Why dataframe is not displayed in console using pyspark? - dataframe

This is the session object I have created for dataframe
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
This is the code for Dataframe creation
data = [("James","","Smith",30,"M",60000),
columns = ["first_name","middle_name","last_name","Age","gender","salary"]
pysparkDF = spark.createDataFrame(data = data, schema = columns)
I want dataframe should be displayed in console of six columns but it is showing me the erro of Py4jjjavaerror exceeds size limit
Output exceeds the size limit. Open the full output data in a text editor
Py4JJavaError Traceback (most recent call last)
Cell In[28], line 10
8 pysparkDF = spark.createDataFrame(data = data, schema = columns)
9 pysparkDF.printSchema()
---> 10 pysparkDF.show(truncate=False)
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pyspark\sql\dataframe.py:615, in DataFrame.show(self, n, truncate, vertical)
610 except ValueError:
611 raise TypeError(
612 "Parameter 'truncate={}' should be either bool or int.".format(truncate)
613 )
--> 615 print(self._jdf.showString(n, int_truncate, vertical))
File c:\Users\baps\AppData\Local\Programs\Python\Python38-32\lib\site-packages\py4j\java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
at java.lang.ProcessImpl.<init>(ProcessImpl.java:453)
at java.lang.ProcessImpl.start(ProcessImpl.java:140)
at java.lang.ProcessBuilder.start(ProcessBuilder.java:1029)
... 30 more


How change the value in a koalas dataframe based in a condition

I am using Koalas and I want to change the value of a column based on a condition.
In pandas I can do that using:
import pandas as pd
df_test = pd.DataFrame({
'a': [1,2,3]
,'b': ['one','two','three']})
df_test2 = pd.DataFrame({
'c': [2,1,3]
,'d': ['one','two','three']})
df_test.loc[df_test.a.isin(df_test2['c']),'b'] = 'four'
a b
0 1 four
1 2 four
2 3 four
I am trying to use the same in Koalas, but I have this error:
PandasNotImplementedError Traceback (most recent call last)
<ipython-input-15-814219258adb> in <module>
5 new_loans['write_offs'] = 0
----> 7 new_loans.loc[(new_loans['ID'].isin(userinput_write_offs['id'])),'write_offs'] = 1
8 new_loans.loc[new_loans['write_offs']==1,'is_active'] = 0
9 new_loans = new_loans.sort_values(by = ['ZOHOID','Disb Date'])
/usr/local/lib/python3.7/dist-packages/databricks/koalas/base.py in isin(self, values)
894 )
--> 896 return self._with_new_scol(self.spark.column.isin(list(values)))
898 def isnull(self) -> Union["Series", "Index"]:
/usr/local/lib/python3.7/dist-packages/databricks/koalas/series.py in __iter__(self)
5872 def __iter__(self):
-> 5873 return MissingPandasLikeSeries.__iter__(self)
5875 if sys.version_info >= (3, 7):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/missing/__init__.py in unsupported_function(*args, **kwargs)
21 def unsupported_function(*args, **kwargs):
22 raise PandasNotImplementedError(
---> 23 class_name=class_name, method_name=method_name, reason=reason
24 )
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
How could I do the same operation in Koalas?
Following this question: Assign Koalas Column from Numpy Result I have done:
df_test.loc[df_test.a.isin(df_test2['c'].to_list()),'b'] = 'four'
But now I have this error:
PythonException Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in __repr__(self)
10614 return self._to_internal_pandas().to_string()
> 10616 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
10617 pdf_length = len(pdf)
10618 pdf = pdf.iloc[:max_display_count]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
10606 def _get_or_create_repr_pandas_cache(self, n):
10607 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
> 10608 self._repr_pandas_cache = {n: self.head(n + 1)._to_internal_pandas()}
10609 return self._repr_pandas_cache[n]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _to_internal_pandas(self)
10602 This method is for internal use only.
10603 """
> 10604 return self._internal.to_pandas_frame
10606 def _get_or_create_repr_pandas_cache(self, n):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
514 def wrapped_lazy_property(self):
515 if not hasattr(self, attr_name):
--> 516 setattr(self, attr_name, fn(self))
517 return getattr(self, attr_name)
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
/usr/local/spark/python/pyspark/sql/utils.py in raise_from(e)
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 254, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 74, in read_command
command = serializer._read_with_length(file)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 458, in loads
return pickle.loads(obj, encoding=encoding)
File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 1110, in subimport
ModuleNotFoundError: No module named 'pandas'
Why is trying to use pandas?
Koalas package exposes Pandas Like APIs on high level for the users but under the hood implementation is done using PySpark APIs.
I observed that within the stack track log you have pasted, a pandas dataframe is being created from sdf spark Dataframe using toPandas() method and assigned to pdf.
In the implementation of toPandas() function, pandas and numpy are being imported.
check line numbers 809 & 138.
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
you can check out the implementation of toPandas() function at the following link:

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.

Can't perform calculations on DataFrame values

I am trying to apply a formula to each value in a Pandas DataFrame, however, I am getting an error.
def transform_x(x):
return x/0.65
transformed = input_df.applymap(transform_x)
This returns the following error:
TypeError Traceback (most recent call last)
<ipython-input-72-66afcc1d1b80> in <module>
----> 5 transformed = input_df.applymap(transform_x)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in applymap(self, func)
6551 return lib.map_infer(x.astype(object).values, func)
-> 6553 return self.apply(infer)
6555 # ----------------------------------------------------------------------
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6489 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
149 return self.apply_raw()
--> 151 return self.apply_standard()
153 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
259 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in infer(x)
6549 if x.empty:
6550 return lib.map_infer(x, func)
-> 6551 return lib.map_infer(x.astype(object).values, func)
6553 return self.apply(infer)
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-72-66afcc1d1b80> in transform_x(x)
1 def transform_x(x):
----> 2 return x/0.65
5 transformed = input_df.applymap(transform_x)
TypeError: ("unsupported operand type(s) for /: 'str' and 'float'", 'occurred at index (column_a)')
I have tried converting the type of the DataFrame to float, as I thought that this might be the issue, however, I am encountering a different problem.
input_df = input_df.astype(float)
ValueError Traceback (most recent call last)
<ipython-input-71-2102a8e5c505> in <module>
----> 1 input_df= input_df.astype(float)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
533 def convert(self, **kwargs):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
635 # TODO(extension)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
704 return arr.view(dtype)
ValueError: could not convert string to float:
I am really not sure what is going wrong. I have tried exporting the DataFrames as a csv and, aside from the indexes which do contain text, the values are all floats. Is this something to do with the indexes perhaps?
As an addendum, I tried using pd.to_numeric outside of a lambda function but it also returned an error:
input_df = pd.to_numeric(input_df, errors='coerce')
TypeError Traceback (most recent call last)
<ipython-input-93-7178dce9054b> in <module>
----> 1 input_df = pd.to_numeric(input_df, errors='coerce')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
120 values = np.array([arg], dtype='O')
121 elif getattr(arg, 'ndim', 1) > 1:
--> 122 raise TypeError('arg must be a list, tuple, 1-d array, or Series')
123 else:
124 values = arg
TypeError: arg must be a list, tuple, 1-d array, or Series
You may try something like:
input_df = input_df.apply(lambda x: pd.to_neumeric(x,errors='coerce')).applymap(transform_x)
the input_df is a 2D array but pd.to_neumeric() takes only list, tuple, 1-d array, or Series so you cannot call a dataframe under it.Hence we take the help of lambda x to pass each series individually .
Once all the df has neumeric data, apply your function.

pandas to_parquet fails on large datasets

I'm trying to save a very large dataset using pandas to_parquet, and it seems to fail when exceeding a certain limit, both with 'pyarrow' and 'fastparquet'. I reproduced the errors I am getting with the following code, and would be happy to hear ideas on how to overcome that issue:
Using Pyarrow:
low = 3
high = 8
for n in np.logspace(low, high, high-low+1):
t0 = time()
df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
pd.read_parquet(tmp_file, engine='pyarrow')
print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
10^3.0 read-write took 0.012851715087890625 seconds
10^4.0 read-write took 0.05722832679748535 seconds
10^5.0 read-write took 0.46846866607666016 seconds
10^6.0 read-write took 4.4494054317474365 seconds
10^7.0 read-write took 43.0602171421051 seconds
ArrowIOError Traceback (most recent call last)
<ipython-input-51-cad917a26b91> in <module>()
5 df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
6 df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
----> 7 pd.read_parquet(tmp_file, engine='pyarrow')
8 print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in read_parquet(path, engine, columns, **kwargs)
256 impl = get_engine(engine)
--> 257 return impl.read(path, columns=columns, **kwargs)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in read(self, path, columns, **kwargs)
128 kwargs['use_pandas_metadata'] = True
129 return self.api.parquet.read_table(path, columns=columns,
--> 130 **kwargs).to_pandas()
132 def _validate_write_lt_070(self, df):
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pyarrow/parquet.py in read_table(source, columns, nthreads, metadata, use_pandas_metadata)
939 pf = ParquetFile(source, metadata=metadata)
940 return pf.read(columns=columns, nthreads=nthreads,
--> 941 use_pandas_metadata=use_pandas_metadata)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pyarrow/parquet.py in read(self, columns, nthreads, use_pandas_metadata)
148 columns, use_pandas_metadata=use_pandas_metadata)
149 return self.reader.read_all(column_indices=column_indices,
--> 150 nthreads=nthreads)
152 def scan_contents(self, columns=None, batch_size=65536):
_parquet.pyx in pyarrow._parquet.ParquetReader.read_all()
error.pxi in pyarrow.lib.check_status()
ArrowIOError: Arrow error: Invalid: BinaryArray cannot contain more than 2147483646 bytes, have 2147483650
Using fastparquet:
low = 3
high = 8
for n in np.logspace(low, high, high-low+1):
t0 = time()
df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
df.to_parquet(tmp_file, engine='fastparquet', compression='gzip')
pd.read_parquet(tmp_file, engine='fastparquet')
print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
10^3.0 read-write took 0.17770028114318848 seconds
10^4.0 read-write took 0.06351733207702637 seconds
10^5.0 read-write took 0.46896958351135254 seconds
10^6.0 read-write took 5.464379549026489 seconds
10^7.0 read-write took 50.26520347595215 seconds
OverflowError Traceback (most recent call last)
<ipython-input-49-234a889ae790> in <module>()
4 t0 = time()
5 df = pd.DataFrame.from_records([(f'ind_{x}', ''.join(['x']*50)) for x in range(int(n))], columns=['a', 'b']).set_index('a')
----> 6 df.to_parquet(tmp_file, engine='fastparquet', compression='gzip')
7 pd.read_parquet(tmp_file, engine='fastparquet')
8 print(f'10^{np.log10(int(n))} read-write took {time()-t0} seconds')
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in to_parquet(self, fname, engine, compression, **kwargs)
1647 from pandas.io.parquet import to_parquet
1648 to_parquet(self, fname, engine,
-> 1649 compression=compression, **kwargs)
1651 #Substitution(header='Write out the column names. If a list of strings '
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in to_parquet(df, path, engine, compression, **kwargs)
225 """
226 impl = get_engine(engine)
--> 227 return impl.write(df, path, compression=compression, **kwargs)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/pandas/io/parquet.py in write(self, df, path, compression, **kwargs)
198 with catch_warnings(record=True):
199 self.api.write(path, df,
--> 200 compression=compression, **kwargs)
202 def read(self, path, columns=None, **kwargs):
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write(filename, data, row_group_offsets, compression, file_scheme, open_with, mkdirs, has_nulls, write_index, partition_on, fixed_text, append, object_encoding, times)
846 if file_scheme == 'simple':
847 write_simple(filename, data, fmd, row_group_offsets,
--> 848 compression, open_with, has_nulls, append)
849 elif file_scheme in ['hive', 'drill']:
850 if append:
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write_simple(fn, data, fmd, row_group_offsets, compression, open_with, has_nulls, append)
715 else None)
716 rg = make_row_group(f, data[start:end], fmd.schema,
--> 717 compression=compression)
718 if rg is not None:
719 fmd.row_groups.append(rg)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in make_row_group(f, data, schema, compression)
612 comp = compression
613 chunk = write_column(f, data[column.name], column,
--> 614 compression=comp)
615 rg.columns.append(chunk)
616 rg.total_byte_size = sum([c.meta_data.total_uncompressed_size for c in
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/writer.py in write_column(f, data, selement, compression)
545 data_page_header=dph, crc=None)
--> 547 write_thrift(f, ph)
548 f.write(bdata)
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/thrift_structures.py in write_thrift(fobj, thrift)
49 pout = TCompactProtocol(fobj)
50 try:
---> 51 thrift.write(pout)
52 fail = False
53 except TProtocolException as e:
~/.conda/envs/anaconda3/lib/python3.6/site-packages/fastparquet/parquet_thrift/parquet/ttypes.py in write(self, oprot)
1028 def write(self, oprot):
1029 if oprot._fast_encode is not None and self.thrift_spec is not None:
-> 1030 oprot.trans.write(oprot._fast_encode(self, [self.__class__, self.thrift_spec]))
1031 return
1032 oprot.writeStructBegin('PageHeader')
OverflowError: int out of range
It seems you succeeded with Pyarrow to write but not to read, and failed to write with fastparquet, thus did not get to read. I suggest you to write the data with Pyarrow and read with fastparquet by chunks, iterating through the row-groups:
from fastparquet import ParquetFile
df.to_parquet(tmp_file, engine='pyarrow', compression='gzip')
pf = ParquetFile(tmp_file)
for df in pf.iter_row_groups():
I had a similar issue, upgrading to pyarrow 0.12 worked for me, and let me read the file in one go (instead of chunks).

Overflow error when trying to save a numpy array as an image using scipy.misc.imsave

I'm trying to view a 71290x71290 array of numpy.float32s. My computer runs out of memory when I try to use matplotlib to view it interactively, so I'm trying to save it as an image file and view it outside of ipython. The following should work, afaik from reading other SO posts:
import numpy as np
W = np.zeros((71290, 71290), dtype='float32')
from scipy.misc import imsave
imsave('test.png', W)
But the last line gives me an overflow error:
OverflowError: size does not fit in an int
Here's the full stack trace:
OverflowError Traceback (most recent call last)
/home/agittens/Documents/langmodel/<ipython-input-4-bf9f2254f869> in <module>()
----> 1 imsave('test.png', W)
/usr/local/lib/python2.7/dist-packages/scipy/misc/pilutil.pyc in imsave(name, arr)
161 """
--> 162 im = toimage(arr)
163 im.save(name)
164 return
/usr/local/lib/python2.7/dist-packages/scipy/misc/pilutil.pyc in toimage(arr, high, low, cmin, cmax, pal, mode, channel_axis)
235 if mode in [None, 'L', 'P']:
236 bytedata = bytescale(data,high=high,low=low,cmin=cmin,cmax=cmax)
--> 237 image = Image.frombytes('L',shape,bytedata.tostring())
238 if pal is not None:
239 image.putpalette(asarray(pal,dtype=uint8).tostring())
/usr/lib/python2.7/dist-packages/PIL/Image.pyc in fromstring(mode, size, data, decoder_name, *args)
1796 im = new(mode, size)
-> 1797 im.fromstring(data, decoder_name, args)
1798 return im
/usr/lib/python2.7/dist-packages/PIL/Image.pyc in fromstring(self, data, decoder_name, *args)
589 d = _getdecoder(self.mode, decoder_name, args)
590 d.setimage(self.im)
--> 591 s = d.decode(data)
593 if s[0] >= 0:
OverflowError: size does not fit in an int