How to convert coordinate columns to Point column with Shapely and Dask? - pandas

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?

What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.

Related

ArrowInvalid: Could not convert ... with type DataFrame: did not recognize Python value type when inferring an Arrow data type

Using IForest library implementing a function for detection outliers using the following code:
import pyspark.pandas as pd
import numpy as np
from alibi_detect.od import IForest
# **************** Modelo IForest ******************************************
# IForest rta - Outlier ---> 1, Not-Outlier ----> 0
od = IForest(
threshold=0.,
n_estimators=5
)
def mode(lm):
freqs = groupby(Counter(lm).most_common(), lambda x:x[1])
m=[val for val,count in next(freqs)[1]]
if len(m)>1:
m=np.median(lm)
else:
m=float(m[0])
return m
def disper(x):
x_pred = x[['precio_local', 'precio_contenido']]
insumo_std = x_pred.std().to_frame().T
mod = mode(x_pred['precio_local'])
x_send2 = pd.DataFrame(
index=x_pred.index,
columns=['Std_precio','Std_prec_cont','cant_muestras','Moda_precio_local','IsFo']
)
x_send2.loc[:,'Std_precio'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Std_prec_cont'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Moda_precio_local'] = mod
mod_cont = mode(x_pred['precio_contenido'])
x_send2.loc[:,'Moda_precio_contenido_std'] = mod_cont
ctn = x_pred.shape[0]
x_send2.loc[:,'cant_muestras'] = ctn
if x_pred.shape[0]>3:
od.fit(x_pred)
preds = od.predict(
x_pred,
return_instance_score=True
)
x_preds = preds['data']['is_outlier']
#x_send2.loc[:,'IsFo']=x_preds
pd.set_option('compute.ops_on_diff_frames', True)
x_send2.loc[:,'IsFo']= pd.Series(x_preds, index=x_pred.index)
#x_send2.insert(x_pred.index, 'IsFo', x_preds)
else:
x_send2.loc[:,'IsFo'] = 0
print(type(x_send2))
print(x_send2)
return x_send2
insumo_all_pd = insumo_all.to_pandas_on_spark()
I get the error:
ArrowInvalid Traceback (most recent call last)
<command-1939548125702628> in <module>
----> 1 df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
2 display(df_result)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
192 start = time.perf_counter()
193 try:
--> 194 res = func(*args, **kwargs)
195 logger.log_success(
196 class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/pandas/groupby.py in apply(self, func, *args, **kwargs)
1200 else:
1201 pser_or_pdf = grouped.apply(pandas_apply, *args, **kwargs)
-> 1202 psser_or_psdf = ps.from_pandas(pser_or_pdf)
1203
1204 if len(pdf) <= limit:
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/namespace.py in from_pandas(pobj)
143 """
144 if isinstance(pobj, pd.Series):
--> 145 return Series(pobj)
146 elif isinstance(pobj, pd.DataFrame):
147 return DataFrame(pobj)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
424 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
425 )
--> 426 internal = InternalFrame.from_pandas(pd.DataFrame(s))
427 if s.name is None:
428 internal = internal.copy(column_labels=[None])
/databricks/spark/python/pyspark/pandas/internal.py in from_pandas(pdf)
1458 data_columns,
1459 data_fields,
-> 1460 ) = InternalFrame.prepare_pandas_frame(pdf)
1461
1462 schema = StructType([field.struct_field for field in index_fields + data_fields])
/databricks/spark/python/pyspark/pandas/internal.py in prepare_pandas_frame(pdf, retain_index)
1531
1532 for col, dtype in zip(reset_index.columns, reset_index.dtypes):
-> 1533 spark_type = infer_pd_series_spark_type(reset_index[col], dtype)
1534 reset_index[col] = DataTypeOps(dtype, spark_type).prepare(reset_index[col])
1535
/databricks/spark/python/pyspark/pandas/typedef/typehints.py in infer_pd_series_spark_type(pser, dtype)
327 return pser.iloc[0].__UDT__
328 else:
--> 329 return from_arrow_type(pa.Array.from_pandas(pser).type)
330 elif isinstance(dtype, CategoricalDtype):
331 if isinstance(pser.dtype, CategoricalDtype):
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.Array.from_pandas()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()
/databricks/python/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Could not convert Std_precio Std_prec_cont cant_muestras Moda_precio_local IsFo Moda_precio_contenido_std
107 0.0 0.0 3 1.0 0 1.666667
252 0.0 0.0 3 1.0 0 1.666667
396 0.0 0.0 3 1.0 0 1.666667 with type DataFrame: did not recognize Python value type when inferring an Arrow data type
The error encountered by using:
df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
The schema of dataframe insumo_all_pd is:
fecha_ola datetime64[ns]
pais object
categoria object
marca object
submarca object
contenido_std float64
unidad_std object
barcode object
precio_local float64
cantidad float64
descripcion object
id_ticket object
id_item object
id_pdv object
fecha_transaccion datetime64[ns]
id_ref float64
precio_contenido float64
dtype: object
It is not clear to me what is causing the error but it seems that the data types are being inferred incorrectly.
I have tried to convert the data types resulting from the "disper" function to float but it gives the same error.
I appreciate any help or guidance you can give me.
The new Jupyter, apparently, has changed some of the pandas related libraries. The solution's upgrading to Jupyter 5.

NotImplementedError when trying to concat dataframes in Pandas

I have a pandas dataframe where there is a column called 'CITY' with various city names. I did one-hot encoding on that column to convert the categorical features to numeric features.
dummy_CITY = pd.get_dummies(df['CITY'], drop_first=False)
dummy_CITY.head()
Next I'm trying to concatenate the new dataframe obtained after one-hot encoding, as shown below:
df_cat = pd.concat([df, dummy_CITY])
for which I'm getting the following error:
`
NotImplementedError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8620/1976427847.py in <module>
----> 1 df_cat = pd.concat([df, dummy_CITY])
~\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in get_result(self)
530 mgrs_indexers.append((obj._mgr, indexers))
531
--> 532 new_data = concatenate_managers(
533 mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
534 )
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in concatenate_managers(mgrs_indexers, axes, concat_axis, copy)
224 fastpath = blk.values.dtype == values.dtype
225 else:
--> 226 values = _concatenate_join_units(join_units, concat_axis, copy=copy)
227 fastpath = False
228
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in _concatenate_join_units(join_units, concat_axis, copy)
486
487 has_none_blocks = any(unit.block is None for unit in join_units)
--> 488 upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
489
490 to_concat = [
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in _dtype_to_na_value(dtype, has_none_blocks)
546 elif dtype.kind == "O":
547 return np.nan
--> 548 raise NotImplementedError
549
550
NotImplementedError:
I expected the new dataframe to be concatenated to the old one without any errors, as the number of rows match for both dataframes.

Dask Cluster: AttributeError: 'DataFrame' object has no attribute '_data'

I'm working with a Dask Cluster on GCP. I'm using this code to deploy it:
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
enviroment_vars = {
'EXTRA_PIP_PACKAGES': '"gcsfs"'
}
cluster = GCPCluster(
n_workers=32,
docker_image='daskdev/dask:2021.2.0',
env_vars=enviroment_vars,
network='my-network',
#filesystem_size=150,
machine_type='e2-standard-16',
projectid='my-project-id',
zone='us-central1-a',
on_host_maintenance="MIGRATE"
client = Client(cluster)
Then I read csv files, with the following code:
import dask.dataframe as dd
import csv
col_dtypes = {
'var1': 'float64',
'var2': 'object',
'var3': 'object',
'var4': 'float64'
}
df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
Everything works fine, but when I try to do some queries, or calculation, I get an error. For instance this piece of code:
df.var1.value_counts().compute()
This is the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
279 dask.base.compute
280 """
--> 281 (result,) = compute(self, traverse=False, **kwargs)
282 return result
283
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
561 postcomputes.append(x.__dask_postcompute__())
562
--> 563 results = schedule(dsk, keys, **kwargs)
564 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
565
/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2653 should_rejoin = False
2654 try:
-> 2655 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2656 finally:
2657 for f in futures.values():
/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1962 else:
1963 local_worker = None
-> 1964 return self.sync(
1965 self._gather,
1966 futures,
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
836 return future
837 else:
--> 838 return sync(
839 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1827 exc = CancelledError(key)
1828 else:
-> 1829 raise exception.with_traceback(traceback)
1830 raise exc
1831 if errors == "skip":
/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
961 if not len(args) == len(self.inkeys):
962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
964
965 def __reduce__(self):
/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
33 def apply(func, args, kwargs=None):
34 if kwargs:
---> 35 return func(*args, **kwargs)
36 else:
37 return func(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
5474 return meta
5475 if is_dataframe_like(df):
-> 5476 check_matching_columns(meta, df)
5477 c = meta.columns
5478 else:
/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
690 def check_matching_columns(meta, actual):
691 # Need nan_to_num otherwise nan comparison gives False
--> 692 if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
693 extra = methods.tolist(actual.columns.difference(meta.columns))
694 missing = methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
AttributeError: 'DataFrame' object has no attribute '_data'
The version of Pandas in my docker file is 1.0.1, so I already try upgrading Pandas (to version 1.2.2), but it didn't work, what am I doing wrong?
My guess is that you have a version mismatch somewhere. What does client.get_versions(check=True) say?

How change the value in a koalas dataframe based in a condition

I am using Koalas and I want to change the value of a column based on a condition.
In pandas I can do that using:
import pandas as pd
df_test = pd.DataFrame({
'a': [1,2,3]
,'b': ['one','two','three']})
df_test2 = pd.DataFrame({
'c': [2,1,3]
,'d': ['one','two','three']})
df_test.loc[df_test.a.isin(df_test2['c']),'b'] = 'four'
df_test.head()
a b
0 1 four
1 2 four
2 3 four
I am trying to use the same in Koalas, but I have this error:
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
<ipython-input-15-814219258adb> in <module>
5 new_loans['write_offs'] = 0
6
----> 7 new_loans.loc[(new_loans['ID'].isin(userinput_write_offs['id'])),'write_offs'] = 1
8 new_loans.loc[new_loans['write_offs']==1,'is_active'] = 0
9 new_loans = new_loans.sort_values(by = ['ZOHOID','Disb Date'])
/usr/local/lib/python3.7/dist-packages/databricks/koalas/base.py in isin(self, values)
894 )
895
--> 896 return self._with_new_scol(self.spark.column.isin(list(values)))
897
898 def isnull(self) -> Union["Series", "Index"]:
/usr/local/lib/python3.7/dist-packages/databricks/koalas/series.py in __iter__(self)
5871
5872 def __iter__(self):
-> 5873 return MissingPandasLikeSeries.__iter__(self)
5874
5875 if sys.version_info >= (3, 7):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/missing/__init__.py in unsupported_function(*args, **kwargs)
21 def unsupported_function(*args, **kwargs):
22 raise PandasNotImplementedError(
---> 23 class_name=class_name, method_name=method_name, reason=reason
24 )
25
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
How could I do the same operation in Koalas?
UPDATE
Following this question: Assign Koalas Column from Numpy Result I have done:
df_test.loc[df_test.a.isin(df_test2['c'].to_list()),'b'] = 'four'
But now I have this error:
---------------------------------------------------------------------------
PythonException Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in __repr__(self)
10614 return self._to_internal_pandas().to_string()
10615
> 10616 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
10617 pdf_length = len(pdf)
10618 pdf = pdf.iloc[:max_display_count]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
10606 def _get_or_create_repr_pandas_cache(self, n):
10607 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
> 10608 self._repr_pandas_cache = {n: self.head(n + 1)._to_internal_pandas()}
10609 return self._repr_pandas_cache[n]
10610
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _to_internal_pandas(self)
10602 This method is for internal use only.
10603 """
> 10604 return self._internal.to_pandas_frame
10605
10606 def _get_or_create_repr_pandas_cache(self, n):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
514 def wrapped_lazy_property(self):
515 if not hasattr(self, attr_name):
--> 516 setattr(self, attr_name, fn(self))
517 return getattr(self, attr_name)
518
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
/usr/local/spark/python/pyspark/sql/utils.py in raise_from(e)
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 254, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 74, in read_command
command = serializer._read_with_length(file)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 458, in loads
return pickle.loads(obj, encoding=encoding)
File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pandas'
Why is trying to use pandas?
Koalas package exposes Pandas Like APIs on high level for the users but under the hood implementation is done using PySpark APIs.
I observed that within the stack track log you have pasted, a pandas dataframe is being created from sdf spark Dataframe using toPandas() method and assigned to pdf.
In the implementation of toPandas() function, pandas and numpy are being imported.
check line numbers 809 & 138.
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
you can check out the implementation of toPandas() function at the following link:
https://github.com/apache/spark/blob/master/python/pyspark/sql/pandas/conversion.py

Can't perform calculations on DataFrame values

I am trying to apply a formula to each value in a Pandas DataFrame, however, I am getting an error.
def transform_x(x):
return x/0.65
transformed = input_df.applymap(transform_x)
This returns the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-66afcc1d1b80> in <module>
3
4
----> 5 transformed = input_df.applymap(transform_x)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in applymap(self, func)
6551 return lib.map_infer(x.astype(object).values, func)
6552
-> 6553 return self.apply(infer)
6554
6555 # ----------------------------------------------------------------------
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6485 args=args,
6486 kwds=kwds)
-> 6487 return op.get_result()
6488
6489 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
149 return self.apply_raw()
150
--> 151 return self.apply_standard()
152
153 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
255
256 # compute the result using the series generator
--> 257 self.apply_series_generator()
258
259 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
284 try:
285 for i, v in enumerate(series_gen):
--> 286 results[i] = self.f(v)
287 keys.append(v.name)
288 except Exception as e:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in infer(x)
6549 if x.empty:
6550 return lib.map_infer(x, func)
-> 6551 return lib.map_infer(x.astype(object).values, func)
6552
6553 return self.apply(infer)
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-72-66afcc1d1b80> in transform_x(x)
1 def transform_x(x):
----> 2 return x/0.65
3
4
5 transformed = input_df.applymap(transform_x)
TypeError: ("unsupported operand type(s) for /: 'str' and 'float'", 'occurred at index (column_a)')
I have tried converting the type of the DataFrame to float, as I thought that this might be the issue, however, I am encountering a different problem.
input_df = input_df.astype(float)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-71-2102a8e5c505> in <module>
----> 1 input_df= input_df.astype(float)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in astype(self, dtype, copy, errors, **kwargs)
5689 # else, only a single dtype is given
5690 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5691 **kwargs)
5692 return self._constructor(new_data).__finalize__(self)
5693
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in astype(self, dtype, **kwargs)
529
530 def astype(self, dtype, **kwargs):
--> 531 return self.apply('astype', dtype=dtype, **kwargs)
532
533 def convert(self, **kwargs):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\managers.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
393 copy=align_copy)
394
--> 395 applied = getattr(b, f)(**kwargs)
396 result_blocks = _extend_blocks(applied, result_blocks)
397
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in astype(self, dtype, copy, errors, values, **kwargs)
532 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
533 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 534 **kwargs)
535
536 def _astype(self, dtype, copy=False, errors='raise', values=None,
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\blocks.py in _astype(self, dtype, copy, errors, values, **kwargs)
631
632 # _astype_nansafe works fine with 1-d only
--> 633 values = astype_nansafe(values.ravel(), dtype, copy=True)
634
635 # TODO(extension)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\dtypes\cast.py in astype_nansafe(arr, dtype, copy, skipna)
700 if copy or is_object_dtype(arr) or is_object_dtype(dtype):
701 # Explicit copy, or required since NumPy can't view from / to object.
--> 702 return arr.astype(dtype, copy=True)
703
704 return arr.view(dtype)
ValueError: could not convert string to float:
I am really not sure what is going wrong. I have tried exporting the DataFrames as a csv and, aside from the indexes which do contain text, the values are all floats. Is this something to do with the indexes perhaps?
As an addendum, I tried using pd.to_numeric outside of a lambda function but it also returned an error:
input_df = pd.to_numeric(input_df, errors='coerce')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-7178dce9054b> in <module>
----> 1 input_df = pd.to_numeric(input_df, errors='coerce')
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\tools\numeric.py in to_numeric(arg, errors, downcast)
120 values = np.array([arg], dtype='O')
121 elif getattr(arg, 'ndim', 1) > 1:
--> 122 raise TypeError('arg must be a list, tuple, 1-d array, or Series')
123 else:
124 values = arg
TypeError: arg must be a list, tuple, 1-d array, or Series
You may try something like:
input_df = input_df.apply(lambda x: pd.to_neumeric(x,errors='coerce')).applymap(transform_x)
the input_df is a 2D array but pd.to_neumeric() takes only list, tuple, 1-d array, or Series so you cannot call a dataframe under it.Hence we take the help of lambda x to pass each series individually .
Once all the df has neumeric data, apply your function.