update dynamodb table with pandas dataframe in lambda function - pandas

I am trying to upload 2 csv files from 2 different url's to a dynamodb table. I am using pandas to get the desired data from the url's and merge the 2 dataframes into a df3. I'm running into an issue when I use put_item to update the database. I have tried converting the pandas series into strings but that doesn't seem to work either.
Here is the lambda function:
import csv
import pandas as pd
import io
import requests
import numpy as np
import boto3
from datetime import datetime
import json
from decimal import Decimal
def lambda_handler(event, context):
url1 = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv"
url2 = "https://raw.githubusercontent.com/datasets/covid-19/master/data/time-series-19-covid-combined.csv"
df1 = pd.read_csv(url1)
df1 = pd.DataFrame(df1)
df1 = df1.drop(0)
df2 = pd.read_csv(url2, delimiter=',')
df2 = pd.DataFrame(df2)
df2['Recovered'] = df2['Recovered'].fillna(0).astype(np.int64)
df2 = df2.loc[df2['Country/Region'] == 'US', 'Recovered']
df2 = df2.reset_index(drop=True)
df2.index = np.arange(1, len(df2) + 1)
df3 = df1.join(df2)
region = 'eu-west-2'
try:
dyndb = boto3.client('dynamodb', region_name=region)
firstrecord = True
for row in df1:
if firstrecord:
firstrecord = False
continue
cases = df3['cases']
date = df3['date']
deaths = df3['deaths']
Recovered = df3['Recovered']
response = dyndb.put_item(TableName='covidstatstable',
Item={
'cases': {'N': cases},
'date': {'S': date},
'deaths': {'N': deaths},
'Recovered': {'N': Recovered},
})
print('Put succeeded:')
except Exception as e:
print(str(e))
and here is the function logs:
Test Event Name
test1
Response
null
Function Logs
START RequestId: d5192687-d8ef-41dc-bd2f-efbcb1c7ff6f Version: $LATEST
Parameter validation failed:
Invalid type for parameter Item.cases.N, value: 1 1
2 1
3 2
4 3
5 5
...
607 42066372
608 42274530
609 42404490
610 42551956
611 42678374
Name: cases, Length: 611, dtype: int64, type: <class 'pandas.core.series.Series'>, valid types: <class 'str'>
Invalid type for parameter Item.date.S, value: 1 2020-01-22
2 2020-01-23
3 2020-01-24
4 2020-01-25
5 2020-01-26
...
607 2021-09-19
608 2021-09-20
609 2021-09-21
610 2021-09-22
611 2021-09-23
Name: date, Length: 611, dtype: object, type: <class 'pandas.core.series.Series'>, valid types: <class 'str'>
Invalid type for parameter Item.deaths.N, value: 1 0
2 0
3 0
4 0
5 0
...
607 673939
608 676191
609 678556
610 681343
611 684488
Name: deaths, Length: 611, dtype: int64, type: <class 'pandas.core.series.Series'>, valid types: <class 'str'>
Invalid type for parameter Item.Recovered.N, value: 1 0
2 0
3 0
4 0
5 0
..
607 0
608 0
609 0
610 0
611 0
Name: Recovered, Length: 611, dtype: int64, type: <class 'pandas.core.series.Series'>, valid types: <class 'str'>
END RequestId: d5192687-d8ef-41dc-bd2f-efbcb1c7ff6f
REPORT RequestId: d5192687-d8ef-41dc-bd2f-efbcb1c7ff6f Duration: 902.87 ms Billed Duration: 903 ms Memory Size: 512 MB Max Memory Used: 172 MB Init Duration: 1806.56 ms
Request ID
d5192687-d8ef-41dc-bd2f-efbcb1c7ff6f

Related

ArrowInvalid: Could not convert ... with type DataFrame: did not recognize Python value type when inferring an Arrow data type

Using IForest library implementing a function for detection outliers using the following code:
import pyspark.pandas as pd
import numpy as np
from alibi_detect.od import IForest
# **************** Modelo IForest ******************************************
# IForest rta - Outlier ---> 1, Not-Outlier ----> 0
od = IForest(
threshold=0.,
n_estimators=5
)
def mode(lm):
freqs = groupby(Counter(lm).most_common(), lambda x:x[1])
m=[val for val,count in next(freqs)[1]]
if len(m)>1:
m=np.median(lm)
else:
m=float(m[0])
return m
def disper(x):
x_pred = x[['precio_local', 'precio_contenido']]
insumo_std = x_pred.std().to_frame().T
mod = mode(x_pred['precio_local'])
x_send2 = pd.DataFrame(
index=x_pred.index,
columns=['Std_precio','Std_prec_cont','cant_muestras','Moda_precio_local','IsFo']
)
x_send2.loc[:,'Std_precio'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Std_prec_cont'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Moda_precio_local'] = mod
mod_cont = mode(x_pred['precio_contenido'])
x_send2.loc[:,'Moda_precio_contenido_std'] = mod_cont
ctn = x_pred.shape[0]
x_send2.loc[:,'cant_muestras'] = ctn
if x_pred.shape[0]>3:
od.fit(x_pred)
preds = od.predict(
x_pred,
return_instance_score=True
)
x_preds = preds['data']['is_outlier']
#x_send2.loc[:,'IsFo']=x_preds
pd.set_option('compute.ops_on_diff_frames', True)
x_send2.loc[:,'IsFo']= pd.Series(x_preds, index=x_pred.index)
#x_send2.insert(x_pred.index, 'IsFo', x_preds)
else:
x_send2.loc[:,'IsFo'] = 0
print(type(x_send2))
print(x_send2)
return x_send2
insumo_all_pd = insumo_all.to_pandas_on_spark()
I get the error:
ArrowInvalid Traceback (most recent call last)
<command-1939548125702628> in <module>
----> 1 df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
2 display(df_result)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
192 start = time.perf_counter()
193 try:
--> 194 res = func(*args, **kwargs)
195 logger.log_success(
196 class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/pandas/groupby.py in apply(self, func, *args, **kwargs)
1200 else:
1201 pser_or_pdf = grouped.apply(pandas_apply, *args, **kwargs)
-> 1202 psser_or_psdf = ps.from_pandas(pser_or_pdf)
1203
1204 if len(pdf) <= limit:
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/namespace.py in from_pandas(pobj)
143 """
144 if isinstance(pobj, pd.Series):
--> 145 return Series(pobj)
146 elif isinstance(pobj, pd.DataFrame):
147 return DataFrame(pobj)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
424 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
425 )
--> 426 internal = InternalFrame.from_pandas(pd.DataFrame(s))
427 if s.name is None:
428 internal = internal.copy(column_labels=[None])
/databricks/spark/python/pyspark/pandas/internal.py in from_pandas(pdf)
1458 data_columns,
1459 data_fields,
-> 1460 ) = InternalFrame.prepare_pandas_frame(pdf)
1461
1462 schema = StructType([field.struct_field for field in index_fields + data_fields])
/databricks/spark/python/pyspark/pandas/internal.py in prepare_pandas_frame(pdf, retain_index)
1531
1532 for col, dtype in zip(reset_index.columns, reset_index.dtypes):
-> 1533 spark_type = infer_pd_series_spark_type(reset_index[col], dtype)
1534 reset_index[col] = DataTypeOps(dtype, spark_type).prepare(reset_index[col])
1535
/databricks/spark/python/pyspark/pandas/typedef/typehints.py in infer_pd_series_spark_type(pser, dtype)
327 return pser.iloc[0].__UDT__
328 else:
--> 329 return from_arrow_type(pa.Array.from_pandas(pser).type)
330 elif isinstance(dtype, CategoricalDtype):
331 if isinstance(pser.dtype, CategoricalDtype):
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.Array.from_pandas()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()
/databricks/python/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Could not convert Std_precio Std_prec_cont cant_muestras Moda_precio_local IsFo Moda_precio_contenido_std
107 0.0 0.0 3 1.0 0 1.666667
252 0.0 0.0 3 1.0 0 1.666667
396 0.0 0.0 3 1.0 0 1.666667 with type DataFrame: did not recognize Python value type when inferring an Arrow data type
The error encountered by using:
df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
The schema of dataframe insumo_all_pd is:
fecha_ola datetime64[ns]
pais object
categoria object
marca object
submarca object
contenido_std float64
unidad_std object
barcode object
precio_local float64
cantidad float64
descripcion object
id_ticket object
id_item object
id_pdv object
fecha_transaccion datetime64[ns]
id_ref float64
precio_contenido float64
dtype: object
It is not clear to me what is causing the error but it seems that the data types are being inferred incorrectly.
I have tried to convert the data types resulting from the "disper" function to float but it gives the same error.
I appreciate any help or guidance you can give me.
The new Jupyter, apparently, has changed some of the pandas related libraries. The solution's upgrading to Jupyter 5.

SpecificationError: Function names must be unique if there is no new column names assigned

I want to create a new column in the clin dataframe based on the following conditions:
1 if vals>=2*365 or is NAN
otherwise 0
I then assign the new column name as SURV.
import numpy as np
vals = clin['days_to_death'].astype(np.float32)
# non-LTS is 0, LTS is 1
surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
clin['SURV'] = clin.apply(surv, axis=1)
Traceback:
SpecificationError: Function names must be unique if there is no new column names assigned
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
<ipython-input-31-603dee8413ce> in <module>
5 # non-LTS is 0, LTS is 1
6 surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
----> 7 clin['SURV'] = clin.apply(surv, axis=1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
146 # multiple values for keyword argument "axis"
147 return self.obj.aggregate( # type: ignore[misc]
--> 148 self.f, axis=self.axis, *self.args, **self.kwds
149 )
150
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
7572 axis = self._get_axis_number(axis)
7573
-> 7574 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7575
7576 result = None
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/aggregation.py in reconstruct_func(func, **kwargs)
93 # there is no reassigned name
94 raise SpecificationError(
---> 95 "Function names must be unique if there is no new column names "
96 "assigned"
97 )
SpecificationError: Function names must be unique if there is no new column names assigned
clin
clin = pd.DataFrame([[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60']], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index=['TCGA-06-1806', 'TCGA-06-5408', 'TCGA-06-5410', 'TCGA-06-5411',
'TCGA-06-5412', 'TCGA-06-5413'])
Expected output:
vital_status
days_to_death
age_at_initial_pathologic_diagnosis
gender
karnofsky_performance_score
SURV
TCGA-06-1806
1
466
47
0
90
0
TCGA-06-5408
1
357
54
1
80
0
TCGA-06-5410
1
108
72
1
60
0
TCGA-06-5411
1
254
51
0
80
0
TCGA-06-5412
1
138
78
1
80
0
TCGA-06-5413
0
nan
67
0
60
1
Make a new column of all 0's and then update the column with your desired parameters.
clin['SURV'] = 0
clin.loc[pd.to_numeric(clin.days_to_death).ge(2*365) | clin.days_to_death.isna(), 'SURV'] = 1
print(clin)
Output:
vital_status days_to_death age_at_initial_pathologic_diagnosis gender karnofsky_performance_score SURV
TCGA-06-1806 1 466 47 0 90 0
TCGA-06-5408 1 357 54 1 80 0
TCGA-06-5410 1 108 72 1 60 0
TCGA-06-5411 1 254 51 0 80 0
TCGA-06-5412 1 138 78 1 80 0
TCGA-06-5413 0 NaN 67 0 60 1

pyspark toPandas() IndexError: index is out of bounds

I'm experiencing a weird behaviour of pyspark's .toPandas() method running from Jupyt. For example, if I try this:
data = [{"Category": 'Category A', "ID": 1, "Value": 12.40},
{"Category": 'Category B', "ID": 2, "Value": 30.10},
{"Category": 'Category C', "ID": 3, "Value": 100.01}
]
# Create data frame (where spark is a SparkSession)
df = spark.createDataFrame(data)
df.show()
I'm able to successfully create the pyspark dataframe. However, when converting to pandas I get IndexError: index is out of bounds:
IndexError Traceback (most recent call last)
<path_to_python>/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
<path_to_python>/lib/python3.7/site-packages/IPython/lib/pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
<path_to_python>/lib/python3.7/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
<path_to_python>/lib/python3.7/site-packages/pandas/core/base.py in __repr__(self)
76 Yields Bytestring in Py2, Unicode String in py3.
77 """
---> 78 return str(self)
79
80
<path_to_python>/lib/python3.7/site-packages/pandas/core/base.py in __str__(self)
55
56 if compat.PY3:
---> 57 return self.__unicode__()
58 return self.__bytes__()
59
<path_to_python>/lib/python3.7/site-packages/pandas/core/frame.py in __unicode__(self)
632 width = None
633 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 634 line_width=width, show_dimensions=show_dimensions)
635
636 return buf.getvalue()
<path_to_python>/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width)
719 decimal=decimal,
720 line_width=line_width)
--> 721 formatter.to_string()
722
723 if buf is None:
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in to_string(self)
596 else:
597
--> 598 strcols = self._to_str_columns()
599 if self.line_width is None: # no need to wrap around just print
600 # the whole frame
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _to_str_columns(self)
527 str_columns = [[label] for label in self.header]
528 else:
--> 529 str_columns = self._get_formatted_column_labels(frame)
530
531 stringified = []
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatted_column_labels(self, frame)
770 need_leadsp[x] else x]
771 for i, (col, x) in enumerate(zip(columns,
--> 772 fmt_columns))]
773
774 if self.show_row_idx_names:
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
769 str_columns = [[' ' + x if not self._get_formatter(i) and
770 need_leadsp[x] else x]
--> 771 for i, (col, x) in enumerate(zip(columns,
772 fmt_columns))]
773
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatter(self, i)
362 else:
363 if is_integer(i) and i not in self.columns:
--> 364 i = self.columns[i]
365 return self.formatters.get(i, None)
366
<path_to_python>/lib/python3.7/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
3956 if is_scalar(key):
3957 key = com.cast_scalar_indexer(key)
-> 3958 return getitem(key)
3959
3960 if isinstance(key, slice):
IndexError: index 3 is out of bounds for axis 0 with size 3
I'm not sure where the problem can be, I've used this many times without problems but this time I tried a new environment and I got this issue. In case it can help my configuration is:
Python: 3.7.6;
Pandas: 0.24.2;
PySpark: 2.4.5
Any idea?
Thanks :)
I found the issue. Trying to minimize the code to reproduce the error I omitted that I was adding a pandas setting:
pd.set_option('display.max_columns', -1)
This caused the error independently of the dataframe being converted. To fix it I just specified a positive number of columns or None.

How change the value in a koalas dataframe based in a condition

I am using Koalas and I want to change the value of a column based on a condition.
In pandas I can do that using:
import pandas as pd
df_test = pd.DataFrame({
'a': [1,2,3]
,'b': ['one','two','three']})
df_test2 = pd.DataFrame({
'c': [2,1,3]
,'d': ['one','two','three']})
df_test.loc[df_test.a.isin(df_test2['c']),'b'] = 'four'
df_test.head()
a b
0 1 four
1 2 four
2 3 four
I am trying to use the same in Koalas, but I have this error:
---------------------------------------------------------------------------
PandasNotImplementedError Traceback (most recent call last)
<ipython-input-15-814219258adb> in <module>
5 new_loans['write_offs'] = 0
6
----> 7 new_loans.loc[(new_loans['ID'].isin(userinput_write_offs['id'])),'write_offs'] = 1
8 new_loans.loc[new_loans['write_offs']==1,'is_active'] = 0
9 new_loans = new_loans.sort_values(by = ['ZOHOID','Disb Date'])
/usr/local/lib/python3.7/dist-packages/databricks/koalas/base.py in isin(self, values)
894 )
895
--> 896 return self._with_new_scol(self.spark.column.isin(list(values)))
897
898 def isnull(self) -> Union["Series", "Index"]:
/usr/local/lib/python3.7/dist-packages/databricks/koalas/series.py in __iter__(self)
5871
5872 def __iter__(self):
-> 5873 return MissingPandasLikeSeries.__iter__(self)
5874
5875 if sys.version_info >= (3, 7):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/missing/__init__.py in unsupported_function(*args, **kwargs)
21 def unsupported_function(*args, **kwargs):
22 raise PandasNotImplementedError(
---> 23 class_name=class_name, method_name=method_name, reason=reason
24 )
25
PandasNotImplementedError: The method `pd.Series.__iter__()` is not implemented. If you want to collect your data as an NumPy array, use 'to_numpy()' instead.
How could I do the same operation in Koalas?
UPDATE
Following this question: Assign Koalas Column from Numpy Result I have done:
df_test.loc[df_test.a.isin(df_test2['c'].to_list()),'b'] = 'four'
But now I have this error:
---------------------------------------------------------------------------
PythonException Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in pretty(self, obj)
392 if cls is not object \
393 and callable(cls.__dict__.get('__repr__')):
--> 394 return _repr_pprint(obj, self, cycle)
395
396 return _default_pprint(obj, self, cycle)
/usr/local/lib/python3.7/dist-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
698 """A pprint that just redirects to the normal repr function."""
699 # Find newlines and replace them with p.break_()
--> 700 output = repr(obj)
701 lines = output.splitlines()
702 with p.group():
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in __repr__(self)
10614 return self._to_internal_pandas().to_string()
10615
> 10616 pdf = self._get_or_create_repr_pandas_cache(max_display_count)
10617 pdf_length = len(pdf)
10618 pdf = pdf.iloc[:max_display_count]
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _get_or_create_repr_pandas_cache(self, n)
10606 def _get_or_create_repr_pandas_cache(self, n):
10607 if not hasattr(self, "_repr_pandas_cache") or n not in self._repr_pandas_cache:
> 10608 self._repr_pandas_cache = {n: self.head(n + 1)._to_internal_pandas()}
10609 return self._repr_pandas_cache[n]
10610
/usr/local/lib/python3.7/dist-packages/databricks/koalas/frame.py in _to_internal_pandas(self)
10602 This method is for internal use only.
10603 """
> 10604 return self._internal.to_pandas_frame
10605
10606 def _get_or_create_repr_pandas_cache(self, n):
/usr/local/lib/python3.7/dist-packages/databricks/koalas/utils.py in wrapped_lazy_property(self)
514 def wrapped_lazy_property(self):
515 if not hasattr(self, attr_name):
--> 516 setattr(self, attr_name, fn(self))
517 return getattr(self, attr_name)
518
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
/usr/local/lib/python3.7/dist-packages/py4j/java_gateway.py in __call__(self, *args)
1303 answer = self.gateway_client.send_command(command)
1304 return_value = get_return_value(
-> 1305 answer, self.gateway_client, self.target_id, self.name)
1306
1307 for temp_arg in temp_args:
/usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw)
132 # Hide where the exception came from that shows a non-Pythonic
133 # JVM exception message.
--> 134 raise_from(converted)
135 else:
136 raise
/usr/local/spark/python/pyspark/sql/utils.py in raise_from(e)
PythonException:
An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 589, in main
func, profiler, deserializer, serializer = read_udfs(pickleSer, infile, eval_type)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 447, in read_udfs
udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 254, in read_single_udf
f, return_type = read_command(pickleSer, infile)
File "/opt/spark/python/lib/pyspark.zip/pyspark/worker.py", line 74, in read_command
command = serializer._read_with_length(file)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 172, in _read_with_length
return self.loads(obj)
File "/opt/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 458, in loads
return pickle.loads(obj, encoding=encoding)
File "/opt/spark/python/lib/pyspark.zip/pyspark/cloudpickle.py", line 1110, in subimport
__import__(name)
ModuleNotFoundError: No module named 'pandas'
Why is trying to use pandas?
Koalas package exposes Pandas Like APIs on high level for the users but under the hood implementation is done using PySpark APIs.
I observed that within the stack track log you have pasted, a pandas dataframe is being created from sdf spark Dataframe using toPandas() method and assigned to pdf.
In the implementation of toPandas() function, pandas and numpy are being imported.
check line numbers 809 & 138.
/usr/local/lib/python3.7/dist-packages/databricks/koalas/internal.py in to_pandas_frame(self)
807 """ Return as pandas DataFrame. """
808 sdf = self.to_internal_spark_frame
--> 809 pdf = sdf.toPandas()
810 if len(pdf) == 0 and len(sdf.schema) > 0:
811 pdf = pdf.astype(
/usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self)
136
137 # Below is toPandas without Arrow optimization.
--> 138 pdf = pd.DataFrame.from_records(self.collect(), columns=self.columns)
139 column_counter = Counter(self.columns)
140
/usr/local/spark/python/pyspark/sql/dataframe.py in collect(self)
594 """
595 with SCCallSiteSync(self._sc) as css:
--> 596 sock_info = self._jdf.collectToPython()
597 return list(_load_from_socket(sock_info, BatchedSerializer(PickleSerializer())))
598
you can check out the implementation of toPandas() function at the following link:
https://github.com/apache/spark/blob/master/python/pyspark/sql/pandas/conversion.py

pandas dataframe convert column type to string or categorical

How do I convert a single column of a pandas dataframe to type string? In the df of housing data below I need to convert zipcode to string so that when I run linear regression, zipcode is treated as categorical and not numeric. Thanks!
df = pd.DataFrame({'zipcode': {17384: 98125, 2680: 98107, 722: 98005, 18754: 98109, 14554: 98155}, 'bathrooms': {17384: 1.5, 2680: 0.75, 722: 3.25, 18754: 1.0, 14554: 2.5}, 'sqft_lot': {17384: 1650, 2680: 3700, 722: 51836, 18754: 2640, 14554: 9603}, 'bedrooms': {17384: 2, 2680: 2, 722: 4, 18754: 2, 14554: 4}, 'sqft_living': {17384: 1430, 2680: 1440, 722: 4670, 18754: 1130, 14554: 3180}, 'floors': {17384: 3.0, 2680: 1.0, 722: 2.0, 18754: 1.0, 14554: 2.0}})
print (df)
bathrooms bedrooms floors sqft_living sqft_lot zipcode
722 3.25 4 2.0 4670 51836 98005
2680 0.75 2 1.0 1440 3700 98107
14554 2.50 4 2.0 3180 9603 98155
17384 1.50 2 3.0 1430 1650 98125
18754 1.00 2 1.0 1130 2640 98109
You need astype:
df['zipcode'] = df.zipcode.astype(str)
#df.zipcode = df.zipcode.astype(str)
For converting to categorical:
df['zipcode'] = df.zipcode.astype('category')
#df.zipcode = df.zipcode.astype('category')
Another solution is Categorical:
df['zipcode'] = pd.Categorical(df.zipcode)
Sample with data:
import pandas as pd
df = pd.DataFrame({'zipcode': {17384: 98125, 2680: 98107, 722: 98005, 18754: 98109, 14554: 98155}, 'bathrooms': {17384: 1.5, 2680: 0.75, 722: 3.25, 18754: 1.0, 14554: 2.5}, 'sqft_lot': {17384: 1650, 2680: 3700, 722: 51836, 18754: 2640, 14554: 9603}, 'bedrooms': {17384: 2, 2680: 2, 722: 4, 18754: 2, 14554: 4}, 'sqft_living': {17384: 1430, 2680: 1440, 722: 4670, 18754: 1130, 14554: 3180}, 'floors': {17384: 3.0, 2680: 1.0, 722: 2.0, 18754: 1.0, 14554: 2.0}})
print (df)
bathrooms bedrooms floors sqft_living sqft_lot zipcode
722 3.25 4 2.0 4670 51836 98005
2680 0.75 2 1.0 1440 3700 98107
14554 2.50 4 2.0 3180 9603 98155
17384 1.50 2 3.0 1430 1650 98125
18754 1.00 2 1.0 1130 2640 98109
print (df.dtypes)
bathrooms float64
bedrooms int64
floors float64
sqft_living int64
sqft_lot int64
zipcode int64
dtype: object
df['zipcode'] = df.zipcode.astype('category')
print (df)
bathrooms bedrooms floors sqft_living sqft_lot zipcode
722 3.25 4 2.0 4670 51836 98005
2680 0.75 2 1.0 1440 3700 98107
14554 2.50 4 2.0 3180 9603 98155
17384 1.50 2 3.0 1430 1650 98125
18754 1.00 2 1.0 1130 2640 98109
print (df.dtypes)
bathrooms float64
bedrooms int64
floors float64
sqft_living int64
sqft_lot int64
zipcode category
dtype: object
With pandas >= 1.0 there is now a dedicated string datatype:
1) You can convert your column to this pandas string datatype using .astype('string'):
df['zipcode'] = df['zipcode'].astype('string')
2) This is different from using str which sets the pandas object datatype:
df['zipcode'] = df['zipcode'].astype(str)
3) For changing into categorical datatype use:
df['zipcode'] = df['zipcode'].astype('category')
You can see this difference in datatypes when you look at the info of the dataframe:
df = pd.DataFrame({
'zipcode_str': [90210, 90211] ,
'zipcode_string': [90210, 90211],
'zipcode_category': [90210, 90211],
})
df['zipcode_str'] = df['zipcode_str'].astype(str)
df['zipcode_string'] = df['zipcode_str'].astype('string')
df['zipcode_category'] = df['zipcode_category'].astype('category')
df.info()
# you can see that the first column has dtype object
# while the second column has the new dtype string
# the third column has dtype category
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 zipcode_str 2 non-null object
1 zipcode_string 2 non-null string
2 zipcode_category 2 non-null category
dtypes: category(1), object(1), string(1)
From the docs:
The 'string' extension type solves several issues with object-dtype
NumPy arrays:
You can accidentally store a mixture of strings and non-strings in an
object dtype array. A StringArray can only store strings.
object dtype breaks dtype-specific operations like
DataFrame.select_dtypes(). There isn’t a clear way to select just text
while excluding non-text, but still object-dtype columns.
When reading code, the contents of an object dtype array is less clear
than string.
More info on working with the new string datatype can be found here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html
Prior answers focused on nominal data (e.g. unordered). If there is a reason to impose order for an ordinal variable, then one would use:
# Transform to category
df['zipcode_category'] = df['zipcode_category'].astype('category')
# Add ordered category
df['zipcode_ordered'] = df['zipcode_category']
# Setup the ordering
df.zipcode_ordered.cat.set_categories(
new_categories = [90211, 90210], ordered = True, inplace = True
)
# Output IDs
df['zipcode_ordered_id'] = df.zipcode_ordered.cat.codes
print(df)
# zipcode_category zipcode_ordered zipcode_ordered_id
# 90210 90210 1
# 90211 90211 0
More details on setting ordered categories can be found at the pandas website:
https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html#sorting-and-order
To convert a column into a string type (that will be an object column per se in pandas), use astype:
df.zipcode = zipcode.astype(str)
If you want to get a Categorical column, you can pass the parameter 'category' to the function:
df.zipcode = zipcode.astype('category')