Seaborn pairplot not running only on a specific system - pandas

I have the following data with the name 'Salaries.csv'. It looks like the following:[The dataset has some columns like Index(['yearID', 'teamID', 'lgID', 'salary', 'num_feat'], dtype='object'). Please note that the column num_feat I have added to the DataFrame.
I want to do a Seaborn pairplot for team 'ATL' to plot scatter plots among all numeric features in the data frame.
I have the following code :
import seaborn as sns
var_set = [
"yearID",
"teamID",
"lgID",
"playerID",
"salary"
]
head_set = []
head_set.extend(var_set)
head_set.append("num_feat")
df = pd.read_csv('Salaries.csv',index_col='playerID', header=None, names=head_set)
df['num_feat'] = 100 * np.random.random_sample(df.shape[0]). #Adding column num_feat
df_copy = df
cols_with_team_ATL = df_copy.loc[df_copy.teamID=="ATL", ]
# Create the default pairplot
pairplot_fig = sns.pairplot(cols_with_team_ATL, vars=['yearID', 'salary', 'num_feat'])
plt.subplots_adjust(top=0.9)
pairplot_fig.fig.suptitle("Scatter plots among all numeric features in the data frame for teamID = ATL", fontsize=18, alpha=0.9, weight='bold')
plt.show()
The same code runs perfectly on my friend's system but not on mine. It shows the following error in my system :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/ch/6r9p7n0j3xg1l79lz1zdkvsh0000gq/T/ipykernel_97373/3735184261.py in <module>
25 # Create the default pairplot
26 print(df.columns)
---> 27 pairplot_fig = sns.pairplot(cols_with_team_ATL, vars=['yearID', 'salary', 'num_feat'])
28 plt.subplots_adjust(top=0.9)
29 pairplot_fig.fig.suptitle("Scatter plots among all numeric features in the data frame for teamID = ATL", fontsize=18, alpha=0.9, weight='bold')
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
2124 diag_kws.setdefault("legend", False)
2125 if diag_kind == "hist":
-> 2126 grid.map_diag(histplot, **diag_kws)
2127 elif diag_kind == "kde":
2128 diag_kws.setdefault("fill", True)
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
1476 plot_kwargs.setdefault("hue_order", self._hue_order)
1477 plot_kwargs.setdefault("palette", self._orig_palette)
-> 1478 func(x=vector, **plot_kwargs)
1479 ax.legend_ = None
1480
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py in histplot(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs)
1460 if p.univariate:
1461
-> 1462 p.plot_univariate_histogram(
1463 multiple=multiple,
1464 element=element,
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py in plot_univariate_histogram(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws)
426
427 # First pass through the data to compute the histograms
--> 428 for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
429
430 # Prepare the relevant data
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_core.py in iter_data(self, grouping_vars, reverse, from_comp_data)
981
982 if from_comp_data:
--> 983 data = self.comp_data
984 else:
985 data = self.plot_data
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_core.py in comp_data(self)
1055 orig = self.plot_data[var].dropna()
1056 comp_col = pd.Series(index=orig.index, dtype=float, name=var)
-> 1057 comp_col.loc[orig.index] = pd.to_numeric(axis.convert_units(orig))
1058
1059 if axis.get_scale() == "log":
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
721
722 iloc = self if self.name == "iloc" else self.obj.iloc
--> 723 iloc._setitem_with_indexer(indexer, value, self.name)
724
725 def _validate_key(self, key, axis: int):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value, name)
1730 self._setitem_with_indexer_split_path(indexer, value, name)
1731 else:
-> 1732 self._setitem_single_block(indexer, value, name)
1733
1734 def _setitem_with_indexer_split_path(self, indexer, value, name: str):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_single_block(self, indexer, value, name)
1966
1967 # actually do the set
-> 1968 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1969 self.obj._maybe_update_cacher(clear=True)
1970
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in setitem(self, indexer, value)
353
354 def setitem(self: T, indexer, value) -> T:
--> 355 return self.apply("setitem", indexer=indexer, value=value)
356
357 def putmask(self, mask, new, align: bool = True):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in setitem(self, indexer, value)
941
942 # length checking
--> 943 check_setitem_lengths(indexer, value, values)
944 exact_match = is_exact_shape_match(values, arr_value)
945
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexers.py in check_setitem_lengths(indexer, value, values)
174 and len(indexer[indexer]) == len(value)
175 ):
--> 176 raise ValueError(
177 "cannot set using a list-like indexer "
178 "with a different length than the value"
ValueError: cannot set using a list-like indexer with a different length than the value
Why is it not running particularly on my system? Is there any problem with the python version or Jupyter Notebook?
Please help.

Related

ArrowInvalid: Could not convert ... with type DataFrame: did not recognize Python value type when inferring an Arrow data type

Using IForest library implementing a function for detection outliers using the following code:
import pyspark.pandas as pd
import numpy as np
from alibi_detect.od import IForest
# **************** Modelo IForest ******************************************
# IForest rta - Outlier ---> 1, Not-Outlier ----> 0
od = IForest(
threshold=0.,
n_estimators=5
)
def mode(lm):
freqs = groupby(Counter(lm).most_common(), lambda x:x[1])
m=[val for val,count in next(freqs)[1]]
if len(m)>1:
m=np.median(lm)
else:
m=float(m[0])
return m
def disper(x):
x_pred = x[['precio_local', 'precio_contenido']]
insumo_std = x_pred.std().to_frame().T
mod = mode(x_pred['precio_local'])
x_send2 = pd.DataFrame(
index=x_pred.index,
columns=['Std_precio','Std_prec_cont','cant_muestras','Moda_precio_local','IsFo']
)
x_send2.loc[:,'Std_precio'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Std_prec_cont'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Moda_precio_local'] = mod
mod_cont = mode(x_pred['precio_contenido'])
x_send2.loc[:,'Moda_precio_contenido_std'] = mod_cont
ctn = x_pred.shape[0]
x_send2.loc[:,'cant_muestras'] = ctn
if x_pred.shape[0]>3:
od.fit(x_pred)
preds = od.predict(
x_pred,
return_instance_score=True
)
x_preds = preds['data']['is_outlier']
#x_send2.loc[:,'IsFo']=x_preds
pd.set_option('compute.ops_on_diff_frames', True)
x_send2.loc[:,'IsFo']= pd.Series(x_preds, index=x_pred.index)
#x_send2.insert(x_pred.index, 'IsFo', x_preds)
else:
x_send2.loc[:,'IsFo'] = 0
print(type(x_send2))
print(x_send2)
return x_send2
insumo_all_pd = insumo_all.to_pandas_on_spark()
I get the error:
ArrowInvalid Traceback (most recent call last)
<command-1939548125702628> in <module>
----> 1 df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
2 display(df_result)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
192 start = time.perf_counter()
193 try:
--> 194 res = func(*args, **kwargs)
195 logger.log_success(
196 class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/pandas/groupby.py in apply(self, func, *args, **kwargs)
1200 else:
1201 pser_or_pdf = grouped.apply(pandas_apply, *args, **kwargs)
-> 1202 psser_or_psdf = ps.from_pandas(pser_or_pdf)
1203
1204 if len(pdf) <= limit:
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/namespace.py in from_pandas(pobj)
143 """
144 if isinstance(pobj, pd.Series):
--> 145 return Series(pobj)
146 elif isinstance(pobj, pd.DataFrame):
147 return DataFrame(pobj)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
424 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
425 )
--> 426 internal = InternalFrame.from_pandas(pd.DataFrame(s))
427 if s.name is None:
428 internal = internal.copy(column_labels=[None])
/databricks/spark/python/pyspark/pandas/internal.py in from_pandas(pdf)
1458 data_columns,
1459 data_fields,
-> 1460 ) = InternalFrame.prepare_pandas_frame(pdf)
1461
1462 schema = StructType([field.struct_field for field in index_fields + data_fields])
/databricks/spark/python/pyspark/pandas/internal.py in prepare_pandas_frame(pdf, retain_index)
1531
1532 for col, dtype in zip(reset_index.columns, reset_index.dtypes):
-> 1533 spark_type = infer_pd_series_spark_type(reset_index[col], dtype)
1534 reset_index[col] = DataTypeOps(dtype, spark_type).prepare(reset_index[col])
1535
/databricks/spark/python/pyspark/pandas/typedef/typehints.py in infer_pd_series_spark_type(pser, dtype)
327 return pser.iloc[0].__UDT__
328 else:
--> 329 return from_arrow_type(pa.Array.from_pandas(pser).type)
330 elif isinstance(dtype, CategoricalDtype):
331 if isinstance(pser.dtype, CategoricalDtype):
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.Array.from_pandas()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()
/databricks/python/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Could not convert Std_precio Std_prec_cont cant_muestras Moda_precio_local IsFo Moda_precio_contenido_std
107 0.0 0.0 3 1.0 0 1.666667
252 0.0 0.0 3 1.0 0 1.666667
396 0.0 0.0 3 1.0 0 1.666667 with type DataFrame: did not recognize Python value type when inferring an Arrow data type
The error encountered by using:
df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
The schema of dataframe insumo_all_pd is:
fecha_ola datetime64[ns]
pais object
categoria object
marca object
submarca object
contenido_std float64
unidad_std object
barcode object
precio_local float64
cantidad float64
descripcion object
id_ticket object
id_item object
id_pdv object
fecha_transaccion datetime64[ns]
id_ref float64
precio_contenido float64
dtype: object
It is not clear to me what is causing the error but it seems that the data types are being inferred incorrectly.
I have tried to convert the data types resulting from the "disper" function to float but it gives the same error.
I appreciate any help or guidance you can give me.
The new Jupyter, apparently, has changed some of the pandas related libraries. The solution's upgrading to Jupyter 5.

ValueError: Number of columns must be a positive integer, not 0

when i want to execute below code and plot figer
scatter_matrix(total_frame)
total_frame is a dataframe like this
the error like this:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_11336\1619863705.py in <module>
1 total_frame.dropna(how='any')
----> 2 scatter_matrix(total_frame)
3 plt.show()
~\.conda\envs\env2\lib\site-packages\pandas\plotting\_misc.py in scatter_matrix(frame, alpha, figsize, ax, grid, diagonal, marker, density_kwds, hist_kwds, range_padding, **kwargs)
137 hist_kwds=hist_kwds,
138 range_padding=range_padding,
--> 139 **kwargs,
140 )
141
~\.conda\envs\env2\lib\site-packages\pandas\plotting\_matplotlib\misc.py in scatter_matrix(frame, alpha, figsize, ax, grid, diagonal, marker, density_kwds, hist_kwds, range_padding, **kwds)
48 n = df.columns.size
49 naxes = n * n
---> 50 fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
51
52 # no gaps between subplots
~\.conda\envs\env2\lib\site-packages\pandas\plotting\_matplotlib\tools.py in create_subplots(naxes, sharex, sharey, squeeze, subplot_kw, ax, layout, layout_type, **fig_kw)
265
266 # Create first subplot separately, so we can share it if requested
--> 267 ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw)
268
269 if sharex:
~\.conda\envs\env2\lib\site-packages\matplotlib\figure.py in add_subplot(self, *args, **kwargs)
770 projection_class, pkw = self._process_projection_requirements(
771 *args, **kwargs)
--> 772 ax = subplot_class_factory(projection_class)(self, *args, **pkw)
773 key = (projection_class, pkw)
774 return self._add_axes_internal(ax, key)
~\.conda\envs\env2\lib\site-packages\matplotlib\axes\_subplots.py in __init__(self, fig, *args, **kwargs)
34 self._axes_class.__init__(self, fig, [0, 0, 1, 1], **kwargs)
35 # This will also update the axes position.
---> 36 self.set_subplotspec(SubplotSpec._from_subplot_args(fig, args))
37
38 #_api.deprecated(
~\.conda\envs\env2\lib\site-packages\matplotlib\gridspec.py in _from_subplot_args(figure, args)
595 f"{len(args)} were given")
596
--> 597 gs = GridSpec._check_gridspec_exists(figure, rows, cols)
598 if gs is None:
599 gs = GridSpec(rows, cols, figure=figure)
~\.conda\envs\env2\lib\site-packages\matplotlib\gridspec.py in _check_gridspec_exists(figure, nrows, ncols)
223 return gs
224 # else gridspec not found:
--> 225 return GridSpec(nrows, ncols, figure=figure)
226
227 def __getitem__(self, key):
~\.conda\envs\env2\lib\site-packages\matplotlib\gridspec.py in __init__(self, nrows, ncols, figure, left, bottom, right, top, wspace, hspace, width_ratios, height_ratios)
385 super().__init__(nrows, ncols,
386 width_ratios=width_ratios,
--> 387 height_ratios=height_ratios)
388
389 _AllowedKeys = ["left", "bottom", "right", "top", "wspace", "hspace"]
~\.conda\envs\env2\lib\site-packages\matplotlib\gridspec.py in __init__(self, nrows, ncols, height_ratios, width_ratios)
51 if not isinstance(ncols, Integral) or ncols <= 0:
52 raise ValueError(
---> 53 f"Number of columns must be a positive integer, not {ncols!r}")
54 self._nrows, self._ncols = nrows, ncols
55 self.set_height_ratios(height_ratios)
ValueError: Number of columns must be a positive integer, not 0
<Figure size 432x288 with 0 Axes>
i search such error and don't find anything,please help me!!!!!
i have solved it,
my data's class is object,the function need num,
so i use pd.convert_dtypes() and it works

pyspark toPandas() IndexError: index is out of bounds

I'm experiencing a weird behaviour of pyspark's .toPandas() method running from Jupyt. For example, if I try this:
data = [{"Category": 'Category A', "ID": 1, "Value": 12.40},
{"Category": 'Category B', "ID": 2, "Value": 30.10},
{"Category": 'Category C', "ID": 3, "Value": 100.01}
]
# Create data frame (where spark is a SparkSession)
df = spark.createDataFrame(data)
df.show()
I'm able to successfully create the pyspark dataframe. However, when converting to pandas I get IndexError: index is out of bounds:
IndexError Traceback (most recent call last)
<path_to_python>/lib/python3.7/site-packages/IPython/core/formatters.py in __call__(self, obj)
700 type_pprinters=self.type_printers,
701 deferred_pprinters=self.deferred_printers)
--> 702 printer.pretty(obj)
703 printer.flush()
704 return stream.getvalue()
<path_to_python>/lib/python3.7/site-packages/IPython/lib/pretty.py in pretty(self, obj)
400 if cls is not object \
401 and callable(cls.__dict__.get('__repr__')):
--> 402 return _repr_pprint(obj, self, cycle)
403
404 return _default_pprint(obj, self, cycle)
<path_to_python>/lib/python3.7/site-packages/IPython/lib/pretty.py in _repr_pprint(obj, p, cycle)
695 """A pprint that just redirects to the normal repr function."""
696 # Find newlines and replace them with p.break_()
--> 697 output = repr(obj)
698 for idx,output_line in enumerate(output.splitlines()):
699 if idx:
<path_to_python>/lib/python3.7/site-packages/pandas/core/base.py in __repr__(self)
76 Yields Bytestring in Py2, Unicode String in py3.
77 """
---> 78 return str(self)
79
80
<path_to_python>/lib/python3.7/site-packages/pandas/core/base.py in __str__(self)
55
56 if compat.PY3:
---> 57 return self.__unicode__()
58 return self.__bytes__()
59
<path_to_python>/lib/python3.7/site-packages/pandas/core/frame.py in __unicode__(self)
632 width = None
633 self.to_string(buf=buf, max_rows=max_rows, max_cols=max_cols,
--> 634 line_width=width, show_dimensions=show_dimensions)
635
636 return buf.getvalue()
<path_to_python>/lib/python3.7/site-packages/pandas/core/frame.py in to_string(self, buf, columns, col_space, header, index, na_rep, formatters, float_format, sparsify, index_names, justify, max_rows, max_cols, show_dimensions, decimal, line_width)
719 decimal=decimal,
720 line_width=line_width)
--> 721 formatter.to_string()
722
723 if buf is None:
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in to_string(self)
596 else:
597
--> 598 strcols = self._to_str_columns()
599 if self.line_width is None: # no need to wrap around just print
600 # the whole frame
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _to_str_columns(self)
527 str_columns = [[label] for label in self.header]
528 else:
--> 529 str_columns = self._get_formatted_column_labels(frame)
530
531 stringified = []
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatted_column_labels(self, frame)
770 need_leadsp[x] else x]
771 for i, (col, x) in enumerate(zip(columns,
--> 772 fmt_columns))]
773
774 if self.show_row_idx_names:
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in <listcomp>(.0)
769 str_columns = [[' ' + x if not self._get_formatter(i) and
770 need_leadsp[x] else x]
--> 771 for i, (col, x) in enumerate(zip(columns,
772 fmt_columns))]
773
<path_to_python>/lib/python3.7/site-packages/pandas/io/formats/format.py in _get_formatter(self, i)
362 else:
363 if is_integer(i) and i not in self.columns:
--> 364 i = self.columns[i]
365 return self.formatters.get(i, None)
366
<path_to_python>/lib/python3.7/site-packages/pandas/core/indexes/base.py in __getitem__(self, key)
3956 if is_scalar(key):
3957 key = com.cast_scalar_indexer(key)
-> 3958 return getitem(key)
3959
3960 if isinstance(key, slice):
IndexError: index 3 is out of bounds for axis 0 with size 3
I'm not sure where the problem can be, I've used this many times without problems but this time I tried a new environment and I got this issue. In case it can help my configuration is:
Python: 3.7.6;
Pandas: 0.24.2;
PySpark: 2.4.5
Any idea?
Thanks :)
I found the issue. Trying to minimize the code to reproduce the error I omitted that I was adding a pandas setting:
pd.set_option('display.max_columns', -1)
This caused the error independently of the dataframe being converted. To fix it I just specified a positive number of columns or None.

matplotlib issue: how to erase this one?

import maplotlib.pyplot as plt
import pandas as pd
df = pd.DataFrame(np.random.randn(30,3)*100+1000,
index=pd.date_range(start='2018-09-01', periods=30, freq='D'),
columns=['1', '2', 3'])
df[:5].plot.bar()
a Seeing the graph, each x label has '00:00:00', which is unnecessary.
So I tried to delete these by writing this code.
df[:5].plot.bar(x=df[:5].index.date
But it has an error like this.
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-52-92dd89374fec> in <module>
----> 1 df[:5].plot.bar(x=df[:5].index.date, stacked=True)
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in bar(self, x, y, **kwargs)
1001 >>> ax = df.plot.bar(x='lifespan', rot=0)
1002 """
-> 1003 return self(kind="bar", x=x, y=y, **kwargs)
1004
1005 def barh(self, x=None, y=None, **kwargs):
~\anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
810 if is_integer(x) and not data.columns.holds_integer():
811 x = data_cols[x]
--> 812 elif not isinstance(data[x], ABCSeries):
813 raise ValueError("x must be a label or position")
814 data = data.set_index(x)
~\anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2804 if is_iterator(key):
2805 key = list(key)
-> 2806 indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
2807
2808 # take() does not accept boolean indexers
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
1550 keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
1551
-> 1552 self._validate_read_indexer(
1553 keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
1554 )
~\anaconda3\lib\site-packages\pandas\core\indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
1638 if missing == len(indexer):
1639 axis_name = self.obj._get_axis_name(axis)
-> 1640 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
1641
1642 # We (temporarily) allow for some missing keys with .loc, except in
KeyError: "None of [Index([2018-09-01, 2018-09-02, 2018-09-03, 2018-09-04, 2018-09-05], dtype='object')] are in the [columns]"
What's the problem?? I just followed the book, but it did come out.
You can change index values before selecting first 5 rows:
df.index = df.index.date
df[:5].plot.bar()
Or:
df.rename(lambda x: x.date())[:5].plot.bar()

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.