xarray: mean of data stored via OPeNDAP - numpy

I'm using xarray's very cool pydap back-end (http://xarray.pydata.org/en/stable/io.html#opendap) to read data stored via OPenDAP at IRI:
import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
print(remote_data)
#<xarray.DataArray 'zg' (P: 2, S: 6569, M: 3, L: 45, Y: 181, X: 360)>
#[115569730800 values with dtype=float32]
#Coordinates:
# * L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
# * Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
# * S (S) datetime64[ns] 1999-01-07 1999-01-08 1999-01-09 1999-01-10 ...
# * M (M) float32 1.0 2.0 3.0
# * X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
# * P (P) int32 500 200
#Attributes:
# level_type: pressure level
# standard_name: geopotential_height
# long_name: Geopotential Height
# units: m
For reference it's sub-seasonal forecast data where L is lead-time (45 days forecasts), S is initialization date and M is ensemble.
I would like to do an ensemble mean and i'm only interested in the 500 hPa level. However, it crashes out and gives a RuntimeError: NetCDF: Access failure:
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
RuntimeError Traceback (most recent call last)
<ipython-input-46-eca488e9def5> in <module>()
1 remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models' '/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
2 da = remote_data.sel(P=500)
----> 3 da_ensmean = da.mean(dim='M')
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
20 keep_attrs=False, **kwargs):
21 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22 skipna=skipna, allow_lazy=True, **kwargs)
23 else:
24 def wrapped_func(self, dim=None, axis=None, keep_attrs=False,
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in reduce(self, func, dim, axis, keep_attrs, **kwargs)
1359 summarized data and the indicated dimension(s) removed.
1360 """
-> 1361 var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
1362 return self._replace_maybe_drop_dims(var)
1363
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, allow_lazy, **kwargs)
1264 if dim is not None:
1265 axis = self.get_axis_num(dim)
-> 1266 data = func(self.data if allow_lazy else self.values,
1267 axis=axis, **kwargs)
1268
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in data(self)
293 return self._data
294 else:
--> 295 return self.values
296
297 #data.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in values(self)
385 def values(self):
386 """The variable's data as a numpy.ndarray"""
--> 387 return _as_array_or_item(self._data)
388
389 #values.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in _as_array_or_item(data)
209 TODO: remove this (replace with np.asarray) once these issues are fixed
210 """
--> 211 data = np.asarray(data)
212 if data.ndim == 0:
213 if data.dtype.kind == 'M':
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
622
623 def __array__(self, dtype=None):
--> 624 self._ensure_cached()
625 return np.asarray(self.array, dtype=dtype)
626
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in _ensure_cached(self)
619 def _ensure_cached(self):
620 if not isinstance(self.array, NumpyIndexingAdapter):
--> 621 self.array = NumpyIndexingAdapter(np.asarray(self.array))
622
623 def __array__(self, dtype=None):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
600
601 def __array__(self, dtype=None):
--> 602 return np.asarray(self.array, dtype=dtype)
603
604 def __getitem__(self, key):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in __getitem__(self, key)
64
65 def __getitem__(self, key):
---> 66 return self.func(self.array[key])
67
68 def __repr__(self):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in _apply_mask(data, encoded_fill_values, decoded_fill_value, dtype)
133 for fv in encoded_fill_values:
134 condition |= data == fv
--> 135 data = np.asarray(data, dtype=dtype)
136 return np.where(condition, decoded_fill_value, data)
137
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/netCDF4_.py in __getitem__(self, key)
63 with self.datastore.ensure_open(autoclose=True):
64 try:
---> 65 array = getitem(self.get_array(), key.tuple)
66 except IndexError:
67 # Catch IndexError in netCDF4 and return a more informative
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/common.py in robust_getitem(array, key, catch, max_retries, initial_delay)
114 for n in range(max_retries + 1):
115 try:
--> 116 return array[key]
117 except catch:
118 if n == max_retries:
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable.__getitem__()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable._get()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
RuntimeError: NetCDF: Access failure
Breaking down the calculation removes the RuntimeError. Guess it was just too hefty of a calculation with all the start times. Shouldn't be too difficult to put in a loop over S:
da = remote_data.isel(P=0,S=0)
da_ensmean = da.mean(dim='M')
print(da_ensmean)
<xarray.DataArray 'zg' (L: 45, Y: 181, X: 360)>
array([[[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
...,
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383],
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383]],
[[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
...,
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ],
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ]],
...,
[[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
...,
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173],
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173]],
[[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
...,
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063],
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063]]], dtype=float32)
Coordinates:
* L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
* Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
S datetime64[ns] 1999-01-07
* X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
P int32 500

This is a good use-case for chunking with dask, e.g.,
import xarray as xr
url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods'
remote_data = xr.open_dataarray(url, chunks={'S': 1, 'L': 1})
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
This version will access the data server in parallel, using many smaller chunks. It will still be slow to download 231 GB of data, but your request will have much better odds of success.

Related

ArrowInvalid: Could not convert ... with type DataFrame: did not recognize Python value type when inferring an Arrow data type

Using IForest library implementing a function for detection outliers using the following code:
import pyspark.pandas as pd
import numpy as np
from alibi_detect.od import IForest
# **************** Modelo IForest ******************************************
# IForest rta - Outlier ---> 1, Not-Outlier ----> 0
od = IForest(
threshold=0.,
n_estimators=5
)
def mode(lm):
freqs = groupby(Counter(lm).most_common(), lambda x:x[1])
m=[val for val,count in next(freqs)[1]]
if len(m)>1:
m=np.median(lm)
else:
m=float(m[0])
return m
def disper(x):
x_pred = x[['precio_local', 'precio_contenido']]
insumo_std = x_pred.std().to_frame().T
mod = mode(x_pred['precio_local'])
x_send2 = pd.DataFrame(
index=x_pred.index,
columns=['Std_precio','Std_prec_cont','cant_muestras','Moda_precio_local','IsFo']
)
x_send2.loc[:,'Std_precio'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Std_prec_cont'] = insumo_std.loc[0,'precio_local']
x_send2.loc[:,'Moda_precio_local'] = mod
mod_cont = mode(x_pred['precio_contenido'])
x_send2.loc[:,'Moda_precio_contenido_std'] = mod_cont
ctn = x_pred.shape[0]
x_send2.loc[:,'cant_muestras'] = ctn
if x_pred.shape[0]>3:
od.fit(x_pred)
preds = od.predict(
x_pred,
return_instance_score=True
)
x_preds = preds['data']['is_outlier']
#x_send2.loc[:,'IsFo']=x_preds
pd.set_option('compute.ops_on_diff_frames', True)
x_send2.loc[:,'IsFo']= pd.Series(x_preds, index=x_pred.index)
#x_send2.insert(x_pred.index, 'IsFo', x_preds)
else:
x_send2.loc[:,'IsFo'] = 0
print(type(x_send2))
print(x_send2)
return x_send2
insumo_all_pd = insumo_all.to_pandas_on_spark()
I get the error:
ArrowInvalid Traceback (most recent call last)
<command-1939548125702628> in <module>
----> 1 df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
2 display(df_result)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
192 start = time.perf_counter()
193 try:
--> 194 res = func(*args, **kwargs)
195 logger.log_success(
196 class_name, function_name, time.perf_counter() - start, signature
/databricks/spark/python/pyspark/pandas/groupby.py in apply(self, func, *args, **kwargs)
1200 else:
1201 pser_or_pdf = grouped.apply(pandas_apply, *args, **kwargs)
-> 1202 psser_or_psdf = ps.from_pandas(pser_or_pdf)
1203
1204 if len(pdf) <= limit:
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/namespace.py in from_pandas(pobj)
143 """
144 if isinstance(pobj, pd.Series):
--> 145 return Series(pobj)
146 elif isinstance(pobj, pd.DataFrame):
147 return DataFrame(pobj)
/databricks/spark/python/pyspark/pandas/usage_logging/__init__.py in wrapper(*args, **kwargs)
187 if hasattr(_local, "logging") and _local.logging:
188 # no need to log since this should be internal call.
--> 189 return func(*args, **kwargs)
190 _local.logging = True
191 try:
/databricks/spark/python/pyspark/pandas/series.py in __init__(self, data, index, dtype, name, copy, fastpath)
424 data=data, index=index, dtype=dtype, name=name, copy=copy, fastpath=fastpath
425 )
--> 426 internal = InternalFrame.from_pandas(pd.DataFrame(s))
427 if s.name is None:
428 internal = internal.copy(column_labels=[None])
/databricks/spark/python/pyspark/pandas/internal.py in from_pandas(pdf)
1458 data_columns,
1459 data_fields,
-> 1460 ) = InternalFrame.prepare_pandas_frame(pdf)
1461
1462 schema = StructType([field.struct_field for field in index_fields + data_fields])
/databricks/spark/python/pyspark/pandas/internal.py in prepare_pandas_frame(pdf, retain_index)
1531
1532 for col, dtype in zip(reset_index.columns, reset_index.dtypes):
-> 1533 spark_type = infer_pd_series_spark_type(reset_index[col], dtype)
1534 reset_index[col] = DataTypeOps(dtype, spark_type).prepare(reset_index[col])
1535
/databricks/spark/python/pyspark/pandas/typedef/typehints.py in infer_pd_series_spark_type(pser, dtype)
327 return pser.iloc[0].__UDT__
328 else:
--> 329 return from_arrow_type(pa.Array.from_pandas(pser).type)
330 elif isinstance(dtype, CategoricalDtype):
331 if isinstance(pser.dtype, CategoricalDtype):
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.Array.from_pandas()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib.array()
/databricks/python/lib/python3.8/site-packages/pyarrow/array.pxi in pyarrow.lib._ndarray_to_array()
/databricks/python/lib/python3.8/site-packages/pyarrow/error.pxi in pyarrow.lib.check_status()
ArrowInvalid: Could not convert Std_precio Std_prec_cont cant_muestras Moda_precio_local IsFo Moda_precio_contenido_std
107 0.0 0.0 3 1.0 0 1.666667
252 0.0 0.0 3 1.0 0 1.666667
396 0.0 0.0 3 1.0 0 1.666667 with type DataFrame: did not recognize Python value type when inferring an Arrow data type
The error encountered by using:
df_result = insumo_all_pd.groupby(by=['categoria','marca','submarca','barcode','contenido_std','unidad_std']).apply(disper)
The schema of dataframe insumo_all_pd is:
fecha_ola datetime64[ns]
pais object
categoria object
marca object
submarca object
contenido_std float64
unidad_std object
barcode object
precio_local float64
cantidad float64
descripcion object
id_ticket object
id_item object
id_pdv object
fecha_transaccion datetime64[ns]
id_ref float64
precio_contenido float64
dtype: object
It is not clear to me what is causing the error but it seems that the data types are being inferred incorrectly.
I have tried to convert the data types resulting from the "disper" function to float but it gives the same error.
I appreciate any help or guidance you can give me.
The new Jupyter, apparently, has changed some of the pandas related libraries. The solution's upgrading to Jupyter 5.

Seaborn pairplot not running only on a specific system

I have the following data with the name 'Salaries.csv'. It looks like the following:[The dataset has some columns like Index(['yearID', 'teamID', 'lgID', 'salary', 'num_feat'], dtype='object'). Please note that the column num_feat I have added to the DataFrame.
I want to do a Seaborn pairplot for team 'ATL' to plot scatter plots among all numeric features in the data frame.
I have the following code :
import seaborn as sns
var_set = [
"yearID",
"teamID",
"lgID",
"playerID",
"salary"
]
head_set = []
head_set.extend(var_set)
head_set.append("num_feat")
df = pd.read_csv('Salaries.csv',index_col='playerID', header=None, names=head_set)
df['num_feat'] = 100 * np.random.random_sample(df.shape[0]). #Adding column num_feat
df_copy = df
cols_with_team_ATL = df_copy.loc[df_copy.teamID=="ATL", ]
# Create the default pairplot
pairplot_fig = sns.pairplot(cols_with_team_ATL, vars=['yearID', 'salary', 'num_feat'])
plt.subplots_adjust(top=0.9)
pairplot_fig.fig.suptitle("Scatter plots among all numeric features in the data frame for teamID = ATL", fontsize=18, alpha=0.9, weight='bold')
plt.show()
The same code runs perfectly on my friend's system but not on mine. It shows the following error in my system :
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/var/folders/ch/6r9p7n0j3xg1l79lz1zdkvsh0000gq/T/ipykernel_97373/3735184261.py in <module>
25 # Create the default pairplot
26 print(df.columns)
---> 27 pairplot_fig = sns.pairplot(cols_with_team_ATL, vars=['yearID', 'salary', 'num_feat'])
28 plt.subplots_adjust(top=0.9)
29 pairplot_fig.fig.suptitle("Scatter plots among all numeric features in the data frame for teamID = ATL", fontsize=18, alpha=0.9, weight='bold')
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_decorators.py in inner_f(*args, **kwargs)
44 )
45 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 46 return f(**kwargs)
47 return inner_f
48
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py in pairplot(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)
2124 diag_kws.setdefault("legend", False)
2125 if diag_kind == "hist":
-> 2126 grid.map_diag(histplot, **diag_kws)
2127 elif diag_kind == "kde":
2128 diag_kws.setdefault("fill", True)
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/axisgrid.py in map_diag(self, func, **kwargs)
1476 plot_kwargs.setdefault("hue_order", self._hue_order)
1477 plot_kwargs.setdefault("palette", self._orig_palette)
-> 1478 func(x=vector, **plot_kwargs)
1479 ax.legend_ = None
1480
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py in histplot(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs)
1460 if p.univariate:
1461
-> 1462 p.plot_univariate_histogram(
1463 multiple=multiple,
1464 element=element,
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/distributions.py in plot_univariate_histogram(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws)
426
427 # First pass through the data to compute the histograms
--> 428 for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
429
430 # Prepare the relevant data
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_core.py in iter_data(self, grouping_vars, reverse, from_comp_data)
981
982 if from_comp_data:
--> 983 data = self.comp_data
984 else:
985 data = self.plot_data
~/USC/anaconda3/lib/python3.9/site-packages/seaborn/_core.py in comp_data(self)
1055 orig = self.plot_data[var].dropna()
1056 comp_col = pd.Series(index=orig.index, dtype=float, name=var)
-> 1057 comp_col.loc[orig.index] = pd.to_numeric(axis.convert_units(orig))
1058
1059 if axis.get_scale() == "log":
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
721
722 iloc = self if self.name == "iloc" else self.obj.iloc
--> 723 iloc._setitem_with_indexer(indexer, value, self.name)
724
725 def _validate_key(self, key, axis: int):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value, name)
1730 self._setitem_with_indexer_split_path(indexer, value, name)
1731 else:
-> 1732 self._setitem_single_block(indexer, value, name)
1733
1734 def _setitem_with_indexer_split_path(self, indexer, value, name: str):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexing.py in _setitem_single_block(self, indexer, value, name)
1966
1967 # actually do the set
-> 1968 self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
1969 self.obj._maybe_update_cacher(clear=True)
1970
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in setitem(self, indexer, value)
353
354 def setitem(self: T, indexer, value) -> T:
--> 355 return self.apply("setitem", indexer=indexer, value=value)
356
357 def putmask(self, mask, new, align: bool = True):
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/managers.py in apply(self, f, align_keys, ignore_failures, **kwargs)
325 applied = b.apply(f, **kwargs)
326 else:
--> 327 applied = getattr(b, f)(**kwargs)
328 except (TypeError, NotImplementedError):
329 if not ignore_failures:
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py in setitem(self, indexer, value)
941
942 # length checking
--> 943 check_setitem_lengths(indexer, value, values)
944 exact_match = is_exact_shape_match(values, arr_value)
945
~/USC/anaconda3/lib/python3.9/site-packages/pandas/core/indexers.py in check_setitem_lengths(indexer, value, values)
174 and len(indexer[indexer]) == len(value)
175 ):
--> 176 raise ValueError(
177 "cannot set using a list-like indexer "
178 "with a different length than the value"
ValueError: cannot set using a list-like indexer with a different length than the value
Why is it not running particularly on my system? Is there any problem with the python version or Jupyter Notebook?
Please help.

Dask Cluster: AttributeError: 'DataFrame' object has no attribute '_data'

I'm working with a Dask Cluster on GCP. I'm using this code to deploy it:
from dask_cloudprovider.gcp import GCPCluster
from dask.distributed import Client
enviroment_vars = {
'EXTRA_PIP_PACKAGES': '"gcsfs"'
}
cluster = GCPCluster(
n_workers=32,
docker_image='daskdev/dask:2021.2.0',
env_vars=enviroment_vars,
network='my-network',
#filesystem_size=150,
machine_type='e2-standard-16',
projectid='my-project-id',
zone='us-central1-a',
on_host_maintenance="MIGRATE"
client = Client(cluster)
Then I read csv files, with the following code:
import dask.dataframe as dd
import csv
col_dtypes = {
'var1': 'float64',
'var2': 'object',
'var3': 'object',
'var4': 'float64'
}
df = dd.read_csv('gs://my_bucket/files-*.csv', blocksize=None, dtype= col_dtypes)
df = df.persist()
Everything works fine, but when I try to do some queries, or calculation, I get an error. For instance this piece of code:
df.var1.value_counts().compute()
This is the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-14-711a7c21ed42> in <module>
----> 1 df.var1.value_counts().compute()
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(self, **kwargs)
279 dask.base.compute
280 """
--> 281 (result,) = compute(self, traverse=False, **kwargs)
282 return result
283
/opt/conda/lib/python3.8/site-packages/dask/base.py in compute(*args, **kwargs)
561 postcomputes.append(x.__dask_postcompute__())
562
--> 563 results = schedule(dsk, keys, **kwargs)
564 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
565
/opt/conda/lib/python3.8/site-packages/distributed/client.py in get(self, dsk, keys, workers, allow_other_workers, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2653 should_rejoin = False
2654 try:
-> 2655 results = self.gather(packed, asynchronous=asynchronous, direct=direct)
2656 finally:
2657 for f in futures.values():
/opt/conda/lib/python3.8/site-packages/distributed/client.py in gather(self, futures, errors, direct, asynchronous)
1962 else:
1963 local_worker = None
-> 1964 return self.sync(
1965 self._gather,
1966 futures,
/opt/conda/lib/python3.8/site-packages/distributed/client.py in sync(self, func, asynchronous, callback_timeout, *args, **kwargs)
836 return future
837 else:
--> 838 return sync(
839 self.loop, func, *args, callback_timeout=callback_timeout, **kwargs
840 )
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in sync(loop, func, callback_timeout, *args, **kwargs)
338 if error[0]:
339 typ, exc, tb = error[0]
--> 340 raise exc.with_traceback(tb)
341 else:
342 return result[0]
/opt/conda/lib/python3.8/site-packages/distributed/utils.py in f()
322 if callback_timeout is not None:
323 future = asyncio.wait_for(future, callback_timeout)
--> 324 result[0] = yield future
325 except Exception as exc:
326 error[0] = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/tornado/gen.py in run(self)
760
761 try:
--> 762 value = future.result()
763 except Exception:
764 exc_info = sys.exc_info()
/opt/conda/lib/python3.8/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1827 exc = CancelledError(key)
1828 else:
-> 1829 raise exception.with_traceback(traceback)
1830 raise exc
1831 if errors == "skip":
/opt/conda/lib/python3.8/site-packages/dask/optimization.py in __call__()
961 if not len(args) == len(self.inkeys):
962 raise ValueError("Expected %d args, got %d" % (len(self.inkeys), len(args)))
--> 963 return core.get(self.dsk, self.outkey, dict(zip(self.inkeys, args)))
964
965 def __reduce__(self):
/opt/conda/lib/python3.8/site-packages/dask/core.py in get()
149 for key in toposort(dsk):
150 task = dsk[key]
--> 151 result = _execute_task(task, cache)
152 cache[key] = result
153 result = _execute_task(out, cache)
/opt/conda/lib/python3.8/site-packages/dask/core.py in _execute_task()
119 # temporaries by their reference count and can execute certain
120 # operations in-place.
--> 121 return func(*(_execute_task(a, cache) for a in args))
122 elif not ishashable(arg):
123 return arg
/opt/conda/lib/python3.8/site-packages/dask/utils.py in apply()
33 def apply(func, args, kwargs=None):
34 if kwargs:
---> 35 return func(*args, **kwargs)
36 else:
37 return func(*args)
/opt/conda/lib/python3.8/site-packages/dask/dataframe/core.py in apply_and_enforce()
5474 return meta
5475 if is_dataframe_like(df):
-> 5476 check_matching_columns(meta, df)
5477 c = meta.columns
5478 else:
/opt/conda/lib/python3.8/site-packages/dask/dataframe/utils.py in check_matching_columns()
690 def check_matching_columns(meta, actual):
691 # Need nan_to_num otherwise nan comparison gives False
--> 692 if not np.array_equal(np.nan_to_num(meta.columns), np.nan_to_num(actual.columns)):
693 extra = methods.tolist(actual.columns.difference(meta.columns))
694 missing = methods.tolist(meta.columns.difference(actual.columns))
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__get__()
/opt/conda/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__()
5268 or name in self._accessors
5269 ):
-> 5270 return object.__getattribute__(self, name)
5271 else:
5272 if self._info_axis._can_hold_identifiers_and_holds_name(name):
AttributeError: 'DataFrame' object has no attribute '_data'
The version of Pandas in my docker file is 1.0.1, so I already try upgrading Pandas (to version 1.2.2), but it didn't work, what am I doing wrong?
My guess is that you have a version mismatch somewhere. What does client.get_versions(check=True) say?

ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42)

I already looked at the other similar questions, but they did not help me. I'm attempting to use GridSearchCV. I'm using three pipelines to predict nfl play data. It works pretty well until the grid search part.
Here is my code.
pipe_nfl1_1 = Pipeline([
('ssc', StandardScaler()),
('lr', LogisticRegression(random_state=42))
])
pipe_nfl1_2 = Pipeline([
('mms', MinMaxScaler()),
('rfc', RandomForestClassifier(random_state=42))
])
pipe_nfl1_3 = Pipeline([
('mms', MinMaxScaler()),
('svc', svm.SVC(random_state=42))
])
pipelines1 = [pipe_nfl1_1, pipe_nfl1_2, pipe_nfl1_3]
pipe_dict1 = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'SVC'}
for pipe in pipelines1:
pipe.fit(X_train1, y_train1)
print('Pipeline test accuracy for predicting 1st downs:')
for idx, val in enumerate(pipelines1):
print(' %s: %.4f' % (pipe_dict1[idx], val.score(X_test1, y_test1)))
best_acc1 = 0.0
best_clf1 = 0
best_pipe1 = ''
for idx, val in enumerate(pipelines1):
if val.score(X_test1, y_test1) > best_acc1:
best_acc1 = val.score(X_test1, y_test1)
best_pipe1 = val
best_clf1 = idx
best_acc1 *= 100
print('Classifier with best accuracy for predicting 1st downs is %s with %.2f' % (pipe_dict1[best_clf1], best_acc1) + '%')
param_grid1 = {
'lr__n_estimators': [2, 4, 6]
}
grid_search1 = GridSearchCV(pipe_nfl1_1, param_grid1, cv=2)
# fine-tune the hyperparameters
grid_search1.fit(X_train1, y_train1)
# get the best model
final_model1 = grid_search1.best_estimator_
grid_search.best_score_
But I'm getting an error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-33-6b0007d9b8f1> in <module>
2
3 # fine-tune the hyperparameters
----> 4 grid_search1.fit(X_train1, y_train1)
5
6 # get the best model
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
518 cloned_parameters[k] = clone(v, safe=False)
519
--> 520 estimator = estimator.set_params(**cloned_parameters)
521
522 start_time = time.time()
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in set_params(self, **kwargs)
139 self
140 """
--> 141 self._set_params('steps', **kwargs)
142 return self
143
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self, attr, **params)
51 self._replace_estimator(attr, name, params.pop(name))
52 # 3. Step parameters and other initialisation arguments
---> 53 super().set_params(**params)
54 return self
55
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
259
260 for key, sub_params in nested_params.items():
--> 261 valid_params[key].set_params(**sub_params)
262
263 return self
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
247 key, delim, sub_key = key.partition('__')
248 if key not in valid_params:
--> 249 raise ValueError('Invalid parameter %s for estimator %s. '
250 'Check the list of available parameters '
251 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.
I've done LogisticRegression.get_params().keys() to get the keys, but it returns get_params() missing 1 required positional argument: 'self'.
You shouldn't have the leading underscores in the parameter names. You want your param_grid1 dict to consist of keys that are actually parameters accepted by the model you're using. That would be n_estimators for RandomForest, and C for LogisticRegression. With that said, n_estimators is a parameter for the model RandomForest, but it's not a parameter for LogisticRegression. C is a parameter for LogisticRegression.
I think what you want to do is a grid search over the parameter space of the model that performs best, right? In that case, your param_grid1 variable should be updated to the model that performs best. The parameters accepted by the models you're testing vary from model to model.

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.