Pandas SystemError: \objects\tupleobject.c:914: bad argument to internal function - pandas

I am encountering below error message sporadically.
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3827 from pandas.core.reshape.reshape import unstack
3828
-> 3829 return unstack(self, level, fill_value)
3830
3831 # ----------------------------------------------------------------------
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
428 if is_extension_array_dtype(obj.dtype):
429 return _unstack_extension_series(obj, level, fill_value)
--> 430 unstacker = _Unstacker(
431 obj.index, level=level, constructor=obj._constructor_expanddim
432 )
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\reshape\reshape.py in __init__(self, index, level, constructor)
116 raise ValueError("Unstacked DataFrame is too big, causing int32 overflow")
117
--> 118 self._make_selectors()
119
120 #cache_readonly
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\reshape\reshape.py in _make_selectors(self)
150
151 # make the mask
--> 152 remaining_labels = self.sorted_labels[:-1]
153 level_sizes = [len(x) for x in new_levels]
154
pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\reshape\reshape.py in sorted_labels(self)
137 #cache_readonly
138 def sorted_labels(self):
--> 139 indexer, to_sort = self._indexer_and_to_sort
140 return [line.take(indexer) for line in to_sort]
141
pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\reshape\reshape.py in _indexer_and_to_sort(self)
127 sizes = [len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]]
128
--> 129 comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
130 ngroups = len(obs_ids)
131
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\sorting.py in get_compressed_ids(labels, sizes)
195 tuple of (comp_ids, obs_group_ids)
196 """
--> 197 ids = get_group_index(labels, sizes, sort=True, xnull=False)
198 return compress_group_index(ids, sort=True)
199
C:\ProgramData\Anaconda3\envs\py39\lib\site-packages\pandas\core\sorting.py in get_group_index(labels, shape, sort, xnull)
139 labels = map(ensure_int64, labels)
140 if not xnull:
--> 141 labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
142
143 labels = list(labels)
SystemError: \objects\tupleobject.c:914: bad argument to internal function
the above was called from the actual code:
1539 slice_tmp = my_orderlog.groupby(['ticker','slice']).activeOrderSize.sum().to_frame('size').reset_index()
1540 slice_tmp['unit_size'] = slice_tmp['size']/slice_tmp['ticker'].map(wt_map)
1541 logger.info(f'TRYING TO DEBUG: {slice_tmp}')
-> 1542 breakdown = slice_tmp.groupby(['ticker','slice']).unit_size.sum().unstack(level=0)
1543 logger.info(f'TRYING TO DEBUG {breakdown}')
The SystemError of bad argument of internal functions are not repeatable and very difficult (if impossible) to catch.
Note that I try to log the in-question dataframe "slice_tmp". but when i get that and run the exact same code, i.e., the "groupby(['ticker','slice']).unit_size.sum().unstack(level=0)" in jupyter notebook, the code runs no problem.
any pointer/ some similar experiences on this odd one please? FWIW i'm running Pandas 1.2.4.

Related

AttributeError: 'Adam' object has no attribute 'get_weights'

I am pretty new to tensorflow Keras and there is a Problem Running Cross Validation that I could not fix. It all worked before I installed featurewiz (conda install -c conda-forge featurewiz).
from sklearn.model_selection import KFold, cross_validate, cross_val_score
from scikeras.wrappers import KerasClassifier
estimator = KerasClassifier(model, epochs=500, batch_size=10) #, verbose = 0
kfold = KFold(n_splits=5, shuffle=True)
results = cross_validate(estimator, X, y, cv=kfold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=True)
print(results)
Error:
WARNING:absl:Found untraced functions such as _update_step_xla while saving (showing 1 of 1). These functions will not be directly callable after loading.
INFO:tensorflow:Assets written to: ram:///var/folders/c4/ywdtx99d1vl0ptsg1fy494_40000gn/T/tmpsuvxkjb9/assets
INFO:tensorflow:Assets written to: ram:///var/folders/c4/ywdtx99d1vl0ptsg1fy494_40000gn/T/tmpsuvxkjb9/assets
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:862, in Parallel.dispatch_one_batch(self, iterator)
861 try:
--> 862 tasks = self._ready_batches.get(block=False)
863 except queue.Empty:
864 # slice the iterator n_jobs * batchsize items at a time. If the
865 # slice returns less than that, then the current batchsize puts
(...)
868 # accordingly to distribute evenly the last items between all
869 # workers.
File ~/tensorflow-test/env/lib/python3.8/queue.py:167, in Queue.get(self, block, timeout)
166 if not self._qsize():
--> 167 raise Empty
168 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
AttributeError Traceback (most recent call last)
Cell In[5], line 6
4 estimator = KerasClassifier(model, epochs=500, batch_size=10) #, verbose = 0
5 kfold = KFold(n_splits=5, shuffle=True) #seed, damit shuffle gleich bleibt , random_state=1337
----> 6 results = cross_validate(estimator, X, y, cv=kfold, scoring=['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted'], return_train_score=True)
8 print(results)
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
1076 try:
1077 # Only set self._iterating to True if at least a batch
1078 # was dispatched. In particular this covers the edge
(...)
1082 # was very quick and its callback already dispatched all the
1083 # remaining jobs.
1084 self._iterating = False
-> 1085 if self.dispatch_one_batch(iterator):
1086 self._iterating = self._original_iterator is not None
1088 while self.dispatch_one_batch(iterator):
File ~/tensorflow-test/env/lib/python3.8/site-packages/joblib/parallel.py:873, in Parallel.dispatch_one_batch(self, iterator)
870 n_jobs = self._cached_effective_n_jobs
871 big_batch_size = batch_size * n_jobs
--> 873 islice = list(itertools.islice(iterator, big_batch_size))
874 if len(islice) == 0:
875 return False
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:268, in <genexpr>(.0)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
266 results = parallel(
267 delayed(_fit_and_score)(
--> 268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/base.py:89, in clone(estimator, safe)
87 new_object_params = estimator.get_params(deep=False)
88 for name, param in new_object_params.items():
---> 89 new_object_params[name] = clone(param, safe=False)
90 new_object = klass(**new_object_params)
91 params_set = new_object.get_params(deep=False)
File ~/tensorflow-test/env/lib/python3.8/site-packages/sklearn/base.py:70, in clone(estimator, safe)
68 elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
69 if not safe:
---> 70 return copy.deepcopy(estimator)
71 else:
72 if isinstance(estimator, type):
File ~/tensorflow-test/env/lib/python3.8/copy.py:153, in deepcopy(x, memo, _nil)
151 copier = getattr(x, "__deepcopy__", None)
152 if copier is not None:
--> 153 y = copier(memo)
154 else:
155 reductor = dispatch_table.get(cls)
File ~/tensorflow-test/env/lib/python3.8/site-packages/scikeras/_saving_utils.py:117, in deepcopy_model(model, memo)
116 def deepcopy_model(model: keras.Model, memo: Dict[Hashable, Any]) -> keras.Model:
--> 117 _, (model_bytes, optimizer_weights) = pack_keras_model(model)
118 new_model = unpack_keras_model(model_bytes, optimizer_weights)
119 memo[model] = new_model
File ~/tensorflow-test/env/lib/python3.8/site-packages/scikeras/_saving_utils.py:108, in pack_keras_model(model)
106 optimizer_weights = None
107 if model.optimizer is not None:
--> 108 optimizer_weights = model.optimizer.get_weights()
109 model_bytes = np.asarray(memoryview(b.read()))
110 return (
111 unpack_keras_model,
112 (model_bytes, optimizer_weights),
113 )
AttributeError: 'Adam' object has no attribute 'get_weights'
I created a Tensorflow enviroment on my M1 Macbook following https://github.com/mrdbourke/m1-machine-learning-test.
It all worked, I got following results:
TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.11.0
I also installed featurewiz, I am not sure if there are some Problems installing it (I did conda install -c conda-forge featurewiz)
SciKeras doesn't work with TensorFlow 2.11. The TensorFlow team release a breaking change in a minor version bump (they removed the get_weights() method). It will be fixed in SciKeras soon: https://github.com/adriangb/scikeras/pull/287
Edit: that PR was merged so the new version of SciKeras (v0.10.0) should solve this issue.

NotImplementedError when trying to concat dataframes in Pandas

I have a pandas dataframe where there is a column called 'CITY' with various city names. I did one-hot encoding on that column to convert the categorical features to numeric features.
dummy_CITY = pd.get_dummies(df['CITY'], drop_first=False)
dummy_CITY.head()
Next I'm trying to concatenate the new dataframe obtained after one-hot encoding, as shown below:
df_cat = pd.concat([df, dummy_CITY])
for which I'm getting the following error:
`
NotImplementedError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8620/1976427847.py in <module>
----> 1 df_cat = pd.concat([df, dummy_CITY])
~\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
305 )
306
--> 307 return op.get_result()
308
309
~\anaconda3\lib\site-packages\pandas\core\reshape\concat.py in get_result(self)
530 mgrs_indexers.append((obj._mgr, indexers))
531
--> 532 new_data = concatenate_managers(
533 mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy
534 )
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in concatenate_managers(mgrs_indexers, axes, concat_axis, copy)
224 fastpath = blk.values.dtype == values.dtype
225 else:
--> 226 values = _concatenate_join_units(join_units, concat_axis, copy=copy)
227 fastpath = False
228
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in _concatenate_join_units(join_units, concat_axis, copy)
486
487 has_none_blocks = any(unit.block is None for unit in join_units)
--> 488 upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks)
489
490 to_concat = [
~\anaconda3\lib\site-packages\pandas\core\internals\concat.py in _dtype_to_na_value(dtype, has_none_blocks)
546 elif dtype.kind == "O":
547 return np.nan
--> 548 raise NotImplementedError
549
550
NotImplementedError:
I expected the new dataframe to be concatenated to the old one without any errors, as the number of rows match for both dataframes.

ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42)

I already looked at the other similar questions, but they did not help me. I'm attempting to use GridSearchCV. I'm using three pipelines to predict nfl play data. It works pretty well until the grid search part.
Here is my code.
pipe_nfl1_1 = Pipeline([
('ssc', StandardScaler()),
('lr', LogisticRegression(random_state=42))
])
pipe_nfl1_2 = Pipeline([
('mms', MinMaxScaler()),
('rfc', RandomForestClassifier(random_state=42))
])
pipe_nfl1_3 = Pipeline([
('mms', MinMaxScaler()),
('svc', svm.SVC(random_state=42))
])
pipelines1 = [pipe_nfl1_1, pipe_nfl1_2, pipe_nfl1_3]
pipe_dict1 = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'SVC'}
for pipe in pipelines1:
pipe.fit(X_train1, y_train1)
print('Pipeline test accuracy for predicting 1st downs:')
for idx, val in enumerate(pipelines1):
print(' %s: %.4f' % (pipe_dict1[idx], val.score(X_test1, y_test1)))
best_acc1 = 0.0
best_clf1 = 0
best_pipe1 = ''
for idx, val in enumerate(pipelines1):
if val.score(X_test1, y_test1) > best_acc1:
best_acc1 = val.score(X_test1, y_test1)
best_pipe1 = val
best_clf1 = idx
best_acc1 *= 100
print('Classifier with best accuracy for predicting 1st downs is %s with %.2f' % (pipe_dict1[best_clf1], best_acc1) + '%')
param_grid1 = {
'lr__n_estimators': [2, 4, 6]
}
grid_search1 = GridSearchCV(pipe_nfl1_1, param_grid1, cv=2)
# fine-tune the hyperparameters
grid_search1.fit(X_train1, y_train1)
# get the best model
final_model1 = grid_search1.best_estimator_
grid_search.best_score_
But I'm getting an error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-33-6b0007d9b8f1> in <module>
2
3 # fine-tune the hyperparameters
----> 4 grid_search1.fit(X_train1, y_train1)
5
6 # get the best model
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
518 cloned_parameters[k] = clone(v, safe=False)
519
--> 520 estimator = estimator.set_params(**cloned_parameters)
521
522 start_time = time.time()
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in set_params(self, **kwargs)
139 self
140 """
--> 141 self._set_params('steps', **kwargs)
142 return self
143
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self, attr, **params)
51 self._replace_estimator(attr, name, params.pop(name))
52 # 3. Step parameters and other initialisation arguments
---> 53 super().set_params(**params)
54 return self
55
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
259
260 for key, sub_params in nested_params.items():
--> 261 valid_params[key].set_params(**sub_params)
262
263 return self
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
247 key, delim, sub_key = key.partition('__')
248 if key not in valid_params:
--> 249 raise ValueError('Invalid parameter %s for estimator %s. '
250 'Check the list of available parameters '
251 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.
I've done LogisticRegression.get_params().keys() to get the keys, but it returns get_params() missing 1 required positional argument: 'self'.
You shouldn't have the leading underscores in the parameter names. You want your param_grid1 dict to consist of keys that are actually parameters accepted by the model you're using. That would be n_estimators for RandomForest, and C for LogisticRegression. With that said, n_estimators is a parameter for the model RandomForest, but it's not a parameter for LogisticRegression. C is a parameter for LogisticRegression.
I think what you want to do is a grid search over the parameter space of the model that performs best, right? In that case, your param_grid1 variable should be updated to the model that performs best. The parameters accepted by the models you're testing vary from model to model.

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.

Folium Choropleth + GeoJSON raises AttributeError: 'NoneType'

I'm trying to do a choropleth using folium which offers a great link between GeoJSON, Pandas and leaflet.
GeoJSON format is like below :
{
"type":"FeatureCollection",
"features":[
{
"type":"Feature",
"geometry":
{
"type":"Polygon",
"coordinates":[[[-1.6704591323124895,49.62681486270549], .....
{
"insee":"50173",
"nom":"Équeurdreville-Hainneville",
"wikipedia":"fr:Équeurdreville-Hainneville",
"surf_m2":12940306}},
Pandas DataFrame :
postal_count.head(5)
Out[98]:
Code_commune_INSEE CP_count
0 75120 723
1 75115 698
2 75112 671
3 75118 627
4 75111 622
"Code_communes_INSEE" corresponds to the attribute "insee" in the GeoJSON. I'd like to do a choropleth using the variable "CP_count" in the above DataFrame.
Here is my code (snippet from this notebook)
map_france = folium.Map(location=[47.000000, 2.000000], zoom_start=6)
map_france.choropleth(
geo_str=open(geo_path + 'simplified_communes100m.json').read(),
data=postal_count,
columns=['Code_commune_INSEE', 'CP_count'],
key_on='feature.geometry.properties.insee',
fill_color='YlGn',
)
map_france.save(table_path + 'choro_test1.html')
I'm still getting this error again and again :
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-83-ea0fd2c1c207> in <module>()
8 fill_color='YlGn',
9 )
---> 10 map_france.save('/media/flo/Stockage/Data/MesAides/map/choro_test1.html')
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/element.py in save(self, outfile, close_file, **kwargs)
151
152 root = self.get_root()
--> 153 html = root.render(**kwargs)
154 fid.write(html.encode('utf8'))
155 if close_file:
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/element.py in render(self, **kwargs)
357 """Renders the HTML representation of the element."""
358 for name, child in self._children.items():
--> 359 child.render(**kwargs)
360 return self._template.render(this=self, kwargs=kwargs)
361
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/element.py in render(self, **kwargs)
665
666 for name, element in self._children.items():
--> 667 element.render(**kwargs)
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/element.py in render(self, **kwargs)
661 script = self._template.module.__dict__.get('script', None)
662 if script is not None:
--> 663 figure.script.add_children(Element(script(self, kwargs)),
664 name=self.get_name())
665
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/jinja2/runtime.py in __call__(self, *args, **kwargs)
434 raise TypeError('macro %r takes not more than %d argument(s)' %
435 (self.name, len(self.arguments)))
--> 436 return self._func(*arguments)
437
438 def __repr__(self):
<template> in macro(l_this, l_kwargs)
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/jinja2/runtime.py in call(_Context__self, _Context__obj, *args, **kwargs)
194 args = (__self.environment,) + args
195 try:
--> 196 return __obj(*args, **kwargs)
197 except StopIteration:
198 return __self.environment.undefined('value was undefined because '
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/features.py in style_data(self)
352
353 for feature in self.data['features']:
--> 354 feature.setdefault('properties', {}).setdefault('style', {}).update(self.style_function(feature)) # noqa
355 return json.dumps(self.data, sort_keys=True)
356
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in style_function(x)
671 "color": line_color,
672 "fillOpacity": fill_opacity,
--> 673 "fillColor": color_scale_fun(x)
674 }
675
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in color_scale_fun(x)
659 def color_scale_fun(x):
660 return color_range[len(
--> 661 [u for u in color_domain if
662 u <= color_data[get_by_key(x, key_on)]])]
663 else:
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in <listcomp>(.0)
660 return color_range[len(
661 [u for u in color_domain if
--> 662 u <= color_data[get_by_key(x, key_on)]])]
663 else:
664 def color_scale_fun(x):
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in get_by_key(obj, key)
655 return (obj.get(key, None) if len(key.split('.')) <= 1 else
656 get_by_key(obj.get(key.split('.')[0], None),
--> 657 '.'.join(key.split('.')[1:])))
658
659 def color_scale_fun(x):
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in get_by_key(obj, key)
655 return (obj.get(key, None) if len(key.split('.')) <= 1 else
656 get_by_key(obj.get(key.split('.')[0], None),
--> 657 '.'.join(key.split('.')[1:])))
658
659 def color_scale_fun(x):
/home/flo/.virtualenvs/mesaides/lib/python3.4/site-packages/folium/folium.py in get_by_key(obj, key)
653
654 def get_by_key(obj, key):
--> 655 return (obj.get(key, None) if len(key.split('.')) <= 1 else
656 get_by_key(obj.get(key.split('.')[0], None),
657 '.'.join(key.split('.')[1:])))
AttributeError: 'NoneType' object has no attribute 'get'
I tried playing with key_on='feature.geometry.properties.insee' without any success.
There were 2 problems :
1 - The correct access to 'insee' parameters is : key_on='feature.properties.insee'
The best way to find the right key_on is to play with the geoJSON dict to make sure you are calling the right properties.
2- Once you have the right key_on parameters, you need to make sure that all the available keys in the geoJSON are contained in your Pandas DataFrame (otherwise it will raise a KeyError)
In this case, I used the following command line to get all the insee keys contained by my geoJSON:
ogrinfo -ro -al communes-20150101-100m.shp -geom=NO | grep insee > list_code_insee.txt
If you are experiencing the same issue, this should solve your problem.
I had the same problem on JupyterLab (on labs.cognitiveclass.ai) using Folium 0.5.0. Then I copied my code and ran it in PyCharm, and it worked! I don't understand why, perhaps there is some backend issue (?)
If you want to display a folium map outside of a Jupyter notebook, you have to save the map to html:
map_france.save('map_france.html')
and open the html in your browser.