np.where condition is not getting satisfied - pandas

In the following line of code, I get the error shown below.
d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
I if the numerator or denominator is 0, then the condition for np.nan should satisfy and d3["WOE"] shoud be nan. Why is the following error being produced?
---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
<ipython-input-56-a9b015683238> in <module>
----> 1 final_iv, IV = data_vars(df_leads_short,df_leads_short.close_flag)
2 IV.sort_values('IV')
<ipython-input-55-5530ad13fa5a> in data_vars(df1, target)
122 count = count + 1
123 else:
--> 124 conv = char_bin(target, df1[i])
125 conv["VAR_NAME"] = i
126 count = count + 1
<ipython-input-55-5530ad13fa5a> in char_bin(Y, X)
92 d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
93 d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
---> 94 d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
95 #d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
96 d3["IV"] = np.where((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0 ),np.nan ,(d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1934 self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
1935 ):
-> 1936 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
1937
1938 # ideally we would define this to avoid the getattr checks, but
/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py in array_ufunc(self, ufunc, method, *inputs, **kwargs)
356 # ufunc(series, ...)
357 inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
--> 358 result = getattr(ufunc, method)(*inputs, **kwargs)
359 else:
360 # ufunc(dataframe)
FloatingPointError: divide by zero encountered in log

We can do
cond = ((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0))
d3.loc[~cond,"WOE"] = np.log(d3.loc[~cond,"DIST_EVENT"]/d3.loc[~cond,"DIST_NON_EVENT"]))
Since the np.where still need calculated the np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT) which will still yield the same error.np.where is just selection.

Related

I am unable to retrieve data from pndas_datareader. how will it work for yahoo data

PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[34], line 2
1 # !pip install pandas_datareader
----> 2 PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas_datareader\data.py:379, in DataReader(name, data_source, start, end, retry_count, pause, session, api_key)
367 raise NotImplementedError(msg)
369 if data_source == "yahoo":
370 return YahooDailyReader(
371 symbols=name,
372 start=start,
373 end=end,
374 adjust_price=False,
375 chunksize=25,
376 retry_count=retry_count,
377 pause=pause,
378 session=session,
--> 379 ).read()
381 elif data_source == "iex":
...
--> 153 data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
154 except KeyError:
155 msg = "No data fetched for symbol {} using {}"
TypeError: string indices must be integers
I need PG stock index information datewise

pandas 0.25.x to 1.5.2, nested renamer migration

[In]:
pd.set_option('display.max_colwidth', 200)
topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({
'Dominant Topic': {
'Doc Count': np.size,
'% Total Docs': np.size }
})
topic_stats_df = topic_stats_df['Dominant Topic'].reset_index()
topic_stats_df['% Total Docs'] = topic_stats_df['% Total Docs'].apply(lambda row: round((row*100) / len(papers), 2))
topic_stats_df['Topic Desc'] = [topics_df.iloc[t]['Terms per Topic'] for t in range(len(topic_stats_df))]
topic_stats_df
[Out]:
---------------------------------------------------------------------------
SpecificationError Traceback (most recent call last)
Cell In[47], line 2
1 pd.set_option('display.max_colwidth', 200)
----> 2 topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({
3 'Dominant Topic': {
4 'Doc Count': np.size,
5 '% Total Docs': np.size }
6 })
7 topic_stats_df = topic_stats_df['Dominant Topic'].reset_index()
8 topic_stats_df['% Total Docs'] = topic_stats_df['% Total Docs'].apply(lambda row: round((row*100) / len(papers), 2))
File ~/miniconda3/envs/nlp/lib/python3.8/site-packages/pandas/core/groupby/generic.py:894, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
891 func = maybe_mangle_lambdas(func)
893 op = GroupByApply(self, func, args, kwargs)
--> 894 result = op.agg()
895 if not is_dict_like(func) and result is not None:
896 return result
File ~/miniconda3/envs/nlp/lib/python3.8/site-packages/pandas/core/apply.py:169, in Apply.agg(self)
166 return self.apply_str()
168 if is_dict_like(arg):
--> 169 return self.agg_dict_like()
170 elif is_list_like(arg):
171 # we require a list, but not a 'str'
172 return self.agg_list_like()
File ~/miniconda3/envs/nlp/lib/python3.8/site-packages/pandas/core/apply.py:478, in Apply.agg_dict_like(self)
475 selected_obj = obj._selected_obj
476 selection = obj._selection
--> 478 arg = self.normalize_dictlike_arg("agg", selected_obj, arg)
480 if selected_obj.ndim == 1:
481 # key only used for output
482 colg = obj._gotitem(selection, ndim=1)
File ~/miniconda3/envs/nlp/lib/python3.8/site-packages/pandas/core/apply.py:594, in Apply.normalize_dictlike_arg(self, how, obj, func)
587 # Can't use func.values(); wouldn't work for a Series
588 if (
589 how == "agg"
590 and isinstance(obj, ABCSeries)
591 and any(is_list_like(v) for _, v in func.items())
592 ) or (any(is_dict_like(v) for _, v in func.items())):
593 # GH 15931 - deprecation of renaming keys
--> 594 raise SpecificationError("nested renamer is not supported")
596 if obj.ndim != 1:
597 # Check for missing columns on a frame
598 cols = set(func.keys()) - set(obj.columns)
SpecificationError: nested renamer is not supported
The code is credited to Sarkar, D. (2019). Text Analytics with Python Apress, Topic modeling section.
Pip pandas 0.25.3 fails because I'm on an m1 Mac.
Have tried: pip install pandas==0.25.3
Have tried: arch -x86_64 pip install pandas==0.25.3
Pandas a will back removed nested renaming in favor of using pd.NamedAgg
topic_stats_df = corpus_topic_df.groupby('Dominant Topic').agg({
'Dominant Topic': {
'Doc Count': np.size,
'% Total Docs': np.size }
})
This statement can be rewritten as follows:
topic_stats_df = corpus_topic_df.groupby('Dominant Topic')\
.agg(Doc_count=('Dominant Topic', np.size),
Pct_Total_Docs=('Dominant Topic', np.size))

regarding the error message of nvalid comparison between dtype=datetime64[ns] and date

I was trying to run the following two segments, a part from this databricks tutorial.
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from datetime import date
# get historical actuals & predictions for comparison
actuals_pd = history_pd[ history_pd['ds'] < date(2018, 1, 1) ]['y'] # line 1
predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'] # line 2
However, I got the error message of TypeError: Invalid comparison between dtype=datetime64[ns] and date from predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'].. The one in the previous line, which seems very similar, does not have this error. I also printed types of predicted_pd and actuals_pd for the reference.
TypeError Traceback (most recent call last)
<ipython-input-15-748394f8994f> in <module>
----> 1 predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat']
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\__init__.py in wrapper(self, other)
368 rvalues = extract_array(other, extract_numpy=True)
369
--> 370 res_values = comparison_op(lvalues, rvalues, op)
371
372 return self._construct_result(res_values, name=res_name)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\array_ops.py in comparison_op(left, right, op)
228 if should_extension_dispatch(lvalues, rvalues):
229 # Call the method on lvalues
--> 230 res_values = op(lvalues, rvalues)
231
232 elif is_scalar(rvalues) and isna(rvalues):
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\arrays\datetimelike.py in wrapper(self, other)
116 other = _validate_comparison_value(self, other)
117 except InvalidComparison:
--> 118 return invalid_comparison(self, other, op)
119
120 dtype = getattr(other, "dtype", None)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\invalid.py in invalid_comparison(left, right, op)
32 else:
33 typ = type(right).__name__
---> 34 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
35 return res_values
36
TypeError: Invalid comparison between dtype=datetime64[ns] and date
Pandas dates default to datetime64[ns]. So you don't want to compare them to datetime.date objects. Instead, you can just use a date string and pandas will handle the comparison corectly. Also, if you use loc to specify the rows and columns, you will get a cleaner syntax than in your examples.
datestr = '2018-01-01'
actuals_pd = history_pd.loc[history_pd['ds'] < datestr, 'y'] # line 1
predicted_pd = forecast_pd.loc[forecast_pd['ds'] < datestr, 'yhat'] # line 2

ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42)

I already looked at the other similar questions, but they did not help me. I'm attempting to use GridSearchCV. I'm using three pipelines to predict nfl play data. It works pretty well until the grid search part.
Here is my code.
pipe_nfl1_1 = Pipeline([
('ssc', StandardScaler()),
('lr', LogisticRegression(random_state=42))
])
pipe_nfl1_2 = Pipeline([
('mms', MinMaxScaler()),
('rfc', RandomForestClassifier(random_state=42))
])
pipe_nfl1_3 = Pipeline([
('mms', MinMaxScaler()),
('svc', svm.SVC(random_state=42))
])
pipelines1 = [pipe_nfl1_1, pipe_nfl1_2, pipe_nfl1_3]
pipe_dict1 = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'SVC'}
for pipe in pipelines1:
pipe.fit(X_train1, y_train1)
print('Pipeline test accuracy for predicting 1st downs:')
for idx, val in enumerate(pipelines1):
print(' %s: %.4f' % (pipe_dict1[idx], val.score(X_test1, y_test1)))
best_acc1 = 0.0
best_clf1 = 0
best_pipe1 = ''
for idx, val in enumerate(pipelines1):
if val.score(X_test1, y_test1) > best_acc1:
best_acc1 = val.score(X_test1, y_test1)
best_pipe1 = val
best_clf1 = idx
best_acc1 *= 100
print('Classifier with best accuracy for predicting 1st downs is %s with %.2f' % (pipe_dict1[best_clf1], best_acc1) + '%')
param_grid1 = {
'lr__n_estimators': [2, 4, 6]
}
grid_search1 = GridSearchCV(pipe_nfl1_1, param_grid1, cv=2)
# fine-tune the hyperparameters
grid_search1.fit(X_train1, y_train1)
# get the best model
final_model1 = grid_search1.best_estimator_
grid_search.best_score_
But I'm getting an error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-33-6b0007d9b8f1> in <module>
2
3 # fine-tune the hyperparameters
----> 4 grid_search1.fit(X_train1, y_train1)
5
6 # get the best model
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
518 cloned_parameters[k] = clone(v, safe=False)
519
--> 520 estimator = estimator.set_params(**cloned_parameters)
521
522 start_time = time.time()
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in set_params(self, **kwargs)
139 self
140 """
--> 141 self._set_params('steps', **kwargs)
142 return self
143
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self, attr, **params)
51 self._replace_estimator(attr, name, params.pop(name))
52 # 3. Step parameters and other initialisation arguments
---> 53 super().set_params(**params)
54 return self
55
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
259
260 for key, sub_params in nested_params.items():
--> 261 valid_params[key].set_params(**sub_params)
262
263 return self
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
247 key, delim, sub_key = key.partition('__')
248 if key not in valid_params:
--> 249 raise ValueError('Invalid parameter %s for estimator %s. '
250 'Check the list of available parameters '
251 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.
I've done LogisticRegression.get_params().keys() to get the keys, but it returns get_params() missing 1 required positional argument: 'self'.
You shouldn't have the leading underscores in the parameter names. You want your param_grid1 dict to consist of keys that are actually parameters accepted by the model you're using. That would be n_estimators for RandomForest, and C for LogisticRegression. With that said, n_estimators is a parameter for the model RandomForest, but it's not a parameter for LogisticRegression. C is a parameter for LogisticRegression.
I think what you want to do is a grid search over the parameter space of the model that performs best, right? In that case, your param_grid1 variable should be updated to the model that performs best. The parameters accepted by the models you're testing vary from model to model.

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.