regarding the error message of nvalid comparison between dtype=datetime64[ns] and date - pandas

I was trying to run the following two segments, a part from this databricks tutorial.
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from datetime import date
# get historical actuals & predictions for comparison
actuals_pd = history_pd[ history_pd['ds'] < date(2018, 1, 1) ]['y'] # line 1
predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'] # line 2
However, I got the error message of TypeError: Invalid comparison between dtype=datetime64[ns] and date from predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'].. The one in the previous line, which seems very similar, does not have this error. I also printed types of predicted_pd and actuals_pd for the reference.
TypeError Traceback (most recent call last)
<ipython-input-15-748394f8994f> in <module>
----> 1 predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat']
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\__init__.py in wrapper(self, other)
368 rvalues = extract_array(other, extract_numpy=True)
369
--> 370 res_values = comparison_op(lvalues, rvalues, op)
371
372 return self._construct_result(res_values, name=res_name)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\array_ops.py in comparison_op(left, right, op)
228 if should_extension_dispatch(lvalues, rvalues):
229 # Call the method on lvalues
--> 230 res_values = op(lvalues, rvalues)
231
232 elif is_scalar(rvalues) and isna(rvalues):
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
64
---> 65 return method(self, other)
66
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\arrays\datetimelike.py in wrapper(self, other)
116 other = _validate_comparison_value(self, other)
117 except InvalidComparison:
--> 118 return invalid_comparison(self, other, op)
119
120 dtype = getattr(other, "dtype", None)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\invalid.py in invalid_comparison(left, right, op)
32 else:
33 typ = type(right).__name__
---> 34 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
35 return res_values
36
TypeError: Invalid comparison between dtype=datetime64[ns] and date

Pandas dates default to datetime64[ns]. So you don't want to compare them to datetime.date objects. Instead, you can just use a date string and pandas will handle the comparison corectly. Also, if you use loc to specify the rows and columns, you will get a cleaner syntax than in your examples.
datestr = '2018-01-01'
actuals_pd = history_pd.loc[history_pd['ds'] < datestr, 'y'] # line 1
predicted_pd = forecast_pd.loc[forecast_pd['ds'] < datestr, 'yhat'] # line 2

Related

I am unable to retrieve data from pndas_datareader. how will it work for yahoo data

PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[34], line 2
1 # !pip install pandas_datareader
----> 2 PG = wb.DataReader('PG',data_source = 'yahoo',start = '2000-1-1', end = '2001-1-1')
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File c:\Users\intiz\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas_datareader\data.py:379, in DataReader(name, data_source, start, end, retry_count, pause, session, api_key)
367 raise NotImplementedError(msg)
369 if data_source == "yahoo":
370 return YahooDailyReader(
371 symbols=name,
372 start=start,
373 end=end,
374 adjust_price=False,
375 chunksize=25,
376 retry_count=retry_count,
377 pause=pause,
378 session=session,
--> 379 ).read()
381 elif data_source == "iex":
...
--> 153 data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]
154 except KeyError:
155 msg = "No data fetched for symbol {} using {}"
TypeError: string indices must be integers
I need PG stock index information datewise

Translate Tweets using googletrans - AttributeError: 'NoneType' object has no attribute 'group'

I have a pandas dataframe with some German tweets. I want to translate these tweets to English and use googletrans for this task and try to apply it via a lambda function to my dataframe.
I use this code:
from googletrans import Translator
df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
And get this error:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-16-ed0b3a6e6dd8> in <module>
----> 1 df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\pandas\core\series.py in apply(self, func, convert_dtype, args, **kwds)
4136 else:
4137 values = self.astype(object)._values
-> 4138 mapped = lib.map_infer(values, f, convert=convert_dtype)
4139
4140 if len(mapped) and isinstance(mapped[0], Series):
pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()
<ipython-input-16-ed0b3a6e6dd8> in <lambda>(x)
----> 1 df1['translated_tweet'] = df1['tweet'].apply(lambda x: Translator().translate(x, dest='en').text)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\client.py in translate(self, text, dest, src, **kwargs)
180
181 origin = text
--> 182 data = self._translate(text, dest, src, kwargs)
183
184 # this code will be updated when the format is changed.
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\client.py in _translate(self, text, dest, src, override)
76
77 def _translate(self, text, dest, src, override):
---> 78 token = self.token_acquirer.do(text)
79 params = utils.build_params(query=text, src=src, dest=dest,
80 token=token, override=override)
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\gtoken.py in do(self, text)
192
193 def do(self, text):
--> 194 self._update()
195 tk = self.acquire(text)
196 return tk
~\Downloads\WPy64-3920\python-3.9.2.amd64\lib\site-packages\googletrans\gtoken.py in _update(self)
60
61 # this will be the same as python code after stripping out a reserved word 'var'
---> 62 code = self.RE_TKK.search(r.text).group(1).replace('var ', '')
63 # unescape special ascii characters such like a \x3d(=)
64 code = code.encode().decode('unicode-escape')
AttributeError: 'NoneType' object has no attribute 'group'
What do I have to change here?

np.where condition is not getting satisfied

In the following line of code, I get the error shown below.
d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
I if the numerator or denominator is 0, then the condition for np.nan should satisfy and d3["WOE"] shoud be nan. Why is the following error being produced?
---------------------------------------------------------------------------
FloatingPointError Traceback (most recent call last)
<ipython-input-56-a9b015683238> in <module>
----> 1 final_iv, IV = data_vars(df_leads_short,df_leads_short.close_flag)
2 IV.sort_values('IV')
<ipython-input-55-5530ad13fa5a> in data_vars(df1, target)
122 count = count + 1
123 else:
--> 124 conv = char_bin(target, df1[i])
125 conv["VAR_NAME"] = i
126 count = count + 1
<ipython-input-55-5530ad13fa5a> in char_bin(Y, X)
92 d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
93 d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
---> 94 d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
95 #d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
96 d3["IV"] = np.where((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0 ),np.nan ,(d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1934 self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
1935 ):
-> 1936 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
1937
1938 # ideally we would define this to avoid the getattr checks, but
/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py in array_ufunc(self, ufunc, method, *inputs, **kwargs)
356 # ufunc(series, ...)
357 inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
--> 358 result = getattr(ufunc, method)(*inputs, **kwargs)
359 else:
360 # ufunc(dataframe)
FloatingPointError: divide by zero encountered in log
We can do
cond = ((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0))
d3.loc[~cond,"WOE"] = np.log(d3.loc[~cond,"DIST_EVENT"]/d3.loc[~cond,"DIST_NON_EVENT"]))
Since the np.where still need calculated the np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT) which will still yield the same error.np.where is just selection.

How to convert coordinate columns to Point column with Shapely and Dask?

I have the following problem. My data is a huge dataframe, looking like this (this is the head of the dataframe)
import pandas
import dask.dataframe as dd
data = dd.read_csv(data_path)
data.persist()
print(data.head())
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner
0 100mN26840E43341 4334150 2684050 -1
1 100mN26840E43342 4334250 2684050 -1
2 100mN26840E43343 4334350 2684050 -1
3 100mN26840E43344 4334450 2684050 -1
4 100mN26840E43345 4334550 2684050 -1
I am using Dask to handle it. I now want to create a new column where the 'x_mp_100m' and 'y_mp_100m' are converted into a Shapely Point. For a single row, it would look like this:
from shapely.geometry import Point
test_df = data.head(1)
test_df = test_df.assign(geom=lambda k: Point(k.x_mp_100m,k.y_mp_100m))
print(test_df)
Gitter_ID_100m x_mp_100m y_mp_100m Einwohner geom
0 100mN26840E43341 4334150 2684050 -1 POINT (4334150 2684050)
I already tried the following code with Dask:
data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
When doing that, I get the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-17-b8de11d9b9b3> in <module>
----> 1 data_out.compute()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
395 keys = [x.__dask_keys__() for x in collections]
396 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 397 results = schedule(dsk, keys, **kwargs)
398 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
399
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, actors, **kwargs)
2319 try:
2320 results = self.gather(packed, asynchronous=asynchronous,
-> 2321 direct=direct)
2322 finally:
2323 for f in futures.values():
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1653 return self.sync(self._gather, futures, errors=errors,
1654 direct=direct, local_worker=local_worker,
-> 1655 asynchronous=asynchronous)
1656
1657 #gen.coroutine
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
671 return future
672 else:
--> 673 return sync(self.loop, func, *args, **kwargs)
674
675 def __repr__(self):
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
~\AppData\Local\Continuum\anaconda3\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
~\AppData\Local\Continuum\anaconda3\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1498 six.reraise(type(exception),
1499 exception,
-> 1500 traceback)
1501 if errors == 'skip':
1502 bad_keys.add(key)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\dask\dataframe\core.py in apply_and_enforce()
3682
3683 Ensures the output has the same columns, even if empty."""
-> 3684 df = func(*args, **kwargs)
3685 if isinstance(df, (pd.DataFrame, pd.Series, pd.Index)):
3686 if len(df) == 0:
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\frame.py in assign()
3549 if PY36:
3550 for k, v in kwargs.items():
-> 3551 data[k] = com.apply_if_callable(v, data)
3552 else:
3553 # <= 3.5: do all calculations first...
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\common.py in apply_if_callable()
327
328 if callable(maybe_callable):
--> 329 return maybe_callable(obj, **kwargs)
330
331 return maybe_callable
<ipython-input-16-d5710cb00158> in <lambda>()
----> 1 data_out = data.map_partitions(lambda df: df.assign(geom= lambda k: Point(k.x_mp_100m,k.y_mp_100m)), meta=pd.DataFrame)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in __init__()
47 BaseGeometry.__init__(self)
48 if len(args) > 0:
---> 49 self._set_coords(*args)
50
51 # Coordinate getters and setters
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in _set_coords()
130 self._geom, self._ndim = geos_point_from_py(args[0])
131 else:
--> 132 self._geom, self._ndim = geos_point_from_py(tuple(args))
133
134 coords = property(BaseGeometry._get_coords, _set_coords)
~\AppData\Local\Continuum\anaconda3\lib\site-packages\shapely\geometry\point.py in geos_point_from_py()
207 coords = ob
208 n = len(coords)
--> 209 dx = c_double(coords[0])
210 dy = c_double(coords[1])
211 dz = None
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\series.py in wrapper()
91 return converter(self.iloc[0])
92 raise TypeError("cannot convert the series to "
---> 93 "{0}".format(str(converter)))
94
95 wrapper.__name__ = "__{name}__".format(name=converter.__name__)
TypeError: cannot convert the series to <class 'float'>
So I think, I am using pandas.assign() function in a wrong way, or there should be a better fitting function, I just cannot seem to wrap my head around it. Do you know a better way to handle this?
I also found this way:
data_out = data.map_partitions(lambda df: df.apply(lambda row: Point(row['x_mp_100m'],row['y_mp_100m']), axis=1))
But is that the most efficient way?
What you're doing seems fine. I would find a function that works well on a single row and then use the apply method or a function that works well on a single Pandas dataframe and then use the map_partitions method.
For the error that you're getting I would first verify that your function works on a pandas dataframe.

How to fix "Exception: Data must be 1-dimensional" error when running Kmeans

I have resolved all errors up till now. I am not quite sure I understand the problem except for I get the error "Exception: Data must be 1-dimensional".
Here is my code. Here is a link to the excel file im using.
import pandas as pd
import numpy as np
import warnings
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans
df1 = pd.read_excel('PERM_Disclosure_Data_FY2018_EOYV2.xlsx', 'PERM_FY2018')
warnings.filterwarnings("ignore")
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.CASE_STATUS[df1['CASE_STATUS']=='Certified-Expired'] = 'Certified'
df1 = df1[df1.CASE_STATUS != 'Withdrawn']
df1 = df1.dropna()
df1 = df1[df1.PW_AMOUNT_9089 != '#############']
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.PW_AMOUNT_9089 = df1.PW_AMOUNT_9089.astype(float)
df1=df1.iloc[:, [2,4,5]]
enc = LabelBinarizer()
y = enc.fit_transform(df1.CASE_STATUS)[:, [0]]
at this point the output for y is an array:
array([[0],
[0],
[0],
...,
[1],
[1],
[0]])
then I define XZ
le = preprocessing.LabelEncoder()
X = df1.iloc[:, [1]]
Z = df1.iloc[:, [2]]
X2 = X.apply(le.fit_transform)
XZ = pd.concat([X2,Z], axis=1)
the output for XZ is:
PW_SOC_TITLE PW_AMOUNT_9089
12 176 60778.0
13 456 100901.0
14 134 134389.0
15 134 104936.0
16 134 95160.0
17 294 66976.0
18 73 38610.0
19 598 122533.0
20 220 109574.0
21 99 67850.0
22 399 132018.0
23 68 56118.0
24 139 136781.0
25 134 111405.0
26 598 58573.0
27 362 75067.0
28 598 85862.0
29 572 33301.0
30 598 112840.0
31 134 134971.0
32 176 100568.0
33 176 100568.0
34 626 19614.0
35 153 26354.0
36 405 79248.0
37 220 93350.0
38 139 153213.0
39 598 131997.0
40 598 131997.0
41 1 90438.0
... ... ...
119741 495 23005.0
119742 63 46030.0
119743 153 20301.0
119744 95 21965.0
119745 153 29890.0
119746 295 79680.0
119747 349 79498.0
119748 223 38930.0
119749 223 38930.0
119750 570 39160.0
119751 302 119392.0
119752 598 106001.0
119753 416 64230.0
119754 598 115482.0
119755 99 80205.0
119756 134 78329.0
119757 598 109325.0
119758 598 109325.0
119759 570 49770.0
119760 194 18117.0
119761 404 46987.0
119762 189 35131.0
119763 73 49900.0
119764 323 32240.0
119765 372 28122.0
119766 468 67974.0
119767 399 78520.0
119768 329 25875.0
119769 329 25875.0
119770 601 82098.0
I then continue:
from sklearn.model_selection import train_test_split
XZ_train, XZ_test, y_train, y_test = train_test_split(XZ, y,
test_size = .25,
random_state=20,
stratify=y )
# loading library
from pandas_ml import ConfusionMatrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# instantiate learning model loop(k = i)
for weights in ['uniform', 'distance']:
for i in range(1,11,2):
knn = KNeighborsClassifier(n_neighbors=i, weights=weights)
# fitting the model
knn.fit(XZ_train, y_train)
# predict the response
pred = knn.predict(XZ_test)
confusion = ConfusionMatrix(y_test, pred)
if i<11:
# evaluate accuracy
print('Weight Measure:', knn.weights)
print('n_neighbors=', knn.n_neighbors)
print('Accuracy=', accuracy_score(y_test, pred))
#print('')
#print('Confusion Matrix')
#print(confusion)
print('-----------------------------')
The error I get is as follows:
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:11: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
# This is added back by InteractiveShellApp.init_path()
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-20-bf6054d911ba> in <module>
12 # predict the response
13 pred = knn.predict(XZ_test)
---> 14 confusion = ConfusionMatrix(y_test, pred)
15 if i<11:
16 # evaluate accuracy
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\cm.py in __new__(cls, y_true, y_pred, *args, **kwargs)
21 if len(set(uniq_true) - set(uniq_pred)) == 0:
22 from pandas_ml.confusion_matrix.bcm import BinaryConfusionMatrix
---> 23 return BinaryConfusionMatrix(y_true, y_pred, *args, **kwargs)
24 return LabeledConfusionMatrix(y_true, y_pred, *args, **kwargs)
25
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\bcm.py in __init__(self, *args, **kwargs)
19 def __init__(self, *args, **kwargs):
20 # super(BinaryConfusionMatrix, self).__init__(y_true, y_pred)
---> 21 super(BinaryConfusionMatrix, self).__init__(*args, **kwargs)
22 assert self.len() == 2, \
23 "Binary confusion matrix must have len=2 but \
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\abstract.py in __init__(self, y_true, y_pred, labels, display_sum, backend, true_name, pred_name)
31 self._y_true.name = self.true_name
32 else:
---> 33 self._y_true = pd.Series(y_true, name=self.true_name)
34
35 if isinstance(y_pred, pd.Series):
G:\Anaconda\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
273 else:
274 data = _sanitize_array(data, index, dtype, copy,
--> 275 raise_cast_failure=True)
276
277 data = SingleBlockManager(data, index, fastpath=True)
G:\Anaconda\lib\site-packages\pandas\core\series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
4163 elif subarr.ndim > 1:
4164 if isinstance(data, np.ndarray):
-> 4165 raise Exception('Data must be 1-dimensional')
4166 else:
4167 subarr = com._asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
Is the data I am passing through not the correct type? The datatypes match the datatypes I've used in a past project so I thought I could replicate it here. For those wondering X is Company names that I encoded, Y is binarized case status, and Z is a wage amount in the float dtype.
"...the output for y is an array..." The array that you show is two-dimensional, with shape (n, 1). (One of the dimensions is trivial, but it is still 2-d.) Do something like y[:, 0] or y.ravel() to get a 1-d version.