SpecificationError: Function names must be unique if there is no new column names assigned - pandas

I want to create a new column in the clin dataframe based on the following conditions:
1 if vals>=2*365 or is NAN
otherwise 0
I then assign the new column name as SURV.
import numpy as np
vals = clin['days_to_death'].astype(np.float32)
# non-LTS is 0, LTS is 1
surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
clin['SURV'] = clin.apply(surv, axis=1)
SpecificationError: Function names must be unique if there is no new column names assigned
SpecificationError Traceback (most recent call last)
<ipython-input-31-603dee8413ce> in <module>
5 # non-LTS is 0, LTS is 1
6 surv = [1 if ( v>=2*365 or np.isnan(v) ) else 0 for v in vals ]
----> 7 clin['SURV'] = clin.apply(surv, axis=1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/apply.py in get_result(self)
146 # multiple values for keyword argument "axis"
147 return self.obj.aggregate( # type: ignore[misc]
--> 148 self.f, axis=self.axis, *self.args, **self.kwds
149 )
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/frame.py in aggregate(self, func, axis, *args, **kwargs)
7572 axis = self._get_axis_number(axis)
-> 7574 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
7576 result = None
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/aggregation.py in reconstruct_func(func, **kwargs)
93 # there is no reassigned name
94 raise SpecificationError(
---> 95 "Function names must be unique if there is no new column names "
96 "assigned"
97 )
SpecificationError: Function names must be unique if there is no new column names assigned
clin = pd.DataFrame([[1, '466', '47', 0, '90'],
[1, '357', '54', 1, '80'],
[1, '108', '72', 1, '60'],
[1, '254', '51', 0, '80'],
[1, '138', '78', 1, '80'],
[0, nan, '67', 0, '60']], columns=['vital_status', 'days_to_death', 'age_at_initial_pathologic_diagnosis',
'gender', 'karnofsky_performance_score'], index=['TCGA-06-1806', 'TCGA-06-5408', 'TCGA-06-5410', 'TCGA-06-5411',
'TCGA-06-5412', 'TCGA-06-5413'])
Expected output:

Make a new column of all 0's and then update the column with your desired parameters.
clin['SURV'] = 0
clin.loc[pd.to_numeric(clin.days_to_death).ge(2*365) | clin.days_to_death.isna(), 'SURV'] = 1
vital_status days_to_death age_at_initial_pathologic_diagnosis gender karnofsky_performance_score SURV
TCGA-06-1806 1 466 47 0 90 0
TCGA-06-5408 1 357 54 1 80 0
TCGA-06-5410 1 108 72 1 60 0
TCGA-06-5411 1 254 51 0 80 0
TCGA-06-5412 1 138 78 1 80 0
TCGA-06-5413 0 NaN 67 0 60 1


np.where condition is not getting satisfied

In the following line of code, I get the error shown below.
d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
I if the numerator or denominator is 0, then the condition for np.nan should satisfy and d3["WOE"] shoud be nan. Why is the following error being produced?
FloatingPointError Traceback (most recent call last)
<ipython-input-56-a9b015683238> in <module>
----> 1 final_iv, IV = data_vars(df_leads_short,df_leads_short.close_flag)
2 IV.sort_values('IV')
<ipython-input-55-5530ad13fa5a> in data_vars(df1, target)
122 count = count + 1
123 else:
--> 124 conv = char_bin(target, df1[i])
125 conv["VAR_NAME"] = i
126 count = count + 1
<ipython-input-55-5530ad13fa5a> in char_bin(Y, X)
92 d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
---> 94 d3["WOE"] = np.where(((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0)) ,np.nan ,np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
95 #d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
96 d3["IV"] = np.where((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0 ),np.nan ,(d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
/opt/conda/lib/python3.7/site-packages/pandas/core/generic.py in __array_ufunc__(self, ufunc, method, *inputs, **kwargs)
1934 self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any
1935 ):
-> 1936 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
1938 # ideally we would define this to avoid the getattr checks, but
/opt/conda/lib/python3.7/site-packages/pandas/core/arraylike.py in array_ufunc(self, ufunc, method, *inputs, **kwargs)
356 # ufunc(series, ...)
357 inputs = tuple(extract_array(x, extract_numpy=True) for x in inputs)
--> 358 result = getattr(ufunc, method)(*inputs, **kwargs)
359 else:
360 # ufunc(dataframe)
FloatingPointError: divide by zero encountered in log
We can do
cond = ((d3.DIST_EVENT==0) | (d3.DIST_NON_EVENT ==0))
d3.loc[~cond,"WOE"] = np.log(d3.loc[~cond,"DIST_EVENT"]/d3.loc[~cond,"DIST_NON_EVENT"]))
Since the np.where still need calculated the np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT) which will still yield the same error.np.where is just selection.

regarding the error message of nvalid comparison between dtype=datetime64[ns] and date

I was trying to run the following two segments, a part from this databricks tutorial.
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from datetime import date
# get historical actuals & predictions for comparison
actuals_pd = history_pd[ history_pd['ds'] < date(2018, 1, 1) ]['y'] # line 1
predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'] # line 2
However, I got the error message of TypeError: Invalid comparison between dtype=datetime64[ns] and date from predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat'].. The one in the previous line, which seems very similar, does not have this error. I also printed types of predicted_pd and actuals_pd for the reference.
TypeError Traceback (most recent call last)
<ipython-input-15-748394f8994f> in <module>
----> 1 predicted_pd = forecast_pd[ forecast_pd['ds'] < date(2018, 1, 1) ]['yhat']
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
---> 65 return method(self, other)
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\__init__.py in wrapper(self, other)
368 rvalues = extract_array(other, extract_numpy=True)
--> 370 res_values = comparison_op(lvalues, rvalues, op)
372 return self._construct_result(res_values, name=res_name)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\array_ops.py in comparison_op(left, right, op)
228 if should_extension_dispatch(lvalues, rvalues):
229 # Call the method on lvalues
--> 230 res_values = op(lvalues, rvalues)
232 elif is_scalar(rvalues) and isna(rvalues):
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\common.py in new_method(self, other)
63 other = item_from_zerodim(other)
---> 65 return method(self, other)
67 return new_method
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\arrays\datetimelike.py in wrapper(self, other)
116 other = _validate_comparison_value(self, other)
117 except InvalidComparison:
--> 118 return invalid_comparison(self, other, op)
120 dtype = getattr(other, "dtype", None)
~\Anaconda3\envs\sparkenv\lib\site-packages\pandas\core\ops\invalid.py in invalid_comparison(left, right, op)
32 else:
33 typ = type(right).__name__
---> 34 raise TypeError(f"Invalid comparison between dtype={left.dtype} and {typ}")
35 return res_values
TypeError: Invalid comparison between dtype=datetime64[ns] and date
Pandas dates default to datetime64[ns]. So you don't want to compare them to datetime.date objects. Instead, you can just use a date string and pandas will handle the comparison corectly. Also, if you use loc to specify the rows and columns, you will get a cleaner syntax than in your examples.
datestr = '2018-01-01'
actuals_pd = history_pd.loc[history_pd['ds'] < datestr, 'y'] # line 1
predicted_pd = forecast_pd.loc[forecast_pd['ds'] < datestr, 'yhat'] # line 2

What am i missing? sklearn fit module

I think input to Machine Learning cannot be text in .fit what should I use then or how should i change my code? This ai should be training with Year month and days and it should give as output Crops (First, Second, Third) (I don't know what more details to add but i need to post more details so im just typing random stuff at the moment.
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
event_data = pd.read_excel("Jacob's Farming Contest.xlsx")
event_data.fillna(0, inplace=True)
X = event_data.drop(columns=['First Crop', 'Second Crop', 'Third Crop'])
y = event_data.drop(columns=['Year', 'Month', 'Day'])
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
# predictions = model.predict(X_test)
TypeError Traceback (most recent call last)
<ipython-input-44-3fbcae548642> in <module>
12 model = DecisionTreeClassifier()
---> 13 model.fit(X_train, y_train)
14 # predictions = model.predict(X_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
888 """
--> 890 super().fit(
891 X, y,
892 sample_weight=sample_weight,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\tree\_classes.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
180 if is_classification:
--> 181 check_classification_targets(y)
182 y = np.copy(y)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y)
167 y : array-like
168 """
--> 169 y_type = type_of_target(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in type_of_target(y)
248 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
--> 250 if is_multilabel(y):
251 return 'multilabel-indicator'
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py in is_multilabel(y)
150 _is_integral_float(np.unique(y.data))))
151 else:
--> 152 labels = np.unique(y)
154 return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint
<__array_function__ internals> in unique(*args, **kwargs)
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in unique(ar, return_index, return_inverse, return_counts, axis)
261 ar = np.asanyarray(ar)
262 if axis is None:
--> 263 ret = _unique1d(ar, return_index, return_inverse, return_counts)
264 return _unpack_tuple(ret)
C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\arraysetops.py in _unique1d(ar, return_index, return_inverse, return_counts)
309 aux = ar[perm]
310 else:
--> 311 ar.sort()
312 aux = ar
313 mask = np.empty(aux.shape, dtype=np.bool_)
TypeError: '<' not supported between instances of 'str' and 'int'
This is Jacob's Farming Contest.xlsx:
Year Month Day First Crop Second Crop Third Crop
0 101 1 1 0 0 0
1 101 1 2 Cactus Carrot Cocoa Beans
2 101 1 3 0 0 0
3 101 1 4 0 0 0
4 101 1 5 Mushroom Sugar CaNether Wart Wheat
... ... ... ... ... ... ...
367 101 12 27 Cactus Carrot Mushroom
368 101 12 28 0 0 0
369 101 12 29 0 0 0
370 101 12 30 Cocoa Beans Pumpkin Sugar CaNether Wart
371 101 12 31 0 0 0

Sorting pandas dataframe by groups

I would like to sort a dataframe by certain priority rules.
I've achieved this in the code below but I think this is a very hacky solution.
Is there a more proper Pandas way of doing this?
import pandas as pd
import numpy as np
df=pd.DataFrame({"Primary Metric":[80,100,90,100,80,100,80,90,90,100,90,90,80,90,90,80,80,80,90,90,100,80,80,100,80],
"Secondary Metric Flag":[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0],
"Secondary Value":[15, 59, 70, 56, 73, 88, 83, 64, 12, 90, 64, 18, 100, 79, 7, 71, 83, 3, 26, 73, 44, 46, 99,24, 20],
"Final Metric":[222, 883, 830, 907, 589, 93, 479, 498, 636, 761, 851, 349, 25, 405, 132, 491, 253, 318, 183, 635, 419, 885, 305, 258, 924]})
Primary_List=list(np.unique(df['Primary Metric']))
for p in Primary_List:
lol=df[df["Primary Metric"]==p]
lol.sort_values(["Secondary Metric Flag"],ascending = False)
pt1=lol[lol["Secondary Metric Flag"]==1].sort_values(by=['Secondary Value', 'Final Metric'], ascending=[False, False])
pt0=lol[lol["Secondary Metric Flag"]==0].sort_values(["Final Metric"],ascending = False)
The priority rules are:
First sort by the 'Primary Metric', then by the 'Secondary Metric
If the 'Secondary Metric Flag' ==1, sort by 'Secondary Value' then
the 'Final Metric'
If ==0, go right for the 'Final Metric'.
Appreciate any feedback.
You do not need for loop and groupby here , just split them and sort_values
df1=df.loc[df['Secondary Metric Flag']==1].sort_values(by=['Primary Metric','Secondary Value', 'Final Metric'], ascending=[True,False, False])
df0=df.loc[df['Secondary Metric Flag']==0].sort_values(["Primary Metric","Final Metric"],ascending = [True,False])
df=pd.concat([df1,df0]).sort_values('Primary Metric')
sorted with loc
def k(t):
p, s, v, f = df.loc[t]
return (-p, -s, -s * v, -f)
df.loc[sorted(df.index, key=k)]
Primary Metric Secondary Metric Flag Secondary Value Final Metric
9 100 1 90 761
5 100 1 88 93
1 100 1 59 883
3 100 1 56 907
23 100 1 24 258
20 100 0 44 419
13 90 1 79 405
19 90 1 73 635
7 90 1 64 498
11 90 1 18 349
10 90 0 64 851
2 90 0 70 830
8 90 0 12 636
18 90 0 26 183
14 90 0 7 132
15 80 1 71 491
21 80 1 46 885
17 80 1 3 318
24 80 0 20 924
4 80 0 73 589
6 80 0 83 479
22 80 0 99 305
16 80 0 83 253
0 80 0 15 222
12 80 0 100 25
sorted with itertuples
def k(t):
_, p, s, v, f = t
return (-p, -s, -s * v, -f)
idx, *tups = zip(*sorted(df.itertuples(), key=k))
pd.DataFrame(dict(zip(df, tups)), idx)
p = df['Primary Metric']
s = df['Secondary Metric Flag']
v = df['Secondary Value']
f = df['Final Metric']
a = np.lexsort([
-p, -s, -s * v, -f
Construct New DataFrame
df.mul([-1, -1, 1, -1]).assign(
**{'Secondary Value': lambda d: d['Secondary Metric Flag'] * d['Secondary Value']}
lambda d: df.loc[d.sort_values([*d]).index]

How to fix "Exception: Data must be 1-dimensional" error when running Kmeans

I have resolved all errors up till now. I am not quite sure I understand the problem except for I get the error "Exception: Data must be 1-dimensional".
Here is my code. Here is a link to the excel file im using.
import pandas as pd
import numpy as np
import warnings
from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer
from sklearn.cluster import KMeans
df1 = pd.read_excel('PERM_Disclosure_Data_FY2018_EOYV2.xlsx', 'PERM_FY2018')
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.CASE_STATUS[df1['CASE_STATUS']=='Certified-Expired'] = 'Certified'
df1 = df1[df1.CASE_STATUS != 'Withdrawn']
df1 = df1.dropna()
df1 = df1[df1.PW_AMOUNT_9089 != '#############']
df1 = df1.dropna(subset=['PW_AMOUNT_9089'])
df1 = df1.dropna(subset=['CASE_STATUS'])
df1 = df1.dropna(subset=['PW_SOC_TITLE'])
df1.PW_AMOUNT_9089 = df1.PW_AMOUNT_9089.astype(float)
df1=df1.iloc[:, [2,4,5]]
enc = LabelBinarizer()
y = enc.fit_transform(df1.CASE_STATUS)[:, [0]]
at this point the output for y is an array:
then I define XZ
le = preprocessing.LabelEncoder()
X = df1.iloc[:, [1]]
Z = df1.iloc[:, [2]]
X2 = X.apply(le.fit_transform)
XZ = pd.concat([X2,Z], axis=1)
the output for XZ is:
12 176 60778.0
13 456 100901.0
14 134 134389.0
15 134 104936.0
16 134 95160.0
17 294 66976.0
18 73 38610.0
19 598 122533.0
20 220 109574.0
21 99 67850.0
22 399 132018.0
23 68 56118.0
24 139 136781.0
25 134 111405.0
26 598 58573.0
27 362 75067.0
28 598 85862.0
29 572 33301.0
30 598 112840.0
31 134 134971.0
32 176 100568.0
33 176 100568.0
34 626 19614.0
35 153 26354.0
36 405 79248.0
37 220 93350.0
38 139 153213.0
39 598 131997.0
40 598 131997.0
41 1 90438.0
... ... ...
119741 495 23005.0
119742 63 46030.0
119743 153 20301.0
119744 95 21965.0
119745 153 29890.0
119746 295 79680.0
119747 349 79498.0
119748 223 38930.0
119749 223 38930.0
119750 570 39160.0
119751 302 119392.0
119752 598 106001.0
119753 416 64230.0
119754 598 115482.0
119755 99 80205.0
119756 134 78329.0
119757 598 109325.0
119758 598 109325.0
119759 570 49770.0
119760 194 18117.0
119761 404 46987.0
119762 189 35131.0
119763 73 49900.0
119764 323 32240.0
119765 372 28122.0
119766 468 67974.0
119767 399 78520.0
119768 329 25875.0
119769 329 25875.0
119770 601 82098.0
I then continue:
from sklearn.model_selection import train_test_split
XZ_train, XZ_test, y_train, y_test = train_test_split(XZ, y,
test_size = .25,
stratify=y )
# loading library
from pandas_ml import ConfusionMatrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
# instantiate learning model loop(k = i)
for weights in ['uniform', 'distance']:
for i in range(1,11,2):
knn = KNeighborsClassifier(n_neighbors=i, weights=weights)
# fitting the model
knn.fit(XZ_train, y_train)
# predict the response
pred = knn.predict(XZ_test)
confusion = ConfusionMatrix(y_test, pred)
if i<11:
# evaluate accuracy
print('Weight Measure:', knn.weights)
print('n_neighbors=', knn.n_neighbors)
print('Accuracy=', accuracy_score(y_test, pred))
#print('Confusion Matrix')
The error I get is as follows:
G:\Anaconda\lib\site-packages\ipykernel_launcher.py:11: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
# This is added back by InteractiveShellApp.init_path()
Exception Traceback (most recent call last)
<ipython-input-20-bf6054d911ba> in <module>
12 # predict the response
13 pred = knn.predict(XZ_test)
---> 14 confusion = ConfusionMatrix(y_test, pred)
15 if i<11:
16 # evaluate accuracy
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\cm.py in __new__(cls, y_true, y_pred, *args, **kwargs)
21 if len(set(uniq_true) - set(uniq_pred)) == 0:
22 from pandas_ml.confusion_matrix.bcm import BinaryConfusionMatrix
---> 23 return BinaryConfusionMatrix(y_true, y_pred, *args, **kwargs)
24 return LabeledConfusionMatrix(y_true, y_pred, *args, **kwargs)
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\bcm.py in __init__(self, *args, **kwargs)
19 def __init__(self, *args, **kwargs):
20 # super(BinaryConfusionMatrix, self).__init__(y_true, y_pred)
---> 21 super(BinaryConfusionMatrix, self).__init__(*args, **kwargs)
22 assert self.len() == 2, \
23 "Binary confusion matrix must have len=2 but \
G:\Anaconda\lib\site-packages\pandas_ml\confusion_matrix\abstract.py in __init__(self, y_true, y_pred, labels, display_sum, backend, true_name, pred_name)
31 self._y_true.name = self.true_name
32 else:
---> 33 self._y_true = pd.Series(y_true, name=self.true_name)
35 if isinstance(y_pred, pd.Series):
G:\Anaconda\lib\site-packages\pandas\core\series.py in __init__(self, data, index, dtype, name, copy, fastpath)
273 else:
274 data = _sanitize_array(data, index, dtype, copy,
--> 275 raise_cast_failure=True)
277 data = SingleBlockManager(data, index, fastpath=True)
G:\Anaconda\lib\site-packages\pandas\core\series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
4163 elif subarr.ndim > 1:
4164 if isinstance(data, np.ndarray):
-> 4165 raise Exception('Data must be 1-dimensional')
4166 else:
4167 subarr = com._asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
Is the data I am passing through not the correct type? The datatypes match the datatypes I've used in a past project so I thought I could replicate it here. For those wondering X is Company names that I encoded, Y is binarized case status, and Z is a wage amount in the float dtype.
"...the output for y is an array..." The array that you show is two-dimensional, with shape (n, 1). (One of the dimensions is trivial, but it is still 2-d.) Do something like y[:, 0] or y.ravel() to get a 1-d version.