Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me.
Here is the code I have been working on:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement'])
df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index()
df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index()
df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID'])
df.head()
Output: MEMBER_ID Age Dement GENDER
0 000000002 01 36 NaN 0
1 000000002 01 36 NaN 0
2 000000002 01 36 NaN 0
3 000000002 01 36 NaN 0
4 000000002 01 36 NaN 0
df['Dement'] = df['Dement'].fillna(0)
df['Dement'] = df['Dement'].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
MEMBER_ID 999 non-null object
Age 999 non-null int64
Dement 999 non-null int64
GENDER 999 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB
freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist()
number_gp = 7
def ax_settings(ax, var_name, x_min, x_max):
ax.set_xlim(x_min,x_max)
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_edgecolor('#444444')
ax.spines['bottom'].set_linewidth(2)
ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes)
return None
fig = plt.figure(figsize=(12,7))
gs = gridspec.GridSpec(nrows=number_gp,
ncols=2,
figure=fig,
width_ratios= [3, 1],
height_ratios= [1]*number_gp,
wspace=0.2, hspace=0.05
)
ax = [None]*(number_gp + 1)
features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
for i in range(number_gp):
ax[i] = fig.add_subplot(gs[i, 0])
ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
if i < (number_gp - 1): ax[i].set_xticks([])
ax[0].legend(['Male', 'Female'], facecolor='w')
ax[number_gp] = fig.add_subplot(gs[:, 1])
ax[number_gp].spines['right'].set_visible(False)
ax[number_gp].spines['top'].set_visible(False)
ax[number_gp].barh(features, freq, color='#004c99', height=0.4)
ax[number_gp].set_xlim(0,100)
ax[number_gp].invert_yaxis()
ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes)
ax[number_gp].tick_params(axis='y', labelsize = 14)
plt.show()
I am then met with:
C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = method(y)
--------------------------------------------------------------------------
TypeError Traceback (most recent call last
<ipython-input-38-8665030edb1c> in <module>()
24 ax[i] = fig.add_subplot(gs[i, 0])
25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
28 if i < (number_gp - 1): ax[i].set_xticks([])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1281
1282 with np.errstate(all='ignore'):
-> 1283 res = na_op(values, other)
1284 if is_scalar(res):
1285 raise TypeError('Could not compare {typ} type with Series'
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1167 result = method(y)
1168 if result is NotImplemented:
-> 1169 raise TypeError("invalid type comparison")
1170 else:
1171 result = op(x, y)
TypeError: invalid type comparison
Please help, i have been faced with an absurd amount of errors this week
Related
What is the solution to the following error in tensorflow.
ValueError: The two structures don't have the same sequence length.
Input structure has length 1, while shallow structure has length 2.
I tried tensorflow versions: 2.9.1 and 2.4.0.
The toy example is given to reproduce the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=(None,))
for x, y in d2.as_numpy_iterator():
pass
The full error is:
ValueError Traceback (most recent call last)
~/Documents/pythonProject/tfProjects/asr/transformer/dataset.py in <module>
256 return tuple([y1, y2])
257 d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
---> 258 d3 = d2.padded_batch(3, padded_shapes=(None,))
259 for x, y in d2.as_numpy_iterator():
260 pass
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in padded_batch(self, batch_size, padded_shapes, padding_values, drop_remainder, name)
1887 padding_values,
1888 drop_remainder,
-> 1889 name=name)
1890
1891 def map(self,
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, batch_size, padded_shapes, padding_values, drop_remainder, name)
5171
5172 input_shapes = get_legacy_output_shapes(input_dataset)
-> 5173 flat_padded_shapes = nest.flatten_up_to(input_shapes, padded_shapes)
5174
5175 flat_padded_shapes_as_tensors = []
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in flatten_up_to(shallow_tree, input_tree)
377 `input_tree`.
378 """
--> 379 assert_shallow_structure(shallow_tree, input_tree)
380 return list(_yield_flat_up_to(shallow_tree, input_tree))
381
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in assert_shallow_structure(shallow_tree, input_tree, check_types)
290 if len(input_tree) != len(shallow_tree):
291 raise ValueError(
--> 292 "The two structures don't have the same sequence length. Input "
293 f"structure has length {len(input_tree)}, while shallow structure "
294 f"has length {len(shallow_tree)}.")
ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2.
The following modification in padded_shapes argument will resolve the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=([None],[None]))
for x, y in d2.as_numpy_iterator():
pass
Some formerly-working code fails after I migrated from 0.25.3 Pandas to 1.2.4. Here is a reproducible example:
import numpy as np
import pandas as pd
print(f"pandas: {pd.__version__}")
!python --version
cols = pd.MultiIndex.from_product([['coz',], ['alpha', 'beta', 'gamma']], names=['health', 'protocol'])
index=pd.date_range(start="1jan2020", end=None, periods=5, freq="d", name="Date")
data = np.random.rand(5,3)
df = pd.DataFrame(data=data, index=index, columns=cols)
def foo(row):
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
df.apply(foo, axis="columns")
in 0.25.3 this worked as I wanted:
pandas: 0.25.3
Python 3.7.11
Date
2020-01-01 False
2020-01-02 True
2020-01-03 False
2020-01-04 True
2020-01-05 False
Freq: D, dtype: bool
but in 1.2.4 the same code throws an error apparently due to the droplevel:
pandas: 1.2.4
Python 3.9.4
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-22-4242d4f13ab1> in <module>
15 return row['beta'] > row['alpha']
16
---> 17 df.apply(foo, axis="columns")
~\.conda\envs\yagi\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in get_result(self)
183 return self.apply_raw()
184
--> 185 return self.apply_standard()
186
187 def apply_empty_result(self):
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in apply_standard(self)
274
275 def apply_standard(self):
--> 276 results, res_index = self.apply_series_generator()
277
278 # wrap results
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
288 for i, v in enumerate(series_gen):
289 # ignore SettingWithCopy here in case the user mutates
--> 290 results[i] = self.f(v)
291 if isinstance(results[i], ABCSeries):
292 # If we have a view on v, we need to make a copy because
<ipython-input-22-4242d4f13ab1> in foo(row)
12
13 def foo(row):
---> 14 row.index = row.index.droplevel(0)
15 return row['beta'] > row['alpha']
16
~\.conda\envs\yagi\lib\site-packages\pandas\core\indexes\base.py in droplevel(self, level)
1609 levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]
1610
-> 1611 return self._drop_level_numbers(levnums)
1612
1613 def _drop_level_numbers(self, levnums: List[int]):
~\.conda\envs\yagi\lib\site-packages\pandas\core\indexes\base.py in _drop_level_numbers(self, levnums)
1619 return self
1620 if len(levnums) >= self.nlevels:
-> 1621 raise ValueError(
1622 f"Cannot remove {len(levnums)} levels from an index with "
1623 f"{self.nlevels} levels: at least one level must be left."
ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.
What seems to be happening is in 1.2.4 the droplevel seems to be accumulating! The first row passed into apply(), has a 2-level index. But the second row passed into apply() has a single-level index, and this is where the error screams. This I don't understand at all.
Here is same toy w/ a print diagnostic
import numpy as np
import pandas as pd
print(f"pandas: {pd.__version__}")
!python --version
cols = pd.MultiIndex.from_product([['coz',], ['alpha', 'beta', 'gamma']], names=['health', 'protocol'])
index=pd.date_range(start="1jan2020", end=None, periods=5, freq="d", name="Date")
data = np.random.rand(5,3)
df = pd.DataFrame(data=data, index=index, columns=cols)
def foo(row):
print(f"\nROW: {row} END")
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
foo = df.apply(foo, axis="columns")
correct output:
pandas: 0.25.3
Python 3.7.11
ROW: health protocol
coz alpha 0.054421
beta 0.922885
gamma 0.843888
Name: 2020-01-01T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.962803
beta 0.827594
gamma 0.260147
Name: 2020-01-02T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.680902
beta 0.124468
gamma 0.960604
Name: 2020-01-03T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.133331
beta 0.664735
gamma 0.623440
Name: 2020-01-04T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.984164
beta 0.578701
gamma 0.538993
Name: 2020-01-05T00:00:00.000000000, dtype: float64 END
Date
2020-01-01 True
2020-01-02 False
2020-01-03 False
2020-01-04 True
2020-01-05 False
Freq: D, dtype: bool
failing output:
pandas: 1.2.4
Python 3.9.4
ROW: health protocol
coz alpha 0.374974
beta 0.137263
gamma 0.494556
Name: 2020-01-01 00:00:00, dtype: float64 END
ROW: protocol
alpha 0.591057
beta 0.560530
gamma 0.183457
Name: 2020-01-02 00:00:00, dtype: float64 END
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-bbef1b39f13a> in <module>
16 return row['beta'] > row['alpha']
17
---> 18 foo = df.apply(foo, axis="columns")
...
ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.
========
So I can fix this by operating on a .copy() of the row, but this feels like a hack. I don't understand why the code has started working in this way after the version change.
def foo(row):
#print(f"\nROW: {row} END")
row=row.copy()
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
https://pandas.pydata.org/docs/user_guide/gotchas.html#mutating-with-user-defined-function-udf-methods
Do not mutate with user-defined-methods like .apply(). I was just lucky that it worked in 0.25.3....
I want to store Numpy arrays as values for cells in my Dataframe. Is there any way to do this?
Basically i have pixel data which is a (512,512) Numpy array that i want to save as the value for pixel_data column corresponding to its particular id in the ID column of my Dataframe. How can i do this?
Heres what i tried:
for f in train_files[:10]:
id_tmp = f.split('/')[4].split('.')[0]
first_dcm = pydicom.read_file(f)
img = first_dcm.pixel_array
window = get_windowing(first_dcm)
image = window_image(img, *window)
train.loc[train.Image == id_tmp, 'img_before_w'] = img
train.loc[train.Image == id_tmp, 'img_after_w'] = image
The error i got:
ValueError Traceback (most recent call last)
<ipython-input-47-32236f8c9ccc> in <module>
5 window = get_windowing(first_dcm)
6 image = window_image(img, *window)
----> 7 train.loc[train.Image == id_tmp, 'img_before_w'] = img
8 train.loc[train.Image == id_tmp, 'img_after_w'] = image
9
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in __setitem__(self, key, value)
203 key = com.apply_if_callable(key, self.obj)
204 indexer = self._get_setitem_indexer(key)
--> 205 self._setitem_with_indexer(indexer, value)
206
207 def _validate_key(self, key, axis: int):
/opt/conda/lib/python3.6/site-packages/pandas/core/indexing.py in _setitem_with_indexer(self, indexer, value)
525 if len(labels) != value.shape[1]:
526 raise ValueError(
--> 527 "Must have equal len keys and value "
528 "when setting with an ndarray"
529 )
ValueError: Must have equal len keys and value when setting with an ndarray
Taking sample dataframe as below:
train=pd.DataFrame({'Image':[1,2,3,2],'img_before_w':[np.nan, np.nan, np.nan,np.nan]})
print(train) gives
Image img_before_w
0 1 NaN
1 2 NaN
2 3 NaN
3 2 NaN
Now, for example, if you want to insert pixel data when train.Image == 2, then it can be achieved using below code:
mask = train.Image == 2 # contains True for desired rows
target_index=mask[mask==True].index # gives index of rows, wherever condition is met
train.loc[mask, 'img_before_w'] = pd.Series([[512,512]]*len(target_index), index=target_index) # inserts [512,512] array in rows wherever condition is met, in given column
Now, print(train) gives, desired output:
Image img_before_w
0 1 NaN
1 2 [512, 512]
2 3 NaN
3 2 [512, 512]
I am trying to identify the peak coordinates (x,y) of a kde/gaussian
curve
How can I get the X values and Y values from
losing_mae.plot.kde(...) so that I can get the argmax()
losing_mae.tail(10)
238 -500.0
239 -637.5
240 -412.5
242 -1062.5
243 -562.5
245 -412.5
247 -437.5
252 -800.0
254 -662.5
255 -1062.5
Name: mae, Length: 113, dtype: float64
losing_mae.hist(ax=ax, bins=25, color='c', alpha=0.5)
losing_mae.plot.kde(color='c', ax=ax2, lw=1)
Set up:
import numpy as np
import pandas as pd
losing_mae = pd.DataFrame.from_dict({1: {0: -500.0,
1: -637.5,
2: -412.5,
3: -1062.5,
4: -562.5,
5: -412.5,
6: -437.5,
7: -800.0,
8: -662.5,
9: -1062.5}}
The kde plot returns an axes object. You can drill down to find the x and y:
d = losing_mae.plot.kde()
print(d.get_children())
Which gives a list of the objects. You probably want to drill down into the Line2D:
[<matplotlib.lines.Line2D at 0x7fb82ce67550>,
<matplotlib.spines.Spine at 0x7fb82d237e80>,
<matplotlib.spines.Spine at 0x7fb84003cd30>,
<matplotlib.spines.Spine at 0x7fb82d221b38>,
<matplotlib.spines.Spine at 0x7fb82d221748>,
<matplotlib.axis.XAxis at 0x7fb82d2590f0>,
<matplotlib.axis.YAxis at 0x7fb82d221400>,
Text(0.5, 1.0, ''),
Text(0.0, 1.0, ''),
Text(1.0, 1.0, ''),
<matplotlib.legend.Legend at 0x7fb82ce67400>,
<matplotlib.patches.Rectangle at 0x7fb82cea6940>]
Now grab the line and its path and then you can get the vertices:
l = d.get_children()[0].get_path()
l = l.vertices
print(l)
array([[-1.38750000e+03, 5.87608940e-05],
[-1.38619870e+03, 5.97906082e-05],
[-1.38489740e+03, 6.08341884e-05],
.... # and so on for ~2000 points
Separate the X and Y:
x, y = np.split(l.vertices, 2, 1)
And then you can just call max on both to get the points you want:
peakX, peakY = x.max(), y.max()
print(peakX, peakY)
87.5 0.0015392054229208412
I have features which are numeric and a binary response. I am trying to build ensemble decision trees such as random forest and gradient-boosted trees. However, I get an error. I have reproduced the error with iris data.
The error is below and the whole error message is at the bottom.
TypeError: Could not convert 12.631578947368421 to int
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
y = list(iris.target)
df = pd.read_csv("https://raw.githubusercontent.com/venky14/Machine- Learning-with-Iris-Dataset/master/Iris.csv")
df = df.drop(['Species'], axis = 1)
df['label'] = y
spark_df = spark.createDataFrame(df).drop('Id')
cols = spark_df.drop('label').columns
assembler = VectorAssembler(inputCols = cols, outputCol = 'features')
output_dat = assembler.transform(spark_df).select('label', 'features')
rf = RandomForestClassifier(labelCol = "label", featuresCol = "features")
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
.addGrid(rf.numTrees, np.linspace(10, 60, 20)).build()
crossval_rf = CrossValidator(estimator = rf,
estimatorParamMaps = paramGrid_rf,
evaluator = BinaryClassificationEvaluator(),
numFolds = 5)
cvModel_rf = crossval_rf.fit(output_dat)
TypeError Traceback (most recent call last)
<ipython-input-24-44f8f759ed8e> in <module>
2 paramGrid_rf = ParamGridBuilder() \
3 .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
----> 4 .addGrid(rf.numTrees, np.linspace(10, 60, 20)) \
5 .build()
6
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in build(self)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in to_key_value_pairs(keys, values)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/param/__init__.py in toInt(value)
197 return int(value)
198 else:
--> 199 raise TypeError("Could not convert %s to int" % value)
200
201 #staticmethod
TypeError: Could not convert 12.631578947368421 to int```
Both maxDepth and numTrees need to be integers; Numpy linspace procudes floats:
import numpy as np
np.linspace(10, 60, 20)
Result:
array([ 10. , 12.63157895, 15.26315789, 17.89473684,
20.52631579, 23.15789474, 25.78947368, 28.42105263,
31.05263158, 33.68421053, 36.31578947, 38.94736842,
41.57894737, 44.21052632, 46.84210526, 49.47368421,
52.10526316, 54.73684211, 57.36842105, 60. ])
So, your code bumps upon the first non-integer value (here 12.63157895), and produces an error.
Use arange instead:
np.arange(10, 60, 20)
# array([10, 30, 50])