Pandas PerformanceWarning: DataFrame is highly fragmented. Whats the efficient solution? - pandas

Here is a generic code representing what is happening in my script:
import pandas as pd
import numpy as np
dic = {}
for i in np.arange(0,10):
dic[str(i)] = df = pd.DataFrame(np.random.randint(0,1000,size=(5000, 20)),
columns=list('ABCDEFGHIJKLMNOPQRST'))
df_out = pd.DataFrame(index = df.index)
for i in np.arange(0,10):
df_out['A_'+str(i)] = dic[str(i)]['A'].astype('int')
df_out['D_'+str(i)] = dic[str(i)]['D'].astype('int')
df_out['H_'+str(i)] = dic[str(i)]['H'].astype('int')
df_out['I_'+str(i)] = dic[str(i)]['I'].astype('int')
df_out['M_'+str(i)] = dic[str(i)]['M'].astype('int')
df_out['O_'+str(i)] = dic[str(i)]['O'].astype('int')
df_out['Q_'+str(i)] = dic[str(i)]['Q'].astype('int')
df_out['R_'+str(i)] = dic[str(i)]['R'].astype('int')
df_out['S_'+str(i)] = dic[str(i)]['S'].astype('int')
df_out['T_'+str(i)] = dic[str(i)]['T'].astype('int')
df_out['C_'+str(i)] = dic[str(i)]['C'].astype('int')
You will notice that as soon as df_out (output) numbers of inseted columns exceed 100 I get the following warning:
PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling frame.insert many times, which has poor performance. Consider using pd.concat instead
The question is how could I use:
pd.concat()
And still have the custom column name that depens on the dictionary key ?
IMPORTANT: I still would like to keep a specific column selections, not all of them.
Like in the example: A, D , H , I etc...
SPECIAL EDIT (based on Corralien's answer)
cols = {'A': 'float',
'D': 'bool'}
out = pd.DataFrame()
for c, df in dic.items():
for col, ftype in cols.items():
out = pd.concat([out,df[[col]].add_suffix(f'_{c}')],
axis=1).astype(ftype)
Many thanks for your help !

You can use a comprehension with pd.concat:
cols = {'A': 'float', 'D': 'bool'}
out = pd.concat([df[cols].astype(cols).add_prefix(f'{k}_')
for k, df in dic.items()], axis=1)
print(out)
# Output:
0_A 0_D 1_A 1_D 2_A 2_D 3_A 3_D
0 116.0 True 396.0 True 944.0 True 398.0 True
1 128.0 True 102.0 True 561.0 True 70.0 True
2 982.0 True 613.0 True 822.0 True 246.0 True
3 830.0 True 366.0 True 861.0 True 906.0 True
4 533.0 True 741.0 True 305.0 True 874.0 True

Use concat with flatten MultiIndex in map:
cols = ['A','D']
df_out = pd.concat({k: v[cols] for k, v in dic.items()}, axis=1).astype('int')
df_out.columns = df_out.columns.map(lambda x: f'{x[1]}_{x[0]}')
print (df_out)
A_0 D_0 A_1 D_1 A_2 D_2 A_3 D_3
0 116 341 396 502 944 483 398 839
1 128 621 102 70 561 656 70 169
2 982 44 613 775 822 379 246 25
3 830 987 366 481 861 632 906 676
4 533 349 741 410 305 422 874 19

Related

Pandas Index.droplevel() works in 0.25.3 but not in 1.2.4

Some formerly-working code fails after I migrated from 0.25.3 Pandas to 1.2.4. Here is a reproducible example:
import numpy as np
import pandas as pd
print(f"pandas: {pd.__version__}")
!python --version
cols = pd.MultiIndex.from_product([['coz',], ['alpha', 'beta', 'gamma']], names=['health', 'protocol'])
index=pd.date_range(start="1jan2020", end=None, periods=5, freq="d", name="Date")
data = np.random.rand(5,3)
df = pd.DataFrame(data=data, index=index, columns=cols)
def foo(row):
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
df.apply(foo, axis="columns")
in 0.25.3 this worked as I wanted:
pandas: 0.25.3
Python 3.7.11
Date
2020-01-01 False
2020-01-02 True
2020-01-03 False
2020-01-04 True
2020-01-05 False
Freq: D, dtype: bool
but in 1.2.4 the same code throws an error apparently due to the droplevel:
pandas: 1.2.4
Python 3.9.4
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-22-4242d4f13ab1> in <module>
15 return row['beta'] > row['alpha']
16
---> 17 df.apply(foo, axis="columns")
~\.conda\envs\yagi\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
7766 kwds=kwds,
7767 )
-> 7768 return op.get_result()
7769
7770 def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in get_result(self)
183 return self.apply_raw()
184
--> 185 return self.apply_standard()
186
187 def apply_empty_result(self):
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in apply_standard(self)
274
275 def apply_standard(self):
--> 276 results, res_index = self.apply_series_generator()
277
278 # wrap results
~\.conda\envs\yagi\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
288 for i, v in enumerate(series_gen):
289 # ignore SettingWithCopy here in case the user mutates
--> 290 results[i] = self.f(v)
291 if isinstance(results[i], ABCSeries):
292 # If we have a view on v, we need to make a copy because
<ipython-input-22-4242d4f13ab1> in foo(row)
12
13 def foo(row):
---> 14 row.index = row.index.droplevel(0)
15 return row['beta'] > row['alpha']
16
~\.conda\envs\yagi\lib\site-packages\pandas\core\indexes\base.py in droplevel(self, level)
1609 levnums = sorted(self._get_level_number(lev) for lev in level)[::-1]
1610
-> 1611 return self._drop_level_numbers(levnums)
1612
1613 def _drop_level_numbers(self, levnums: List[int]):
~\.conda\envs\yagi\lib\site-packages\pandas\core\indexes\base.py in _drop_level_numbers(self, levnums)
1619 return self
1620 if len(levnums) >= self.nlevels:
-> 1621 raise ValueError(
1622 f"Cannot remove {len(levnums)} levels from an index with "
1623 f"{self.nlevels} levels: at least one level must be left."
ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.
What seems to be happening is in 1.2.4 the droplevel seems to be accumulating! The first row passed into apply(), has a 2-level index. But the second row passed into apply() has a single-level index, and this is where the error screams. This I don't understand at all.
Here is same toy w/ a print diagnostic
import numpy as np
import pandas as pd
print(f"pandas: {pd.__version__}")
!python --version
cols = pd.MultiIndex.from_product([['coz',], ['alpha', 'beta', 'gamma']], names=['health', 'protocol'])
index=pd.date_range(start="1jan2020", end=None, periods=5, freq="d", name="Date")
data = np.random.rand(5,3)
df = pd.DataFrame(data=data, index=index, columns=cols)
def foo(row):
print(f"\nROW: {row} END")
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
foo = df.apply(foo, axis="columns")
correct output:
pandas: 0.25.3
Python 3.7.11
ROW: health protocol
coz alpha 0.054421
beta 0.922885
gamma 0.843888
Name: 2020-01-01T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.962803
beta 0.827594
gamma 0.260147
Name: 2020-01-02T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.680902
beta 0.124468
gamma 0.960604
Name: 2020-01-03T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.133331
beta 0.664735
gamma 0.623440
Name: 2020-01-04T00:00:00.000000000, dtype: float64 END
ROW: health protocol
coz alpha 0.984164
beta 0.578701
gamma 0.538993
Name: 2020-01-05T00:00:00.000000000, dtype: float64 END
Date
2020-01-01 True
2020-01-02 False
2020-01-03 False
2020-01-04 True
2020-01-05 False
Freq: D, dtype: bool
failing output:
pandas: 1.2.4
Python 3.9.4
ROW: health protocol
coz alpha 0.374974
beta 0.137263
gamma 0.494556
Name: 2020-01-01 00:00:00, dtype: float64 END
ROW: protocol
alpha 0.591057
beta 0.560530
gamma 0.183457
Name: 2020-01-02 00:00:00, dtype: float64 END
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-bbef1b39f13a> in <module>
16 return row['beta'] > row['alpha']
17
---> 18 foo = df.apply(foo, axis="columns")
...
ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.
========
So I can fix this by operating on a .copy() of the row, but this feels like a hack. I don't understand why the code has started working in this way after the version change.
def foo(row):
#print(f"\nROW: {row} END")
row=row.copy()
row.index = row.index.droplevel(0)
return row['beta'] > row['alpha']
https://pandas.pydata.org/docs/user_guide/gotchas.html#mutating-with-user-defined-function-udf-methods
Do not mutate with user-defined-methods like .apply(). I was just lucky that it worked in 0.25.3....

Unable to filter dataframe list

I have a list of dataframes that I'd like to filter out some rows.
d = {'id': ['111', '222', '333'], 'queries': ['High', 'Mid', 'Low'], 'time_stay': ['High', 'Mid', 'Low']}
dd = pd.DataFrame(data=d)
d2 = {'id': ['444', '555'], 'queries': ['True', 'False'], 'time_stay': ['High', 'Mid']}
dd2 = pd.DataFrame(data=d2)
df_list = [dd, dd2]
df_list
[ id queries time_stay
0 111 High High
1 222 Mid Mid
2 333 Low Low,
id queries time_stay
0 444 True High
1 555 False Mid]
If the dataframe length == 3, I'd like to only keep the row where queries == Low
for df in df_list:
if len(df) == 3:
df = df.loc[df['queries'] == 'Low']
else:
pass
The code df.loc[df['queries'] == 'Low'] works as expected but when I check df_list[0], the result is not saved.
The expected output should be:
df_list[0]
333 Low Low
I also would like to rewrite it into a list comprehensive format.
You can map a lambda function:
df_list = list(map(lambda df: df[df.queries == 'Low'] if len(df) == 3 else df, df_list))
Output
df_list
[ id queries time_stay
2 333 Low Low,
id queries time_stay
0 444 True High
1 555 False Mid]

Spark random forest - could not convert float to int error

I have features which are numeric and a binary response. I am trying to build ensemble decision trees such as random forest and gradient-boosted trees. However, I get an error. I have reproduced the error with iris data.
The error is below and the whole error message is at the bottom.
TypeError: Could not convert 12.631578947368421 to int
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
y = list(iris.target)
df = pd.read_csv("https://raw.githubusercontent.com/venky14/Machine- Learning-with-Iris-Dataset/master/Iris.csv")
df = df.drop(['Species'], axis = 1)
df['label'] = y
spark_df = spark.createDataFrame(df).drop('Id')
cols = spark_df.drop('label').columns
assembler = VectorAssembler(inputCols = cols, outputCol = 'features')
output_dat = assembler.transform(spark_df).select('label', 'features')
rf = RandomForestClassifier(labelCol = "label", featuresCol = "features")
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
.addGrid(rf.numTrees, np.linspace(10, 60, 20)).build()
crossval_rf = CrossValidator(estimator = rf,
estimatorParamMaps = paramGrid_rf,
evaluator = BinaryClassificationEvaluator(),
numFolds = 5)
cvModel_rf = crossval_rf.fit(output_dat)
TypeError Traceback (most recent call last)
<ipython-input-24-44f8f759ed8e> in <module>
2 paramGrid_rf = ParamGridBuilder() \
3 .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
----> 4 .addGrid(rf.numTrees, np.linspace(10, 60, 20)) \
5 .build()
6
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in build(self)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in to_key_value_pairs(keys, values)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/param/__init__.py in toInt(value)
197 return int(value)
198 else:
--> 199 raise TypeError("Could not convert %s to int" % value)
200
201 #staticmethod
TypeError: Could not convert 12.631578947368421 to int```
Both maxDepth and numTrees need to be integers; Numpy linspace procudes floats:
import numpy as np
np.linspace(10, 60, 20)
Result:
array([ 10. , 12.63157895, 15.26315789, 17.89473684,
20.52631579, 23.15789474, 25.78947368, 28.42105263,
31.05263158, 33.68421053, 36.31578947, 38.94736842,
41.57894737, 44.21052632, 46.84210526, 49.47368421,
52.10526316, 54.73684211, 57.36842105, 60. ])
So, your code bumps upon the first non-integer value (here 12.63157895), and produces an error.
Use arange instead:
np.arange(10, 60, 20)
# array([10, 30, 50])

Out of bounds error when unstacking MultiIndex pandas dataframe after filtering

I have a multi-index pandas DataFrame that I perform some operations to (including dropping columns with null values) and then try to unstack... however this results in an index error. Any way to fix this? Code below:
ds = ds.unstack(level='Symbol')
ds.columns = ds.columns.swaplevel(0, 1)
ds = ds[start:end]
ds = ds[equities]
ds = ds.stack(level='Symbol')
ds.dropna(axis=1, inplace=True) # this line breaks the code
ds = ds.unstack(level='Symbol')
ds.head()
Without the dropna line the code performs fine, so something about this is breaking the indexing... which seems like a bug to me. This throws an error with some data frames but not all so probably specific to only some circumstances. Any help would be greatly appreciated!
Dumping error log below:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-341-efb7e680485a> in <module>()
9 #ds.dropna(axis=1, inplace=True)
10
---> 11 ds = ds.unstack(level='Symbol')
12
13 ds.head()
~/.local/lib/python3.5/site-packages/pandas/core/frame.py in unstack(self, level, fill_value)
4567 """
4568 from pandas.core.reshape.reshape import unstack
-> 4569 return unstack(self, level, fill_value)
4570
4571 _shared_docs['melt'] = ("""
~/.local/lib/python3.5/site-packages/pandas/core/reshape/reshape.py in unstack(obj, level, fill_value)
467 if isinstance(obj, DataFrame):
468 if isinstance(obj.index, MultiIndex):
--> 469 return _unstack_frame(obj, level, fill_value=fill_value)
470 else:
471 return obj.T.stack(dropna=False)
~/.local/lib/python3.5/site-packages/pandas/core/reshape/reshape.py in _unstack_frame(obj, level, fill_value)
480 unstacker = partial(_Unstacker, index=obj.index,
481 level=level, fill_value=fill_value)
--> 482 blocks = obj._data.unstack(unstacker)
483 klass = type(obj)
484 return klass(blocks)
~/.local/lib/python3.5/site-packages/pandas/core/internals.py in unstack(self, unstacker_func)
4349 new_columns = new_columns[columns_mask]
4350
-> 4351 bm = BlockManager(new_blocks, [new_columns, new_index])
4352 return bm
4353
~/.local/lib/python3.5/site-packages/pandas/core/internals.py in __init__(self, blocks, axes, do_integrity_check, fastpath)
3035 self._consolidate_check()
3036
-> 3037 self._rebuild_blknos_and_blklocs()
3038
3039 def make_empty(self, axes=None):
~/.local/lib/python3.5/site-packages/pandas/core/internals.py in _rebuild_blknos_and_blklocs(self)
3123 for blkno, blk in enumerate(self.blocks):
3124 rl = blk.mgr_locs
-> 3125 new_blknos[rl.indexer] = blkno
3126 new_blklocs[rl.indexer] = np.arange(len(rl))
3127
IndexError: index 100352 is out of bounds for axis 1 with size 100352
Problem is dropna remove some rows, so also values of MultiIndex, but MultiIndex is not changed by default. So need removed this unnecessary values from MultiIndex by MultiIndex.remove_unused_levels.
ds = ds.stack(level='Symbol')
ds.dropna(axis=1, inplace=True)
ds.index = ds.index.remove_unused_levels()
ds = ds.unstack(level='Symbol')

How to pass stopwords from a column of dataframe

I am getting error while I directly passing dataframe column into stopword.
How Can I resolve this
stop_words_corpus=pd.DataFrame(word_dictionary_corpus.Word.unique(),columns=feature_names)
cv = CountVectorizer( max_features = 200,analyzer='word',stop_words= stop_words_corpus)
cv_txt = cv.fit_transform(data.pop('Clean_addr'))
****Updated Error***
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in _count_vocab(self, raw_documents, fixed_vocab)
783 vocabulary.default_factory = vocabulary.__len__
784
--> 785 analyze = self.build_analyzer()
786 j_indices = []
787 indptr = _make_int_array()
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py in build_analyzer(self)
260
261 elif self.analyzer == 'word':
--> 262 stop_words = self.get_stop_words()
263 tokenize = self.build_tokenizer()
264
I fixed the error taht error still having the issue
Try this:
cv = CountVectorizer(max_features = 200,
analyzer='word',
stop_words=stop_words_corpus.stack().unique())
We need to make the dataframe into NpArray to pass stopwords in to the countvectorizer
stop_word =stop_words_corpus['Word'].values
cv = CountVectorizer(max_features = 200,
analyzer='word',
stop_words=stop_word)