My question is just below the code snippet, below:
I have raw sensor time series data that .. is getting really close to be usable now :)
locDf = locationDf.copy()
locDf.set_index('date', inplace=True)
locDfs = {}
for user, user_loc_dc in locDf.groupby('user'):
locDfs[user] = user_loc_dc.resample('15T').agg('max').bfill()
aDf = appDf.copy()
aDf.set_index('date', inplace=True)
userLocAppDfs = {}
appDfs = []
for user, a2_df in aDf.groupby('user'):
userDf = a2_df.resample('15T').agg('min')
userDf.reset_index(inplace=True)
userDf = pd.crosstab(index=userDf['date'], columns=userDf['app'], values=userDf['metric'], aggfunc=np.mean).fillna(0, downcast='infer')
userDf['user'] = user
userDf.reset_index(inplace=True)
userDf.set_index('date', inplace=True)
appDfs.append(userDf)
userLocAppDfs[user] = userDf
loDf = locDfs[user]
loDf.reset_index(inplace=True)
loDf = pd.crosstab([loDf.date, loDf.user], loDf.location)
loDf.reset_index(inplace=True)
loDf.set_index('date', inplace=True)
loDf.drop('user', axis=1, inplace=True)
userLocAppDfs[user] = userLocAppDfs[user].join(loDf, how='outer')
userLocAppDfs[user]['user'].fillna(user, inplace=True)
#for app in a2_df['app'].unique():
# userLocAppDfs[user][app] = userLocAppDfs[user][app].fillna(0, inplace=True)
userLocAppDfs['user_1'].head(5)
Question
If I uncomment those last two lines to try to fill the NaN's at the start, I dont' get zeros. I get None. :( Can anyone tell me why?
I'd like to.. you know, get 0's there:
2017-08-28 00:00:00 0 0 user_1 0.0 0.0 0.0 1.0 0.0
2017-08-28 00:15:00 0 0 user_1 0.0 0.0 1.0 0.0 0.0
2017-08-28 00:30:00 0 0 user_1 0.0 0.0 1.0 0.0 0.0
2017-08-28 00:45:00 0 0 user_1 0.0 0.0 1.0 0.0 0.0
2017-08-28 01:00:00 0 0 user_1 0.0 0.0 1.0 0.0 0.0
The last step will be for me to get the rolling average of those app_* numbers, so that I get a curve.
Try
for app in a2_df['app'].unique():
userLocAppDfs[user][app].fillna(0, inplace=True)
# or userLocAppDfs[user][app] = userLocAppDfs[user][app].fillna(0)
So it is because you have specified inplace = True and at the same time you assign it back.
Note that df.fillna(0, inplace=True) will not return a value. Rather it will directly modify the originaldf. Try print(df.fillna(0, inplace=True)), it will give you None. So what you've done above was assigning None to column apps.
Related
I have been stacked on how do I convert back the result of a OneHotEnocder to a DataFrame.The Idea that I have separated numeric columns from categorical columns as follows:
feats = df.drop(["Transported"], axis=1)
target = df["Transported"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size = 0.2,
random_state=42)
here after doing the split, I needed to separet the num from cat for training set and i did this:
num_train = X_train.select_dtypes(include=['float64', 'int64'])
cat_train = X_train.select_dtypes(include=['object'])
num_test = X_test.select_dtypes(include=['float64', 'int64'])
cat_test = X_test.select_dtypes(include=['object'])
After this I did the the Simple imputer and it worked.
imputer_median = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
num = ["Age", "RoomService", "FoodCourt", "ShoppingMall","Spa","VRDeck"]
num_train.loc[:,num] = imputer_median.fit_transform(num_train[num])
num_test.loc[:,num] = imputer_median.transform(num_test[num])
cat = ["HomePlanet", "CryoSleep", "Destination","VIP"]
cat_train.loc[:,cat] = imputer_most_frequent.fit_transform(cat_train[cat])
cat_test.loc[:,cat] = imputer_most_frequent.transform(cat_test[cat])
and this the head of the cat_train:
cat_train.head()
HomePlanet CryoSleep Destination VIP
2333 Earth False TRAPPIST-1e False
2589 Earth False TRAPPIST-1e False
8302 Europa True 55 Cancri e False
8177 Mars False TRAPPIST-1e False
500 Europa True 55 Cancri e False
But, after this I needed to apply the OneHotEncoder just like this:
from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder( drop='first',sparse=False)
cat_train.loc[:,cat] = oneh.fit_transform(cat_train[cat])
cat_train.loc[:,cat] = oneh.fit_transform(cat_train[cat])
And I got this error:
shape mismatch: value array of shape (6954,6) could not be broadcast to indexing result
of shape (6954,4)
I tried several ways, but everytime I could not succeed to have a DataFrame back after the OneHotEncoder. Please help me out, I am stacked on this and I cannot continue the rest of the work. Thanks in advance
here is the full traceback error:
ValueError Traceback (most recent
call last)
~\AppData\Local\Temp\ipykernel_16200\2252764984.py in <module>
3 oneh = OneHotEncoder( drop='first',sparse=False)
4
----> 5 cat_train.loc[:,cat] = oneh.fit_transform(cat_train[cat])
6 cat_train.loc[:,cat] = oneh.fit_transform(cat_train[cat])
~\anaconda3\lib\site-packages\pandas\core\indexing.py in
__setitem__(self, key, value)
714
715 iloc = self if self.name == "iloc" else self.obj.iloc
--> 716 iloc._setitem_with_indexer(indexer, value,
self.name)
717
718 def _validate_key(self, key, axis: int):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in
_setitem_with_indexer(self, indexer, value, name)
1691 self._setitem_with_indexer_split_path(indexer,
value, name)
1692 else:
-> 1693 self._setitem_single_block(indexer, value,
name)
1694
1695 def _setitem_with_indexer_split_path(self, indexer, value,
name: str):
~\anaconda3\lib\site-packages\pandas\core\indexing.py in
_setitem_single_block(self, indexer, value, name)
1941
1942 # actually do the set
-> 1943 self.obj._mgr =
self.obj._mgr.setitem(indexer=indexer, value=value)
1944 self.obj._maybe_update_cacher(clear=True,
inplace=True)
1945
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in
setitem(self, indexer, value)
335 For SingleBlockManager, this backs s[indexer] = value
336 """
--> 337 return self.apply("setitem", indexer=indexer,
value=value)
338
339 def putmask(self, mask, new, align: bool = True):
~\anaconda3\lib\site-packages\pandas\core\internals\managers.py in
apply(self, f, align_keys, ignore_failures, **kwargs)
302 applied = b.apply(f, **kwargs)
303 else:
--> 304 applied = getattr(b, f)(**kwargs)
305 except (TypeError, NotImplementedError):
306 if not ignore_failures:
~\anaconda3\lib\site-packages\pandas\core\internals\blocks.py in
setitem(self, indexer, value)
957 else:
958 value = setitem_datetimelike_compat(values,
len(values[indexer]), value)
--> 959 values[indexer] = value
960
961 return self
ValueError: shape mismatch: value array of shape (6954,6) could not
be broadcast to indexing result of shape (6954,4)
I tried this time the next move:
from sklearn.preprocessing import OneHotEncoder
oneh = OneHotEncoder(handle_unknown='ignore')
cat_train.loc[:,cat] = oneh.fit_transform(cat_train[cat])
cat_test.loc[:,cat] = oneh.transform(cat_test)
and I got this dataframe, but this is not what I am looking for:
HomePlanet CryoSleep Destination VIP
2333 (0, 0)\t1.0\n (0, 3)\t1.0\n (0, 7)\t1.0\n ... (0,
0)\t1.0\n (0, 3)\t1.0\n (0, 7)\t1.0\n ... (0, 0)\t1.0\n (0,
3)\t1.0\n (0, 7)\t1.0\n ... (0, 0)\t1.0\n (0, 3)\t1.0\n (0,
7)\t1.0\n ...
2589 (0, 0)\t1.0\n (0, 3)\t1.0\n (0, 7)\t1.0\n ... (0,
0)\t1.0\n (0, 3)\t1.0\n (0, 7)\t1.0\n ... (0, 0)\t1.0\n (0,
3)\t1.0\n (0, 7)\t1.0\n ... (0, 0)\t1.0\n (0, 3)\t1.0\n (0,
7)\t1.0\n ...
I also used Columntransformer; but It's not getting me back the dataframe I want to(i mean the dataframe with the original columns used before the onehotencoder (look above the cat_train)) this is the steps I did:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(
transformers=[("OneHotEncoder", OneHotEncoder(drop='first',
sparse=False), cat)],
remainder='passthrough'
)
cat_train = ct.fit_transform(cat_train)
cat_test = ct.transform(cat_test)
cat_train = pd.DataFrame(cat_train, columns=ct.get_feature_names())
cat_test = pd.DataFrame(cat_test, columns=ct.get_feature_names())
cat_train
and the cat_train.head() I got is :
OneHotEncoder__x0_Europa OneHotEncoder__x0_Mars OneHotEncoder__x1_True OneHotEncoder__x2_PSO J318.5-22 OneHotEncoder__x2_TRAPPIST-1e OneHotEncoder__x3_True
0 0.0 0.0 0.0 0.0 1.0 0.0
1 0.0 0.0 0.0 0.0 1.0 0.0
2 1.0 0.0 1.0 0.0 0.0
this is weird because next I need to concatenat the cat_train with num_train and also for the test, and I done this , alot of NAN values will appears, wherease I already imputed all the nan values before. Any Idea?
The first error is because you try to assign the one-hot-encoded data, which has more columns than the original, back to the same original columns. You need to instead add these dummy columns and delete the original ones. Anyway, applying fit_transform to both train and test (assuming the repeated train row is a typo) is a bad idea.
The second error appears to be due to the one-hot-encoded data being sparse. You can specify sparse=False in the OneHotEncoder to fix that, but then probably you'll have the same issue as above.
The best thing to do is to use a ColumnTransformer; it would handle all the concatenation for you.
I succeeded to find the solution. In fact, I was looking to get back the original(since I had 4 columns so I thought I should get these columns back) columns as they were before the OneHotEnoder, which is not generally POSSIBLE. In my case I have ,for each cat_train columns, a different modality(more than one) so the result after a OneHotEncoder must be a more columns than before. So, and based on this, I ve regenerated the code as follow:
feats = df.drop(["Transported"], axis=1)
target = df["Transported"]
X_train, X_test, y_train, y_test = train_test_split(feats, target,
test_size = 0.2, random_state=42)
Separate numeric columns from categorical columns
import numpy as np
num_train = X_train.select_dtypes(include=[np.number])
cat_train = X_train.select_dtypes(exclude=[np.number])
num_test = X_test.select_dtypes(include=[np.number])
cat_test = X_test.select_dtypes(exclude=[np.number])
Fill in missing values
num_imp = SimpleImputer(strategy='median')
num_train = num_imp.fit_transform(num_train)
num_test = num_imp.transform(num_test)
cat_imp = SimpleImputer(strategy='most_frequent')
cat_train = cat_imp.fit_transform(cat_train)
cat_test = cat_imp.transform(cat_test)
Encode categorical variables
cat_enc = OneHotEncoder(handle_unknown='ignore')
cat_train = cat_enc.fit_transform(cat_train)
cat_test = cat_enc.transform(cat_test)
And Now the magic part; Reconstitute training and test sets
X_train = pd.concat([pd.DataFrame(num_train),
pd.DataFrame(cat_train.toarray())], axis=1)
X_test = pd.concat([pd.DataFrame(num_test),
pd.DataFrame(cat_test.toarray())], axis=1)
the dataframe is now as it should be
X_train.head()
0 1 2 3 4 5 0 1 2 3 4 5 6 7 8 9
0 28.0 0.0 55.0 0.0 656.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
1 17.0 0.0 1195.0 31.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
2 28.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0
3 20.0 0.0 2.0 289.0 976.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
4 36.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0 0.0
I'm trying to find the string with specific pattern in my dataframe
import re
import pandas as pd
import numpy as np
df = pd.read_excel(io = "mydata.xlsx", sheet_name = 'Sheet1', index_col = 0)
to find '\[a-z]' string:
header = df.select_dtypes(['object']).columns
df_header = df[header]
p = re.compile('\[a-z]')
df_header_check = df_header.apply(lambda x: x.str.contains(p, na=False))
df_header.loc[df_header_check.any(1), df_header_check.any()]
And I don't get any results. Not an error message, just an empty dataframe.
I've tried:
p = re.compile(r'\\[a-z]') but also does not work
The sample dataset:
TIME11 WARNEMOTION4 WARNEMOTION4DTL TIME12 WARNSIGN_DTL EVENT_DTL EVENT_DTL_2
EXCLUDE
1_3 1 NaN 2.0 1.0 2.0 2.0 1.0 2.0 2.0 2.0 ... NaN NaN NaN NaN NaN NaN NaN 언어: ****************** 1. 변사자 정보 : ***_*****-*******_x000D__x000D_\n2. 발견일시 : ****년 **월 **일 **:**_x000D__x000D_\n3. 시도... NaN
And I expect the dataframe output like the above.
I was trying to fit OneHotEncoder on the X_train and then transform on X_train, X_test
However this resulted in error:
# One hot encoding
from sklearn.preprocessing import OneHotEncoder
encode_columns = ['borough','building_class_category', 'commercial_units','residential_units']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[encode_columns])
X_train = enc.transform(X_train[encode_columns])
X_test = enc.transform(X_test[encode_columns])
X_train.head()
Error:
4
5 enc = OneHotEncoder(handle_unknown='ignore')
----> 6 enc.fit(X_train[encode_columns])
7 X_train = enc.transform(X_train[encode_columns])
8 X_test = enc.transform(X_test[encode_columns])
TypeError: cannot perform reduce with flexible type
Sample row of X_train:
TLDR: You probably run the cell with fit and transform multiple times, and .transform() doesn't work the way, you think it work.
Why are you getting this error?
If you have data definition in one cell:
X_train = pd.DataFrame({'borough': ["Queens", "Brooklyn", "Queens", "Queens", "Brooklyn"],
'building_class_category': ["01", "02", "02", "01", "13"],
'commercial_units': ["O", "O", "O", "O", "A"],
'residential_units': [1,2,2,1,1]})
And fitting one hot-encoder in second one:
encode_columns = ['borough','building_class_category', 'commercial_units','residential_units']
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[encode_columns])
X_train = enc.transform(X_train[encode_columns])
The cell above would work first time, but since you overwrite X_train if you run the cell second time:
TypeError: cannot perform reduce with flexible type
So the first part of the answer will be - have different name for the input and output.
What does OneHotEncoder transform returns?
If you'll print out enc.transform(X_train[encode_columns]) you'll get:
<5x9 sparse matrix of type '<class 'numpy.float64'>'
with 20 stored elements in Compressed Sparse Row format>
Defaultly the OneHotEncoder transform doesn't return the pandas DataFrame (or even a numpy array) but a sparse matrix. To get a numpy array yo have to either transform it:
enc.transform(X_train[encode_columns]).toarray()
or set sparse=False in definition of OneHotEncoder:
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
Bonus: How to have descriptive names of features?
After setting sparse=False, enc.transform(X_train[encode_columns]) would return numpy array. Even if you would transform it to pd.DataFrame, column names won't tell you much:
pd.DataFrame(enc.transform(X_train[encode_columns]))
# 0 1 2 3 4 5 6 7 8
#0 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
#1 1.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
#2 0.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0
#3 0.0 1.0 1.0 0.0 0.0 0.0 1.0 1.0 0.0
#4 1.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0
To get proper column names, you have to use get_feature_names_out() method:
pd.DataFrame(enc.transform(X_train[encode_columns]), columns = enc.get_feature_names_out())
# borough_Brooklyn borough_Queens ... residential_units_2
#0 0.0 1.0 ... 0.0
#1 1.0 0.0 ... 1.0
#2 0.0 1.0 ... 1.0
#3 0.0 1.0 ... 0.0
#4 1.0 0.0 ... 0.0
Whole code:
X_train = pd.DataFrame({'borough': ["Queens", "Brooklyn", "Queens", "Queens", "Brooklyn"],
'building_class_category': ["01", "02", "02", "01", "13"],
'commercial_units': ["O", "O", "O", "O", "A"],
'residential_units': [1,2,2,1,1]})
encode_columns = ['borough','building_class_category', 'commercial_units','residential_units']
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
enc.fit(X_train[encode_columns])
X_train_encoded = pd.DataFrame(enc.transform(X_train[encode_columns]), columns = enc.get_feature_names_out())
When I download data from yfinance, I get 8 columns (Open, High, Low, etc...) per ticker. Since I am downloading 15 tickers, I have 120 columns and 1 index column (date). They add up horizontally. See image 1
Instead of having that many columns, in 2 levels, I want just the 8 unique columns. Plus creating one new column that identifies the ticker. See Image 2.
Image 1: Current Form
Image 1 but in raw text:
Adj Close ... Volume
DANHOS13.MX FCFE18.MX FHIPO14.MX FIBRAHD15.MX FIBRAMQ12.MX FIBRAPL14.MX FIHO12.MX FINN13.MX FMTY14.MX FNOVA17.MX ... FIBRAPL14.MX FIHO12.MX FINN13.MX FMTY14.MX FNOVA17.MX FPLUS16.MX FSHOP13.MX FUNO11.MX FVIA16.MX TERRA13.MX
Date
2015-01-02 26.065336 NaN 18.526043 NaN 16.337654 18.520781 14.683501 11.301384 9.247743 NaN ... 338697 189552 148064 57 NaN NaN 212451 2649823 NaN 1111343
2015-01-05 24.670488 NaN 18.436762 NaN 15.857328 17.859756 13.795850 11.071105 9.209846 NaN ... 449555 364819 244594 19330 NaN NaN 491587 3317923 NaN 1255128
Image 2: Desired outcome
The code Im applying is:
start = dt.datetime(2015,1,1)
end = dt.datetime.now()
df = yf.download("FUNO11.MX FIBRAMQ12.MX FIHO12.MX DANHOS13.MX FINN13.MX FSHOP13.MX TERRA13.MX FMTY14.MX FIBRAPL14.MX FHIPO14.MX FIBRAHD15.MX FPLUS16.MX FVIA16.MX FNOVA17.MX FCFE18.MX",
start = start,
end = end,
group_by = 'Ticker',
actions = True)
I will download the data a little differently:
import yfinance as yf
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta
start = dt(2015,1,1)
end = dt.now()
symbols = ["FUNO11.MX", "FIBRAMQ12.MX", "FIHO12.MX", "DANHOS13.MX", "FINN13.MX", "FSHOP13.MX", "TERRA13.MX", "FMTY14.MX",
"FIBRAPL14.MX", "FHIPO14.MX", "FIBRAHD15.MX", "FPLUS16.MX", "FVIA16.MX", "FNOVA17.MX", "FCFE18.MX"]
data = yf.download(symbols, start=start, end=end, actions=True)
And then
Option 1:
def reshaper(symb, dframe):
df = dframe.unstack().reset_index()
df.columns = ['variable','symbol','Date','Value']
df = df.loc[df.symbol==symb,['Date','variable','Value']].pivot_table(index='Date', columns='variable', values='Value').reset_index()
df.columns.name = ''
df['Ticker'] = symb
return df
h = pd.DataFrame()
for s in symbols:
h = h.append(reshaper(s, data), ignore_index=True)
h
Option 2: For a one-liner, you could do this:
data.stack().reset_index().rename(columns={'level_1':'Ticker'})
A slightly simpler version relies on stacking first the two column index levels (measure and ticker) to get long form tidy data, and then stack on the measure level, keeping ticker and date as indices:
import yfinance as yf
symbols = ["FUNO11.MX", "FIBRAMQ12.MX", "FIHO12.MX", "DANHOS13.MX",
"FINN13.MX", "FSHOP13.MX", "TERRA13.MX", "FMTY14.MX",
"FIBRAPL14.MX", "FHIPO14.MX", "FIBRAHD15.MX", "FPLUS16.MX",
"FVIA16.MX", "FNOVA17.MX", "FCFE18.MX"]
data = yf.download(symbols, start='2015-01-01', end='2020-11-15', actions=True)
data_reshape=data.stack(level=[0,1]).unstack(1)
data_reshape.index=data_reshape.index.set_names(['ticker'],level=[1])
data_reshape.head()
data_reshape.head()
Adj Close Close Dividends High \
Date ticker
2015-01-02 DANHOS13.MX 26.065336 37.000000 0.0 37.400002
FHIPO14.MX 18.526043 24.900000 0.0 24.900000
FIBRAMQ12.MX 16.337654 24.490000 0.0 25.110001
FIBRAPL14.MX 18.520781 26.740801 0.0 27.118500
FIHO12.MX 14.683501 21.670000 0.0 22.190001
Low Open Stock Splits Volume
Date ticker
2015-01-02 DANHOS13.MX 36.330002 36.330002 0.0 82849.0
FHIPO14.MX 24.900000 24.900000 0.0 94007.0
FIBRAMQ12.MX 24.350000 24.990000 0.0 1172917.0
FIBRAPL14.MX 26.343100 26.750700 0.0 338697.0
FIHO12.MX 21.209999 22.120001 0.0 189552.0
I have 2 dataframes of equal length. The source has one column, ML_PREDICTION that I want to copy to the target dataframe, which has some values already that I don't want to overwrite.
#Select only blank values in target dataframe
mask = br_df['RECOMMENDED_ACTION'] == ''
# Attempt 1 - Results: KeyError: "['Retain' 'Retain' '' ... '' '' 'Retain'] not in index"
br_df.loc[br_df['RECOMMENDED_ACTION'][mask]] = ML_df['ML_PREDICTION'][mask]
br_df.loc['REASON_CODE'][mask] = 'ML01'
br_df.loc['COMMENT'][mask] = 'Automated Prediction'
# Attempt 2 - Results: Overwrites all values in target dataframe
br_df['RECOMMENDED_ACTION'].where(mask, other=ML_df['ML_PREDICTION'], inplace=True)
br_df['REASON_CODE'].where(mask, other='ML01', inplace=True)
br_df['COMMENT'].where(mask, other='Automated Prediction', inplace=True)
# Attempt 3 - Results: Overwrites all values in target dataframe
br_df['RECOMMENDED_ACTION'] = [x for x in ML_df['ML_PREDICTION'] if [mask] ]
br_df['REASON_CODE'] = ['ML01' for x in ML_df['ML_PREDICTION'] if [mask]]
br_df['COMMENT'] = ['Automated Prediction' for x in ML_df['ML_PREDICTION'] if [mask]]
Attempt 4 - Results: Values in target (br_df) were unchanged
br_df.loc[br_df['RECOMMENDED_ACTION'].isnull(), 'REASON_CODE'] = 'ML01'
br_df.loc[br_df['RECOMMENDED_ACTION'].isnull(), 'COMMENT'] = 'Automated Prediction'
br_df.loc[br_df['RECOMMENDED_ACTION'].isnull(), 'RECOMMENDED_ACTION'] = ML_df['ML_PREDICTION']
Attempt 5
#Dipanjan
` # Before - br_df['REASON_CODE'].value_counts()
BR03 10
BR01 8
Name: REASON_CODE, dtype: int64
#Attempt 5
br_df.loc['REASON_CODE'] = br_df['REASON_CODE'].fillna('ML01')
br_df.loc['COMMENT'] = br_df['COMMENT'].fillna('Automated Prediction')
br_df.loc['RECOMMENDED_ACTION'] = br_df['RECOMMENDED_ACTION'].fillna(ML_df['ML_PREDICTION'])
# After -- print(br_df['REASON_CODE'].value_counts())
BR03 10
BR01 8
ML01 2
Automated Prediction 1
Name: REASON_CODE, dtype: int64
#WTF? -- br_df[br_df['REASON_CODE'] == 'Automated Prediction']
PERSON_STATUS ... RECOMMENDED_ACTION REASON_CODE COMMENT
COMMENT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN Automated Prediction Automated Prediction Automated Prediction
What am I missing here?
use below options -
df.loc[df['A'].isnull(), 'A'] = df['B']
or
df['A'] = df['A'].fillna(df['B'])
import numpy as np
df_a = pd.DataFrame([0,1,np.nan])
df_b = pd.DataFrame([0,np.nan,2])
df_a
0
0 0.0
1 1.0
2 NaN
df_b
0
0 0.0
1 NaN
2 2.0
df_a[0] = df_a[0].fillna(df_b[0])
final_output-
df_a
0
0 0.0
1 1.0
2 2.0
Ultimately, this is the syntax that appears to solve my problem:
mask = mask[:len(br_df)] # create the boolean index
br_df = br_df[:len(mask)] # make sure they are the same length
br_df['RECOMMENDED_ACTION'].loc[mask] = ML_df['ML_PREDICTION'].loc[mask]
br_df['REASON_CODE'].loc[mask] = 'ML01'
br_df['COMMENT'].loc[mask] = 'Automated Prediction'