Remove the first or last char so the values from a column should start with numbers - pandas

I'm new to Pandas and I'd like to ask your advice.
Let's take this dataframe:
df_test = pd.DataFrame({'Dimensions': ['22.67x23.5', '22x24.6', '45x56', 'x23x56.22','46x23x','34x45'],
'Other': [59, 29, 73, 56,48,22]})
I want to detect the lines that starts with "x" (line 4) or ends with "x" (line 5) and then remove them so my dataframe should look like this
Dimensions Other
22.67x23.5 59
22x24.6 29
45x56 73
23x56.22 56
46x23 48
34x45 22
I wanted to create a function and apply it to a column
def remove_x(x):
if (x.str.match('^[a-zA-Z]') == True):
x = x[1:]
return x
if (x.str.match('.*[a-zA-Z]$') == True):
x = x[:-1]
return x
If I apply this function to the column
df_test['Dimensions'] = df_test['Dimensions'].apply(remove_x)
I got an error 'str' object has no attribute 'str'
I delete 'str' from the function and re-run all but no success.
What should I do?
Thank you for any suggestions or if there is another way to do it I'm interested in.

Just use str.strip:
df_test['Dimensions'] = df_test['Dimensions'].str.strip('x')
For general patterns, you can try str.replace:
df_test['Dimensions'].str.replace('(^x)|(x$)','')
Output:
Dimensions Other
0 22.67x23.5 59
1 22x24.6 29
2 45x56 73
3 23x56.22 56
4 46x23 48
5 34x45 22

#QuangHoang's answer is better (for simplicity and efficiency), but here's what went wrong in your approach. In your apply function, you are making calls to accessing the str methods of a Series or DataFrame. But when you call df_test['Dimensions'].apply(remove_x), the values passed to remove_x are the elements of df_test['Dimensions'], aka the str values themselves. So you should construct the function as if x is an incoming str.
Here's how you could implement that (avoiding any regex):
def remove_x(x):
if x[0] == 'x':
return x[1:]
elif x[-1] == 'x':
return x[:-1]
else:
return x
More idiomatically:
def remove_x(x):
x.strip('x')
Or even:
df_test['Dimensions'] = df_test['Dimensions'].apply(lambda x : x.strip('x'))
All that said, better to not use apply and follow the built-ins shown by Quang.

Related

Map elements of multiple columns in Pandas

I'm trying to label some values in a DataFrame in Pandas based on the value itself, in-place.
df = pd.read_csv('data/extrusion.csv')
# get list of columns that contain thickness
columns = [c for c in data.columns if 'SDickeIst'.lower() in c.lower()]
# create a function that returns the class based on value
def get_label(ser):
ser.map(lambda x : x if x == 0 else 1)
df[columns].apply(get_label)
I would expect that the apply function takes each column in particular and applies get_label on it. In turn, get_label gets the ser argument as a Series and uses map to map each element != 0 with 1.
get_label doesn't return anything.
You want to return ser.map(lambda x : x if x == 0 else 1).
def get_label(ser):
return ser.map(lambda x : x if x == 0 else 1)
Besides that, apply doesn't act in-place, it always returns a new object. Therefore you need
df[columns] = df[columns].apply(get_label)
But in this simple case, using DataFrame.where should be much faster if you are dealing with large DataFrames.
df[columns] = df[columns].where(lambda x: x == 0, 1)

Pandas Data frame column condition check based on length of the value

I have pandas data frame which gets created by reading an excel file. The excel file has a column called serial number. Then I pass a serial number to another function which connect to API and fetch me the result set for those serial number.
My Code -:
def create_excel(filename):
try:
data = pd.read_excel(filename, usecols=[4,18,19,20,26,27,28],converters={'Serial Number': '{:0>32}'.format})
except Exception as e:
sys.exit("Error reading %s: %s" % (filename, e))
data["Subject Organization"].fillna("N/A",inplace= True)
df = data[data['Subject Organization'].str.contains("Fannie",case = False)]
#df['Serial Number'].apply(lamda x: '000'+x if len(x) == 29 else '00'+x if len(x) == 30 else '0'+x if len(x) == 31 else x)
print(df)
df.to_excel(r'Data.xlsx',index= False)
output = df['Serial Number'].apply(lambda x: fetch_by_ser_no(x))
df2 = pd.DataFrame(output)
df2.columns = ['Output']
df5 = pd.concat([df,df2],axis = 1)
The problem I am facing is I want to check if df5 returned by fetch_by_ser_no() is blank then make the serial number as 34 characters by adding two more leading 00 and then check the function again.
How can I do it by not creating multiple dataframe
Any help!!
Thanks
You can try to use if ... else ...:
output = df['Serial Number'].apply(lambda x: 'ok' if fetch_by_ser_no(x) else 'badly')

function' object is not subscriptable", 'occurred at index 0'

I have a dataframe (maple) that, amongst others, has the columns 'THM', which is filled with float64 and 'Season_index', which is filled with int64. The 'THM' column has some missing values, and I want to fill them using the following function:
def fill_thm(cols):
THM = cols[0]
Season_index = cols[1]
if pd.isnull[THM]:
if Season_index == 1:
return 10
elif Season_index == 2:
return 20
elif Season_index == 3:
return 30
else:
return 40
else:
return THM
Then, to apply the function I used
maple['THM']= maple[['THM','Season_index']].apply(fill_thm,axis=1)
But I am getting the ("'function' object is not subscriptable", 'occurred at index 0') error. Anyone has any idea why? Thanks!
Try this:
def fill_thm(THM, S_i):
if pd.isnull[THM]:
if S_i == 1:
return 10
elif S_i == 2:
return 20
elif S_i == 3:
return 30
else:
return 40
else:
return THM
And apply with:
maple.loc[:,'THM'] = maple[['THM','Season_index']].apply(lambda row: pd.Series((fill_thm(row['THM'], row['Season_index']))), axis=1)
Try this code:
def fill(cols):
Age = cols[0]
Pclass=cols[1]
if pd.isnull['Age']:
if Pclass==1:
return 37
elif Pclass==2:
return 30
else:
return 28
else:
return Age
train[:,'Age'] = train[['Age','Pclass']].apply(fill,axis=1)
first of all, when you use apply on a specific column, you need not to specify axis=1.
second, if you are using pandas 0.22, just upgrade to 0.24. It solves all the issues with apply on Dataframes.

pandas histogram plot error: ValueError: num must be 1 <= num <= 0, not 1

I am drawing a histogram of a column from pandas data frame:
%matplotlib notebook
import matplotlib.pyplot as plt
import matplotlib
df.hist(column='column_A', bins = 100)
but got the following errors:
62 raise ValueError(
63 "num must be 1 <= num <= {maxn}, not {num}".format(
---> 64 maxn=rows*cols, num=num))
65 self._subplotspec = GridSpec(rows, cols)[int(num) - 1]
66 # num - 1 for converting from MATLAB to python indexing
ValueError: num must be 1 <= num <= 0, not 1
Does anyone know what this error mean? Thanks!
Problem
The problem you encounter arises when column_A does not contain numeric data. As you can see in the excerpt from pandas.plotting._core below, the numeric data is essential to make the function hist_frame (which you call by DataFrame.hist()) work correctly.
def hist_frame(data, column=None, by=None, grid=True, xlabelsize=None,
xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False,
sharey=False, figsize=None, layout=None, bins=10, **kwds):
# skipping part of the code
# ...
if column is not None:
if not isinstance(column, (list, np.ndarray, Index)):
column = [column]
data = data[column]
data = data._get_numeric_data() # there is no numeric data in the column
naxes = len(data.columns) # so the number of axes becomes 0
# naxes is passed to the subplot generating function as 0 and later determines the number of columns as 0
fig, axes = _subplots(naxes=naxes, ax=ax, squeeze=False,
sharex=sharex, sharey=sharey, figsize=figsize,
layout=layout)
# skipping the rest of the code
# ...
Solution
If your problem is to represent numeric data (but not of numeric dtype yet) with a histogram, you need to cast your data to numeric, either with pd.to_numeric or df.astype(a_selected_numeric_dtype), e.g. 'float64', and then proceed with your code.
If your problem is to represent non-numeric data in one column with a histogram, you can call the function hist_series with the following line: df['column_A'].hist(bins=100).
If your problem is to represent non-numeric data in many columns with a histogram, you may resort to a handful options:
Use matplotlib and create subplots and histograms directly
Update pandas at least to version 0.25
usually is 0
mta['penn'] = [mta_bystation[mta_bystation.STATION == "34 ST-PENN STA"], 'Penn Station']
mta['grdcntrl'] = [mta_bystation[mta_bystation.STATION == "GRD CNTRL-42 ST"], 'Grand Central']
mta['heraldsq'] = [mta_bystation[mta_bystation.STATION == "34 ST-HERALD SQ"], 'Herald Sq']
mta['23rd'] = [mta_bystation[mta_bystation.STATION == "23 ST"], '23rd St']
#mta['portauth'] = [mta_bystation[mta_bystation.STATION == "42 ST-PORT AUTH"], 'Port Auth']
#mta['unionsq'] = [mta_bystation[mta_bystation.STATION == "14 ST-UNION SQ"], 'Union Sq']
mta['timessq'] = [mta_bystation[mta_bystation.STATION == "TIMES SQ-42 ST"], 'Ti

Pandas Apply(), Transform() ERROR = invalid dtype determination in get_concat_dtype

Flowing on from this question, which i link as background, but question is standalone.
4 questions:
I cannot understand the error I see when using apply or transform:
"invalid dtype determination in get_concat_dtype"
Why does ClipNetMean work but the other 2 methods not?
Unsure if or why i need the .copy(deep=True)
Why the slightly different syntax needed to call the InnerFoo function
The DataFrame:
cost
section item
11 1 25
2 100
3 77
4 10
12 5 50
1 39
2 7
3 32
13 4 19
1 21
2 27
The code:
import pandas as pd
import numpy as np
df = pd.DataFrame(data = {'section' : [11,11,11,11,12,12,12,12,13,13,13]
,'item' : [1,2,3,4,5,1,2,3,4,1,2]
,'cost' : [25.,100.,77.,10.,50.,39.,7.,32.,19.,21.,27.]
})
df.set_index(['section','item'],inplace=True)
upper =50
lower = 10
def ClipAndNetMean(cost,upper,lower):
avg = cost.mean()
new_cost = (cost- avg).clip(lower,upper)
return new_cost
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
return v.to_frame()
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
return v.to_frame()
return inner
#These 2 work fine.
print df.groupby(level = 'section').apply(ClipAndNetMean,lower,upper)
print df.groupby(level = 'section').transform(ClipAndNetMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(MiniMean,lower,upper)
print df.groupby(level = 'section').transform(MiniMean,lower,upper)
#apply works but not transform
print df.groupby(level = 'section').apply(InnerFoo(lower,upper))
print df.groupby(level = 'section').transform(InnerFoo(lower,upper))
exit()
So to Chris's answer, note that if I add back the column header the methods will work in a Transform call.
see v.columns = ['cost']
def MiniMean(cost,upper,lower):
cost_clone = cost.copy(deep=True)
cost_clone['A'] = lower
cost_clone['B'] = upper
v = cost_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
def InnerFoo(lower,upper):
def inner(group):
group_clone = group.copy(deep=True)
group_clone['lwr'] = lower
group_clone['upr'] = upper
v = group_clone.apply(np.mean,axis=1)
v = v.to_frame()
v.columns = ['cost']
return v
return inner
1 & 2) transform expects something "like-indexed", while apply is flexible. The two failing functions are adding additional columns.
3) In some cases, (e.g. if you're passing a whole DataFrame into a function) it can be necessary to copy to avoid mutating the original. It should not be necessary here.
4) The first two functions take a DataFrame with two parameters and returns data. InnerFoo actually returns another function, so it needs to be called before being passed into apply.