New column with word at nth position of string from other column pandas - pandas

import numpy as np
import pandas as pd
d = {'ABSTRACT_ID': [14145090,1900667, 8157202,6784974],
'TEXT': [
"velvet antlers vas are commonly used in tradit",
"we have taken a basic biologic RPA to elucidat4",
"ceftobiprole bpr is an investigational cephalo",
"lipoperoxidationderived aldehydes for example",],
'LOCATION': [1, 4, 2, 1]}
df = pd.DataFrame(data=d)
df
def word_at_pos(x,y):
pos=x
string= y
count = 0
res = ""
for word in string:
if word == ' ':
count = count + 1
if count == pos:
break
res = ""
else :
res = res + word
print(res)
word_at_pos(df.iloc[0,2],df.iloc[0,1])
For this df I want to create a new column WORD that contains the word from TEXT at the position indicated by LOCATION. e.g. first line would be "velvet".
I can do this for a single line as an isolated function world_at_pos(x,y), but can't work out how to apply this to whole column. I have done new columns with Lambda functions before, but can't work out how to fit this function to lambda.

Looping over TEXT and LOCATION could be the best idea because splitting creates a jagged array, so filtering using numpy advanced indexing won't be possible.
df["WORDS"] = [txt.split()[loc] for txt, loc in zip(df["TEXT"], df["LOCATION"]-1)]
print(df)
ABSTRACT_ID ... WORDS
0 14145090 ... velvet
1 1900667 ... a
2 8157202 ... bpr
3 6784974 ... lipoperoxidationderived
[4 rows x 4 columns]

Related

Recursively update the dataframe

I have a dataframe called datafe from which I want to combine the hyphenated words.
for example input dataframe looks like this:
,author_ex
0,Marios
1,Christodoulou
2,Intro-
3,duction
4,Simone
5,Speziale
6,Exper-
7,iment
And the output dataframe should be like:
,author_ex
0,Marios
1,Christodoulou
2,Introduction
3,Simone
4,Speziale
5,Experiment
I have written a sample code to achieve this but I am not able to get out of the recursion safely.
def rm_actual(datafe, index):
stem1 = datafe.iloc[index]['author_ex']
stem2 = datafe.iloc[index + 1]['author_ex']
fixed_token = stem1[:-1] + stem2
datafe.drop(index=index + 1, inplace=True, axis=0)
newdf=datafe.reset_index(drop=True)
newdf.iloc[index]['author_ex'] = fixed_token
return newdf
def remove_hyphens(datafe):
for index, row in datafe.iterrows():
flag = False
token=row['author_ex']
if token[-1:] == '-':
datafe=rm_actual(datafe, index)
flag=True
break
if flag==True:
datafe=remove_hyphens(datafe)
if flag==False:
return datafe
datafe=remove_hyphens(datafe)
print(datafe)
Is there any possibilities I can get out of this recursion with expected output?
Another option:
Given/Input:
author_ex
0 Marios
1 Christodoulou
2 Intro-
3 duction
4 Simone
5 Speziale
6 Exper-
7 iment
Code:
import pandas as pd
# read/open file or create dataframe
df = pd.DataFrame({'author_ex':['Marios', 'Christodoulou', 'Intro-', \
'duction', 'Simone', 'Speziale', 'Exper-', 'iment']})
# check input format
print(df)
# create new column 'Ending' for True/False if column 'author_ex' ends with '-'
df['Ending'] = df['author_ex'].shift(1).str.contains('-$', na=False, regex=True)
# remove the trailing '-' from the 'author_ex' column
df['author_ex'] = df['author_ex'].str.replace('-$', '', regex=True)
# create new column with values of 'author_ex' and shifted 'author_ex' concatenated together
df['author_ex_combined'] = df['author_ex'] + df.shift(-1)['author_ex']
# create a series true/false but shifted up
index = (df['Ending'] == True).shift(-1)
# set the last row to 'False' after it was shifted
index.iloc[-1] = False
# replace 'author_ex' with 'author_ex_combined' based on true/false of index series
df.loc[index,'author_ex'] = df['author_ex_combined']
# remove rows that have the 2nd part of the 'author_ex' string and are no longer required
df = df[~df.Ending]
# remove the extra columns
df.drop(['Ending', 'author_ex_combined'], axis = 1, inplace=True)
# output final dataframe
print('\n\n')
print(df)
# notice index 3 and 6 are missing
Outputs:
author_ex
0 Marios
1 Christodoulou
2 Introduction
4 Simone
5 Speziale
6 Experiment

Pandas get row if column is a substring of string

I can do the following if I want to extract rows whose column "A" contains the substring "hello".
df[df['A'].str.contains("hello")]
How can I select rows whose column is the substring for another word? e.g.
df["hello".contains(df['A'].str)]
Here's an example dataframe
df = pd.DataFrame.from_dict({"A":["hel"]})
df["hello".contains(df['A'].str)]
IIUC, you could apply str.find:
import pandas as pd
df = pd.DataFrame(['hell', 'world', 'hello'], columns=['A'])
res = df[df['A'].apply("hello".find).ne(-1)]
print(res)
Output
A
0 hell
2 hello
As an alternative use __contains__
res = df[df['A'].apply("hello".__contains__)]
print(res)
Output
A
0 hell
2 hello
Or simply:
res = df[df['A'].apply(lambda x: x in "hello")]
print(res)

Pandas dataframe append to column containing list

I have a pandas dataframe with one column that contains an empty list in each cell.
I need to duplicate the dataframe, and append it at the bottom of the original dataframe, but with additional information in the list.
Here is a minimal code example:
df_main = pd.DataFrame([['a', []], ['b', []]], columns=['letter', 'mylist'])
> df_main
letter mylist
0 a []
1 b []
df_copy = df_main.copy()
for index, row in df_copy.iterrows():
row.mylist = row.mylist.append(1)
pd.concat([ df_copy,df_main], ignore_index=True)
> result:
letter mylist
0 a None
1 b None
2 a [1]
3 b [1]
As you can see there is a problem that the [] empty list was replaced by a None
Just to make sure, this is what I would like to have:
letter mylist
0 a []
1 b []
2 a [1]
3 b [1]
How can I achieve that?
append method on list return a None value, that's why None appears in the final dataframe. You may have use + operator for reassignment like this:
import pandas as pd
df_main = pd.DataFrame([['a', []], ['b', []]], columns=['letter', 'mylist'])
df_copy = df_main.copy()
for index, row in df_copy.iterrows():
row.mylist = row.mylist + list([1])
pd.concat([df_main, df_copy], ignore_index=True).head()
Output of this block of code:
letter mylist
0 a []
1 b []
2 a [1]
3 b [1]
A workaround to solve your problem would be to create a temporary column mylist2 with np.empty((len(df), 0)).tolist()) and use np.where() to change the None values of mylist to an empty list and then drop the empty column.
import pandas as pd, numpy as np
df_main = pd.DataFrame([['a', []], ['b', []]], columns=['letter', 'mylist'])
df_copy = df_main.copy()
for index, row in df_copy.iterrows():
row.mylist = row.mylist.append(1)
df = (pd.concat([df_copy,df_main], ignore_index=True)
.assign(mylist2=np.empty((len(df), 0)).tolist()))
df['mylist'] = np.where((df['mylist'].isnull()), df['mylist2'], df['mylist'])
df= df.drop('mylist2', axis=1)
df
Out[1]:
letter mylist
0 a []
1 b []
2 a [1]
3 b [1]
Not only does append method on list return a None value as indicated in the first answer, but both df_main and df_copy contain pointers to the same lists. So after:
for index, row in df_copy.iterrows():
row.mylist.append(1)
both dataframes have updated lists with one element. For your code to work as expected you can create a new list after you copy the dataframe:
df_copy = df_main.copy()
for index, row in df_copy.iterrows():
row.mylist = []
This question is another great example why we should not put objects in a dataframe.

Iterating over columns in data frame by skipping first column and drawing multiple plots

I have a data frame as following,
df.head()
ID AS_FP AC_FP RP11_FP RP11_be AC_be AS_be Info
AE02 0.060233 0 0.682884 0.817115 0.591182 0.129252 SAP
AE03 0 0 0 0.889181 0.670113 0.766243 SAP
AE04 0 0 0.033256 0.726193 0.171861 0.103839 others
AE05 0 0 0.034988 0.451329 0.431836 0.219843 others
What I am aiming is to plot each column starting from AS_FP til RP11_beta as lmplot, each x axis is column ending with FP and y axis is its corresponding column ending with be.
And I wanted to save it as separate files so I strated iterating through the columns by skipping first column ID, like this,
for ind, column in enumerate(df.columns):
if column.split('_')[0] == column.split('_')[0]:
But I got lost how to continue, I need to plot
sns.lmplot(x, y, data=df, hue='Info',palette=colors, fit_reg=False,
size=10,scatter_kws={"s": 700},markers=["o", "v"])
and save each image as seperate file
Straightforward solution:
1) Toy data:
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
dct = OrderedDict()
dct["ID"] = ["AE02", "AE03", "AE04", "AE05"]
dct["AS_FP"] = [0.060233, 0, 0, 0]
dct["AC_FP"] = [0, 0,0, 0]
dct["RP11_FP"] = [0.682884, 0, 0.033256, 0.034988]
dct["AS_be"] = [0.129252, 0.766243, 0.103839, 0.219843]
dct["AC_be"] = [0.591182, 0.670113, 0.171861, 0.431836]
dct["RP11_be"] = [0.817115, 0.889181, 0.726193, 0.451329]
dct["Info"] = ["SAP", "SAP", "others", "others"]
df = pd.DataFrame(dct)
2) Iterating through pairs, saving each figure with unique filename:
graph_cols = [col for col in df.columns if ("_FP" in col) or ("_be" in col)]
fps = sorted([col for col in graph_cols if "_FP" in col])
bes = sorted([col for col in graph_cols if "_be" in col])
for x, y in zip(fps, bes):
snsplot = sns.lmplot(x, y, data=df, fit_reg=False, hue='Info',
size=10, scatter_kws={"s": 700})
snsplot.savefig(x.split("_")[0] + ".png")
You can add needed params in lmlplot as you need.

apply generic function in a vectorized fashion using numpy/pandas

I am trying to vectorize my code and, thanks in large part to some users (https://stackoverflow.com/users/3293881/divakar, https://stackoverflow.com/users/625914/behzad-nouri), I was able to make huge progress. Essentially, I am trying to apply a generic function (in this case max_dd_array_ret) to each of the bins I found (see vectorize complex slicing with pandas dataframe for details on date vectorization and Start, End and Duration of Maximum Drawdown in Python for the rationale behind max_dd_array_ret). the problem is the following: I should be able to obtain the result df_2 and, to some degree, ranged_DD(asd_1.values, starts, ends+1) is what I am looking for, except for the tragic effect that it's as if the first two bins are merged and the last one is missing as it can be gauged by looking at the results.
any explanation and fix is very welcomed
import pandas as pd
import numpy as np
from time import time
from scipy.stats import binned_statistic
def max_dd_array_ret(xs):
xs = (xs+1).cumprod()
i = np.argmax(np.maximum.accumulate(xs) - xs) # end of the period
j = np.argmax(xs[:i])
max_dd = abs(xs[j]/xs[i] -1)
return max_dd if max_dd is not None else 0
def get_ranges_arr(starts,ends):
# Taken from https://stackoverflow.com/a/37626057/3293881
counts = ends - starts
counts_csum = counts.cumsum()
id_arr = np.ones(counts_csum[-1],dtype=int)
id_arr[0] = starts[0]
id_arr[counts_csum[:-1]] = starts[1:] - ends[:-1] + 1
return id_arr.cumsum()
def ranged_DD(arr,starts,ends):
# Get all indices and the IDs corresponding to same groups
idx = get_ranges_arr(starts,ends)
id_arr = np.repeat(np.arange(starts.size),ends-starts)
slice_arr = arr[idx]
return binned_statistic(id_arr, slice_arr, statistic=max_dd_array_ret)[0]
asd_1 = pd.Series(0.01 * np.random.randn(500), index=pd.date_range('2011-1-1', periods=500)).pct_change()
index_1 = pd.to_datetime(['2011-2-2', '2011-4-3', '2011-5-1','2011-7-2', '2011-8-3', '2011-9-1','2011-10-2', '2011-11-3', '2011-12-1','2012-1-2', '2012-2-3', '2012-3-1',])
index_2 = pd.to_datetime(['2011-2-15', '2011-4-16', '2011-5-17','2011-7-17', '2011-8-17', '2011-9-17','2011-10-17', '2011-11-17', '2011-12-17','2012-1-17', '2012-2-17', '2012-3-17',])
starts = asd_1.index.searchsorted(index_1)
ends = asd_1.index.searchsorted(index_2)
df_2 = pd.DataFrame([max_dd_array_ret(asd_1.loc[i:j]) for i, j in zip(index_1, index_2)], index=index_1)
print(df_2[0].values)
print(ranged_DD(asd_1.values, starts, ends+1))
results:
df_2
[ 1.75893509 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085
1.43863472 1.85322338 1.84767224 1.32605754 1.48688414 5.44786663]
ranged_DD(asd_1.values, starts, ends+1)
[ 6.08002911 2.60131797 1.55631781 1.8770067 2.50709085 1.43863472
1.85322338 1.84767224 1.32605754 1.48688414]
which are identical except for the first two:
[ 1.75893509 6.08002911 vs [ 6.08002911
and the last two
1.48688414 5.44786663] vs 1.48688414]
p.s.:while looking in more detail at the docs (http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.binned_statistic.html) I found that this might be the problem
"All but the last (righthand-most) bin is half-open. In other words,
if bins is [1, 2, 3, 4], then the first bin is [1, 2) (including 1,
but excluding 2) and the second [2, 3). The last bin, however, is [3,
4], which includes 4. New in version 0.11.0."
problem is I don't how to reset it.