Convert string to colum - pandas

I have a simple data frame, and I am developing a sentiment analysis.
This is the code and the reproducible example
import transformers
from pysentimiento import SentimentAnalyzer
from pysentimiento import EmotionAnalyzer
analyzer = SentimentAnalyzer(lang="en")
emotion_analyzer = EmotionAnalyzer(lang="en")
data = [['Hello world'], ['I am the best'], ['Nice jacket!']]
df2 = pd.DataFrame(data, columns = ['Tweet'])
# print dataframe.
df2["sentiment"] = df2.apply(lambda row : analyzer.predict(row["Tweet"]), axis = 1)
The output for the code below:
Tweet sentiment
---------------------| --------------------
Hello world | SentimentOutput(output=POS, probas={POS: 0.999, NEG: 0.001,NEU: 0.000}) |
I am the best | SentimentOutput(output=POS, probas={POS: 0.999, NEG: 0.001,NEU: 0.000})
Nice jacket! | SentimentOutput(output=POS, probas={POS: 0.999, NEG: 0.001,NEU: 0.000})
I would like to split the sentiment column and have something like this:
Tweet sentiment prob_Pos Prob_Neg Prob_Neu
---------------------|---------------|----------|------------------------------
Hello world | POS | 0.99 | 0.001 | 0.000
I am the best | POS | 0.99 | 0.001 | 0.000
Nice jacket! | POS | 0.99 | 0.001 | 0.000

The results must be converted into a pd.Series then join back to the DataFrame. This is easiest to do with a function as the results cannot be easily unpacked:
analyzer = SentimentAnalyzer(lang="en")
def process(row):
res = analyzer.predict(row["Tweet"])
return pd.Series({'sentiment': res.output, **res.probas})
df2 = df2.join(df2.apply(process, axis=1))
df2:
Tweet sentiment NEG NEU POS
0 Hello world NEU 0.000446 0.548691 0.450863
1 I am the best POS 0.000660 0.001529 0.997811
2 Nice jacket! POS 0.000224 0.051520 0.948256
This can also be done in a way that the analyzer can be passed as a parameter:
def process_with(predictor):
def process_(row):
res = predictor.predict(row["Tweet"])
return pd.Series({'sentiment': res.output, **res.probas})
return process_
analyzer = SentimentAnalyzer(lang="en")
df2 = df2.join(df2.apply(process_with(analyzer), axis=1))

Related

Create a bar plot in plt when having the bins and heights

I have the following ranges of bins and their desired heights
Range | Height
-------------------------
0.0-0.0905 | 0.02601
0.0905-0.1811| 0.13678
0.1811-0.2716| 0.22647
0.2716-0.3621| 0.31481
0.3621-0.4527| 0.40681
0.4527-0.5432| 0.50200
0.5432-0.6337| 0.58746
0.6337-0.7243| 0.68153
0.7243-0.8148| 0.76208
0.8148-0.9053| 0.86030
0.9053-0.9958| 0.95027
0.9958-1 | 0.99584
The desired outcome is a histogram/bar plot with the edges according to Range and the heights according to Height.
You can split your Range and explode to get the edges of the bins:
import pandas as pd
from io import StringIO
data = StringIO("""Range | Height
-------------------------
0.0-0.0905 | 0.02601
0.0905-0.1811| 0.13678
0.1811-0.2716| 0.22647
0.2716-0.3621| 0.31481
0.3621-0.4527| 0.40681
0.4527-0.5432| 0.50200
0.5432-0.6337| 0.58746
0.6337-0.7243| 0.68153
0.7243-0.8148| 0.76208
0.8148-0.9053| 0.86030
0.9053-0.9958| 0.95027
0.9958-1 | 0.99584""")
df = pd.read_csv(data, sep="\s*\|\s*", engine="python", skiprows=[1])
df['Range'] = df['Range'].str.split('-')
df = df.explode('Range').drop_duplicates('Range').astype(float)
This will give you:
Range Height
0 0.0000 0.02601
0 0.0905 0.02601
1 0.1811 0.13678
2 0.2716 0.22647
3 0.3621 0.31481
4 0.4527 0.40681
5 0.5432 0.50200
6 0.6337 0.58746
7 0.7243 0.68153
8 0.8148 0.76208
9 0.9053 0.86030
10 0.9958 0.95027
11 1.0000 0.99584
Then use plt.stairs:
import matplotlib.pyplot as plt
plt.stairs(df['Height'].iloc[1:], edges=df['Range'].values, fill=True)
plt.show()
Output:

How to count hypothenuses with pandas udf, pyspark

I want to write a panda udf which will take two arguments cathetus1, and cathetus2 from other dataframe and return hypot.
# this data is list where cathetuses are.
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
#and this is creating dataframe where only cathetuses are showing.
this is function i have written so far.
def pandaUdf(cat1, cat2):
leg1 = []
leg2 = []
for i in data:
x = 0
leg1.append(i[x])
leg2.append(i[x+1])
hypoData.append(np.hypot(leg1,leg2))
return np.hypot(leg1,leg2)
#example_series = pd.Series(data)
and im trying to create a new column in df, which values will be hypos.
df.withColumn(col('Hypo'), pandaUdf(example_df.cathetus1,example_df.cathetus2)).show()
but this gives me an error --> col should be Column.
I dont understand how I can fix this error or why its even there.
You can apply np.hypot on the 2 cathetus directly without extracting individual values.
from pyspark.sql import functions as F
from pyspark.sql.types import *
data = [(3.0, 4.0), (6.0, 8.0), (3.3, 5.6)]
schema = StructType([StructField("cathetus1",DoubleType(),True),StructField("cathetus2",DoubleType(),True)])
df = spark.createDataFrame(data=data,schema=schema)
df.show()
"""
+---------+---------+
|cathetus1|cathetus2|
+---------+---------+
| 3.0| 4.0|
| 6.0| 8.0|
| 3.3| 5.6|
+---------+---------+
"""
def hypot(cat1: pd.Series, cat2: pd.Series) -> pd.Series:
return np.hypot(cat1,cat2)
hypot_pandas_df = F.pandas_udf(hypot, returnType=FloatType())
df.withColumn("Hypo", hypot_pandas_df("cathetus1", "cathetus2")).show()
"""
+---------+---------+----+
|cathetus1|cathetus2|Hypo|
+---------+---------+----+
| 3.0| 4.0| 5.0|
| 6.0| 8.0|10.0|
| 3.3| 5.6| 6.5|
+---------+---------+----+
"""

ImageDataGenerator for multi task output in Keras using flow_from_directory

I am creating a multitask CNN model and I have two different classification properties (one with 10 classes, 2nd with 5 classes) and my directory structure looks like this:
-Train
- image1.jpg
...
- imageN.jpg
-Test
- image1.jpg
...
- imageN.jpg
-Vald
- image1.jpg
...
- imageN.jpg
trainlabel is a dataframe containing, Image, PFRType, FuelType columns
I am trying to use flow_from_dataframe and my generators are:
trainGen = ImageDataGenerator()
trainGenDf = trainGen.flow_from_dataframe(trainLabel,
directory = '../MTLData/train/',
x_col = "Image",y_col=["PFRType","FuelType"],
class_mode='multi_ouput',
target_size=(224,224),
batch_size=32)
The error I get is:
Error when checking target: expected PFR to have shape (10,) but got array with shape (1,)
PFR is a subtask layer with 10 classes output
Here is model diagram.
You can use flow_from_dataframe.
You just need to parse your csv files containing the labels into a pandas dataframe which maps the filenames to their corresponding labels.
For instance, if dataframe looks like:
| image_path | label_task_a | label_task_b | subset |
|------------|--------------|--------------|--------|
| image1.jpg | foo | bla | Train |
| ... | ... | ... | ... |
| imageN.jpg | baz | whatever | Vald |
You can create one generator for each subset:
train_generator_task_a = datagen.flow_from_dataframe(
dataframe=df[df.subset == 'Train']],
directory='data/Train',
x_col='image_path',
y_col=['label_task_a', 'label_task_b'], # outputs for both tasks.
batch_size=32,
seed=42,
shuffle=True,
class_mode='categorical')
Edit 1:
Regarding your Error: if you set class_mode='sparse', Keras expects the labels to be 1D numpy arrays of integer labels. Have you tried to set it to class_mode='multi_output'?
I have used custom function for generator, this doesnt support shuffle so far!
def get_data_generator(data, split ,batch_size=16):
imagePath = ''
df =''
if split == 'train':
imagePath = '../MTLData/train/'
df = data[data.dir == 'train']
elif split == 'test':
imagePath = '../MTLData/test/'
df = data[data.dir == 'test']
elif split == 'vald':
imagePath = '../MTLData/vald/'
df = data[data.dir == 'vald']
pfrID = len(data.PFRType.unique())
ftID = len(data.FuelType.unique())
images, pfrs,fts = [], [], []
while True:
for i in range(0,df.shape[0]):
r = df.iloc[i]
file, pfr, ft = r['Image'], r['PFRType'], r['FuelType']
im = Image.open(imagePath+file)
im = im.resize((224, 224))
im = np.array(im) / 255.0
images.append(im)
pfrs.append(to_categorical(pfr, pfrID))
fts.append(to_categorical(ft, ftID))
if len(images) >= batch_size:
yield np.array(images), [np.array(pfrs), np.array(fts)]
images, pfrs, fts = [], [], []

Pandas accumulate data for linear regression

I try to adjust my data so total_gross per day is accumulated. E.g.
`Created` `total_gross` `total_gross_accumulated`
Day 1 100 100
Day 2 100 200
Day 3 100 300
Day 4 100 400
Any idea, how I have to change my code to have total_gross_accumulated available?
Here is my data.
my code:
from sklearn import linear_model
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
X = event_data.index
y = event_data.total_gross
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()
List comprehension is the most pythonic way to do this.
SHORT answer:
This should give you the new column that you want:
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
OR faster
event_data['total_gross_accumulated'] = event_data['total_gross'].cumsum()
LONG answer:
Full code using your data:
import pandas as pd
def load_event_data():
df = pd.read_csv('sample-data.csv', usecols=['created', 'total_gross'])
df['created'] = pd.to_datetime(df.created)
return df.set_index('created').resample('D').sum().fillna(0)
event_data = load_event_data()
n = event_data.shape[0]
# skip line 0 and start by accumulating from 1 until the end
total_gross_accumulated =[event_data['total_gross'][:i].sum() for i in range(1,n+1)]
# add the new variable in the initial pandas dataframe
event_data['total_gross_accumulated'] = total_gross_accumulated
Results:
event_data.head(6)
# total_gross total_gross_accumulated
#created
#2019-03-01 3481810 3481810
#2019-03-02 4690 3486500
#2019-03-03 0 3486500
#2019-03-04 0 3486500
#2019-03-05 0 3486500
#2019-03-06 0 3486500
X = event_data.index
y = event_data.total_gross_accumulated
plt.xticks(rotation=90)
plt.plot(X, y)
plt.show()

difference between np.linalg.lstsq and linear regression in scikit learn

comb 1 is a pandas data frame with following values.
yearID teamID salary W
408 ANA 51464167 82
409 ARI 81027833 85
When I use np.linalg.lstsq I am able to print dfg data frame.
dfg = pd.DataFrame()
comb1 = combined[combined['yearID'] == 2000]
x1 = comb1['salary'].values /1000000
y1 =comb1['W'].values
A1 = np.array([x1, np.ones(len(x1))])
w1 = np.linalg.lstsq(A1.T,y1)[0]
yq = (w1[0]*x1+w1[1])
dfg['New val'] = y1 - yq
When I use scikit learn libary for the linear regression and do the same operation I am getting a value error
from sklearn.linear_model import LinearRegression
fg = pd.DataFrame()
x2 = comb1['salary'].values /1000000
y2 =comb1['W'].values
x2_reshape = x2.reshape(-1,1)
y2_reshape = y2.reshape(-1,1)
clf1 = LinearRegression()
clf1.fit(x2_reshape, y2_reshape)
predicted_train = clf1.predict(x2_reshape)
x_pre = y2 - predicted_train
fg['New val'] = x_pre
What is the difference between these two ?? kindly help me!!
They should be the same:
Notes
From the implementation point of view, this is just plain Ordinary Least Squares (scipy.linalg.lstsq) wrapped as a predictor object.
If you getting an error, it's probably because of they way you set up your data.