How to use multiprocess.Pool with .apply function? - python-multiprocessing

I have a function that processes a string and I am applying it on a dataframe column
import pandas as pd
import numpy as np
def test_upper(d):
return d.upper()
def mainfunc():
df = pd.read_csv("file.csv", sep='\t', encoding='utf-8')
print(df.head())
lambdafunc = lambda x: test_upper(x)
df['upper_cols'] = df['cols'].apply(lambdafunc)
print(df.head())
mainfunc()
Now, I want to do the same but with multiprocessing.Pool. I have searched how to do this here in stackoverflow and this is what I came up:
import pandas as pd
import numpy as np
import multiprocessing as mp
def test_upper(d):
return d.upper()
def mainfunc():
df = pd.read_csv("file.csv", sep='\t', encoding='utf-8')
print(df.head())
lambdafunc = lambda x: test_upper(x)
list_results = pd.Series()
def log_result(result):
list_results.append(result)
pool = mp.Pool(processes=4)
pool.apply_async(lambdafunc, (df['cols'], ), callback=log_result)
pool.close()
pool.join()
print(list_results)
mainfunc()
The results are blank Series/list as I've tried both. What am I doing wrong here?
Thank you!

Finally figured it out
def test_upper(d):
output = d.apply(lambda x: x:upper())
return output
def mainfunc():
df = pd.read_csv("file.csv", sep='\t', encoding='utf-8')
print(df.head())
pool = mp.Pool(processes=4)
result = pool.apply_async(test_upper, (df['cols'], ))
pool.close()
pool.join()
print(result.get())
mainfunc()

Related

ThreadPoolExecutor DataFrame

I am dealing with a simple loop.
I have a slightly larger dataframe and I would like to use the processor (currently 2%).
I tried this:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor
scan = pd.DataFrame([[0,2,3,5],[4,2,7,7], [5,6,2,3]], columns=['st1','nd1','st2','nd2'])
def task(value):
calc_all = pd.DataFrame()
for i in range(0,3,2):
j=i+1
calc = pd.concat([pd.DataFrame(scan.iloc[:,i]), pd.DataFrame(scan.iloc[:,j])],axis=1)
calc['th'] = calc.iloc[:,0] + calc.iloc[:,1]
calc_all = pd.concat([calc_all, calc], axis=1)
time.sleep(1) #tested time
return calc_all
if __name__ == '__main__':
with ThreadPoolExecutor(2) as exe:
for result in exe.map(task, range(2)):
print(result)
It's not faster. What did I do wrong?

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

What kind of plot this is? [duplicate]

I'm trying to get the errorbars to show at the confidence interval's limits, and not in the center.
What I want is this:
but what I'm getting is this:
To plot the bar chart I used this:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12345)
df = pd.DataFrame([np.random.normal(32000,200000,3650),
np.random.normal(43000,100000,3650),
np.random.normal(43500,140000,3650),
np.random.normal(48000,70000,3650)],
index=[1992,1993,1994,1995])
df1 = df.T
df1.columns = ['1992', '1993','1994','1995']
a = df1.describe()
means = a.loc['mean'].values.tolist()
stdevs = a.loc['std'].values.tolist()
counts = a.loc['count'].values.tolist()
index = np.arange(len(df1.columns))
CI = []
for i in range(len(means)):
CIval = 1.96*stdevs[i]/(counts[i]**(0.5))
CI.append(CIval)
#print(means, CI)
plt.figure()
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xticks(index)
ax.set_xticklabels(df1.columns)
plt.bar(index, means, xerr = 0.1, yerr=CI)
plt.tight_layout()
plt.show()
The error bars are showing as expected. You have set a 0.1 value for the x error, however in your expected result image, there is no x errorbar so we can remove that. Secondly, we can increase the capsize of your error bars so that they are actually visible by using the capsize= in the call to plt.bar():
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(12345)
df = pd.DataFrame([np.random.normal(32000,200000,3650),
np.random.normal(43000,100000,3650),
np.random.normal(43500,140000,3650),
np.random.normal(48000,70000,3650)],
index=[1992,1993,1994,1995])
df1 = df.T
df1.columns = ['1992', '1993','1994','1995']
a = df1.describe()
means = a.loc['mean'].values.tolist()
stdevs = a.loc['std'].values.tolist()
counts = a.loc['count'].values.tolist()
index = np.arange(len(df1.columns))
CI = []
for i in range(len(means)):
CIval = 1.96*stdevs[i]/(counts[i]**(0.5))
CI.append(CIval)
fig, ax = plt.subplots(figsize=(10,10))
ax.set_xticks(index)
ax.set_xticklabels(df1.columns)
plt.bar(index, means, yerr=CI, capsize=10)
plt.tight_layout()
plt.show()

when read data from file: output {'y': array([-0.21]), 'x': array([1.63])}

I will read data from a file, I have only two number in file are(1.63 , -0.21),output :
{'y': array([-0.21]), 'x': array([1.63])}
I need the output like this:
position = {'x': 1.63 , 'y' : -0.21}
this my code:
import pandas as pd
import numpy as np
def read():
data = pd.read_csv('distance.csv', skipinitialspace=True, header=None)
x0, y0 = np.array(data.ix[:,0]), np.array(data.ix[:,1])
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
please help me
Thank you in advance
Change this
x0, y0 = np.array(data.ix[:,0]), np.array(data.ix[:,1]) to x0, y0 = data.ix[:,0], data.ix[:,1]
Essentially you need to remove the np.array wrapping which converting your float to an nd.array
try this
import numpy as np
data = np.genfromtxt('distance.csv', dtype=list).tolist()
x0,y0 = float(data[0]), float(data[1])
position = {'x': x0 , 'y' : y0}
print position
the output is:
{'y': -0.7, 'x': 1.7}
by this way worked but it is long
import pandas as pd
import numpy as np
import csv
def read():
data_path = 'distance.csv'
with open(data_path, 'r') as f:
reader = csv.reader(f, delimiter=',')
# get all the rows as a list
data = list(reader)
# transform data into numpy array
data = np.array(data).astype(float)
a = data[0]
x0,y0 = a[0], a[1]
#
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
output:
{'y': -0.21, 'x': 1.63}
also this way is good:
import pandas as pd
import numpy as np
def read():
data = np.genfromtxt('distance.csv', dtype=str, delimiter=',')
x0, y0 = (data[0]), (data[1])
position = {'x': x0 , 'y' : y0}
print position
if __name__ == '__main__':
try:
read()
except KeyboardInterrupt:
rospy.loginfo('Shutting down')
output:
{'y': '-0.21', 'x': '1.63'}

Figures names in Pandas Boxplots

I created 2 boxplots using pandas.
Then each figure gets referenced with plt.gcf()
When trying to show the plots, only the last boxplot gets shown. Its like fig1 is getting overwritten.
What is the correct way of showing both boxplots?
This is the sample code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dates = pd.date_range('20000101', periods=10)
df = pd.DataFrame(index=dates)
df['A'] = np.cumsum(np.random.randn(10))
df['B'] = np.random.randint(-1,2,size=10)
df['C'] = range(1,11)
df['D'] = range(12,22)
# first figure
ax_boxplt1 = df[['A','B']].boxplot()
fig1 = plt.gcf()
# second figure
ax_boxplt2 = df[['C','D']].boxplot()
fig2 = plt.gcf()
# print figures
figures = [fig1,fig2]
for fig in figures:
print(fig)
Create a figure with two axes and plot to each of them separately
fig, axes = plt.subplots(2)
dates = pd.date_range('20000101', periods=10)
df = pd.DataFrame(index=dates)
df['A'] = np.cumsum(np.random.randn(10))
df['B'] = np.random.randint(-1,2,size=10)
df['C'] = range(1,11)
df['D'] = range(12,22)
# first figure
df[['A','B']].boxplot(ax=axes[0]) # Added `ax` parameter
# second figure
df[['C','D']].boxplot(ax=axes[1]) # Added `ax` parameter
plt.show()
In order to get two figures, define the figure before plotting to it. You can use a number enumerate the figures.
plt.figure(1)
# do something with the first figure
plt.figure(2)
# do something with the second figure
Complete example:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dates = pd.date_range('20000101', periods=10)
df = pd.DataFrame(index=dates)
df['A'] = np.cumsum(np.random.randn(10))
df['B'] = np.random.randint(-1,2,size=10)
df['C'] = range(1,11)
df['D'] = range(12,22)
# first figure
fig1=plt.figure(1)
ax_boxplt1 = df[['A','B']].boxplot()
# second figure
fig2=plt.figure(2)
ax_boxplt2 = df[['C','D']].boxplot()
plt.show()