Printing json result in an request - pyspark - sql

I would appreciate some help on this, having this API call below, how could I print the json results?
import requests
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, StringType
from pyspark.sql.functions import udf
class Test:
def Test1():
r = requests.get("https://mylink")
if r.status_code != 200:
print("Error calling geo-cache: {} - {}".format(r.status_code, r.text))
else:
return r.json()
Thanks in advance!

Related

ThreadPoolExecutor DataFrame

I am dealing with a simple loop.
I have a slightly larger dataframe and I would like to use the processor (currently 2%).
I tried this:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor
scan = pd.DataFrame([[0,2,3,5],[4,2,7,7], [5,6,2,3]], columns=['st1','nd1','st2','nd2'])
def task(value):
calc_all = pd.DataFrame()
for i in range(0,3,2):
j=i+1
calc = pd.concat([pd.DataFrame(scan.iloc[:,i]), pd.DataFrame(scan.iloc[:,j])],axis=1)
calc['th'] = calc.iloc[:,0] + calc.iloc[:,1]
calc_all = pd.concat([calc_all, calc], axis=1)
time.sleep(1) #tested time
return calc_all
if __name__ == '__main__':
with ThreadPoolExecutor(2) as exe:
for result in exe.map(task, range(2)):
print(result)
It's not faster. What did I do wrong?

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

NameError: name 'sparklines' is not defined

Bonjour,
import the data frame:
# Loading a Sample Pandas DataFrame
import pandas as pd
import numpy as np
df = pd.read_csv('https://raw.githubusercontent.com/datagy/data/main/sales.csv', parse_dates=['date'])
code is:
def percentile_90(x):
return x.quantile(.9)
from scipy.stats import trim_mean
def trim_mean_10(x):
return trim_mean(x, 0.1)
def largest(x):
return x.nlargest(1)
import matplotlib.pyplot as plt
import base64
from r-ltxsparklines import sparklines
def sparkline_str(x):
bins=np.histogram(x)[0]
sl = ''.join(sparklines(bins))
return sl
#Les voici tous rassemblés :
agg_func_largest = {
'fare': [percentile_90, trim_mean_10, largest, sparkline_str]
}
df.groupby(['class', 'embark_town']).agg(agg_func_largest)
that produces:
Input In [82]
from r-ltxsparklines import sparklines
^
SyntaxError: invalid syntax
After other modifications, error is:
NameError: name 'sparklines' is not defined
The question is: how to define 'sparklines' or which libraries to import so that the 'sparklines' function is recognized?
Regards,
Atapalou

Why sparklines are outside of table as expected?

Bonjour,
"sparkline" does not work in my code.
Already, I didn't manage to install it. So, I found a function that I call "sparkline_test. Nevertheless, the images that should be integrated in the table are outside. Something is wrong.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from io import BytesIO
from itertools import islice
import seaborn as sns
import base64
I cannot import sparklines:
#import sparklines
df = sns.load_dataset('titanic')
def percentile_90(x):
return x.quantile(.9)
from scipy.stats import trim_mean
def trim_mean_10(x):
return trim_mean(x, 0.1)
def largest(x):
return x.nlargest(1)
def sparkline_str(x):
bins=np.histogram(x)[0]
sl = ''.join(sparklines(bins))
return sl
def sparkline_test(data, figsize=(4,0.25),**kwags):
data = list(data)
fig,ax = plt.subplots(1,1,figsize=figsize,**kwags)
ax.plot(data)
for k,v in ax.spines.items():
v.set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
plt.plot(len(data)-1, data[len(data)-1], 'r.')
ax.fill_between(range(len(data)), data, len(data)*[min(data)], alpha=0.1)
img = BytesIO()
plt.savefig(img, transparent=True, bbox_inches='tight')
img.seek(0)
plt.show()
# plt.close()
return base64.b64encode(img.read()).decode("utf-8")
def sparkline_str(x):
bins=np.histogram(x)[0]
sl = ''.join(sparkline_test(bins))
return sl
agg_func_largest = {
'fare': [percentile_90, trim_mean_10, largest, sparkline_test]
#'fare': [percentile_90, trim_mean_10, largest]
}
df.groupby(['class', 'embark_town']).agg(agg_func_largest)
that produces:
What is expected is:
Something is wrong....But what?
Do you have any idea?
Regards,
Atapalou

Invoke Sagemaker Endpoint using Spark (EMR Cluster)

I am developing a spark application in an EMR cluster. The flow of the project goes like this :
Dataframe is repartitioned based in a Id.
Sagemaker endpoint needs to be invoked on each partition and get the result.
But doing that i am getting this error :
cPickle.PicklingError: Could not serialize object: TypeError: can't pickle thread.lock objects
The code is a follows :
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark import SparkConf
import itertools
import json
import boto3
import time
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from io import BytesIO as StringIO
client=boto3.client('sagemaker-runtime')
def invoke_endpoint(json_data):
ansJson=json.dumps(json_data)
response=client.invoke_endpoint(EndpointName="<EndpointName>",Body=ansJson,ContentType='text/csv',Accept='Accept')
resultJson=json.loads(str(response['Body'].read().decode('ascii')))
return resultJson
def execute(list_of_url):
final_iterator=[]
urlist=[]
json_data={}
for url in list_of_url:
final_iterator.append((url.ID,url.Prediction))
urlist.append(url.ID)
json_data['URL']=urlist
ressultjson=invoke_endpoint(json_data)
return iter(final_iterator)
### Atributes to be added to Spark Conf
conf = (SparkConf().set("spark.executor.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true").set("spark.driver.extraJavaOptions","-Dcom.amazonaws.services.s3.enableV4=true"))
scT=SparkContext(conf=conf)
scT.setSystemProperty("com.amazonaws.services.s3.enableV4","true")
hadoopConf=scT._jsc.hadoopConfiguration()
hadoopConf.set("f3.s3a.awsAccessKeyId","<AccessKeyId>")
hadoopConf.set("f3.s3a.awsSecretAccessKeyId","<SecretAccessKeyId>")
hadoopConf.set("f3.s3a.endpoint","s3-us-east-1.amazonaws.com")
hadoopConf.set("com.amazonaws.services.s3.enableV4","true")
hadoopConf.set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
sql=SparkSession(scT)
csv_df=sql.read.csv('s3 path to my csv file',header =True)
#print('Total count is',csv_df.count())
csv_dup_df=csv_df.dropDuplicates(['ID'])
print('Total count is',csv_dup_df.count())
windowSpec=Window.orderBy("ID")
result_df=csv_dup_df.withColumn("ImageID",F.row_number().over(windowSpec)%80)
final_df=result_df.withColumn("Prediction",lit(str("UNKOWN")))
df2 = final_df.repartition("ImageID")
df3=df2.rdd.mapPartitions(lambda url: execute(url)).toDF()
df3.coalesce(1).write.mode("overwrite").save("s3 path to save the results in csv format",format="csv")
print(df3.rdd.glom().collect())
##Ok
print("Work is Done")
Can you tell me how to rectify this issue ?