airflow ingest data from json api doesn't work - pandas

I use google cloud to create a pipeline for ingest data from api by using google cloud composer
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
from datetime import timedelta
import pymysql.cursors
import pandas as pd
import requests
def get_data_from_api():
url = "https://de-training-2020-7au6fmnprq-de.a.run.app/currency_gbp/all"
response = requests.get(url)
result_conversion_rate = response.json()
conversion_rate = pd.DataFrame.from_dict(result_conversion_rate)
conversion_rate = conversion_rate.reset_index().rename(columns={"index":"date"})
conversion_rate['date'] = pd.to_datetime(conversion_rate['date']).dt.date
conversion_rate.to_csv("/home/airflow/gcs/data/conversion_rate_from_api.csv", index=False)
def covid_api():
url = "https://covid19.th-stat.com/json/covid19v2/getTimeline.json"
response = requests.get(url)
df = response.json()
df = pd.DataFrame.from_dict(df['Data'])
df['Date'] = pd.to_datetime(df['Date']).dt.date
df.to_csv("/home/airflow/gcs/data/result.csv", index=False)
default_args = {
'owner': 'datath',
'depends_on_past': False,
'start_date': days_ago(2),
'email': ['airflow#example.com'],
'email_on_failure': False,
'email_on_retry': False,
'retries': 1,
'retry_delay': timedelta(minutes=5),
'schedule_interval': '#once'
}
dag = DAG(
'Retail_pipeline',
default_args=default_args,
description='Pipeline for ETL online_retail data',
schedule_interval=timedelta(days=1),
)
t1 = PythonOperator(
task_id='api_call',
python_callable=get_data_from_api,
dag=dag,
)
t2 = PythonOperator(
task_id='api_covid',
python_callable=covid_api,
dag=dag,
)
t1 >> t2
the first task works fine but the second task got failed and I try the second task on jupyter it works fine please help don't know what to do

It appears it is failing on the response.json() step.
There are a couple things you can do to troubleshoot:
Output the raw result of the response with like r.text. I think this will show you where the error is.
If you are still uncertain where the error is after step 1. We should load the result and try to deserialise using native json from python.

Related

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

Trying to get players data from nbcsport

i've been trying to scrape this site
import pandas as pd
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.nbcsports.com/edge/basketball/nba/injury-report")
soup = BeautifulSoup(r.content,"lxml")
st1 = soup.find("div", attrs={"class":"page-wrapper--sidebar page-wrapper--sidebar-initial container clearfix page-wrapper"})
st2 = st1.find("div",attrs={"class":"content content--main cols-8"})
st3 = st2.find("div", attrs={"class":"block__content"})
st4 = st3.find("div",attrs={"id":"injury-report-page-wrapper"})
st4.find("div",attrs={"class":"injury-report-wall"})
Nothing returns.
I am trying to get the injury data however it doesn't work at all. i've tried bs,pandas couldn't make it. it looks like this data comes from an api but kinda stuckt. Open for advices.
import requests
import pandas as pd
def main(url):
params = {
"sort": "-start_date",
"filter[player.team.meta.drupal_internal__id]": 176,
"filter[player.status.active]": 1,
"filter[active]": 1,
"include": "injury_type,player,player.status,player.position"
}
r = requests.get(url, params=params)
data = []
for item in r.json()['included']:
data.append(item['attributes'])
df = pd.DataFrame().from_dict(data)
print(df)
# df.to_csv('data.csv', index=False)
main('https://www.nbcsports.com/edge/api/injury')

How to construct a "text/csv" payload when invoking a sagemaker endpoint

My training data looks like
df = pd.DataFrame({'A' : [2, 5], 'B' : [1, 7]})
I have trained a model in AWS Sagemaker and I deployed the model behind an endpoint.
The endpoint accepts the payload as "text/csv".
to invoke the endpoint using boto3 you can do:
import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
EndpointName="my-sagemaker-endpoint-name",
Body= my_payload_as_csv,
ContentType = 'text/csv')
How do i construct the payload "my_payload_as_csv" from my Dataframe in order to invoke the Sagemaker Endpoint correctly?
if you start from the dataframe example
df = pd.DataFrame({'A' : [2, 5], 'B' : [1, 7]})
you take a row
df_1_record = df[:1]
and convert df_1_record to a csv like this:
import io
from io import StringIO
csv_file = io.StringIO()
# by default sagemaker expects comma seperated
df_1_record.to_csv(csv_file, sep=",", header=False, index=False)
my_payload_as_csv = csv_file.getvalue()
my_payload_as_csv looks like
'2,1\n'
then you can invoke the sagemaker endpoint
import boto3
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
EndpointName="my-sagemaker-endpoint-name",
Body= my_payload_as_csv,
ContentType = 'text/csv')
#VincentCleas's answer was good. But, If you want to construct csv-payload without installing pandas, do this:
import boto3
csv_buffer = open('<file-name>.csv')
my_payload_as_csv = csv_buffer.read()
client = boto3.client('sagemaker-runtime')
response = client.invoke_endpoint(
EndpointName="my-sagemaker-endpoint-name",
Body= my_payload_as_csv,
ContentType = 'text/csv')

Pandasql table not found error on AWS Lambda Function which works on local machine

I am reading a pandas DF from AWS S3, trying to run some pre-processing SQL on it and save as a csv again, using pandasql library for the same. Challenge here is, in my local machine it works perfectly fine, but on the AWS Lambda it fails with the following error:
"An error occured: (sqlite3.OperationalError) no such table: TblV\n[SQL: SELECT * from TblV;]\n(Background on this error at: http://sqlalche.me/e/e3q8)"
Note: I've built a deployment package of pandas and pandasql on Amazon AMI Linux EC2 instance, zipped it with the lambda_function code and pushed to AWS S3 and saved in the Lambda Function by passing the path.
My code in local, which work perfectly fine:
import pandas as pd
from pandasql import sqldf
from time import time
t1 = time()
TblV = pd.read_csv(r"C:\Users\ab\Documents\test.csv")
query = """SELECT * from TblV"""
df = sqldf(query, globals())
print(df.columns)
print(df.shape)
print(df.head(5))
t2 = time()
print('Time taken: ', t2 - t1)
My code in AWS Lambda Function which throws the above error no matter what I do:
import json
import boto3
import datetime
import pandas as pd
from pandasql import sqldf
import sys
from io import StringIO
def lambda_handler(event, context):
try:
client = boto3.client('s3')
bucket_name = 'bucket'
object_key = 'test/Vol/test.csv'
csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')
TblV = pd.read_csv(StringIO(csv_string))
print(TblV.head(5)) # This print works perfectly
query = """SELECT * from TblV;"""
df = sqldf(query, globals())
print(df.columns)
print(df.shape)
print(df.head(5))
except Exception as e:
err = "An error occured: " + str(e)
return err
you need to download the find in the tmp folder. AWS lambda has temporary storage of 500MB.
client.download_file(bucket_name, object_key,'/tmp/file_name.extension')
in order to read the data
TblV = pd.read_csv(r"/tmp/file_name.extension")
query = """SELECT * from TblV"""
Try using 'df = sqldf(query, locals())' instead of 'df = sqldf(query, globals())'.
The variable 'TblV' is defined inside a function hence it cant be reffered as a global variable.