Read web content into a dataframe without writing to a file - pandas

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()

Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

Related

compare 2 pandas dataframes

import glob
import pandas as pd
import numpy as np
import os
import fnmatch
import zipfile
df1 = pd.read_csv("2016Q12ExactTargetE1.csv",names = ['FileName'])
print("\nRead " ,df1.shape[0] , "Records")
# accessing and printing files in directory and subdirectory
for filename in glob.glob('c:\\temp\\*.zip', recursive=True):
#print(filename)
myzip=filename
zf = zipfile.ZipFile(myzip)
zfl = zf.namelist()
eml_files = fnmatch.filter(zfl, "*.eml")
df2 = pd.DataFrame(eml_files )
print("\nRead2 " ,df2.shape[0] , "Records")
The csv file
FileName
F0B1F7B371C427E6FDDE1078287A3C71.eml
E107A8CADF8F87B05599A3AAF03D5BA1.eml
30B54778C0B912F2516F6C390A137E91.eml
D06DD3162620490F7E9F8ADD1AE0F621.eml
10E3BAFB831EA97615DBBBF18D601EC1.eml
the eml_files looks like
['00E6E77CE9890A3F34343997BCA33791.eml',
'109E4F29239EA8259707B2E3D0D00351.eml',
'403EBEC70C1F305B72EFAA3822D75871.eml',
'30B54778C0B912F2516F6C390A137E91.eml',
'E107A8CADF8F87B05599A3AAF03D5BA1.eml',
'F0B1F7B371C427E6FDDE1078287A3C71.eml',
'00654E78278B0BBDFBF29BAEA3F61051.eml',
'10E3BAFB831EA97615DBBBF18D601EC1.eml',
'30295A4958D6787060A9BD30ABA3BD81.eml',
'712FE30B1D680ACF5F5194E05E7AFCC1.eml',
'80E928FB95A365F85AE1A99DC8418061.eml',
'91681F0020EAC9AC7F010E917CD72F51.eml',
'C0542641286DE272AB1FAEF954BA1951.eml',
'D06DD3162620490F7E9F8ADD1AE0F621.eml',
'214C558DD0ABCAC2EA3BE06DE95E0811.eml',
'4101E93C02FBA028CEA078B9A3542B01.eml',
'51159C8E5965890AE7356E92BC1C6921.eml',
'50775947EFD5010C3D5EA799F36029A1.eml']
How can I compare the two dataframes df1 and df2
Thank you
I tried
df3=df1.compare(df2, keep_equal=True)
but I get an error
Can only compare identically-labeled DataFrame objects
because the df2 is created by zipfile.namelist() which is diffrent from df1 which is read from a csv

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

read csv file from buffer got EmptyDataError?

i need to read a string like csv content with pandas , but pandas get some errors, i don't knonw what happened, can anyone help me?
import pandas as pd
import io
s = ',测试项,信息,结果\r\n0,软件测试机型805,软件测试机型805,PASS\r\n1,软件当前版本1,软件当前版本1,FAIL\r\n2,软件测试机型805,软件测试机型805,PASS\r\n3,软件当前版本1,软件当前版本1,FAIL\r\n4,软件测试机型805,软件测试机型805,PASS\r\n5,软件当前版本1,软件当前版本1,FAIL\r\n'
buf = io.StringIO()
buf.write(s)
df = pd.read_csv(buf)
got error, EmptyDataError: No columns to parse from file
老铁你拿去
import pandas as pd
import io
s = ',测试项,信息,结果\r\n0,软件测试机型805,软件测试机型805,PASS\r\n1,软件当前版本1,软件当前版本1,FAIL\r\n2,软件测试机型805,软件测试机型805,PASS\r\n3,软件当前版本1,软件当前版本1,FAIL\r\n4,软件测试机型805,软件测试机型805,PASS\r\n5,软件当前版本1,软件当前版本1,FAIL\r\n'
buf = io.StringIO()
buf.write(s)
buf.seek(0)
df = pd.read_csv(buf)
``

How to use pd.DataFrame method to manually create a dataframe from info scraped using beautifulsoup4

I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'
If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)
Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.

How to get column header in excel generated via python ExcelWriter

I am fetching excel data from django database via raw query. excel is generated but column header is missing .
please suggest some way to get that header.
import pandas as pd
from pandas import ExcelWriter
df1 = pd.DataFrame(row1)
try:
from StringIO import StringIO
except:
from io import StringIO
import xlwt
wb = Workbook()
writer = ExcelWriter("XYZ.xlsx",options={'remove_timezone': True})
xl_out = StringIO()
writer.path = xl_out
ws1 = wb.add_sheet("abc")
for col_num, value in enumerate(df1.columns.values):
ws1.write(1,col_num + 1, 'value')
df1.to_excel(writer,"abc", index= True, header=True)
writer.save()