How to use pd.DataFrame method to manually create a dataframe from info scraped using beautifulsoup4 - pandas

I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'

If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)

Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.

Related

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

KeyError: 0 when converting bs4 xml to pandas df

I am trying to import xml to pandas using bs4.
The bs4 import works, but getting pandas to recognise the xml is problematic.
import requests
import bs4
import pandas as pd
url = 'https://www.federalreserve.gov/data.xml'
geturl = requests.get(url).text
data = bs4.BeautifulSoup(geturl, 'lxml')
df = pd.DataFrame(data)
print(df.head())
I am expecting the df to show the first 5 rows of data, but instead i get the following error:
KeyError: 0
Why is pandas producing this KeyError: 0?
Many thanks!
There are five different charts in the xml file. Which one do you want? This is an example using the first chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# list comprehension to get the observation index and value of the first chart (i.e, charts[0])
data = [[ob['index'], ob['value']] for ob in charts[0].findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
df.head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00
Update
You can iterate through all the charts and append to a dict. You will then call each DataFrame by the title of the chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# empty dict
df_list = {}
for chart in charts:
# list comprehension to get the observation index and value
data = [[ob['index'], ob['value']] for ob in chart.findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
# create key from the the chart title and append df
df_list[chart['title']] = []
df_list[chart['title']].append(df)
# calling the second chart
df_list['Selected Assets of the Federal Reserve'][0].head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00

using ipywidgets SelectMultiple on a dataframe

import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
a = ['Banking', 'Auto', 'Life', 'Electric', 'Technology', 'Airlines',
'Healthcare']
df = pd.DataFrame(np.random.randn(7, 4), columns = list('ABCD'))
df.index = a
df.head(7)
dropdown = widgets.SelectMultiple(
options=df.index,
description='Sector',
disabled=False,
layout={'height':'100px', 'width':'40%'})
display(dropdown)
I want to create a function where I can filter the df by Sector. i.e say I select Airlines, Banking and Electric from the display(dropdown) and it returns a dataframe of the selected sectors only.
Try something like this, I have used a global variable to demonstrate in this case, but I would normally wrap up the functionality in a class so you always have access to the filtered dataframe.
Rather than use interact I have used .observe on the Selection widget.
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display, clear_output
a = ['Banking', 'Auto', 'Life', 'Electric', 'Technology', 'Airlines',
'Healthcare']
df = pd.DataFrame(np.random.randn(7, 4), columns = list('ABCD'), index=a)
filtered_df = None
dropdown = widgets.SelectMultiple(
options=df.index,
description='Sector',
disabled=False,
layout={'height':'100px', 'width':'40%'})
def filter_dataframe(widget):
global filtered_df
selection = list(widget['new'])
with out:
clear_output()
display(df.loc[selection])
filtered_df = df.loc[selection]
out = widgets.Output()
dropdown.observe(filter_dataframe, names='value')
display(dropdown)
display(out)

How to get column header in excel generated via python ExcelWriter

I am fetching excel data from django database via raw query. excel is generated but column header is missing .
please suggest some way to get that header.
import pandas as pd
from pandas import ExcelWriter
df1 = pd.DataFrame(row1)
try:
from StringIO import StringIO
except:
from io import StringIO
import xlwt
wb = Workbook()
writer = ExcelWriter("XYZ.xlsx",options={'remove_timezone': True})
xl_out = StringIO()
writer.path = xl_out
ws1 = wb.add_sheet("abc")
for col_num, value in enumerate(df1.columns.values):
ws1.write(1,col_num + 1, 'value')
df1.to_excel(writer,"abc", index= True, header=True)
writer.save()