Scraping website with Beautiful soup but getting an error - beautifulsoup

from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
#Generate csv file
outfile = open('scrape.csv','w',newline='')
writer = csv.writer(outfile)
#List all urls
urls = ['https://www.rpsgroup.com/services/','https://www.rpsgroup.com']
#Create a blank data frame
df = pd.DataFrame(columns=['Header1','Image Source', 'Alt Tag'])
#Html parser
for i in urls:
Web_page = requests.get(urls)
soup = BeautifulSoup(Web_page.content, 'html.parser')
#Print headings
h1 = soup.find_all('h1')
print(h1.get_text())
header1 = h1.get_text()
#Find all images in that class
image =soup.find('div',class_='organism-3-col')
image_link = image.find_all('img')
#Find image and source attribute
for image in image_link:
if image.has_attr('src'):
print (image['src'])
image_source = image('src')
print (image['alt'])
image_alt = image('alt')
else:
print('Source not found')
#Create a new data frame and add columns
df2 = pd.DataFrame([[header1, image_source, image_link]], columns=['Header1', 'Image source', 'Image link' ])
#Append the previous data frame with the extracted information
df= df.append(df2, ignore_index=True)
#save to csv
df.to_csv('scrape.csv')
outfile.close()

Related

choropleth plotly map displaying a white background

I am trying to create a choropleth map of the uk using plotly, but every time I try, it outputs an empty page, or the json doesn't match with the dataframe.this is where i obtained the url for the dataframe Here's my code so far:
import pandas as pd
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json') as response:
geojson = json.load(response)
url3 = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
df = pd.read_csv(url3)
df_new=df.replace("areaName", "NAME_2")
from plotly import graph_objects as go
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaCode"],
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True,
)
)
fig.show()
a few things to fix this up:
uk-counties.json is in topojson format, plotly needs a geojson. can fix with the topojson module, for example (or geopandas)
no need to replace "areaName", you want this: locations=df["areaName"]
you need to specify a marker_style. centering and zooming help as well
for good result you need to use only one day's worth of data per choropleth, hence the df = df[df['date'] == '2022-11-23']
the covid data and the topojson don't match up well by districts, so there are gaps in the map
code:
"""
https://stackoverflow.com/questions/71828342/choropleth-plotly-map-displaying-a-white-background
"""
from urllib.request import urlretrieve
import json
from io import StringIO
from plotly import graph_objects as go
import pandas as pd
import topojson as tp
URL_JSON = 'https://raw.githubusercontent.com/deldersveld/topojson/master/countries/united-kingdom/uk-counties.json'
URL_DATA = 'https://api.coronavirus.data.gov.uk/v2/data?areaType=utla&metric=cumCasesBySpecimenDate&metric=cumPeopleVaccinatedFirstDoseByVaccinationDate&metric=cumPeopleVaccinatedSecondDoseByVaccinationDate&metric=newCasesBySpecimenDate&metric=cumPeopleVaccinatedThirdInjectionByVaccinationDate&format=csv'
CSV_DATA = 'uk_covid.csv'
TOPO_DATA = 'topojson.json'
GEO_DATA = 'geojson.json'
def download():
urlretrieve(URL_JSON, TOPO_DATA)
with open(TOPO_DATA, 'r') as data:
topoJSON = json.load(StringIO(data.read()))
topo = tp.Topology(topoJSON, object_name='GBR_adm2')
# convert to geojson, store in GEO_DATA
topo.to_geojson(GEO_DATA)
df = pd.read_csv(URL_DATA)
df.to_csv(CSV_DATA)
def make_map():
df = pd.read_csv(CSV_DATA)
with open(GEO_DATA, 'r') as data:
geojson = json.load(StringIO(data.read()))
# one day at a time
df = df[df['date'] == '2022-11-23']
fig = go.Figure(
go.Choroplethmapbox(
geojson=geojson,
featureidkey="properties.NAME_2",
locations=df["areaName"], # <=== not areaCode
z=df['cumCasesBySpecimenDate'],
zauto=True,
colorscale='Reds',
showscale=True
)
)
# need a mapbox_style
fig.update_layout(mapbox_style='carto-positron',
mapbox_zoom=5,
mapbox_center_lon=-2.057852,
mapbox_center_lat=53.404854,
height=700,
width=700)
fig.show()
if 0: # only needed once
download()
make_map()

How can I add each new dataframe to the csv that is created?

My problem is that only the most recent url request is saved. How can I save all the responses? I tried using df.to_csv('complete.csv', 'a') but that creates a jumbled file.
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
# save to csv
df.to_csv('complete.csv')
print(df)
list.txt
https://www.rockauto.com/en/catalog/ford,2010,f-150,6.2l+v8,1447337,brake+&+wheel+hub,brake+pad,1684
https://www.rockauto.com/en/catalog/ford,2015,f-150,5.0l+v8,3308775,brake+&+wheel+hub,brake+pad,1684
You are saving the dataframe after each iterations, which is just overwriting the previous save. So you need to append the dataframes after each iterations. after it completes the loop, then save that final dataframe. So something like:
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
final_df = pd.DataFrame()
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
print(df)
final_df = final_df.append(df, sort=False).reset_index(drop=True)
# save to csv
final_df.to_csv('complete.csv')
print(final_df)

Combining CSV of different shapes into one CSV

I have CSVs of different number of rows and columns. I would like to create one large CSV where all the CSV data are stacked directly on top of each other, aligned by the first column. I tried the script below with limited success; b which is an empty array does not hold the data from the previous loops.
from os import walk
import sys
import numpy as np
filenames= []
dirpath = []
filtered = []
original = []
f = []
b = np.empty([2, 2])
for (dirpath, dirnames, filenames) in walk("C:\\Users\\dkim1\\Python Scripts\\output"):
f.extend(dirnames)
print(f)
for names in f:
print(names)
df = np.genfromtxt('C:\\Users\\dkim1\\Python Scripts\\output\\' + names + '\\replies.csv', dtype =None, delimiter = ',', skip_header=1, names=True)
b = np.column_stack(df)
print(b)
Have you tried pd.concat()?
import os
import pandas as pd
# just used a single dir for example simplicity, rather than os.walk()
root_dir = "your directory path here"
file_names = os.listdir(root_dir)
cat_list=[]
for names in file_names:
df = pd.read_csv(os.path.join(root_dir, names), delimiter = ',', header=None)
cat_list.append(df)
concatted_df = pd.concat(cat_list)

KeyError: 0 when converting bs4 xml to pandas df

I am trying to import xml to pandas using bs4.
The bs4 import works, but getting pandas to recognise the xml is problematic.
import requests
import bs4
import pandas as pd
url = 'https://www.federalreserve.gov/data.xml'
geturl = requests.get(url).text
data = bs4.BeautifulSoup(geturl, 'lxml')
df = pd.DataFrame(data)
print(df.head())
I am expecting the df to show the first 5 rows of data, but instead i get the following error:
KeyError: 0
Why is pandas producing this KeyError: 0?
Many thanks!
There are five different charts in the xml file. Which one do you want? This is an example using the first chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# list comprehension to get the observation index and value of the first chart (i.e, charts[0])
data = [[ob['index'], ob['value']] for ob in charts[0].findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
df.head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00
Update
You can iterate through all the charts and append to a dict. You will then call each DataFrame by the title of the chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# empty dict
df_list = {}
for chart in charts:
# list comprehension to get the observation index and value
data = [[ob['index'], ob['value']] for ob in chart.findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
# create key from the the chart title and append df
df_list[chart['title']] = []
df_list[chart['title']].append(df)
# calling the second chart
df_list['Selected Assets of the Federal Reserve'][0].head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00

How to read multiple files and save a single output with pandas use command line arguments

I have multiple files in TXT format how to get all the values ​​with a single output Merge values ​​into a single file use command line arguments in pandas
like this:
python3 file1.txt file2.txt file3.txt
Code:
import pandas as pd
import socket, struct
import os
import glob
import sys
try:
file = sys.argv[1]
except Exception:
print("Usage: python3 {} [file]".format(sys.argv[0]))
sys.exit()
os.chdir('/Users/roc/Desktop/js/projj')
fileList = glob.glob('*.txt')
appended_data = []
for file in fileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int)
df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
print(result_df.to_csv("/Users/roc/Desktop/js/projj/delegated2.txt",sep=' ', index=False, header=False))
Use the below to iterate through a folder and append all files to a single dataframe
import os
import glob
os.chdir('C:\\path_to_folder\\')
Filelist = glob.glob('*.txt')
appended_data = []
for file in FileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
Once you have the df which is combined of all the data from all files, use the next part of the code:
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int) df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
result_df.to_csv("/Users/roc/Desktop/output.txt",sep=' ', index=False, header=False)