How can I add each new dataframe to the csv that is created? - pandas

My problem is that only the most recent url request is saved. How can I save all the responses? I tried using df.to_csv('complete.csv', 'a') but that creates a jumbled file.
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
# save to csv
df.to_csv('complete.csv')
print(df)
list.txt
https://www.rockauto.com/en/catalog/ford,2010,f-150,6.2l+v8,1447337,brake+&+wheel+hub,brake+pad,1684
https://www.rockauto.com/en/catalog/ford,2015,f-150,5.0l+v8,3308775,brake+&+wheel+hub,brake+pad,1684

You are saving the dataframe after each iterations, which is just overwriting the previous save. So you need to append the dataframes after each iterations. after it completes the loop, then save that final dataframe. So something like:
# imports
import requests
from bs4 import BeautifulSoup
import pandas as pd
# main code
with open('list.txt', 'r') as f_in:
final_df = pd.DataFrame()
for line in map(str.strip, f_in):
if not line:
continue
response = requests.get(line)
data = response.text
soup = BeautifulSoup(data, 'html.parser')
linecodes = []
partnos = []
for tbody in soup.select('tbody[id^="listingcontainer"]'):
tmp = tbody.find('span', class_='listing-final-manufacturer')
linecodes.append(tmp.text if tmp else '-')
tmp = tbody.find('span', class_='listing-final-partnumber as-link-if-js buyers-guide-color')
partnos.append(tmp.text if tmp else '-')
# create dataframe
df = pd.DataFrame(zip(linecodes,partnos), columns=['linecode', 'partno'])
print(df)
final_df = final_df.append(df, sort=False).reset_index(drop=True)
# save to csv
final_df.to_csv('complete.csv')
print(final_df)

Related

Read web content into a dataframe without writing to a file

I am trying to read data from the following link to a data frame without saving locally (this is important). I figured out a way (below), but is there an efficient way to do this?
from urllib.request import urlopen
import pandas as pd
from io import StringIO
from matplotlib.dates import DateFormatter
from datetime import datetime
uri = 'https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4'
data = urlopen(uri, timeout=300).read().decode("utf-8")
dateparse = lambda x: datetime.strptime(x.strip(), '%Y-%m-%d %H:%M')
str1 = data.split('\n')
dfList = []
for ii in range(1,len(str1)):
if len(str1[ii])>0:
df1 = pd.read_csv(StringIO(str1[ii]), parse_dates=[1], date_parser=dateparse, header=None) #Read each string into a dataframe
if not df1.empty:
df2 = df1.iloc[:,0:3] #Get the first five columns
if df2.iloc[0,-1] != 'M': #Don't append the ones with missing data
dfList.append(df2)
df = pd.concat(dfList, axis=0, ignore_index=True)
df.columns = ['Station','Date','Temp']
ax1 = df.plot(x=1,y=2)
ax1.get_figure().autofmt_xdate()
Using requests, pandas and io:
from io import StringIO
import pandas as pd
import requests
url = (
"https://mesonet.agron.iastate.edu/cgi-bin/request/asos.py?"
"station=AXA&data=all&year1=2022&month1=12&day1=1&year2=2022&"
"month2=12&day2=1&tz=Etc%2FUTC&format=onlycomma&latlon=no&"
"elev=no&missing=M&trace=T&direct=no&report_type=3&report_type=4"
)
with requests.Session() as request:
response = request.get(url, timeout=30)
if response.status_code != 200:
print(response.raise_for_status())
df = pd.read_csv(StringIO(response.text), sep=",")
print(df)

Scraping website with Beautiful soup but getting an error

from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
#Generate csv file
outfile = open('scrape.csv','w',newline='')
writer = csv.writer(outfile)
#List all urls
urls = ['https://www.rpsgroup.com/services/','https://www.rpsgroup.com']
#Create a blank data frame
df = pd.DataFrame(columns=['Header1','Image Source', 'Alt Tag'])
#Html parser
for i in urls:
Web_page = requests.get(urls)
soup = BeautifulSoup(Web_page.content, 'html.parser')
#Print headings
h1 = soup.find_all('h1')
print(h1.get_text())
header1 = h1.get_text()
#Find all images in that class
image =soup.find('div',class_='organism-3-col')
image_link = image.find_all('img')
#Find image and source attribute
for image in image_link:
if image.has_attr('src'):
print (image['src'])
image_source = image('src')
print (image['alt'])
image_alt = image('alt')
else:
print('Source not found')
#Create a new data frame and add columns
df2 = pd.DataFrame([[header1, image_source, image_link]], columns=['Header1', 'Image source', 'Image link' ])
#Append the previous data frame with the extracted information
df= df.append(df2, ignore_index=True)
#save to csv
df.to_csv('scrape.csv')
outfile.close()

KeyError: 0 when converting bs4 xml to pandas df

I am trying to import xml to pandas using bs4.
The bs4 import works, but getting pandas to recognise the xml is problematic.
import requests
import bs4
import pandas as pd
url = 'https://www.federalreserve.gov/data.xml'
geturl = requests.get(url).text
data = bs4.BeautifulSoup(geturl, 'lxml')
df = pd.DataFrame(data)
print(df.head())
I am expecting the df to show the first 5 rows of data, but instead i get the following error:
KeyError: 0
Why is pandas producing this KeyError: 0?
Many thanks!
There are five different charts in the xml file. Which one do you want? This is an example using the first chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# list comprehension to get the observation index and value of the first chart (i.e, charts[0])
data = [[ob['index'], ob['value']] for ob in charts[0].findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
df.head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00
Update
You can iterate through all the charts and append to a dict. You will then call each DataFrame by the title of the chart:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# xml url
xml = 'https://www.federalreserve.gov/data.xml'
# GET request and create soup
r = requests.get(xml)
soup = BeautifulSoup(r.text, 'xml')
# list comprehension to create a list of all the charts in the xml file
charts = [chart for chart in soup.findAll('chart')]
# empty dict
df_list = {}
for chart in charts:
# list comprehension to get the observation index and value
data = [[ob['index'], ob['value']] for ob in chart.findAll('observation')]
# create DataFrame
df = pd.DataFrame(data, columns=['Date', 'Value'])
# create key from the the chart title and append df
df_list[chart['title']] = []
df_list[chart['title']].append(df)
# calling the second chart
df_list['Selected Assets of the Federal Reserve'][0].head()
Date Value
0 1-Aug-07 870261.00
1 8-Aug-07 865453.00
2 15-Aug-07 864931.00
3 22-Aug-07 862775.00
4 29-Aug-07 872873.00

How to use pd.DataFrame method to manually create a dataframe from info scraped using beautifulsoup4

I made it to the point where all tr data data has been scraped and I am able to get a nice printout. But when I go to implement the pd.DataFrame as in df= pd.DataFrame({"A": a}) etc, I get a syntax error
Here is a list of my imported libraries in the Jupyter Notebook:
import pandas as pd
import numpy as np
import bs4 as bs
import requests
import urllib.request
import csv
import html5lib
from pandas.io.html import read_html
import re
Here is my code:
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs.BeautifulSoup(source,'html.parser')
table_rows = soup.find_all('tr')
table_rows
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
texas_info = pd.DataFrame({
"title": Texas
"Zip Code" : [Zip Code],
"City" :[City],
})
texas_info.head()
I expect to get a dataframe with two columns, one being the 'Zip Code' and the other the 'Cities'
If you want to create manually, with bs4 4.7.1 you can use :not, :contains and :nth-of-type pseudo classes to isolate the two columns of interest, then construct a dict then convert to df
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
zips = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan])')]
cities = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': zips,'Cities': cities}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
You could combine selectors into one line:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs
source = urllib.request.urlopen('https://www.zipcodestogo.com/Texas/').read()
soup = bs(source,'lxml')
items = [item.text for item in soup.select('.inner_table:contains(Texas) td:nth-of-type(1):not([colspan]), .inner_table:contains(Texas) td:nth-of-type(2):not([colspan])')]
d = {'Zips': items[0::2],'Cities': items[1::2]}
df = pd.DataFrame(d)
df = df[1:].reset_index(drop = True)
print(df)
I note you want to create manually but worth knowing for future readers that you could just use pandas read_html
import pandas as pd
table = pd.read_html('https://www.zipcodestogo.com/Texas/')[1]
table.columns = table.iloc[1]
table = table[2:]
table = table.drop(['Zip Code Map', 'County'], axis=1).reset_index(drop=True)
print(table)
Try creating the DataFrame and perform the for loop to append each row in the table into the DataFrame.
df = pd.DataFrame()
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
zipCode = row[0] # assuming first column
city = row[1] # assuming second column
df = df.append({"Zip Code": zipCode, "City" : city}, ignore_index=True)
If you only need these two columns, you should not include title in the DataFrame (that will create another column); that line also happened to be where the syntax error occurred because of the missing comma.

How to read multiple files and save a single output with pandas use command line arguments

I have multiple files in TXT format how to get all the values ​​with a single output Merge values ​​into a single file use command line arguments in pandas
like this:
python3 file1.txt file2.txt file3.txt
Code:
import pandas as pd
import socket, struct
import os
import glob
import sys
try:
file = sys.argv[1]
except Exception:
print("Usage: python3 {} [file]".format(sys.argv[0]))
sys.exit()
os.chdir('/Users/roc/Desktop/js/projj')
fileList = glob.glob('*.txt')
appended_data = []
for file in fileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int)
df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
print(result_df.to_csv("/Users/roc/Desktop/js/projj/delegated2.txt",sep=' ', index=False, header=False))
Use the below to iterate through a folder and append all files to a single dataframe
import os
import glob
os.chdir('C:\\path_to_folder\\')
Filelist = glob.glob('*.txt')
appended_data = []
for file in FileList:
pdd = pd.read_csv(file,header=None,sep='|',error_bad_lines=False, warn_bad_lines=False,skiprows=[0],names=['Name','Code','Ipv', 'Ip','Range','Date', 'Category'],low_memory=False)
df = pdd[pdd['Ipv'].str.contains("ipv4") & pdd['Ip'].str.contains('[0-9]')]
appended_data.append(df)
appended_data = pd.concat(appended_data)
df = pd.DataFrame(appended_data)
Once you have the df which is combined of all the data from all files, use the next part of the code:
pd.options.mode.chained_assignment = None
def ip2int(ip):
packedIP = socket.inet_aton(ip)
return struct.unpack("!L", packedIP)[0]
df['Ip'] = df.Ip.apply(ip2int) df['Range'] = df.groupby(['Code'])['Range'].transform('sum').fillna(0).astype(int)
k = df[['Ip', 'Range', 'Code']].dropna()
df2 = k.drop_duplicates(subset=['Range'])
result_df =df2.sort_values('Range', ascending=True)
result_df.to_csv("/Users/roc/Desktop/output.txt",sep=' ', index=False, header=False)