Web Scraping Table with class from CoinMarketCap w - beautifulsoup

I'm trying to scrape an entire table of data from:
https://coinmarketcap.com/currencies/ethereum/historical-data/
I'm trying to extract the table:
<table class="h7vnx2-2 jNaLNi cmc-table ">
with
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
and it returns: None
here's the full code:
import requests
from bs4 import BeautifulSoup
def main():
URL = "https://coinmarketcap.com/currencies/ethereum/historical-data/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
print(table)
if __name__ == "__main__":
main()

import pandas as pd
from urllib.parse import urlencode
def main(url):
params = {
'id': '1027',
'convertId': '2781',
'timeStart': '1623283200',
'timeEnd': '1628553600'
}
df = pd.DataFrame(pd.read_json(
url + urlencode(params))['data']['quotes'])
df = pd.DataFrame.from_records(df['quote'])
print(df)
main('https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?')

As stated, the data is dynamically rendered. Go to the api to get the data directly:
import requests
import pandas as pd
url ='https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical'
payload = {
'id':'1027',
'convertId':'2781',
'timeStart':'1623283200',
'timeEnd':'1628553600' }
jsonData = requests.get(url, params=payload).json()
df = pd.json_normalize(jsonData, record_path=['data','quotes'])
#rows = [i['quote'] for i in jsonData['data']['quotes']]
#df = pd.DataFrame(rows)
Output:
print(df)
Output from spyder call 'get_namespace_view':
open high ... marketCap timestamp
0 2611.142652 2619.957793 ... 2.872870e+11 2021-06-10T23:59:59.999Z
1 2472.858836 2495.414705 ... 2.736314e+11 2021-06-11T23:59:59.999Z
2 2354.752218 2447.227868 ... 2.758389e+11 2021-06-12T23:59:59.999Z
3 2372.690096 2547.367910 ... 2.916739e+11 2021-06-13T23:59:59.999Z
4 2508.770462 2606.432929 ... 2.951126e+11 2021-06-14T23:59:59.999Z
.. ... ... ... ... ...
56 2725.669632 2840.430748 ... 3.307572e+11 2021-08-05T23:59:59.999Z
57 2827.503486 2944.903352 ... 3.382378e+11 2021-08-06T23:59:59.999Z
58 2891.707469 3170.229727 ... 3.694372e+11 2021-08-07T23:59:59.999Z
59 3161.232779 3184.603971 ... 3.526859e+11 2021-08-08T23:59:59.999Z
60 3012.885711 3185.701187 ... 3.707654e+11 2021-08-09T23:59:59.999Z
[61 rows x 7 columns]

Related

ValueError: NaTType does not support timetuple when converting a dataframe to dictionary using to_dict('records')

I'm running this flask app
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS, cross_origin
import json
import pandas as pd
# Create the app object
app = Flask(__name__)
cors = CORS(app, resources= {r"/*": {'origins' : "*"}})
# importing function for calculations
from Record_Matching import Matching
#app.route("/query", methods = ['get'])
#cross_origin()
def query():
# service_account_creds = request.json
query1 = request.args.get('query1', type = str)
query2 = request.args.get('query2', type = str)
querycolumns = request.args.get('querycolumns')
project_id = request.args.get('project_id', type = str)
service_account_creds = request.args.get('service_account')
SS = request.args.get('SS', type = float)
TT = request.args.get('TT', type = float)
result = Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns)
return result
if __name__ == "__main__":
app.run(host="localhost", port=8080, debug=True)
and I'm importing the matching function from this python scripts
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import recordlinkage
from recordlinkage.preprocessing import phonetic
from pandas.io.json import json_normalize
import uuid
from uuid import uuid4
import random
import string
import json
import ast
# Results to data frame function
def gcp2df(sql, client):
query = client.query(sql)
results = query.result()
return results.to_dataframe()
# Exporting df to bigquery - table parameter example: "dataset.tablename"
# def insert(df, table):
# client = bigquery.Client()
# job_config = bigquery.LoadJobConfig(write_disposition=bigquery.job.WriteDisposition.WRITE_TRUNCATE)
# return client.load_table_from_dataframe(df, table, job_config = job_config)
def pair(df1, df2, TT, querycolumns):
# function to take pair from list and compare:
L = querycolumns
l=len(querycolumns)
p1=0
p2=1
# To generate phonetics we need to make sure all names are in english.
# thus we'll replace non-english words by random english strings
df1[L[p1]] = df1[L[p1]].astype(str)
df2[L[p2]] = df2[L[p2]].astype(str)
for i in range(0,len(df1)):
if df1[L[p1]][i].isascii() == False:
df1[L[p1]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
for i in range(0,len(df2)):
if df2[L[p2]][i].isascii() == False:
df2[L[p2]][i] = ''.join(random.choices(string.ascii_lowercase, k=5))
compare = recordlinkage.Compare()
df1["phonetic_given_name"] = phonetic(df1[L[p1]], "soundex")
df2["phonetic_given_name"] = phonetic(df2[L[p2]], "soundex")
df1["initials"] = (df1[L[p1]].str[0] + df1[L[p1]].str[-1])
df2["initials"] = (df2[L[p2]].str[0] + df2[L[p2]].str[-1])
indexer = recordlinkage.Index()
indexer.block('initials')
candidate_links = indexer.index(df1, df2)
compare.exact('phonetic_given_name', 'phonetic_given_name', label="phonetic_given_name")
# O(n) a function that uses two pointers to track consecutive pairs for the input list
while p2 <=l:
compare.string(L[p1], L[p2], method='jarowinkler',threshold = TT, label=L[p1])
p1+=2
p2+=2
features = compare.compute(candidate_links,df1, df2)
return features
def Matching(query1,query2, SS,TT, service_account_creds, project_id, querycolumns):
service_account_creds = ast.literal_eval(service_account_creds)
credentials = service_account.Credentials(service_account_creds, service_account_creds['client_email'],
service_account_creds['token_uri'])
job_config = bigquery.LoadJobConfig()
client = bigquery.Client( project = project_id)
SS=int(SS)
TT=float(TT)
df1 = gcp2df("""{}""".format(query1), client)
df2 = gcp2df("""{}""".format(query2), client)
querycolumns = json.loads(querycolumns)
querycolumns = list(querycolumns.values())
features = pair(df1, df2, TT, querycolumns)
features['Similarity_score'] = features.sum(axis=1)
features = features[features['Similarity_score']>=SS].reset_index()
final = features[['level_0', 'level_1']]
final.rename(columns= {'level_0':'df1_index', 'level_1':'df2_index'}, inplace= True)
final['Unique_ID'] = [uuid.uuid4() for _ in range(len(final.index))]
final['Unique_ID'] = final['Unique_ID'].astype(str)
final['Similarity_Score'] = SS
final_duplicates = final['df1_index'].value_counts().max()
# insert(final,"test-ahmed-project.Record_Linkage.Matching_Indices")
message = "Mission accomplished!, your highest number of duplicates is " + str(final_duplicates)
return {'message':message,'final':final.to_dict('records'), 'df1':df1.to_dict('records')}
I'm not sure why when I return df1 as a dictionary it shows ValueError error when I try to to use the function from flask app, but when I run it in a jupytor notebook using the same dataframe that I'm taking from bigquery, it works just fine, so why does it not work on the flask app?
I tried to_dict('record') to convert a dataframe to a dictionary,
it looking online many resources suggest the error exists because the data contains missing values, but it shouldn't be a problem because when I try converting the same dataframe to dictionary in jupyter notebook it works just fine.

Extracting URL from html page

I am working with the following code:
import requests, pandas as pd
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text, "lxml").find_all("div", class_="mpi_info")
I am trying to get all the urls like "/homedetail/30729-mcguinness-dr-spring-tx-77386/5204857" in a dataframe but not sure how to go about this.
The addresses are under the class "address". Create a list containing all the href's and pass it to a DataFrame
import requests, pandas as pd
from bs4 import BeautifulSoup
url = "https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1"
soup = BeautifulSoup(requests.get(url).text, "lxml")
address_links = [tag["href"] for tag in soup.find_all("a", class_="address")]
df = pd.DataFrame(address_links)
print(df.to_string())
Output:
0
0 /homedetail/30729-mcguinness-dr-spring-tx-77386/5204857
1 /homedetail/11-dovecote-spring-tx-77382/5323232
2 /homedetail/9934-crestwater-cir-magnolia-tx-77354/11567525
3 /homedetail/3-shanewood-ct-spring-tx-77382/5325643
4 /homedetail/22-solebrook-path-tomball-tx-77375/12190176
5 /homedetail/24-snowdrop-lily-dr-tomball-tx-77375/14652805
6 /homedetail/26-freestone-pl-spring-tx-77382/9791228
7 /homedetail/8557-alford-point-dr-magnolia-tx-77354/13580284?lid=6218369
8 /homedetail/210-spyglass-park-loop-montgomery-tx-77316/12783261
9 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
10 /homedetail/51-lenox-hill-dr-spring-tx-77382/5331072
11 /homedetail/19-s-garnet-bnd-spring-tx-77382/9164284
import requests, pandas as pd
from bs4 import BeautifulSoup
def scraper():
lst = []
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text)
for i in soup.find_all('a'):
if i.get('href') and i.get('href').startswith('/homedetail/'):
lst.append(i['href'])
return lst
if __name__ == '__main__':
urls = scraper()
df = pd.DataFrame(urls)
print(df)
Output:
0 /homedetail/30729-mcguinness-dr-spring-tx-7738...
1 /homedetail/30729-mcguinness-dr-spring-tx-7738...
2 /homedetail/11-dovecote-spring-tx-77382/5323232
3 /homedetail/11-dovecote-spring-tx-77382/5323232
4 /homedetail/9934-crestwater-cir-magnolia-tx-77...
5 /homedetail/9934-crestwater-cir-magnolia-tx-77...
6 /homedetail/3-shanewood-ct-spring-tx-77382/532...
7 /homedetail/3-shanewood-ct-spring-tx-77382/532...
8 /homedetail/22-solebrook-path-tomball-tx-77375...
9 /homedetail/22-solebrook-path-tomball-tx-77375...
10 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
11 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
12 /homedetail/26-freestone-pl-spring-tx-77382/97...
13 /homedetail/26-freestone-pl-spring-tx-77382/97...
14 /homedetail/8557-alford-point-dr-magnolia-tx-7...
15 /homedetail/8557-alford-point-dr-magnolia-tx-7...
16 /homedetail/210-spyglass-park-loop-montgomery-...
17 /homedetail/210-spyglass-park-loop-montgomery-...
18 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
19 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
20 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
21 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
22 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
23 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...

Only crawler the first page and save detailed contents as dataframe in Python

I'm trying to loop pages, crawler and save detailed contents from this link:
Based on the code from here, I've modified the code to:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls[:1]: # remove [:1] to scrape all the pages
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
updated_df = pd.DataFrame()
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
# print(f"Fetching data for {key}...")
dfs = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4")
# https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
for df in dfs:
# df = dfs[0].T
df = dfs[0].T.iloc[1:, :].copy()
updated_df = updated_df.append(df)
print(updated_df)
cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)',
'转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
updated_df.columns = cols
updated_df.to_excel('./data.xlsx', index = False)
But it only successfully crawler the first page, how could I crawler all the pages and also add url column? Thanks.
Is this what you're looking for? This processes all the urls and dumps a list of dataframes to a single excel file.
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
COLUMNS = [
'项目编号', '转让/出租标的名称', '转让方/出租方名称',
'转让标的评估价/年租金评估价(元)', '转让底价/年租金底价(元)',
'受让方/承租方名称', '成交价/成交年租金(元)', '成交日期', 'URL'
]
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls:
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
def post_process(list_of_dataframes: list, source_url: str) -> pd.DataFrame():
temp_df = list_of_dataframes[0]
temp_df = temp_df.append(
pd.Series(["URL", source_url], index=temp_df.columns),
ignore_index=True,
)
return temp_df.T.iloc[1:, :].copy()
def dump_to_excel(post_processed_dfs: list):
df = pd.concat(post_processed_dfs)
df.columns = COLUMNS
df.to_excel("scraped_data.xlsx", index=False)
processed_dfs = []
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
print(f"Fetching data for {key}...")
df_list = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
processed_dfs.append(post_process(df_list, follow_url))
dump_to_excel(processed_dfs)
Output:

Why is beautifulsoup not parsing a simple Wikipedia Table

To help fight covid19 here in the Philippines, I'm trying to do data analysis. My data source is table of incidences in Wikipedia. See https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines
Tried to get table in python with Beautiful soup but I cannot seem to get the content of the columns [Facility of admission or consultation, Had recent travel history abroad]. See screenshot:
What am I doing wrong?
Here's my code: (can also be found here https://github.com/gio888/covid19_ph2/blob/master/covid_import_from_wikipedia.ipynb)
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
n_columns = 0
n_rows=0
column_names = []
for row in table.find_all('tr'):
td_tags = row.find_all('td')
if len(td_tags) > 0:
n_rows+=1
if n_columns == 0:
n_columns = len(td_tags)
th_tags = row.find_all('th')
if len(th_tags) > 0 and len(column_names) == 0:
for th in th_tags:
column_names.append(th.get_text())
columns = column_names if len(column_names) > 0 else range(0,n_columns)
df = pd.DataFrame(columns = columns,index= range(0,n_rows))
row_marker = 0
for row in table.find_all('tr'):
column_marker = 0
columns = row.find_all('td')
for column in columns:
df.iat[row_marker,column_marker] = column.get_text()
column_marker += 1
if len(columns) > 0:
row_marker += 1
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
df
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Firefox(options=options)
driver = webdriver.Firefox()
driver.get(
"https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines")
items = [["yes", "Yes"], ["no", "No"], [
"TBA", "TBA"], ["status-d", "Died"], ["status-r", "Recovered"], ["status-a", "Admitted"]]
for item in items:
script = (
"document.querySelectorAll('.{}').forEach((element) => element.innerHTML = '{}')".format(*item))
driver.execute_script(script)
df = pd.read_html(driver.page_source)[2]
df.to_csv("data.csv", index=False)
driver.quit()
Output: View Online
import pandas as pd
import requests
from bs4 import BeautifulSoup
#url = "https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_pandemic_data/Philippines_medical_cases_summary"
css_content = {
'status-a': 'Admitted',
'status-r': 'Recovered',
'status-d': 'Died',
'yes':'Yes',
'no': 'No',
'tba':'TBA',
"covid-sticky":'skip_header'
}
def Check_att(source,value,attribute='class'):
# <tag att='value'> <td class='x'>
if col_value : return col_value
if value in source.attrs.get(attribute, []) :
return css_content.get(value,'')
return ''
url = 'https://en.wikipedia.org/wiki/2020_coronavirus_pandemic_in_the_Philippines'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
table = soup.find('table', class_='wikitable')
column_names = [col_name.text.rstrip('\n').strip() for col_name in table.select('tr.covid-sticky > th')]
n_rows = len(table.select('tr > td'))
df = pd.DataFrame(columns = column_names,index= range(0,n_rows))
for row_index,row in enumerate(table.find_all('tr')[1:],0):
# if Check_att(row,"covid-sticky") :continue
columns = row.find_all('td')
for col_index , column in enumerate(columns,0):
col_value = ''
col_value = Check_att(column,'status-a')
col_value = Check_att(column,'status-r')
col_value = Check_att(column,'status-d')
col_value = Check_att(column,'yes')
col_value = Check_att(column,'no')
col_value = Check_att(column,'tba')
if not col_value :
col_value = column.get_text().rstrip('\n').strip()
df.iat[row_index,col_index] = col_value
for col in df:
try:
df[col] = df[col].astype(float)
except ValueError:
pass
print(df)

Pandas DataFrame - How to extract string patterns with hidden characters

I am scraping names, prices and images from this website. There are 8 items in total, but in the DF I would like to filter only the items that contain the pattern "Original Zaino Antifurto". When I try to apply the bp_filter to the DF I get an error, probably due to hidden characters.
Does anyone know how to filter for this pattern avoiding the error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = []
prices_xd = []
picts_xd = []
for container in con_xd:
name = container.find("a", class_="product-item-link").text
names_xd.append(name)
for container in con_xd:
price = container.find("span", class_="price").text
prices_xd.append(price)
for container in con_xd:
pict = container.find("a").get("href")
picts_xd.append(pict)
bp_xd = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
bp_xd['Item_Price_EUR'] = bp_xd['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
bp_xd['(XD-Design) Item_Name'] = bp_xd['(XD-Design) Item_Name'].str.strip()
bp_filter = bp_xd['(XD-Design) Item_Name'][bp_xd['(XD-Design) Item_Name'].str.contains('Original Zaino Antifurto')]
# bp_xd[bp_filter]
Here you have the fixed working code
import requests
from bs4 import BeautifulSoup
import pandas as pd
url_xd = 'https://www.xd-design.com/it-it/catalogsearch/result/?q=Bobby+Original+Zaino+Antifurto'
req_xd = requests.get(url_xd)
pars_xd = BeautifulSoup(req_xd.content, 'html.parser')
con_xd = pars_xd.find_all('div', class_ = 'product details product-item-details')
names_xd = [c.find("a", class_="product-item-link").text for c in con_xd]
prices_xd = [c.find("span", class_="price").text for c in con_xd]
picts_xd = [c.find("a").get("href") for c in con_xd]
df = pd.DataFrame({'(XD-Design) Item_Name': names_xd,
'Item_Price_EUR': prices_xd,
'Link_to_Pict': picts_xd })
df['Item_Price_EUR'] = df['Item_Price_EUR'].str.replace('€','').str.replace(',','.').astype(float)
df['(XD-Design) Item_Name'] = df['(XD-Design) Item_Name'].str.strip()
df = df.loc[df['(XD-Design) Item_Name'].apply(lambda x: 1 if 'Original Zaino Antifurto' in x else 0) == 1]