Extracting URL from html page

Extracting URL from html page - beautifulsoup

I am working with the following code:
import requests, pandas as pd
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text, "lxml").find_all("div", class_="mpi_info")
I am trying to get all the urls like "/homedetail/30729-mcguinness-dr-spring-tx-77386/5204857" in a dataframe but not sure how to go about this.

The addresses are under the class "address". Create a list containing all the href's and pass it to a DataFrame
import requests, pandas as pd
from bs4 import BeautifulSoup
url = "https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1"
soup = BeautifulSoup(requests.get(url).text, "lxml")
address_links = [tag["href"] for tag in soup.find_all("a", class_="address")]
df = pd.DataFrame(address_links)
print(df.to_string())
Output:
0
0 /homedetail/30729-mcguinness-dr-spring-tx-77386/5204857
1 /homedetail/11-dovecote-spring-tx-77382/5323232
2 /homedetail/9934-crestwater-cir-magnolia-tx-77354/11567525
3 /homedetail/3-shanewood-ct-spring-tx-77382/5325643
4 /homedetail/22-solebrook-path-tomball-tx-77375/12190176
5 /homedetail/24-snowdrop-lily-dr-tomball-tx-77375/14652805
6 /homedetail/26-freestone-pl-spring-tx-77382/9791228
7 /homedetail/8557-alford-point-dr-magnolia-tx-77354/13580284?lid=6218369
8 /homedetail/210-spyglass-park-loop-montgomery-tx-77316/12783261
9 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
10 /homedetail/51-lenox-hill-dr-spring-tx-77382/5331072
11 /homedetail/19-s-garnet-bnd-spring-tx-77382/9164284

import requests, pandas as pd
from bs4 import BeautifulSoup
def scraper():
lst = []
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text)
for i in soup.find_all('a'):
if i.get('href') and i.get('href').startswith('/homedetail/'):
lst.append(i['href'])
return lst
if __name__ == '__main__':
urls = scraper()
df = pd.DataFrame(urls)
print(df)
Output:
0 /homedetail/30729-mcguinness-dr-spring-tx-7738...
1 /homedetail/30729-mcguinness-dr-spring-tx-7738...
2 /homedetail/11-dovecote-spring-tx-77382/5323232
3 /homedetail/11-dovecote-spring-tx-77382/5323232
4 /homedetail/9934-crestwater-cir-magnolia-tx-77...
5 /homedetail/9934-crestwater-cir-magnolia-tx-77...
6 /homedetail/3-shanewood-ct-spring-tx-77382/532...
7 /homedetail/3-shanewood-ct-spring-tx-77382/532...
8 /homedetail/22-solebrook-path-tomball-tx-77375...
9 /homedetail/22-solebrook-path-tomball-tx-77375...
10 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
11 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
12 /homedetail/26-freestone-pl-spring-tx-77382/97...
13 /homedetail/26-freestone-pl-spring-tx-77382/97...
14 /homedetail/8557-alford-point-dr-magnolia-tx-7...
15 /homedetail/8557-alford-point-dr-magnolia-tx-7...
16 /homedetail/210-spyglass-park-loop-montgomery-...
17 /homedetail/210-spyglass-park-loop-montgomery-...
18 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
19 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
20 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
21 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
22 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
23 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...

Related

Web Scraping Table with class from CoinMarketCap w

I'm trying to scrape an entire table of data from:
https://coinmarketcap.com/currencies/ethereum/historical-data/
I'm trying to extract the table:
<table class="h7vnx2-2 jNaLNi cmc-table ">
with
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
and it returns: None
here's the full code:
import requests
from bs4 import BeautifulSoup
def main():
URL = "https://coinmarketcap.com/currencies/ethereum/historical-data/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
print(table)
if __name__ == "__main__":
main()

import pandas as pd
from urllib.parse import urlencode
def main(url):
params = {
'id': '1027',
'convertId': '2781',
'timeStart': '1623283200',
'timeEnd': '1628553600'
}
df = pd.DataFrame(pd.read_json(
url + urlencode(params))['data']['quotes'])
df = pd.DataFrame.from_records(df['quote'])
print(df)
main('https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?')

As stated, the data is dynamically rendered. Go to the api to get the data directly:
import requests
import pandas as pd
url ='https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical'
payload = {
'id':'1027',
'convertId':'2781',
'timeStart':'1623283200',
'timeEnd':'1628553600' }
jsonData = requests.get(url, params=payload).json()
df = pd.json_normalize(jsonData, record_path=['data','quotes'])
#rows = [i['quote'] for i in jsonData['data']['quotes']]
#df = pd.DataFrame(rows)
Output:
print(df)
Output from spyder call 'get_namespace_view':
open high ... marketCap timestamp
0 2611.142652 2619.957793 ... 2.872870e+11 2021-06-10T23:59:59.999Z
1 2472.858836 2495.414705 ... 2.736314e+11 2021-06-11T23:59:59.999Z
2 2354.752218 2447.227868 ... 2.758389e+11 2021-06-12T23:59:59.999Z
3 2372.690096 2547.367910 ... 2.916739e+11 2021-06-13T23:59:59.999Z
4 2508.770462 2606.432929 ... 2.951126e+11 2021-06-14T23:59:59.999Z
.. ... ... ... ... ...
56 2725.669632 2840.430748 ... 3.307572e+11 2021-08-05T23:59:59.999Z
57 2827.503486 2944.903352 ... 3.382378e+11 2021-08-06T23:59:59.999Z
58 2891.707469 3170.229727 ... 3.694372e+11 2021-08-07T23:59:59.999Z
59 3161.232779 3184.603971 ... 3.526859e+11 2021-08-08T23:59:59.999Z
60 3012.885711 3185.701187 ... 3.707654e+11 2021-08-09T23:59:59.999Z
[61 rows x 7 columns]

Selenium/Webscrape this field

My code runs fine and prints the title for all rows but the rows with dropdowns.
For example, row 4 has a dropdown if clicked. I implemented a try which would in theory initiate the dropdown, to then pull the titles.
But when i execute click() and try to print, for the rows with these drop downs, they are not printing.
Expected output- Print all titles including the ones in dropdown.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='card item-container session')
for property in productlist:
sessiontitle=property.find('h4',class_='session-title card-title').text
print(sessiontitle)
try:
ifDropdown=driver.find_elements_by_class_name('item-expand-action expand')
ifDropdown.click()
time.sleep(4)
newTitle=driver.find_element_by_class_name('card-title').text
print(newTitle)
except:
newTitle='none'

import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def my_filter(req, content):
try:
r = req.get(content['href'])
soup = get_soup(r.text)
return [x.text for x in soup.select('.card-title')[1:]]
except TypeError:
return 'N/A'
def main(url):
with requests.Session() as req:
for page in range(1, 2):
print(f"Extracting Page# {page}\n")
params = {
"p": page
}
r = req.get(url, params=params)
soup = get_soup(r.text)
goal = {x.select_one('.session-title').text: my_filter(
req, x.select_one('.item-expand-action')) for x in soup.select('.card')}
df = pd.DataFrame(goal.items(), columns=['Title', 'Menu'])
print(df)
main('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
Output:
Title Menu
0 Educational sessions on-demand N/A
1 Special Symposia on-demand N/A
2 Multidisciplinary sessions on-demand N/A
3 Illumina - Diagnosing Non-Small Cell Lung Canc... [Illumina gives an update on their IVD road ma...
4 MSD - Homologous Recombination Deficiency: BRC... [Welcome and Introductions, Homologous Recombi...
5 Servier - The clinical value of IDH inhibition... [Isocitric dehydrogenase: an actionable geneti...
6 AstraZeneca - Redefining Breast Cancer – Biolo... [Welcome and Opening, Redefining Breast Cancer...
7 ITM Isotopen Technologien München AG - A Globa... [Welcome & Introduction, Changes in the Incide...
8 MSD - The Role of Biomarkers in Patient Manage... [Welcome and Introductions, The Role of Pd-L1 ...
9 AstraZeneca - Re-evaluating the role of gBRCA ... [Welcome and introduction, What do we know abo...
10 Novartis - Unmet needs in oncogene-driven NSCL... [Welcome and introduction, Unmet needs in onco...
11 Opening session N/A

Creating a table off of 247sports website

I am trying to create a pandas dataframe based off of the top 1000 recruits from the 2022 football recruiting class from the 247sports website in a google colab notebook. I currently am using the following code so far:
#Importing all necessary packages
import pandas as pd
import time
import datetime as dt
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import twofourseven
from bs4 import BeautifulSoup
from splinter import Browser
from kora.selenium import wd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import requests
from geopy.geocoders import Nominatim
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
year = '2022'
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeRecruitRankings?InstitutionGroup=HighSchool'
# Add the `user-agent` otherwise we will get blocked when sending the request
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
response = requests.get(url, headers = headers).content
soup = BeautifulSoup(response, "html.parser")
data = []
for tag in soup.find_all("li", class_="rankings-page__list-item"): # `[1:]` Since the first result is a table header
# meta = tag.find_all("span", class_="meta")
rank = tag.find_next("div", class_="primary").text
TwoFourSeven_rank = tag.find_next("div", class_="other").text
name = tag.find_next("a", class_="rankings-page__name-link").text
school = tag.find_next("span", class_="meta").text
position = tag.find_next("div", class_="position").text
height_weight = tag.find_next("div", class_="metrics").text
rating = tag.find_next("span", class_="score").text
nat_rank = tag.find_next("a", class_="natrank").text
state_rank = tag.find_next("a", class_="sttrank").text
pos_rank = tag.find_next("a", class_="posrank").text
data.append(
{
"Rank": rank,
"247 Rank": TwoFourSeven_rank,
"Name": name,
"School": school,
"Class of": year,
"Position": position,
"Height & Weight": height_weight,
"Rating": rating,
"National Rank": nat_rank,
"State Rank": state_rank,
"Position Rank": pos_rank,
# "School": ???,
}
)
print(rank)
df = pd.DataFrame(data)
data
Ideally, I would also like to grab the school name the recruit chose from the logo on the table, but I am not sure how to go about that. For example, I would like to print out "Florida State" for the school column from this "row" of data.
Along with that, I do get an output of printing ranks, but afterwards, I get the following error that won't allow me to collect and/or print out additional data:
AttributeError Traceback (most recent call last)
<ipython-input-11-56f4779601f8> in <module>()
16 # meta = tag.find_all("span", class_="meta")
17
---> 18 rank = tag.find_next("div", class_="primary").text
19 # TwoFourSeven_rank = tag.find_next("div", class_="other").text
20 name = tag.find_next("a", class_="rankings-page__name-link").text
AttributeError: 'NoneType' object has no attribute 'text'
Lastly, I do understand that this webpage only displays 50 recruits without having my python code click the "Load more" tab via selenium, but I am not 100% sure how to incorporate that in the most efficient and legible way possible. If anyone knows a good way to do all this, I'd greatly appreciate it. Thanks in advance.

Use try/except as some of the elements will not be present. Also no need to use Selenium. Simple requests will do.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://247sports.com/Season/2022-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
rows = []
page = 0
while True:
page +=1
print('Page: %s' %page)
payload = {'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
athletes = soup.find_all('li',{'class':'rankings-page__list-item'})
if len(athletes) == 0:
break
continue_loop = True
while continue_loop == True:
for athlete in athletes:
if athlete.text.strip() == 'Load More':
continue_loop = False
continue
primary_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'primary'}).text.strip()
try:
other_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'other'}).text.strip()
except:
other_rank = ''
name = athlete.find('div',{'class':'recruit'}).find('a').text.strip()
link = 'https://247sports.com' + athlete.find('div',{'class':'recruit'}).find('a')['href']
highschool = ' '.join([x.strip() for x in athlete.find('div',{'class':'recruit'}).find('span',{'class':'meta'}).text.strip().split('\n')])
pos = athlete.find('div',{'class':'position'}).text.strip()
ht = athlete.find('div',{'class':'metrics'}).text.split('/')[0].strip()
wt = athlete.find('div',{'class':'metrics'}).text.split('/')[1].strip()
rating = athlete.find('span',{'class':'score'}).text.strip()
nat_rank = athlete.find('a',{'class':'natrank'}).text.strip()
pos_rank = athlete.find('a',{'class':'posrank'}).text.strip()
st_rank = athlete.find('a',{'class':'sttrank'}).text.strip()
try:
team = athlete.find('div',{'class':'status'}).find('img')['title']
except:
team = ''
row = {'Primary Rank':primary_rank,
'Other Rank':other_rank,
'Name':name,
'Link':link,
'Highschool':highschool,
'Position':pos,
'Height':ht,
'weight':wt,
'Rating':rating,
'National Rank':nat_rank,
'Position Rank':pos_rank,
'State Rank':st_rank,
'Team':team}
rows.append(row)
df = pd.DataFrame(rows)
**Output: first 10 rows of 1321 rows - **
print(df.head(10).to_string())
Primary Rank Other Rank Name Link Highschool Position Height weight Rating National Rank Position Rank State Rank Team
0 1 1 Quinn Ewers https://247sports.com/Player/Quinn-Ewers-45572600 Southlake Carroll (Southlake, TX) QB 6-3 206 1.0000 1 1 1 Ohio State
1 2 3 Travis Hunter https://247sports.com/Player/Travis-Hunter-46084728 Collins Hill (Suwanee, GA) CB 6-1 165 0.9993 2 1 1 Florida State
2 3 2 Walter Nolen https://247sports.com/Player/Walter-Nolen-46083769 St. Benedict at Auburndale (Cordova, TN) DL 6-4 300 0.9991 3 1 1
3 4 14 Domani Jackson https://247sports.com/Player/Domani-Jackson-46057101 Mater Dei (Santa Ana, CA) CB 6-1 185 0.9966 4 2 1 USC
4 5 10 Zach Rice https://247sports.com/Player/Zach-Rice-46086346 Liberty Christian Academy (Lynchburg, VA) OT 6-6 282 0.9951 5 1 1
5 6 4 Gabriel Brownlow-Dindy https://247sports.com/Player/Gabriel-Brownlow-Dindy-46084792 Lakeland (Lakeland, FL) DL 6-3 275 0.9946 6 2 1
6 7 5 Shemar Stewart https://247sports.com/Player/Shemar-Stewart-46080267 Monsignor Pace (Opa Locka, FL) DL 6-5 260 0.9946 7 3 2
7 8 20 Denver Harris https://247sports.com/Player/Denver-Harris-46081216 North Shore (Houston, TX) CB 6-1 180 0.9944 8 3 2
8 9 33 Travis Shaw https://247sports.com/Player/Travis-Shaw-46057330 Grimsley (Greensboro, NC) DL 6-5 310 0.9939 9 4 1
9 10 23 Devon Campbell https://247sports.com/Player/Devon-Campbell-46093947 Bowie (Arlington, TX) IOL 6-3 310 0.9937 10 1 3

duplicating input url in append dataframe after scraping

The scraping is working (website with a list of 6 products), however I would like to also append the baseurl to the dataframe at the end. Ideally at the end of the dataframe. I do not care about the header name.In the future I will have multiple urls in the "urls."
How can I do this?
Before:
0 1 2
1 product1 h
2 product2 t
3 product3 t
4 product4 p
5 product5 s
6 product6 :
After:
0 1 2
1 product1 baseurl
2 product2 baseurl
3 product3 baseurl
4 product4 baseurl
5 product5 baseurl
6 product6 baseurl
Code:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from selenium import webdriver
from pandas import DataFrame
urls = [
'this is where the site url is'
]
final = []
with requests.Session() as s:
for url in urls:
driver = webdriver.Chrome('/Users/Documents/python/Selenium/bin/chromedriver')
driver.get(url)
soup = bs(driver.page_source, 'lxml')
items = soup.select('.grid-item-content')
titles = [item.text.strip() for item in items]
baseurl = url
results = list(zip(titles,baseurl))
final.append([results])
df = pd.DataFrame(results)
df.to_csv(r'output.csv', sep=',', encoding='utf-8',index = False )
driver.quit()
print(titles):
[product1,product2,product3,product4,product5,product6]
print(baseurl):
[url] <--- only has one result

You need to change
list(zip(titles,baseurl))
to
list(zip(titles,[baseurl] * len(titles)))
So that every element in the url gets a copy of the baseurl during zipping

'int' object has no attribute 'replace' error in python3.x

I don't get why this error occurs. Coz from my point of view the three columns 'WWBO','IBO','DBO' has exact same structure but when I apply 'replace' only WWBO works. Does it have sth with fillna?
Need your help!
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all- movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
market
Error is:::
AttributeError: 'int' object has no attribute 'replace'

it is Pandas bugs auto casting '0' values to int, to solutions for this either eliminate the 0 value or cast the columns to string as below
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
#Read url
URL = "https://www.the-numbers.com/box-office-records/worldwide/all-movies/cumulative/released-in-2019"
data = requests.get(URL).text
#parse url
soup = bs(data, "html.parser")
#find the tables you want
table = soup.findAll("table")[1:]
#read it into pandas
df = pd.read_html(str(table))
#concat both the tables
df = pd.concat([df[0],df[1]])
df = df.rename(columns={'Rank':'Rank',
'Movie':'Title',
'Worldwide Box Office':'WWBO',
'Domestic Box Office':'DBO',
'International Box Office':'IBO',
'DomesticShare':'Share'})
#drop columns
market = df.drop(columns=['Rank','Share'])
market = market.fillna(0)
#replace $ -> ''
market['WWBO'] = market['WWBO'].map(lambda s: s.replace('$',''))
market['IBO']=market['IBO'].astype(str)
market['IBO'] = market['IBO'].map(lambda s: s.replace('$',''))
market['DBO']=market['DBO'].astype(str)
market['DBO'] = market['DBO'].map(lambda s: s.replace('$',''))
>> market[['WWBO','IBO','DBO']]
WWBO IBO DBO
0 2,622,240,021 1,842,814,023 779,425,998
1 1,121,905,659 696,535,598 425,370,061
2 692,163,684 692,163,684 0
3 518,883,574 358,491,094 160,392,480
4 402,976,036 317,265,826 85,710,210
5 358,234,705 220,034,625 138,200,080
6 342,904,508 231,276,537 111,627,971
7 326,150,303 326,150,303 0
8 293,766,097 192,548,368 101,217,729
9 255,832,826 255,832,826 0
10 253,940,650 79,203,380 174,737,270
11 245,303,505 134,268,500 111,035,005
12 190,454,964 84,648,456 105,806,508
13 155,313,390 98,312,634 57,000,756

clearly one or more of these fields(market['WWBO'], market['IBO'], market['DBO']) have integer values and you are trying to perform string operation i.e. replace over it that's it is throwing error that
AttributeError: 'int' object has no attribute 'replace'
could you first print those values and see what are they or if you have many then its better to perform type check first like
if market['WWBO'].dtype == object:
market['WWBO'].map(lambda s: s.replace('$',''))
else:
pass
let me know if this works for you or not

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

Extracting URL from html page - beautifulsoup

Related

Web Scraping Table with class from CoinMarketCap w

Selenium/Webscrape this field

Creating a table off of 247sports website

duplicating input url in append dataframe after scraping

'int' object has no attribute 'replace' error in python3.x

Categories

Resources