Creating a table off of 247sports website - pandas

I am trying to create a pandas dataframe based off of the top 1000 recruits from the 2022 football recruiting class from the 247sports website in a google colab notebook. I currently am using the following code so far:
#Importing all necessary packages
import pandas as pd
import time
import datetime as dt
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import twofourseven
from bs4 import BeautifulSoup
from splinter import Browser
from kora.selenium import wd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import requests
from geopy.geocoders import Nominatim
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
year = '2022'
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeRecruitRankings?InstitutionGroup=HighSchool'
# Add the `user-agent` otherwise we will get blocked when sending the request
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
response = requests.get(url, headers = headers).content
soup = BeautifulSoup(response, "html.parser")
data = []
for tag in soup.find_all("li", class_="rankings-page__list-item"): # `[1:]` Since the first result is a table header
# meta = tag.find_all("span", class_="meta")
rank = tag.find_next("div", class_="primary").text
TwoFourSeven_rank = tag.find_next("div", class_="other").text
name = tag.find_next("a", class_="rankings-page__name-link").text
school = tag.find_next("span", class_="meta").text
position = tag.find_next("div", class_="position").text
height_weight = tag.find_next("div", class_="metrics").text
rating = tag.find_next("span", class_="score").text
nat_rank = tag.find_next("a", class_="natrank").text
state_rank = tag.find_next("a", class_="sttrank").text
pos_rank = tag.find_next("a", class_="posrank").text
data.append(
{
"Rank": rank,
"247 Rank": TwoFourSeven_rank,
"Name": name,
"School": school,
"Class of": year,
"Position": position,
"Height & Weight": height_weight,
"Rating": rating,
"National Rank": nat_rank,
"State Rank": state_rank,
"Position Rank": pos_rank,
# "School": ???,
}
)
print(rank)
df = pd.DataFrame(data)
data
Ideally, I would also like to grab the school name the recruit chose from the logo on the table, but I am not sure how to go about that. For example, I would like to print out "Florida State" for the school column from this "row" of data.
Along with that, I do get an output of printing ranks, but afterwards, I get the following error that won't allow me to collect and/or print out additional data:
AttributeError Traceback (most recent call last)
<ipython-input-11-56f4779601f8> in <module>()
16 # meta = tag.find_all("span", class_="meta")
17
---> 18 rank = tag.find_next("div", class_="primary").text
19 # TwoFourSeven_rank = tag.find_next("div", class_="other").text
20 name = tag.find_next("a", class_="rankings-page__name-link").text
AttributeError: 'NoneType' object has no attribute 'text'
Lastly, I do understand that this webpage only displays 50 recruits without having my python code click the "Load more" tab via selenium, but I am not 100% sure how to incorporate that in the most efficient and legible way possible. If anyone knows a good way to do all this, I'd greatly appreciate it. Thanks in advance.

Use try/except as some of the elements will not be present. Also no need to use Selenium. Simple requests will do.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://247sports.com/Season/2022-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
rows = []
page = 0
while True:
page +=1
print('Page: %s' %page)
payload = {'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
athletes = soup.find_all('li',{'class':'rankings-page__list-item'})
if len(athletes) == 0:
break
continue_loop = True
while continue_loop == True:
for athlete in athletes:
if athlete.text.strip() == 'Load More':
continue_loop = False
continue
primary_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'primary'}).text.strip()
try:
other_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'other'}).text.strip()
except:
other_rank = ''
name = athlete.find('div',{'class':'recruit'}).find('a').text.strip()
link = 'https://247sports.com' + athlete.find('div',{'class':'recruit'}).find('a')['href']
highschool = ' '.join([x.strip() for x in athlete.find('div',{'class':'recruit'}).find('span',{'class':'meta'}).text.strip().split('\n')])
pos = athlete.find('div',{'class':'position'}).text.strip()
ht = athlete.find('div',{'class':'metrics'}).text.split('/')[0].strip()
wt = athlete.find('div',{'class':'metrics'}).text.split('/')[1].strip()
rating = athlete.find('span',{'class':'score'}).text.strip()
nat_rank = athlete.find('a',{'class':'natrank'}).text.strip()
pos_rank = athlete.find('a',{'class':'posrank'}).text.strip()
st_rank = athlete.find('a',{'class':'sttrank'}).text.strip()
try:
team = athlete.find('div',{'class':'status'}).find('img')['title']
except:
team = ''
row = {'Primary Rank':primary_rank,
'Other Rank':other_rank,
'Name':name,
'Link':link,
'Highschool':highschool,
'Position':pos,
'Height':ht,
'weight':wt,
'Rating':rating,
'National Rank':nat_rank,
'Position Rank':pos_rank,
'State Rank':st_rank,
'Team':team}
rows.append(row)
df = pd.DataFrame(rows)
**Output: first 10 rows of 1321 rows - **
print(df.head(10).to_string())
Primary Rank Other Rank Name Link Highschool Position Height weight Rating National Rank Position Rank State Rank Team
0 1 1 Quinn Ewers https://247sports.com/Player/Quinn-Ewers-45572600 Southlake Carroll (Southlake, TX) QB 6-3 206 1.0000 1 1 1 Ohio State
1 2 3 Travis Hunter https://247sports.com/Player/Travis-Hunter-46084728 Collins Hill (Suwanee, GA) CB 6-1 165 0.9993 2 1 1 Florida State
2 3 2 Walter Nolen https://247sports.com/Player/Walter-Nolen-46083769 St. Benedict at Auburndale (Cordova, TN) DL 6-4 300 0.9991 3 1 1
3 4 14 Domani Jackson https://247sports.com/Player/Domani-Jackson-46057101 Mater Dei (Santa Ana, CA) CB 6-1 185 0.9966 4 2 1 USC
4 5 10 Zach Rice https://247sports.com/Player/Zach-Rice-46086346 Liberty Christian Academy (Lynchburg, VA) OT 6-6 282 0.9951 5 1 1
5 6 4 Gabriel Brownlow-Dindy https://247sports.com/Player/Gabriel-Brownlow-Dindy-46084792 Lakeland (Lakeland, FL) DL 6-3 275 0.9946 6 2 1
6 7 5 Shemar Stewart https://247sports.com/Player/Shemar-Stewart-46080267 Monsignor Pace (Opa Locka, FL) DL 6-5 260 0.9946 7 3 2
7 8 20 Denver Harris https://247sports.com/Player/Denver-Harris-46081216 North Shore (Houston, TX) CB 6-1 180 0.9944 8 3 2
8 9 33 Travis Shaw https://247sports.com/Player/Travis-Shaw-46057330 Grimsley (Greensboro, NC) DL 6-5 310 0.9939 9 4 1
9 10 23 Devon Campbell https://247sports.com/Player/Devon-Campbell-46093947 Bowie (Arlington, TX) IOL 6-3 310 0.9937 10 1 3

Related

Using a for loop to create a new column in pandas dataframe

I have been trying to create a web crawler to scrape data from a website called Baseball Reference. When defining my crawler I realized that the different players have a unique id at the end of their URL containing the first 6 letters of their last name, three zeroes and the first 3 letters of their first name.
I have a pandas dataframe already containing columns 'first' and 'last' containing each players first and last names along with a lot of other data that i downloaded from this same website.
my def for my crawler function is as follows so far:
def bbref_crawler(ID):
url = 'https://www.baseball-reference.com/register/player.fcgi?id=' + str(ID)
source_code = requests.get(url)
page_soup = soup(source_code.text, features='lxml')
And the code that I have so far trying to obtain the player id's is as follows:
for x in nwl_offense:
while len(nwl_offense['last']) > 6:
id_last = len(nwl_offense['last']) - 1
while len(nwl_offense['first']) > 3:
id_first = len(nwl_offense['first']) - 1
nwl_offense['player_id'] = (str(id_first) + '000' + str(id_last))
When I run the for / while loop it just never stops running and I am not sure how else to go about achieving the goal I set out for of automating the player id into another column of that dataframe, so i can easily use the crawler to obtain more information on the players that I need for a project.
This is what the first 5 rows of the dataframe, nwl_offense look like:
print(nwl_offense.head())
Rk Name Age G ... WRC+ WRC
WSB OWins
0 1.0 Brian Baker 20.0 14.0 ... 733.107636 2.007068 0.099775 0.189913
1 2.0 Drew Beazley 21.0 46.0 ... 112.669541 29.920766 -0.456988 2.655892
2 3.0 Jarrett Bickel 21.0 33.0 ... 85.017293 15.245547 1.419822 1.502232
3 4.0 Nate Boyle 23.0 21.0 ... 1127.591556 1.543534 0.000000 0.139136
4 5.0 Seth Brewer* 22.0 12.0 ... 243.655365 1.667671 0.099775 0.159319
As stated in the comments, I wouldn't try to create a function to make the ids, as there will likely be some "quirky" ones in there that might not follow that logic.
If you're just go through each letter search they have it divided by and get the id directly by the player url.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://www.baseball-reference.com/register/player.fcgi'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
player_register_search = {}
searchLinks = soup.find('div', {'id':'div_players'}).find_all('li')
for each in searchLinks:
links = each.find_all('a', href=True)
for link in links:
print(link)
player_register_search[link.text] = 'https://www.baseball-reference.com/' + link['href']
tot = len(player_register_search)
playerIds = {}
for count, (k, link)in enumerate(player_register_search.items(), start=1):
print(f'{count} of {tot} - {link}')
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
kLower = k.lower()
playerSection = soup.find('div', {'id':f'all_players_{kLower}'})
h2 = playerSection.find('h2').text
#print('\t',h2)
player_links = playerSection.find_all('a', href=True)
for player in player_links:
playerName = player.text.strip()
playerId = player['href'].split('id=')[-1].strip()
if playerName not in playerIds.keys():
playerIds[playerName] = []
#print(f'\t{playerName}: {playerId}')
playerIds[playerName].append(playerId)
df = pd.DataFrame({'Player' : list(playerIds.keys()),
'id': list(playerIds.values())})
Output:
print(df)
Player id
0 Scott A'Hara [ahara-000sco]
1 A'Heasy [ahease001---]
2 Al Aaberg [aaberg001alf]
3 Kirk Aadland [aadlan001kir]
4 Zach Aaker [aaker-000zac]
... ...
323628 Mike Zywica [zywica001mic]
323629 Joseph Zywiciel [zywici000jos]
323630 Bobby Zywicki [zywick000bob]
323631 Brandon Zywicki [zywick000bra]
323632 Nate Zyzda [zyzda-000nat]
[323633 rows x 2 columns]
TO GET JUST THE PLAYERS FROM YOUR DATAFRAME:
THIS IS JUST AN EXAMPLE OF YOUR DATAFRAME. DO NOT INCLUDE THIS IN YOUR CODE
# Sample of the dataframe
nwl_offense = pd.DataFrame({'first':['Evan', 'Kelby'],
'last':['Albrecht', 'Golladay']})
Use this:
# YOU DATAFRAME - GET LIST OF NAMES
player_interest_list = list(nwl_offense['Name'])
nwl_players = df.loc[df['Player'].isin(player_interest_list)]
Output:
print(nwl_players)
Player id
3095 Evan Albrecht [albrec001eva, albrec000eva]
108083 Kelby Golladay [gollad000kel]

Web Scraping Table with class from CoinMarketCap w

I'm trying to scrape an entire table of data from:
https://coinmarketcap.com/currencies/ethereum/historical-data/
I'm trying to extract the table:
<table class="h7vnx2-2 jNaLNi cmc-table ">
with
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
and it returns: None
here's the full code:
import requests
from bs4 import BeautifulSoup
def main():
URL = "https://coinmarketcap.com/currencies/ethereum/historical-data/"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
table = soup.find('table', {"class":"h7nvx2-2 jNaLNi cmc-table"})
print(table)
if __name__ == "__main__":
main()
import pandas as pd
from urllib.parse import urlencode
def main(url):
params = {
'id': '1027',
'convertId': '2781',
'timeStart': '1623283200',
'timeEnd': '1628553600'
}
df = pd.DataFrame(pd.read_json(
url + urlencode(params))['data']['quotes'])
df = pd.DataFrame.from_records(df['quote'])
print(df)
main('https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical?')
As stated, the data is dynamically rendered. Go to the api to get the data directly:
import requests
import pandas as pd
url ='https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical'
payload = {
'id':'1027',
'convertId':'2781',
'timeStart':'1623283200',
'timeEnd':'1628553600' }
jsonData = requests.get(url, params=payload).json()
df = pd.json_normalize(jsonData, record_path=['data','quotes'])
#rows = [i['quote'] for i in jsonData['data']['quotes']]
#df = pd.DataFrame(rows)
Output:
print(df)
Output from spyder call 'get_namespace_view':
open high ... marketCap timestamp
0 2611.142652 2619.957793 ... 2.872870e+11 2021-06-10T23:59:59.999Z
1 2472.858836 2495.414705 ... 2.736314e+11 2021-06-11T23:59:59.999Z
2 2354.752218 2447.227868 ... 2.758389e+11 2021-06-12T23:59:59.999Z
3 2372.690096 2547.367910 ... 2.916739e+11 2021-06-13T23:59:59.999Z
4 2508.770462 2606.432929 ... 2.951126e+11 2021-06-14T23:59:59.999Z
.. ... ... ... ... ...
56 2725.669632 2840.430748 ... 3.307572e+11 2021-08-05T23:59:59.999Z
57 2827.503486 2944.903352 ... 3.382378e+11 2021-08-06T23:59:59.999Z
58 2891.707469 3170.229727 ... 3.694372e+11 2021-08-07T23:59:59.999Z
59 3161.232779 3184.603971 ... 3.526859e+11 2021-08-08T23:59:59.999Z
60 3012.885711 3185.701187 ... 3.707654e+11 2021-08-09T23:59:59.999Z
[61 rows x 7 columns]

python selenium/soup not scrolling and printing entire job containers in linkedined

Here's the problem statement: The base_site link below takes us to a job search URL.
There are small containers that show jobs on the left pane of the webpage.
The problem is that with this code I can only see 7 containers as output.
For example, it shows the 1st seven job result locations in the output whereas I am expecting all of them to be shown in the output. For this, I am using scrolltoview but that doesn't seem to help as well.
What is it that I'm missing?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
def get_driver():
options = Options()
options.add_argument("user-data-dir=C:\\Users\\abc\\AppData\\Local\\Google\\Chrome\\User Data")
path = 'C:\\Program Files (x86)\\Google\\chromedriver.exe'
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(path, options=options)
text_search = 'Product Development Engineer'
location_search = 'california'
# base_site = 'https://www.linkedin.com/jobs'
base_site = 'https://www.linkedin.com/jobs/search/?currentJobId=2638809245&f_E=3%2C4&f_JT=F&f_SB2=3&f_TPR=r60' \
'4800&geoId=102095887&keywords=product%20development%20engineer&location=California%2C%20United%20States&sortBy=R'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
"70.0.3538.102 Safari/537.36 Edge/18.19582"}
driver.get(base_site)
parsing_job_data(driver, base_site, headers)
def parsing_job_data(driver, base_site, headers):
try:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.find_all('div', class_="job-card-container relative job-card-list job-card-container--clickable "
"job-card-list--underline-title-on-hover jobs-search-results-list__list-"
"item--active jobs-search-two-pane__job-card-container--viewport-tracking"
"-0")
sleep(1)
each_container = soup.select('[class*="occludable-update"]', limit=20)
for container in each_container:
element = driver.find_element_by_class_name("artdeco-entity-lockup__caption")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
job_title = container.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
location = container.find('li', class_='job-card-container__metadata-item').text
job_title = job_title.strip()
location = location.strip()
print(job_title, ', ', location)
except Exception as e:
print(e)
if __name__ == "__main__":
get_driver()
import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
allin = []
async def worker(channel):
async with channel:
async for num in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"currentJobId": "2638809245",
"f_E": "3,4",
"f_JT": "F",
"f_SB2": "3",
"f_TPR": "r604800",
"geoId": "102095887",
"keywords": "product development engineer",
"location": "California, United States",
"sortBy": "R",
"position": "1",
"pageNum": "0",
"start": num
}
r = await client.get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search', params=params)
soup = await get_soup(r.text)
goal = [(x.h3.get_text(strip=True), x.select_one('.job-search-card__location').get_text(strip=True))
for x in soup.select('.base-search-card__info')]
allin.extend(goal)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(2):
nurse.start_soon(worker, receiver.clone())
async with sender:
for num in range(0, 450, 25):
await sender.send(num)
df = pd.DataFrame(allin, columns=["Title", "Location"])
print(df)
#df.to_csv('result.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
Title Location
0 Packaging Process Engineer Fremont, CA
1 Project Engineer Oakland, CA
2 Process Engineer- Materials and Fibers Santa Clarita, CA
3 Senior Product Design Engineer Carson, CA
4 Design Engineer Sacramento, CA
.. ... ...
436 Software Development Engineer Irvine, CA
437 Software Development Engineer Sunnyvale, CA
438 Software Development Engineer San Luis Obispo, CA
439 Software Development Engineer - Luna Irvine, CA
440 Software Development Engineer Irvine, CA
[441 rows x 2 columns]

Selenium/Webscrape this field

My code runs fine and prints the title for all rows but the rows with dropdowns.
For example, row 4 has a dropdown if clicked. I implemented a try which would in theory initiate the dropdown, to then pull the titles.
But when i execute click() and try to print, for the rows with these drop downs, they are not printing.
Expected output- Print all titles including the ones in dropdown.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='card item-container session')
for property in productlist:
sessiontitle=property.find('h4',class_='session-title card-title').text
print(sessiontitle)
try:
ifDropdown=driver.find_elements_by_class_name('item-expand-action expand')
ifDropdown.click()
time.sleep(4)
newTitle=driver.find_element_by_class_name('card-title').text
print(newTitle)
except:
newTitle='none'
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def my_filter(req, content):
try:
r = req.get(content['href'])
soup = get_soup(r.text)
return [x.text for x in soup.select('.card-title')[1:]]
except TypeError:
return 'N/A'
def main(url):
with requests.Session() as req:
for page in range(1, 2):
print(f"Extracting Page# {page}\n")
params = {
"p": page
}
r = req.get(url, params=params)
soup = get_soup(r.text)
goal = {x.select_one('.session-title').text: my_filter(
req, x.select_one('.item-expand-action')) for x in soup.select('.card')}
df = pd.DataFrame(goal.items(), columns=['Title', 'Menu'])
print(df)
main('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
Output:
Title Menu
0 Educational sessions on-demand N/A
1 Special Symposia on-demand N/A
2 Multidisciplinary sessions on-demand N/A
3 Illumina - Diagnosing Non-Small Cell Lung Canc... [Illumina gives an update on their IVD road ma...
4 MSD - Homologous Recombination Deficiency: BRC... [Welcome and Introductions, Homologous Recombi...
5 Servier - The clinical value of IDH inhibition... [Isocitric dehydrogenase: an actionable geneti...
6 AstraZeneca - Redefining Breast Cancer – Biolo... [Welcome and Opening, Redefining Breast Cancer...
7 ITM Isotopen Technologien München AG - A Globa... [Welcome & Introduction, Changes in the Incide...
8 MSD - The Role of Biomarkers in Patient Manage... [Welcome and Introductions, The Role of Pd-L1 ...
9 AstraZeneca - Re-evaluating the role of gBRCA ... [Welcome and introduction, What do we know abo...
10 Novartis - Unmet needs in oncogene-driven NSCL... [Welcome and introduction, Unmet needs in onco...
11 Opening session N/A

Extracting URL from html page

I am working with the following code:
import requests, pandas as pd
from bs4 import BeautifulSoup
if __name__ == '__main__':
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text, "lxml").find_all("div", class_="mpi_info")
I am trying to get all the urls like "/homedetail/30729-mcguinness-dr-spring-tx-77386/5204857" in a dataframe but not sure how to go about this.
The addresses are under the class "address". Create a list containing all the href's and pass it to a DataFrame
import requests, pandas as pd
from bs4 import BeautifulSoup
url = "https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1"
soup = BeautifulSoup(requests.get(url).text, "lxml")
address_links = [tag["href"] for tag in soup.find_all("a", class_="address")]
df = pd.DataFrame(address_links)
print(df.to_string())
Output:
0
0 /homedetail/30729-mcguinness-dr-spring-tx-77386/5204857
1 /homedetail/11-dovecote-spring-tx-77382/5323232
2 /homedetail/9934-crestwater-cir-magnolia-tx-77354/11567525
3 /homedetail/3-shanewood-ct-spring-tx-77382/5325643
4 /homedetail/22-solebrook-path-tomball-tx-77375/12190176
5 /homedetail/24-snowdrop-lily-dr-tomball-tx-77375/14652805
6 /homedetail/26-freestone-pl-spring-tx-77382/9791228
7 /homedetail/8557-alford-point-dr-magnolia-tx-77354/13580284?lid=6218369
8 /homedetail/210-spyglass-park-loop-montgomery-tx-77316/12783261
9 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
10 /homedetail/51-lenox-hill-dr-spring-tx-77382/5331072
11 /homedetail/19-s-garnet-bnd-spring-tx-77382/9164284
import requests, pandas as pd
from bs4 import BeautifulSoup
def scraper():
lst = []
url = r'https://www.har.com/search/dosearch?map_tools_nwlat=30.285067156744077&map_tools_nwlng=-95.67872179656118&map_tools_selat=30.106228394674915&map_tools_selng=-95.37032501917425&for_sale=1&property_status=A&listing_price_min=450000&listing_price_max=1000000&bedroom_min=4&lotsize_min=8500&garage_num=3&private_pool=1'
soup = BeautifulSoup(requests.get(url).text)
for i in soup.find_all('a'):
if i.get('href') and i.get('href').startswith('/homedetail/'):
lst.append(i['href'])
return lst
if __name__ == '__main__':
urls = scraper()
df = pd.DataFrame(urls)
print(df)
Output:
0 /homedetail/30729-mcguinness-dr-spring-tx-7738...
1 /homedetail/30729-mcguinness-dr-spring-tx-7738...
2 /homedetail/11-dovecote-spring-tx-77382/5323232
3 /homedetail/11-dovecote-spring-tx-77382/5323232
4 /homedetail/9934-crestwater-cir-magnolia-tx-77...
5 /homedetail/9934-crestwater-cir-magnolia-tx-77...
6 /homedetail/3-shanewood-ct-spring-tx-77382/532...
7 /homedetail/3-shanewood-ct-spring-tx-77382/532...
8 /homedetail/22-solebrook-path-tomball-tx-77375...
9 /homedetail/22-solebrook-path-tomball-tx-77375...
10 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
11 /homedetail/24-snowdrop-lily-dr-tomball-tx-773...
12 /homedetail/26-freestone-pl-spring-tx-77382/97...
13 /homedetail/26-freestone-pl-spring-tx-77382/97...
14 /homedetail/8557-alford-point-dr-magnolia-tx-7...
15 /homedetail/8557-alford-point-dr-magnolia-tx-7...
16 /homedetail/210-spyglass-park-loop-montgomery-...
17 /homedetail/210-spyglass-park-loop-montgomery-...
18 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
19 /homedetail/6-rosedown-pl-spring-tx-77382/5329545
20 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
21 /homedetail/51-lenox-hill-dr-spring-tx-77382/5...
22 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...
23 /homedetail/19-s-garnet-bnd-spring-tx-77382/91...