python selenium/soup not scrolling and printing entire job containers in linkedined - selenium

Here's the problem statement: The base_site link below takes us to a job search URL.
There are small containers that show jobs on the left pane of the webpage.
The problem is that with this code I can only see 7 containers as output.
For example, it shows the 1st seven job result locations in the output whereas I am expecting all of them to be shown in the output. For this, I am using scrolltoview but that doesn't seem to help as well.
What is it that I'm missing?
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from time import sleep
def get_driver():
options = Options()
options.add_argument("user-data-dir=C:\\Users\\abc\\AppData\\Local\\Google\\Chrome\\User Data")
path = 'C:\\Program Files (x86)\\Google\\chromedriver.exe'
options.add_experimental_option("detach", True)
driver = webdriver.Chrome(path, options=options)
text_search = 'Product Development Engineer'
location_search = 'california'
# base_site = 'https://www.linkedin.com/jobs'
base_site = 'https://www.linkedin.com/jobs/search/?currentJobId=2638809245&f_E=3%2C4&f_JT=F&f_SB2=3&f_TPR=r60' \
'4800&geoId=102095887&keywords=product%20development%20engineer&location=California%2C%20United%20States&sortBy=R'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
"70.0.3538.102 Safari/537.36 Edge/18.19582"}
driver.get(base_site)
parsing_job_data(driver, base_site, headers)
def parsing_job_data(driver, base_site, headers):
try:
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
soup = BeautifulSoup(driver.page_source, 'lxml')
results = soup.find_all('div', class_="job-card-container relative job-card-list job-card-container--clickable "
"job-card-list--underline-title-on-hover jobs-search-results-list__list-"
"item--active jobs-search-two-pane__job-card-container--viewport-tracking"
"-0")
sleep(1)
each_container = soup.select('[class*="occludable-update"]', limit=20)
for container in each_container:
element = driver.find_element_by_class_name("artdeco-entity-lockup__caption")
driver.execute_script("arguments[0].scrollIntoView(true);", element)
element.click()
job_title = container.find('a', class_='disabled ember-view job-card-container__link job-card-list__title').text
location = container.find('li', class_='job-card-container__metadata-item').text
job_title = job_title.strip()
location = location.strip()
print(job_title, ', ', location)
except Exception as e:
print(e)
if __name__ == "__main__":
get_driver()

import trio
import httpx
from bs4 import BeautifulSoup
import pandas as pd
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
async def get_soup(content):
return BeautifulSoup(content, 'lxml')
allin = []
async def worker(channel):
async with channel:
async for num in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"currentJobId": "2638809245",
"f_E": "3,4",
"f_JT": "F",
"f_SB2": "3",
"f_TPR": "r604800",
"geoId": "102095887",
"keywords": "product development engineer",
"location": "California, United States",
"sortBy": "R",
"position": "1",
"pageNum": "0",
"start": num
}
r = await client.get('https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search', params=params)
soup = await get_soup(r.text)
goal = [(x.h3.get_text(strip=True), x.select_one('.job-search-card__location').get_text(strip=True))
for x in soup.select('.base-search-card__info')]
allin.extend(goal)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(2):
nurse.start_soon(worker, receiver.clone())
async with sender:
for num in range(0, 450, 25):
await sender.send(num)
df = pd.DataFrame(allin, columns=["Title", "Location"])
print(df)
#df.to_csv('result.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
Title Location
0 Packaging Process Engineer Fremont, CA
1 Project Engineer Oakland, CA
2 Process Engineer- Materials and Fibers Santa Clarita, CA
3 Senior Product Design Engineer Carson, CA
4 Design Engineer Sacramento, CA
.. ... ...
436 Software Development Engineer Irvine, CA
437 Software Development Engineer Sunnyvale, CA
438 Software Development Engineer San Luis Obispo, CA
439 Software Development Engineer - Luna Irvine, CA
440 Software Development Engineer Irvine, CA
[441 rows x 2 columns]

Related

How to crawl start URLs only?

I'm trying to make the spider crawl just the provided start URLs without following any extracted links. I've tried setting rules = (Rule (follow=False),) but it still follows links. Does anyone know how to download the start URLs only?
EDIT:
Here's some code
class Spider(CrawlSpider):
name = 'spider'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
def __init__(self, mode, *args, **kwargs):
if mode == 'scan':
self.start_urls = ['https://www.example.com/']
self.rules = (Rule (callback="parse_obj", follow=False),
)
self.custom_settings = {
'COMPRESSION_ENABLED': True,
'URLLENGTH_LIMIT': 100,
'DOWNLOAD_DELAY': 1
}
elif mode == 'crawl':
# something else
super(Spider, self).__init__(*args, **kwargs)

Why am I seeing "no connection adapters found" when trying to use results as a variable, but not while trying to print?

Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=2&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=6&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=7&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=17&stats_player_seq=-100',
'https://stats.ncaa.org/player/game_by_game?game_sport_year_ctl_id=15881&id=15881&org_id=23&stats_player_seq=-100']
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
time.sleep(5)
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in soup.select('.smtext > a[href^="/contests/"]'):
profileA = 'https://stats.ncaa.org'+profileA.get('href')
profilesA.append(profileA)
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in soup.select('a[href^="/game/play_by_play/"]'):
profileB = 'https://stats.ncaa.org'+profileB.get('href')
profilesB.append(profileB)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
urls = profilesB
s = requests.Session()
s.headers.update(headers)
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.
Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
fl.append(item)
return fl

I cant get text from atribute span in bs4

when i try to get text i have a output like:
price = item.find('span').text
AttributeError: 'NoneType' object has no attribute 'text'
code:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-
Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first
inlineblock')[0]
for item in table:
price = item.find('span').text
print(price)
Try:
#___IMPORTS_____
from datetime import date
import calendar
import requests
from bs4 import BeautifulSoup
#_______________
url= 'https://www.investing.com/currencies/eur-usd'
page = requests.get(url, headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'})
#print(f'Status code is: {page.status_code}')
soup = BeautifulSoup(page.text, 'html.parser')
table = soup.find_all('div', class_='first inlineblock')
for item in table:
price = item.find('span', class_='float_lang_base_2')
print(price.text)
1.1753
1.1752
- 0.4
Or if you require the field:
for item in table:
field = item.find('span', class_='float_lang_base_1')
price = item.find('span', class_='float_lang_base_2')
print(field.text, ':', price.text)
Prev. Close : 1.1753
Open : 1.1752
1-Year Change : - 0.4

Creating a table off of 247sports website

I am trying to create a pandas dataframe based off of the top 1000 recruits from the 2022 football recruiting class from the 247sports website in a google colab notebook. I currently am using the following code so far:
#Importing all necessary packages
import pandas as pd
import time
import datetime as dt
import os
import re
import requests
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#import twofourseven
from bs4 import BeautifulSoup
from splinter import Browser
from kora.selenium import wd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import requests
from geopy.geocoders import Nominatim
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
year = '2022'
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeRecruitRankings?InstitutionGroup=HighSchool'
# Add the `user-agent` otherwise we will get blocked when sending the request
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
response = requests.get(url, headers = headers).content
soup = BeautifulSoup(response, "html.parser")
data = []
for tag in soup.find_all("li", class_="rankings-page__list-item"): # `[1:]` Since the first result is a table header
# meta = tag.find_all("span", class_="meta")
rank = tag.find_next("div", class_="primary").text
TwoFourSeven_rank = tag.find_next("div", class_="other").text
name = tag.find_next("a", class_="rankings-page__name-link").text
school = tag.find_next("span", class_="meta").text
position = tag.find_next("div", class_="position").text
height_weight = tag.find_next("div", class_="metrics").text
rating = tag.find_next("span", class_="score").text
nat_rank = tag.find_next("a", class_="natrank").text
state_rank = tag.find_next("a", class_="sttrank").text
pos_rank = tag.find_next("a", class_="posrank").text
data.append(
{
"Rank": rank,
"247 Rank": TwoFourSeven_rank,
"Name": name,
"School": school,
"Class of": year,
"Position": position,
"Height & Weight": height_weight,
"Rating": rating,
"National Rank": nat_rank,
"State Rank": state_rank,
"Position Rank": pos_rank,
# "School": ???,
}
)
print(rank)
df = pd.DataFrame(data)
data
Ideally, I would also like to grab the school name the recruit chose from the logo on the table, but I am not sure how to go about that. For example, I would like to print out "Florida State" for the school column from this "row" of data.
Along with that, I do get an output of printing ranks, but afterwards, I get the following error that won't allow me to collect and/or print out additional data:
AttributeError Traceback (most recent call last)
<ipython-input-11-56f4779601f8> in <module>()
16 # meta = tag.find_all("span", class_="meta")
17
---> 18 rank = tag.find_next("div", class_="primary").text
19 # TwoFourSeven_rank = tag.find_next("div", class_="other").text
20 name = tag.find_next("a", class_="rankings-page__name-link").text
AttributeError: 'NoneType' object has no attribute 'text'
Lastly, I do understand that this webpage only displays 50 recruits without having my python code click the "Load more" tab via selenium, but I am not 100% sure how to incorporate that in the most efficient and legible way possible. If anyone knows a good way to do all this, I'd greatly appreciate it. Thanks in advance.
Use try/except as some of the elements will not be present. Also no need to use Selenium. Simple requests will do.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'https://247sports.com/Season/2022-Football/CompositeRecruitRankings/?InstitutionGroup=HighSchool'
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
rows = []
page = 0
while True:
page +=1
print('Page: %s' %page)
payload = {'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
athletes = soup.find_all('li',{'class':'rankings-page__list-item'})
if len(athletes) == 0:
break
continue_loop = True
while continue_loop == True:
for athlete in athletes:
if athlete.text.strip() == 'Load More':
continue_loop = False
continue
primary_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'primary'}).text.strip()
try:
other_rank = athlete.find('div',{'class':'rank-column'}).find('div',{'class':'other'}).text.strip()
except:
other_rank = ''
name = athlete.find('div',{'class':'recruit'}).find('a').text.strip()
link = 'https://247sports.com' + athlete.find('div',{'class':'recruit'}).find('a')['href']
highschool = ' '.join([x.strip() for x in athlete.find('div',{'class':'recruit'}).find('span',{'class':'meta'}).text.strip().split('\n')])
pos = athlete.find('div',{'class':'position'}).text.strip()
ht = athlete.find('div',{'class':'metrics'}).text.split('/')[0].strip()
wt = athlete.find('div',{'class':'metrics'}).text.split('/')[1].strip()
rating = athlete.find('span',{'class':'score'}).text.strip()
nat_rank = athlete.find('a',{'class':'natrank'}).text.strip()
pos_rank = athlete.find('a',{'class':'posrank'}).text.strip()
st_rank = athlete.find('a',{'class':'sttrank'}).text.strip()
try:
team = athlete.find('div',{'class':'status'}).find('img')['title']
except:
team = ''
row = {'Primary Rank':primary_rank,
'Other Rank':other_rank,
'Name':name,
'Link':link,
'Highschool':highschool,
'Position':pos,
'Height':ht,
'weight':wt,
'Rating':rating,
'National Rank':nat_rank,
'Position Rank':pos_rank,
'State Rank':st_rank,
'Team':team}
rows.append(row)
df = pd.DataFrame(rows)
**Output: first 10 rows of 1321 rows - **
print(df.head(10).to_string())
Primary Rank Other Rank Name Link Highschool Position Height weight Rating National Rank Position Rank State Rank Team
0 1 1 Quinn Ewers https://247sports.com/Player/Quinn-Ewers-45572600 Southlake Carroll (Southlake, TX) QB 6-3 206 1.0000 1 1 1 Ohio State
1 2 3 Travis Hunter https://247sports.com/Player/Travis-Hunter-46084728 Collins Hill (Suwanee, GA) CB 6-1 165 0.9993 2 1 1 Florida State
2 3 2 Walter Nolen https://247sports.com/Player/Walter-Nolen-46083769 St. Benedict at Auburndale (Cordova, TN) DL 6-4 300 0.9991 3 1 1
3 4 14 Domani Jackson https://247sports.com/Player/Domani-Jackson-46057101 Mater Dei (Santa Ana, CA) CB 6-1 185 0.9966 4 2 1 USC
4 5 10 Zach Rice https://247sports.com/Player/Zach-Rice-46086346 Liberty Christian Academy (Lynchburg, VA) OT 6-6 282 0.9951 5 1 1
5 6 4 Gabriel Brownlow-Dindy https://247sports.com/Player/Gabriel-Brownlow-Dindy-46084792 Lakeland (Lakeland, FL) DL 6-3 275 0.9946 6 2 1
6 7 5 Shemar Stewart https://247sports.com/Player/Shemar-Stewart-46080267 Monsignor Pace (Opa Locka, FL) DL 6-5 260 0.9946 7 3 2
7 8 20 Denver Harris https://247sports.com/Player/Denver-Harris-46081216 North Shore (Houston, TX) CB 6-1 180 0.9944 8 3 2
8 9 33 Travis Shaw https://247sports.com/Player/Travis-Shaw-46057330 Grimsley (Greensboro, NC) DL 6-5 310 0.9939 9 4 1
9 10 23 Devon Campbell https://247sports.com/Player/Devon-Campbell-46093947 Bowie (Arlington, TX) IOL 6-3 310 0.9937 10 1 3

Why not all of the coordinates are generated while running Geopy to scrape yellow pages?

The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ypscraper#gmail.com")
main_list = []
def extract(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
def transform(articles):
for item in articles:
name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text
try:
street = item.find('span', {'itemprop':'streetAddress'}).text
except:
street = ''
try:
city = item.find('span', {'itemprop':'addressLocality'}).text
except:
city = ''
try:
province = item.find('span', {'itemprop':'addressRegion'}).text
except:
province = ''
try:
postCode = item.find('span', {'itemprop':'postalCode'}).text
except:
postCode = ''
try:
phone = item.find('li', class_ = 'mlr__submenu__item').text.strip()
except:
phone = ''
try:
def search_geo():
global location
location = geolocator.geocode(street + ' ' + city)
print(street + ' ' + city)
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slatitude = location.latitude
except:
slatitude = ''
try:
thread = threading.Thread(target=search_geo)
thread.start()
thread.join()
slongitude = location.longitude
except:
slongitude = ''
business = {
'name': name,
'street': street,
'city': city,
'province': province,
'postCode': postCode,
'phone': phone,
'slongitude': slongitude,
'slatitude': slatitude
}
main_list.append(business)
return
def load():
df = pd.DataFrame(main_list)
df.to_csv('repairshopsbc', index=False)
for x in range(1,2):
print(f'Getting page {x}')
articles = extract(f'https://www.yellowpages.ca/search/si/{x}/car+repair/British+Columbia+BC')
transform(articles)
time.sleep(5)
load()
print('Saved to CSV')