I have one more issue with my BeautifulSoup scraper and was hoping you'd be able to help me. Since adding the code for it to extract information from the details pages, now the scraper only scrapes the first page and no longer scrapes the multiple pages outlined.
I assume it has to do with the loop but I'm not sure how to define this to ensure it doesn't cause issues.
Is there a better way for pagination?
I've posted the full code and bolded the new section that's causing problems.
Full code:
from bs4 import BeautifulSoup
import scrapy
import requests
import csv
import time
import os
pages = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
with open(r'csv', 'a', encoding='utf-8', newline='') as f_output:
csv_print = csv.writer(f_output)
file_is_empty = os.stat(r'C:\csv').st_size == 0
if file_is_empty:
csv_print.writerow(['Title', 'Company', 'Location', 'Salary', 'Summary', 'Link', 'Description', 'URL'])
for page in pages:
source = requests.get('https://www.indeed.com/jobs?q=work+from+home&l=United+States&fromage=1&start={}'.format(page)).text
soup = BeautifulSoup(source, 'lxml')
results = soup.findAll("div", {"class": "result"})
for jobs in soup.find_all(class_='result'):
try:
title = jobs.h2.text.strip()
except Exception as e:
title = None
print('Title:', title)
try:
company = jobs.span.text.strip()
except Exception as e:
company = None
print('Company:', company)
try:
location = jobs.find('span', class_='location').text.strip()
except Exception as e:
location = None
print('Location:', location)
try:
salary = jobs.find('span', class_='no-wrap').text.strip()
except Exception as e:
salary = None
print('Salary:', salary)
try:
summary = soup.find('div', class_='summary').text.strip()
except Exception as e:
summary = None
print('Summary:', summary)
link = jobs.a['href']
if 'http' not in link:
link = ("https://www.indeed.com" + link)
print('Link:', link)
**page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
job_description = soup.find('div', id='jobDescriptionText').decode_contents(formatter="html")
print('job_description:', job_description)
try:
url = soup.find('div', class_='icl-u-lg-hide').attrs['href']
except Exception as e:
url = None
print('url:', url)**
csv_print.writerow((title, company, location, salary, summary, link, job_description, url))
print('--------')
time.sleep(0.5)
Thank you so much :)
You guys are the best!
Couple things,
BeautifulSoup parse html, it has nothing to do with implementing the pagination process.
The way your code is written, it actually doesnt scrape the first page, it's scraping the last page. You have it iterating through the list of pages, and then have it parse the html (which in this case is the very last request made). you need to parse your results for each page. Meaning for jobs in soup.find_all(class_='result'): needs to be within your loop of for page in pages:
This is just my preference, but I like using pandas. So I used that instead of csv to write to file
Code:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
pages = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
titleList = []
companyList = []
locList = []
salList = []
summaryList = []
linkList = []
descList = []
urlList = []
for page in pages:
source = requests.get('https://www.indeed.com/jobs?q=work+from+home&l=United+States&fromage=1&start={}'.format(page)).text
soup = BeautifulSoup(source, 'lxml')
print ('Page: %s' %page)
results = soup.findAll("div", {"class": "result"})
for jobs in soup.find_all(class_='result'):
try:
title = jobs.h2.text.strip()
except Exception as e:
title = None
print('Title:', title)
try:
company = jobs.span.text.strip()
except Exception as e:
company = None
#print('Company:', company)
try:
location = jobs.find('span', class_='location').text.strip()
except Exception as e:
location = None
#print('Location:', location)
try:
salary = jobs.find('span', class_='no-wrap').text.strip()
except Exception as e:
salary = None
#print('Salary:', salary)
try:
summary = soup.find('div', class_='summary').text.strip()
except Exception as e:
summary = None
#print('Summary:', summary)
link = jobs.a['href']
if 'http' not in link:
link = ("https://www.indeed.com" + link)
#print('Link:', link)
page = requests.get(link)
soup = BeautifulSoup(page.content, 'html.parser')
job_description = soup.find('div', id='jobDescriptionText').decode_contents(formatter="html")
#print('job_description:', job_description)
try:
url = soup.find('div', class_='icl-u-lg-hide').attrs['href']
except Exception as e:
url = None
#print('url:', url)
titleList.append(title)
companyList.append(company)
locList.append(location)
salList.append(salary)
summaryList.append(summary)
linkList.append(link)
descList.append(job_description)
urlList.append(url)
print('--------')
time.sleep(0.5)
df = pd.DataFrame({
'Title':titleList,
'Company':companyList,
'Location':locList,
'Salary':salList,
'Summary':summaryList,
'Link':linkList,
'Description':descList,
'URL':urlList})
df.to_csv('file.csv',index=False)
Output:
print (df.head(15).to_string())
Title Company Location Salary Summary Link Description URL
0 Medical Transcription Documentation Specialist... new None None Extract electronic medical record data includi... https://www.indeed.com/rc/clk?jk=b728723376dae... Dane Street is looking for highly motivated ca... None
1 Company Partnerships Data Contractor\nnew new New York, NY 10001 (Flatiron District area) $30 an hour None https://www.indeed.com/rc/clk?jk=188b5c89acc25... <div><div>Do you love gathering, analyzing and... None
2 Customer Experience Associate - work from home... new Baltimore, MD 21230 (Spring Garden Industrial ... $16 - $17 an hour None https://www.indeed.com/rc/clk?jk=bdf4e0b474852... <div><div><div><b>About Hungry Harvest</b></di... None
3 Customer Care Representative I - Houston, TX -... new Houston, TX 77036 (Bellaire area) None None https://www.indeed.com/rc/clk?jk=b144f9b3db19a... <div><p><b>Description</b>\n</p>SHIFT: Day Job... None
4 Talent Acquisition Specialist\nnew new None None None https://www.indeed.com/rc/clk?jk=f9c860e3dea80... <p></p><div><p><b>The Position</b></p><p>\nTal... None
5 Talent Acquisition Specialist\nnew new None None None https://www.indeed.com/rc/clk?jk=5b1609f84a81b... <div><div><div><div>Job Details</div>\n</div><... None
6 Success Consultant\nnew new None None None https://www.indeed.com/rc/clk?jk=e2b62a52550b8... <div><div>WORK FROM HOME</div><div><h6 class="... None
7 Work At Home Customer Service Specialist (Bell... new Belle Glade, FL $13.25 an hour None https://www.indeed.com/rc/clk?jk=0bd3274a26a1f... US55505\n<br/><br/>\nJob Description Details\n... None
8 Data Associate\nnew new None None None https://www.indeed.com/rc/clk?jk=88d65d8102b17... <div></div><div><div><div><div>The DNC Coordin... None
9 States Data Analyst\nnew new None None None https://www.indeed.com/rc/clk?jk=8c84a472e6119... <div></div><div><div><div><div>The DNC Coordin... None
10 Coder Franciscan Health None None Understanding of payer relationships, requirem... https://www.indeed.com/rc/clk?jk=cc0748fa0c360... <div><div>Ambulatory Coding | Franciscan Allia... None
11 Data Associate Democratic National Committee None None None https://www.indeed.com/rc/clk?jk=88d65d8102b17... <div></div><div><div><div><div>The DNC Coordin... None
12 Work At Home Customer Service Specialist (Boyn... HSN Boynton Beach, FL $13.25 an hour None https://www.indeed.com/rc/clk?jk=a59f7532662bc... <p></p><div>US55505\n<p></p><p><b>Job Descript... None
13 Associate Project Manager ConvergeOne None None None https://www.indeed.com/rc/clk?jk=cbc2520fc153b... <div>C1 Company Overview:<div><b>\nConvergeOne... None
14 Customer Solutions Specialist Webstaurant Store, Inc. Albany, GA $19 an hour None https://www.indeed.com/rc/clk?jk=8f5bd656508fd... <div><p>Customer Solutions Specialist</p>\n<p>... None
....
Related
I'm having some issues with scraping fish images off a website.
species_with_foto = ['/fangster/aborre-perca-fluviatilis/1',
'/fangster/almindelig-tangnaal-syngnathus-typhle/155',
'/fangster/ansjos-engraulis-encrasicholus/66',
'/fangster/atlantisk-tun-blaafinnet-tun-thunnus-thynnus-/137']
titles = []
species = []
for x in species_with_foto:
specie_page = 'https://www.fiskefoto.dk'+x
driver.get(specie_page)
content = driver.page_source
soup = BeautifulSoup(content)
brutto = soup.find_all('img', attrs={'class':'rapportBillede'})
species.append(brutto)
#print(brutto)
titles.append(x)
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished - :', x)
time.sleep(2)
This returns a list of lists with the sublist looking like this:
[<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg" style="width:50%;"/>,
<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" style="width:calc(50% - 6px);margin-bottom:7px;"/>]
How can i clean up the list and only keep the src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" - part? I have tried with other variables in the soup.find_all but can't get it to work.
(The selenium part is also not functioning properly, but I'll get to that after......)
EDIT:
This is my code now, I'm really getting close :) One issue is that now my photos are not saved in a list of lists but just a list - I for the love of god don't understand why this happens?
Help to fix and understand would be greatly appreciated!
titles = []
fish_photos = []
for x in species_with_foto_mini:
site = "https://www.fiskefoto.dk/"+x
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
titles.append(x)
try:
images = bs.find_all('img', attrs={'class':'rapportBillede'})
for img in images:
if img.has_attr('src'):
#print(img['src'])
a = (img['src'])
fish_photos.append(a)
except KeyError:
print('No src')
#navigate pages
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished -', x)
time.sleep(2)
EDIT:
I need the end result to be a list of lists looking something like this:
fish_photos =
[['/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg',
'/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg',['/images/400/tungehvarre-arnoglossus-laterna-medefiskeri-6650-2523403.jpg', '/images/400/ulk-myoxocephalus-scorpius-medefiskeri-bundrig-koebenhavner-koebenhavner-torsk-mole-sild-boersteorm-pigge-351-18-48-9-680-2013-6-4.jpg'],[ '/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-5.02kg-77cm-6436-7486.jpg','/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-10.38kg-96cm-6337-4823146.jpg']
EDIT:
My output now is a list with identical lists. I need it to put every specie in its own list, like this: fish_photo_list = [[trout1, trout2, trout3], [other fish1, other fish 2, other], [salmon1, salmon2]]
My initial code this, but not now.
Here is an example, you can change:
from urllib.request import urlopen
from bs4 import BeautifulSoup
site = "[insert name of the site]"
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
try:
images = bs.find_all('img')
for img in images:
if img.has_attr('src'):
print(img['src'])
except KeyError:
print('No src')
My code runs fine and prints the title for all rows but the rows with dropdowns.
For example, row 4 has a dropdown if clicked. I implemented a try which would in theory initiate the dropdown, to then pull the titles.
But when i execute click() and try to print, for the rows with these drop downs, they are not printing.
Expected output- Print all titles including the ones in dropdown.
from selenium import webdriver
from bs4 import BeautifulSoup
import time
driver = webdriver.Chrome()
driver.get('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
time.sleep(4)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='card item-container session')
for property in productlist:
sessiontitle=property.find('h4',class_='session-title card-title').text
print(sessiontitle)
try:
ifDropdown=driver.find_elements_by_class_name('item-expand-action expand')
ifDropdown.click()
time.sleep(4)
newTitle=driver.find_element_by_class_name('card-title').text
print(newTitle)
except:
newTitle='none'
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_soup(content):
return BeautifulSoup(content, 'lxml')
def my_filter(req, content):
try:
r = req.get(content['href'])
soup = get_soup(r.text)
return [x.text for x in soup.select('.card-title')[1:]]
except TypeError:
return 'N/A'
def main(url):
with requests.Session() as req:
for page in range(1, 2):
print(f"Extracting Page# {page}\n")
params = {
"p": page
}
r = req.get(url, params=params)
soup = get_soup(r.text)
goal = {x.select_one('.session-title').text: my_filter(
req, x.select_one('.item-expand-action')) for x in soup.select('.card')}
df = pd.DataFrame(goal.items(), columns=['Title', 'Menu'])
print(df)
main('https://cslide.ctimeetingtech.com/esmo2021/attendee/confcal/session/list')
Output:
Title Menu
0 Educational sessions on-demand N/A
1 Special Symposia on-demand N/A
2 Multidisciplinary sessions on-demand N/A
3 Illumina - Diagnosing Non-Small Cell Lung Canc... [Illumina gives an update on their IVD road ma...
4 MSD - Homologous Recombination Deficiency: BRC... [Welcome and Introductions, Homologous Recombi...
5 Servier - The clinical value of IDH inhibition... [Isocitric dehydrogenase: an actionable geneti...
6 AstraZeneca - Redefining Breast Cancer – Biolo... [Welcome and Opening, Redefining Breast Cancer...
7 ITM Isotopen Technologien München AG - A Globa... [Welcome & Introduction, Changes in the Incide...
8 MSD - The Role of Biomarkers in Patient Manage... [Welcome and Introductions, The Role of Pd-L1 ...
9 AstraZeneca - Re-evaluating the role of gBRCA ... [Welcome and introduction, What do we know abo...
10 Novartis - Unmet needs in oncogene-driven NSCL... [Welcome and introduction, Unmet needs in onco...
11 Opening session N/A
I am trying to find certain words within a website. Right now my code can only check for one word but I want it to be able to check for multiple words, (say instead of just checking for 'dog', i want it to check for ["dog","cat","adult"]
#Import Packages
import requests
from bs4 import BeautifulSoup
def count_words(url, the_word):
r = requests.get(url, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
words = soup.find(text=lambda text: text and the_word in text)
print(words)
def main():
url = 'https://patch.com/illinois/alsip-crestwood/pet-adoption-alsip-crestwood-area-see-latest-
dogs-cats-more'
word= 'dog'
count = count_words(url, word)
print(url, count, word)
if __name__ == '__main__':
main()
Basically I do not know how to pass in a list of words instead of one singular string!
I believe you're making it a bit too complicated than what is actually necessary. Try something like this:
url = "https://patch.com/illinois/alsip-crestwood/pet-adoption-alsip-crestwood-area-see-latest-dogs-cats-more"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
pets = ["dog","cat"]
for pet in pets:
print(pet, len(soup.find_all(text=lambda text: text and pet in text)))
Output:
dog 13
cat 76
I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
You can simply use
//li[b="ASIN:"]
to get required element on both pages
I have the following IPython Notebook, I am trying to access data base of movies from rotten tomatoes website.
But Rotten Tomatoes limits to 10,000 API requests a day
So I don't want to re-run this function every time when I restart the notebook, I am trying to save and reload this data as a CSV file. When I convert the data to a csv file I am getting this processing symbol[*] inside IPython notebook. After some time I am getting the following error
ConnectionError: HTTPConnectionPool(host='api.rottentomatoes.com', port=80): Max retries exceeded with url: /api/public/v1.0/movie_alias.json?apikey=5xr26r2qtgf9h3kcq5kt6y4v&type=imdb&id=0113845 (Caused by <class 'socket.gaierror'>: [Errno 11002] getaddrinfo failed)
Is this problem due to slow internet connection? Should I make some changes to my code? Kindly help me with this.
The code for the file is shown below:
%matplotlib inline
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
api_key = '5xr26r2qtgf9h3kcq5kt6y4v'
movie_id = '770672122' # toy story 3
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are "get parameters"
options = {'review_type': 'top_critic', 'page_limit': 20, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
print json.dumps(data['reviews'][0], indent=2) # dump an object into a json string
from io import StringIO
movie_txt = requests.get('https://raw.github.com/cs109/cs109_data/master/movies.dat').text
movie_file = StringIO(movie_txt) # treat a string like a file
movies = pd.read_csv(movie_file,delimiter='\t')
movies
#print the first row
movies[['id', 'title', 'imdbID', 'year']]
def base_url():
return 'http://api.rottentomatoes.com/api/public/v1.0/'
def rt_id_by_imdb(imdb):
"""
Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID,
or raises a KeyError if no match was found
"""
url = base_url() + 'movie_alias.json'
imdb = "%7.7i" % imdb
params = dict(id=imdb, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
return r['id']
def _imdb_review(imdb):
"""
Query the RT reviews API, to return the first page of reviews
for a movie specified by its IMDB ID
Returns a list of dicts
"""
rtid = rt_id_by_imdb(imdb)
url = base_url() + 'movies/{0}/reviews.json'.format(rtid)
params = dict(review_type='top_critic',
page_limit=20,
page=1,
country='us',
apikey=api_key)
data = json.loads(requests.get(url, params=params).text)
data = data['reviews']
data = [dict(fresh=r['freshness'],
quote=r['quote'],
critic=r['critic'],
publication=r['publication'],
review_date=r['date'],
imdb=imdb, rtid=rtid
) for r in data]
return data
def fetch_reviews(movies, row):
m = movies.irow(row)
try:
result = pd.DataFrame(_imdb_review(m['imdbID']))
result['title'] = m['title']
except KeyError:
return None
return result
def build_table(movies, rows):
dfs = [fetch_reviews(movies, r) for r in range(rows)]
dfs = [d for d in dfs if d is not None]
return pd.concat(dfs, ignore_index=True)
critics = build_table(movies, 3000)
critics.to_csv('critics.csv', index=False)
critics = pd.read_csv('critics.csv')