Extract part of string(/soup element) within a list of lists - pandas

I'm having some issues with scraping fish images off a website.
species_with_foto = ['/fangster/aborre-perca-fluviatilis/1',
'/fangster/almindelig-tangnaal-syngnathus-typhle/155',
'/fangster/ansjos-engraulis-encrasicholus/66',
'/fangster/atlantisk-tun-blaafinnet-tun-thunnus-thynnus-/137']
titles = []
species = []
for x in species_with_foto:
specie_page = 'https://www.fiskefoto.dk'+x
driver.get(specie_page)
content = driver.page_source
soup = BeautifulSoup(content)
brutto = soup.find_all('img', attrs={'class':'rapportBillede'})
species.append(brutto)
#print(brutto)
titles.append(x)
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished - :', x)
time.sleep(2)
This returns a list of lists with the sublist looking like this:
[<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg" style="width:50%;"/>,
<img alt="Aborre (Perca fluviatilis) aborrefiskeri, striber, rygfinne, regnorm, majs, spinner, " class="rapportBillede" src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" style="width:calc(50% - 6px);margin-bottom:7px;"/>]
How can i clean up the list and only keep the src="/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg" - part? I have tried with other variables in the soup.find_all but can't get it to work.
(The selenium part is also not functioning properly, but I'll get to that after......)
EDIT:
This is my code now, I'm really getting close :) One issue is that now my photos are not saved in a list of lists but just a list - I for the love of god don't understand why this happens?
Help to fix and understand would be greatly appreciated!
titles = []
fish_photos = []
for x in species_with_foto_mini:
site = "https://www.fiskefoto.dk/"+x
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
titles.append(x)
try:
images = bs.find_all('img', attrs={'class':'rapportBillede'})
for img in images:
if img.has_attr('src'):
#print(img['src'])
a = (img['src'])
fish_photos.append(a)
except KeyError:
print('No src')
#navigate pages
try:
driver.find_element(by=By.XPATH, value='/html/body/form/div[4]/div[1]/div/div[13]/div[2]/div/div').click()
print('CLicked next', x)
except NoSuchElementException:
print('Succesfully finished -', x)
time.sleep(2)
EDIT:
I need the end result to be a list of lists looking something like this:
fish_photos =
[['/images/400/aborre-perca-fluviatilis-medefiskeri-bundrig-0,220kg-24cm-striber-rygfinne-regnorm-majs-spinner-358-22-29-14-740-2013-21-4.jpg',
'/images/400/aborre-perca-fluviatilis-medefiskeri-prop-flaad-med-levende-skalle-paa-enkeltkrog-1.6kg-46cm-6604-1724617.jpg',['/images/400/tungehvarre-arnoglossus-laterna-medefiskeri-6650-2523403.jpg', '/images/400/ulk-myoxocephalus-scorpius-medefiskeri-bundrig-koebenhavner-koebenhavner-torsk-mole-sild-boersteorm-pigge-351-18-48-9-680-2013-6-4.jpg'],[ '/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-5.02kg-77cm-6436-7486.jpg','/images/400/graeskarpe-ctenopharyngodon-idella-medefiskeri-bobleflaad-med-toastbroed-paa-enkeltkrog-10.38kg-96cm-6337-4823146.jpg']
EDIT:
My output now is a list with identical lists. I need it to put every specie in its own list, like this: fish_photo_list = [[trout1, trout2, trout3], [other fish1, other fish 2, other], [salmon1, salmon2]]
My initial code this, but not now.

Here is an example, you can change:
from urllib.request import urlopen
from bs4 import BeautifulSoup
site = "[insert name of the site]"
html = urlopen(site)
bs = BeautifulSoup(html, 'html.parser')
try:
images = bs.find_all('img')
for img in images:
if img.has_attr('src'):
print(img['src'])
except KeyError:
print('No src')

Related

Pandas: Creating multiple new columns from function with multiple output values

Im trying to scrape a website for multiple values regarding a list of books. The links to the book pages are stored in a dataframe. Now I need a function that iterates those links and adds the book values to new columns in the dataframe. I don't want to request the page again every time I'm scraping a new book value, so I want to do it all in one function.
The problem is the function then returns multiple values (e.g. book_title and book_rating) which I don't know how to best add to the dataframe.
I tried the following, which I know can't work but I'm stuck:
import requests as rq
from bs4 import BeautifulSoup as bs
import pandas as pd
#function to get the book page
def get_book_page(page):
# Construct the URL
books_page_url = page
# Get the HTML page content using requests
response = rq.get(books_page_url, headers = headers)
# Ensure that the response is valid
if response.status_code != 200:
print('Status code:', response.status_code)
raise Exception('Failed to fetch web page ' + books_page_url)
# Construct a beautiful soup document
doc = bs(response.content, "html.parser")
return doc
#function to scrape the book title
def scrape_book_title(book_content):
try:
title_tag = book_content.find("h1", class_="bc-heading bc-color-base bc-size-large bc-text-bold").text.strip()
except:
title_tag = "fehlt"
return title_tag
#function to scrape the book rating
def scrape_book_rating(book_content):
star_tag = book_content.find("li", class_="bc-list-item ratingsLabel")
try:
rating_tag = star_tag.find("span", class_="bc-text bc-pub-offscreen").text.strip()
except:
rating_tag = "fehlt"
return rating_tag
#function I'm trying to fix
def get_book_title(links):
bs_page = get_book_page(links)
bs_content = bs_page.find("ul", class_="bc-list bc-spacing-s2 bc-color-secondary bc-list-nostyle")
book_title = scrape_book_title(bs_content)
book_rating = scrape_book_rating(bs_content)
return book_title, book_rating
#here I would like to add the columns "A_Titel" and "A_Rating" with the values of "book_title" and "book_rating"
df['A_Titel'], df['A_Rating'] = df.apply(lambda x: get_book_title(x.Link), axis=1)

Scraping Glassdoor returns duplicate entries

So I am trying to scrape job posts from Glassdoor using Requests, Beautiful Soup and Selenium. The entire code works except that, even after scraping data from 30 pages, most entries turn out to be duplicates (almost 80% of them!). Its not a headless scraper so I know it is going to each new page. What could be the reason for so many duplicate entries? Could it be some sort of anti-scraping tool being used by Glassdoor or is something off in my code?
The result turns out to be 870 entries of which a whopping 690 are duplicates!
My code:
def glassdoor_scraper(url):
driver = webdriver.Chrome()
driver.get(url)
time.sleep(10)
# Getting to the page where we want to start scraping
jobs_search_title = driver.find_element(By.ID, 'KeywordSearch')
jobs_search_title.send_keys('Data Analyst')
jobs_search_location = driver.find_element(By.ID, 'LocationSearch')
time.sleep(1)
jobs_search_location.clear()
jobs_search_location.send_keys('United States')
click_search = driver.find_element(By.ID, 'HeroSearchButton')
click_search.click()
for page_num in range(1,10):
time.sleep(10)
res = requests.get(driver.current_url)
soup = BeautifulSoup(res.text,'html.parser')
time.sleep(2)
companies = soup.select('.css-l2wjgv.e1n63ojh0.jobLink')
for company in companies:
companies_list.append(company.text)
positions = soup.select('.jobLink.css-1rd3saf.eigr9kq2')
for position in positions:
positions_list.append(position.text)
locations = soup.select('.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0')
for location in locations:
locations_list.append(location.text)
job_post = soup.select('.eigr9kq3')
for job in job_post:
salary_info = job.select('.e1wijj242')
if len(salary_info) > 0:
for salary in salary_info:
salaries_list.append(salary.text)
else:
salaries_list.append('Salary Not Found')
ratings = soup.select('.e1rrn5ka3')
for index, rating in enumerate(ratings):
if len(rating.text) > 0:
ratings_list.append(rating.text)
else:
ratings_list.append('Rating Not Found')
next_page = driver.find_elements(By.CLASS_NAME, 'e13qs2073')[1]
next_page.click()
time.sleep(5)
try:
close_jobalert_popup = driver.find_element(By.CLASS_NAME, 'modal_closeIcon')
except:
pass
else:
time.sleep(1)
close_jobalert_popup.click()
continue
#driver.close()
print(f'{len(companies_list)} jobs found for you!')
global glassdoor_dataset
glassdoor_dataset = pd.DataFrame(
{'Company Name': companies_list,
'Company Rating': ratings_list,
'Position Title': positions_list,
'Location' : locations_list,
'Est. Salary' : salaries_list
})
glassdoor_dataset.to_csv(r'glassdoor_jobs_scraped.csv')
You're going way too fast. You need to put some waits.
I see you have put Implicit Waits. Trying putting Explicit Waits instead.
Something like this:
(put your own conditions. you can try invisibility element too. like if something is invisible and then visible to ensure you are on next page now)
if not then increase your time.sleep()
WebDriverWait(driver, 40).until(expected_conditions.visibility_of_element_located(
(By.XPATH, '//*[#id="wrapper"]/section/div/div/div[2]/button[2]')))
I don't think the repetition is due to a code issue - I think glassdoor just starts cycling results after a while. [If interested, see this gist for some stats - basically, from the 7th page or so, most of the 1st page results seem to be shown on every page onwards. I did a small test manually - with only 5 listings, by id, and even directly on an un-automated browser, they started repeating after a while....]
My suggestion would be to just filter them before looping to the next page - there's a data-id attribute for each li wrapped around the listings which seems to be a unique identifier. If we add that to the other columns' lists, we can start collecting only un-collected listings; if you just edit the for page_num loop to:
for page_num in range(1, 10):
time.sleep(10)
scrapedUrls.append(driver.current_url)
res = requests.get(driver.current_url)
soup = BeautifulSoup(res.text, 'html.parser')
# soup = BeautifulSoup(driver.page_source, 'html.parser') # no noticable improvement
time.sleep(2)
filteredListings = [
di for di in soup.select('li[data-id]') if
di.get('data-id') not in datId_list
]
datId_list += [di.get('data-id') for di in filteredListings]
companies_list += [
t.select_one('.css-l2wjgv.e1n63ojh0.jobLink').get_text(strip=True)
if t.select_one('.css-l2wjgv.e1n63ojh0.jobLink')
else None for t in filteredListings
]
positions_list += [
t.select_one('.jobLink.css-1rd3saf.eigr9kq2').get_text(strip=True)
if t.select_one('.jobLink.css-1rd3saf.eigr9kq2')
else None for t in filteredListings
]
locations_list += [
t.select_one(
'.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0').get_text(strip=True)
if t.select_one('.css-l2fjlt.pr-xxsm.css-iii9i8.e1rrn5ka0')
else None for t in filteredListings
]
job_post = [
t.select('.eigr9kq3 .e1wijj242') for t in filteredListings
]
salaries_list += [
'Salary Not Found' if not j else
(j[0].text if len(j) == 1 else [s.text for s in j])
for j in job_post
]
ratings_list += [
t.select_one('.e1rrn5ka3').get_text(strip=True)
if t.select_one('.e1rrn5ka3')
else 'Rating Not Found' for t in filteredListings
]
and, if you added datId_list to the dataframe, it could serve as a meaningful index
dfDict = {'Data-Id': datId_list,
'Company Name': companies_list,
'Company Rating': ratings_list,
'Position Title': positions_list,
'Location': locations_list,
'Est. Salary': salaries_list
}
for k in dfDict:
print(k, len(dfDict[k]))
glassdoor_dataset = pd.DataFrame(dfDict)
glassdoor_dataset.set_index('Data-Id', drop=True)
glassdoor_dataset.to_csv(r'glassdoor_jobs_scraped.csv')

List values inside for loop in python beautifulsoup

I am doing some scraping in beautifulsoup. While scraping values from next pages I am using for loop. Everything is fine but when I make a list of the scraped values, I got only values of last page. Below is my code.
from bs4 import BeautifulSoup as bs
import requests
params = []
for page_number in range(0, 4):
p = page_number*10
params.append(p)
print(params)
gymname_list = []
gymratings_list =[]
gymnumreviews_list = []
gymcat_list = []
for i in params:
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9'}
url= f'https://www.yelp.com.au/search?find_desc=gyms&find_loc=Berlin%2C%20Germany&start={i}'
response=requests.get(url,headers=headers)
page_soup=bs(response.content,'lxml')
mains = page_soup.find_all("div", {"class": "mainAttributes__09f24__26-vh arrange-unit__09f24__3IxLD arrange-unit-fill__09f24__1v_h4 border-color--default__09f24__1eOdn"})
for main in mains:
try:
gymname = main.find("a", {"class": "css-166la90"}).text
print(gymname)
except:
print(None)
gymname_list.append(gymname)
In the code above, as you can see, I am trying to scrape the first four pages but when I print "gymname" all I got is the gym name on the last i.e. the fourth page results. I want all results into my list. gymname_list. Please help
In your last for loop, your indentation should be inside this for loop
for main in mains:
try:
gymname = main.find("a", {"class": "css-166la90"}).text
print(gymname)
except:
print(None)
gymname_list.append(gymname)

How to get value of a cell in html page when click to a link in list link?

I have a list about 5000 link.
Ex 2 in 5000 link:
https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019
...
I want to get value of column Time of Day and row Finish of links.
Ex:
09:51:07 AM - https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019
07:50:55 AM - https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019
I got user infor of a website, that website has id, class. But table in https://racevietnam.com/runner/ngocsondknb/ecopark-marathon-2019 have not id, class in table. So I can't.
#!/usr/bin/python
from urllib.request import urlopen
from bs4 import BeautifulSoup
list_user = []
for userID in range(1, 100000):
link = "https://example.com/member.php?u=" + str(userID)
html = urlopen(link)
bsObj = BeautifulSoup(html, "lxml")
user_name = bsObj.find("div", {"id":"main_userinfo"}).h1.get_text()
list_user.append(user_name)
print("username", userID, "is: ", user_name)
with open("result.txt", "a") as myfile:
myfile.write(user_name)
Please help me.
Thank you.
Using bs4 4.7.1.
There is only one table and you want the second column (td) of the last row. You can use last:child to select the last row; which should be used in conjunction with tbody type selector, and > child combinator, so as not to get header row. You can use nth-of-type to specify the td cell to return.
Now you may wish to develop this in at least two ways:
Handle cases where not found e.g.
name = getattr(soup.select_one('title'), 'text', 'N/A')
timing = getattr(soup.select_one('tbody > tr:last-child td:nth-of-type(2)'), 'text', 'N/A')
Add items to lists/data structure, which can be output as a dataframe at end and written out as csv. Or you may wish to stick with your current method
Python:
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://racevietnam.com/runner/buiducninh/ecopark-marathon-2019', 'https://racevietnam.com/runner/drtungnguyen83/ecopark-marathon-2019']
with requests.Session() as s:
for url in urls:
r = s.get(url)
soup = bs(r.content, 'lxml')
name = soup.select_one('title').text
timing = soup.select_one('tbody > tr:last-child td:nth-of-type(2)').text
print(name, timing)
This is my code.
It's working Ok.
import requests
from bs4 import BeautifulSoup
f = open("input.ecopark","r")
f_content = f.readlines()
f.close()
for url in f_content:
r = requests.get(url.rstrip())
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select("table tbody tr td")
x = ""
for i in result:
if not x:
if i.get_text() == "Finish":
x = 1
continue
if x:
print(url.rstrip()+ " "+i.get_text())
break

How to get ASINs XPATH from 2 different Amazon pages that have the same parent nodes?

I made a web scraping program using python and webdriver and I want to extract the ASIN from 2 different pages. I would like xpath to work for these 2 links at the same .
These are the amazon pages:https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds and
https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1. They have the same parent nodes(id, classes). How can I make this program work for these 2 links at the same time?
So the problem is on these lines of code: 36, 41
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
and
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text. I have to change these lines to output in the csv the ASINs for these 2 products. For the first link it prints the wrong information and for the second it prints the ASIN.
I attached the code. I will appreciate any help.
from selenium import webdriver
import csv
import io
# set the proxies to hide actual IP
proxies = {
'http': 'http://5.189.133.231:80',
'https': 'https://27.111.43.178:8080'
}
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server="%s"' % ';'.join(['%s=%s' % (k, v) for k, v in proxies.items()]))
driver = webdriver.Chrome(executable_path="C:\\Users\Andrei-PC\Downloads\webdriver\chromedriver.exe",
chrome_options=chrome_options)
header = ['Product title', 'ASIN']
with open('csv/bot_1.csv', "w") as output:
writer = csv.writer(output)
writer.writerow(header)
links=['https://www.amazon.com/Hydro-Flask-Wide-Mouth-Flip/dp/B01ACATW7E/ref=sr_1_3?s=kitchen&ie=UTF8&qid=1520348607&sr=1-3&keywords=-gfds',
'https://www.amazon.com/Ubbi-Saving-Special-Required-Locking/dp/B00821FLSU/ref=sr_1_1?s=baby-products&ie=UTF8&qid=1520265799&sr=1-1&keywords=-hgfd&th=1'
]
for i in range(len(links)):
driver.get(links[i])
product_title = driver.find_elements_by_xpath('//*[#id="productTitle"][1]')
prod_title = [x.text for x in product_title]
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[4]').text
except:
print('no ASIN template one')
try:
asin = driver.find_element_by_xpath('//div[#id="detail-bullets_feature_div"]/div[#id="detail-bullets"]/table/tbody/tr/td/div/ul/li[5]').text
except:
print('no ASIN template two')
try:
data = [prod_title[0], asin]
except:
print('no items v3 ')
with io.open('csv/bot_1.csv', "a", newline="", encoding="utf-8") as output:
writer = csv.writer(output)
writer.writerow(data)
You can simply use
//li[b="ASIN:"]
to get required element on both pages