Unable to parse an element from eBay via BeautifulSoup - beautifulsoup

I am trying to scrape "number of items sold" on eBay but for some reason I cannot. I already have the title, price, and all I need is total_sold_price which I am unable to attain. Every time I run my code, I just get a blank for total_sold_price.
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
My entire code: https://pastebin.com/bu8HgCDZ
Thank you so much.

Fixed it for you. You need to make the soup call inside your loop.
Note: I am using this path '../chromedriver', please change it to your path before running the code.
Code
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
driver = webdriver.Chrome('../chromedriver')
driver.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=watches&_sacat=0&_pgn=1')
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.maximize_window()
tempList = []
for link in soup.find_all('a', href=True):
if 'itm' in link['href']:
print(link['href'])
tempList.append(link['href'])
array_length = len(tempList)
for i in range(array_length):
driver.get(tempList[i])
timeout = 5
try:
element_present = EC.presence_of_element_located((By.XPATH, '//*[#id="itemTitle"]'))
WebDriverWait(driver, timeout).until(element_present)
except TimeoutException:
print("Timed out waiting for page to load")
try:
title_selenium = driver.find_element_by_xpath('//*[#id="itemTitle"]').text
except:
title_selenium = ""
try:
price_selenium = driver.find_element_by_xpath('//*[#id="prcIsum"]').text.strip().split()
except:
price_selenium = ""
#you need to call soup here due to your loop structure
soup = BeautifulSoup(driver.page_source, 'lxml')
try:
total_sold_price_BeautifulSoup = soup.find('span', {'class': 'vi-qtyS-hot-red'}).text
except:
total_sold_price_BeautifulSoup = ""
print("title: ", title_selenium)
print("price: ", price_selenium)
print("total_sold_price: ", total_sold_price_BeautifulSoup)
print("\n")
i+=1
driver.close()

Related

How to attach screenshot in pytest-html report with coftest.py?

I want to attach a screenshot to my HTML report but I haven't found any good resource on how to use the conftest.py file. I created the coftest.py file inside the pytest folder with the following code:
import pytest
#pytest.hookimpl(hookwrapper=True)
def pytest_runtest_makereport(item, call):
pytest_html = item.config.pluginmanager.getplugin("html")
outcome = yield
report = outcome.get_result()
extra = getattr(report, "extra", [])
image="D:/Selenium/Insights/2022-11-02_00-13-18/error_page.png"
if report.when == "call":
# always add url to report
extra.append(pytest_html.extras.url("http://www.example.com/"))
extra.append(pytest_html.extra.image(image))
xfail = hasattr(report, "wasxfail")
if (report.skipped and xfail) or (report.failed and not xfail):
# only add additional html on failure
# extra.append(pytest_html.extras.html("<div>Additional HTML</div>"))
extra.append(pytest_html.extra.image(image))
report.extra = extra
And my test.py file is:
import time
from os import getenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from dotenv import load_dotenv
from Login_actions import Login_activities
from Insights_actions import Insights_activities
from Locators import Locators
import pytest, os
from datetime import datetime
class Test_Insights():
#pytest.fixture
def test_setup(self):
#make new directory for downloads
new_dir = r"D:\Selenium\Insights\{timestamp}".format(timestamp=datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
# print(new_dir)
if not os.path.exists(new_dir):
os.makedirs(new_dir)
self.saved_dir=new_dir
prefs = {"download.default_directory": new_dir, "download.directory_upgrade": True, "download.prompt_for_download": False}
#intiating chrome browser instance
options=Options()
options.add_argument('--start-maximized')
# options.add_argument('--headless')
options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options)
#load credentials
load_dotenv()
self.username = getenv("TOP_USERNAME")
self.password = getenv("TOP_PWD")
#exiting ceremonies
yield
self.driver.close()
self.driver.quit()
print("Test executed")
def test_check_in(self, test_setup):
driver=self.driver
# login_url="https://tilt-sso.preprod.crto.in/" separate login page
# url="https://tilt-orange360.preprod.crto.in/insights/home"
url="https://tilt-sso.preprod.crto.in/auth?code=5515f8b0-4b64-4da4-b506-e6a6a3f81b23&scope=cn%20dn%20mail%20uid%20umsId&state=eyJyZWRpcmVjdF91cmkiOiJcL2hvbWUiLCJub25jZSI6IktaTFBxczU5T3lQUWJaRUp0OFhBQWZvZDNueDhPaENDbGlJWVRqZ08ifQ%3D%3D"
driver.get(url)
try:
welcome_text = driver.find_element(by=By.XPATH, value="//div[contains(text(),'Criteo')]")
assert welcome_text
login_actions = Login_activities(driver)
login_actions.enter_username(test_setup.username)
login_actions.enter_password(test_setup.password)
login_actions.login()
page_load_wait = WebDriverWait(driver, timeout=30).until(
EC.url_to_be("https://tilt-orange360.preprod.crto.in/insights/home"))
if (page_load_wait):
WebDriverWait(driver, timeout=20).until(
EC.visibility_of_element_located((By.XPATH, Locators.welcome_text)))
WebDriverWait(driver, timeout=20).until(EC.element_to_be_clickable((By.XPATH, Locators.run_insight)))
insights_actions = Insights_activities(driver)
insights_actions.insights_search("Check-In")
insights_actions.search_partners("BOOKINGIT")
insights_actions.smart_date_30days()
insights_actions.submit_insights()
WebDriverWait(driver, 20).until(EC.visibility_of_element_located((By.XPATH, Locators.success_mesg)))
# submit_verify = driver.find_element(by=By.XPATH, value=Locators.success_mesg)
# assert(submit_verify)
print("Submission successful")
insights_actions.download_file()
time.sleep(20)
print(self.saved_dir)
arr=[]
arr+=[file for file in os.listdir(self.saved_dir) if file.endswith('.pptx')]
print("File in the directory: " + arr[0])
while not arr:
time.sleep(5)
if arr:
print("Insights completed. File downloaded successfully")
else:
print("File not available")
raise NoSuchElementException
except:
if driver.find_element(by=By.XPATH,value=Locators.error_page):
driver.get_screenshot_as_file('{dir}/error_page.png'.format(dir=self.saved_dir))
print("500 Internal server error")
Error_page=driver.current_url
print("The error page: "+Error_page)
raise NoSuchElementException
I do not know why is it not working. The document: https://pytest-html.readthedocs.io/en/latest/user_guide.html#enhancing-reports does not have much information. I really need help here, please.

beautiful soup unable to scrape website contents

Hi I am trying to do a simple web scrape on this website https://www.sayurbox.com/p/Swallow%20Tepung%20Agar%20Agar%20Tinggi%20Serat%207%20gram
where my code is this:
def userAgent(URL):
ua = UserAgent()
USER_AGENT = ua.random
headers = {"User-Agent" : str(USER_AGENT),"Accept-Encoding": "*","Connection": "keep-alive"}
resp = requests.get(URL, headers=headers)
if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")
print(f'{URL}')
else:
print(f'error 200:{URL}')
urlError = pd.DataFrame({'url':[URL],
'date':[dateNow]
})
urlError.to_csv('errorUrl/errorUrl.csv', mode='a', index=False, header=False)
return soup
soup = userAgent(url)
productTitle = soup.find_all('div', {"class":"InfoProductDetail__shortDesc"})
However it is unable to do so, is there something wrong with my code? I tried adding time.sleep to wait for the page to load, however it still does not work. Help will be greatly appreciated
Your code is fine but the url is dynamic meaning data is generated by JavaScript and requests,BeautifulSoup can't mimic that's you need automation tool something like selenium.Now you can run the code.
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
url = 'https://www.sayurbox.com/p/Swallow%20Tepung%20Agar%20Agar%20Tinggi%20Serat%207%20gram'
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.maximize_window()
driver.get(url)
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.close()
title=soup.select_one('.InfoProductDetail__shortDesc').text
price= soup.select_one('span.InfoProductDetail__price').text
print(title)
print(price)
Output:
Swallow Tepung Agar Agar Tinggi Serat 7 gram
7.900

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

Extract Title Tags BeautifulSoup

I need help because I wanted to write code for finding out the title tags on a website. Although I used the code from another question and applied it to this scenario, there are no title tags whenever I print 'Beschreibung'.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib.parse
webseite = 'https://www.entega.de/sitemap/'
response = requests.get(webseite)
response.status_code
soup = BeautifulSoup (response.content, 'html.parser')
result_container = soup.find_all('div', {'class':'clearfix'})
url_part_1 = 'https://www.entega.de/sitemap/'
url_part_2 = []
for item in result_container:
for link in item.find_all ('a', {'class':'modSitemap__lvl1Link ui-link' }):
url_part_2.append (link.get ('href'))
url_joined = []
for i in url_part_2:
url_joined.append (urllib.parse.urljoin(url_part_1,i))
Überschrift = []
Beschreibung = []
Verlinkungen = []
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', title=True, class_='modSitemap__lvl1Link ui-link'))
You are getting nothing because these links don't have an <a class="modSitemap__lvl1Link ui-link"> tag. They do have classes that start with that string though. You could expand to that. Or you can simply just get the <a> tags that have a title attribute.
So change your loop to either:
import re
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', {'class':re.compile("^modSitemap__lvl1Link ui-link")}, title=True, ))
or
for link in url_joined:
response = requests.get (link)
soup = BeautifulSoup (response.content, 'html.parser')
Beschreibung.append(soup.find_all('a', title=True))

Scrape url link in table by BS4

I tried to scrape the hyperlinks in the tag (a herf) of the table. However, it doesn't work. Can you help to improve this code?
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
dfs = pd.DataFrame()
for i in range(1,11):
driver = webdriver.Chrome()
driver.get('https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo='+str(i)+'')
res = driver.execute_script('return document.documentElement.outerHTML')
time.sleep(3)
driver.quit()
soup = BeautifulSoup(res, 'lxml')
h_table = soup.find('table', {'class':'table_bd f_tac f_fs13'})
def tableDataText(h_table):
rows = []
trs = h_table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
result_table = tableDataText(h_table)
df = pd.DataFrame(result_table[1:], columns=result_table[0])
dfs = pd.concat([dfs, df], ignore_index=True)
Your question and the expected result is not that clear and should be improved - If just wanna grab all the urls from the href you can go with:
from bs4 import BeautifulSoup
from selenium import webdriver
linkList = []
for i in range(1,11):
driver = webdriver.Chrome()
driver.get('https://racing.hkjc.com/racing/information/English/racing/RaceCard.aspx?RaceDate=2021/02/06&Racecourse=ST&RaceNo='+str(i)+'')
time.sleep(6)
soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()
for a in soup.select('table#racecardlist table a'):
linkList.append('https://racing.hkjc.com'+a['href'])
linkList