Web-scraping code using selenium and beautifulsoup not working properly - selenium

I wrote the python code for web-scraping Sydney morning herald newspaper. This code first clicks all the show more button and then scrape all the articles. Selenium part is working correctly. But I think there is some problem in the scraping part, as after scraping the desired fields (date, title, and content)for few articles (5-6) it is only giving date and title, no content.
import time
import csv
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
base = 'https://www.smh.com.au'
browser = webdriver.Safari(executable_path='/usr/bin/safaridriver')
wait = WebDriverWait(browser, 10)
browser.get('https://www.smh.com.au/search?text=cybersecurity')
while True:
try:
time.sleep(2)
show_more = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_3we9i')))
show_more.click()
except Exception as e:
print(e)
break
soup = BeautifulSoup(browser.page_source,'lxml')
anchors = soup.find_all('a', {'tabindex': '-1'})
for anchor in anchors:
browser.get(base + anchor['href'])
sub_soup = BeautifulSoup(browser.page_source, 'html.parser')
dateTag = sub_soup.find('time', {'class': '_2_zR-'})
titleTag = sub_soup.find('h1', {'itemprop': 'headline'})
contentTag = sub_soup.find_all('div', {'class': '_1665V undefined'})
date = None
title = None
content = None
if isinstance(dateTag, Tag):
date = dateTag.get_text().strip()
if isinstance(titleTag, Tag):
title = titleTag.get_text().strip()
if isinstance(contentTag, list):
content = []
for c in contentTag:
content.append(c.get_text().strip())
content = ' '.join(content)
print(f'{date}\n {title}\n {content}\n')
time.sleep(3)
browser.close()
Why did this code stop giving content part after a few articles? I don't understand it.
Thank you.

It's because You've reached your monthly free access limit
It's the message displayed on the webpage after a few page displayed.

Related

Does Selenium only work Javascript sites? (BeautifulSoup works but Selenium does not)

I am trying to scrape data from google finance with following link
https://www.google.com/finance/quote/ACN:NYSE
The section I am trying to fetch is on right side containing information like market cap, p/e ratio etc.
Earlier I thought it was javascript and wrote the following snippet:
class_name = 'gyFHrc'
options = Options()
options.headless = True
service = Service('/usr/local/bin/geckodriver')
browser = Firefox(service=service, options=options)
browser.get(base_url+suffix)
wait = WebDriverWait(browser, 15)
wait.until(presence_of_element_located((By.CLASS_NAME, class_name))) # <--line 58
stuff = browser.find_elements(By.CLASS_NAME, class_name)
print(f'stuff-->{stuff}')
for elem in stuff:
html = elem.get_attribute("outerHTML")
# print(f'html:{html}')
I get the following error:
File "scraping_google_finance_js.py", line 58, in <module>
wait.until(presence_of_element_located((By.CLASS_NAME, class_name)))
File "/Users/me/opt/anaconda3/envs/scraping/lib/python3.10/site-packages/selenium/webdriver/support/wait.py", line 90, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Stacktrace:
WebDriverError#chrome://remote/content/shared/webdriver/Errors.jsm:183:5
NoSuchElementError#chrome://remote/content/shared/webdriver/Errors.jsm:395:5
element.find/</<#chrome://remote/content/marionette/element.js:300:16
Later, I realised that this was plain HTML and I can use BeautifulSoup as follows:
class_name = 'gyFHrc'
soup = BeautifulSoup(html, 'html.parser')
box_rows = soup.find_all("div", class_name)
print(box_rows)
for row in box_rows:
print(type(row), str(row.contents[1].contents))
This worked with following output:
<class 'bs4.element.Tag'> ['$295.14']
<class 'bs4.element.Tag'> ['$289.67 - $298.00']
<class 'bs4.element.Tag'> ['$261.77 - $417.37']
.....
The question is, why did it not work with Selenium? Did I do something wrong? or Selenium only works with Javascript site?
Clearly time to load the page was not the problem as BeautifulSoup could fetch and parse the page
The error selenium.common.exceptions.TimeoutException says the element you are trying to load or find is not found within the given time.
Probably your internet is slow to load the stuff in time. Increase the wait time to get the result.
This error usually happens when selenium can't find the desired tag or element. But in your case, the element was there.
I checked the code with a few changes, and it worked for me so it's probably an issue with the element loading in time.
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Chrome(ChromeDriverManager().install())
class_name = "gyFHrc"
driver.get("https://www.google.com/finance/quote/ACN:NYSE")
wait = WebDriverWait(driver, 15)
wait.until(EC.presence_of_element_located((By.CLASS_NAME, class_name))) # <--line 58
stuff = driver.find_elements(By.CLASS_NAME, class_name)
print(f"stuff-->{stuff}")
for elem in stuff:
html = elem.get_attribute("outerHTML")
print(f"html:{html}")
Result

How to find href link from multiple browser tabs

I wrote this code to fetch the href link of images in webpage. However i found this way to do this, But Unable to find link in loop. I want that when new tab open href link of that tabs get print in shell. And the urls link are multiple.
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import urllib
import urllib.request
import time
import os
#driver = webdriver.Chrome(executable_path=r"C:\Users\umesh.kumar\Downloads\Codes\chromedriver.exe")
username = "ABCD.kumar"
password = "X0XX"
driver = webdriver.Chrome()
driver.get("http://propadmin.99acres.com/propadmin/index.htm")
urls = ["https://propadmin.99acres.com/do/seller/ProcessSellerForms/getDeletedPhotos?prop_id=A18056415", "https://propadmin.xxxxxxx.com/do/seller/ProcessSellerForms/getDeletedPhotos?prop_id=A56063622"]
driver.maximize_window()
driver.find_element(By.ID, "username").send_keys(username)
driver.find_element(By.ID, "password").send_keys(password)
driver.find_element_by_name("login").click()
for posts in range(len(urls)):
print(posts)
driver.get(urls[posts])
if(posts!=len(urls)-1):
driver.execute_script("window.open('');")
chwd = driver.window_handles
driver.switch_to.window(chwd[-1])
elems = driver.find_elements_by_tag_name('a')
for elem in elems:
href = elem.get_attribute('href')
if href is not None:
print(href)

How to scrape a specific itemprop from a web page with XPath and Selenium?

I'm trying to use Python (Selenium, BeautifulSoup, and XPath) to scrape a span with an itemprop equal to "description", but every time I run the code, the "try" fails and it prints out the "except" error.
I do see the element in the code when I inspect elements on the page.
Line that isn't getting the desired response:
quick_overview = soup.find_element_by_xpath("//span[contains(#itemprop, 'description')]")
Personally, I think you should just keep working with selenium
quick_overview = driver.find_element_by_xpath("//span[contains(#itemprop, 'description')]")
for the element and add .text to end to get the text content.
To actually use soup to parse this out you would likely need a wait condition from selenium first so no real point.
However, should you decide to integrate bs4 then you need to change your function to work with the actual html from driver.page_source and parse that, then switch to select_one to grab your item. Then ensure you are returning from the function and assigning to new soup object.
from bs4 import BeautifulSoup
from selenium import webdriver # links w/ browser and carries out actions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
PATH = "C:\Program Files (x86)\chromedriver_win32\chromedriver.exe"
baseurl = "http://www.waytekwire.com"
skus_to_find_test = ['WL16-8', 'WG18-12']
driver = webdriver.Chrome(PATH)
driver.get(baseurl)
def use_driver_current_html(driver):
soup = BeautifulSoup(driver.page_source, 'lxml')
return soup
for sku in skus_to_find_test[0]:
search_bar = driver.find_element_by_id('themeSearchText')
search_bar.send_keys(sku)
search_bar.send_keys(Keys.RETURN)
try:
product_url = driver.find_elements_by_xpath("//div[contains(#class, 'itemDescription')]//h3//a[contains(text(), sku)]")[0]
product_url.click()
WebDriverWait(driver,10).until(EC.presence_of_element_located((By.XPATH, "//span[contains(#itemprop, 'description')]")))
soup = use_driver_current_html(driver)
try:
quick_overview = soup.select_one("span[itemprop=description]").text
print(quick_overview)
except:
print('No Quick Overview Found.')
except:
print('Product not found.')

I am trying to extract names of the reviewers in steam , but I am getting empty list , where am I wrong?

from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import lxml
import openpyxl as op
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
# from lxml
html_text = 'https://store.steampowered.com/app/354400/Tadpole_Treble/'
# wb = op.load_workbook(
# 'https://1drv.ms/x/s!Aiw1ZpHhhvdugd0Z_eEBU_KzDlxiuA?e=cWevHn')
driver = webdriver.Chrome(
executable_path='C:/Users/atif/Downloads/chromedriver.exe')
driver.get(html_text)
driver.implicitly_wait(10)
names = driver.find_elements_by_xpath('//div[#class="persona_name"]/a')
print(len(names))
for name in names:
print(name.text)
# body = soup.body
# titles = headers.find_all('a', class_='title fw500 ellipsis')
# for h in headers:
# # title = h.find('a', class_='title fw500 ellipsis').text
# print(h.a['href'])
# a_links = body.find_all("a")
driver.close()
this is the answer I am getting
[20380:14344:0416/154733.584:ERROR:device_event_log_impl.cc(214)]
[15:47:33.584] Bluetooth: bluetooth_adapter_winrt.cc:1072 Getting
Default Adapter failed.
0 < = Result showing zero
It seems your code is correct but the element is not loaded at that time. That's why you are getting empty list. The reviews are loading once you scroll down the page to the end. I have tried the below code and it works for me. Do let me know if it works for you or not.
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)
driver.get('https://store.steampowered.com/app/354400/Tadpole_Treble')
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
sleep(5)
ReviewersNames = driver.find_elements_by_xpath("//div[#class=\"persona_name\"]/a")
for reviewer in ReviewersNames:
print(reviewer.text)
Note - you can also use explicit wait above.
Please mark it as answer if it resolves your problem.

Selenium: How use while loop to click link if it exists?

I am trying to write a Python program that uses Selenium to click a button to go to the next page if the button is clickable. This is because I am web scraping from varying amounts of pages.
I have tried to use a while loop that checks the href attribute, but the code doesn't click the button, nor does it return an error. If I simply write button.click(), but without a while loop or conditional check for the href attribute, then the program clicks the button correctly.
My code also has a while loop condition of "variable is not None". Is this a valid usage of "is not"? My logic is for the program to click the button to go to the next page if there is an href available from the to click.
Code:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import numpy as np
import pandas as pd
PATH = "C:\Program Files (x86)\chromedriver.exe"
wd = webdriver.Chrome(PATH)
wd.get("https://profiles.ucr.edu/app/home/search;name=;org=Physics%20and%20Astronomy;title=;phone=;affiliation=Faculty")
time.sleep(1)
button = wd.find_element_by_xpath("""//a[#aria-label='Next page']""")
#<a tabindex="0" aria-label="Next page" class="ng-star-inserted" style=""> Next <span class="show-for-sr">page</span></a>
href_data = button.get_attribute('href')
while (href_data is not None):
time.sleep(0.5)
button.click()
href_data = button.get_attribute('href')
Would anyone here be willing to assist me with this? I understand that Selenium requires the user to download a webdriver, so I apologize for any difficulties with testing my code.
Thank you, ExactPlace441
To loop until all pages were clicked.
wd.get('https://profiles.ucr.edu/app/home/search;name=;org=Physics%20and%20Astronomy;title=;phone=;affiliation=Faculty')
wait=WebDriverWait(wd, 10)
while True:
try:
wait.until(EC.element_to_be_clickable((By.XPATH, "//a[#aria-label='Next page']"))).click()
time.sleep(5)
except:
break
Import
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
I faced the same problem then I used gecko driver(selenium Firefox) instead of Chrome. My code was working perfectly in selenium Firefox but same code was not working in selenium Chrome. Without while loop I hadn't any problem to click on button in selenium Chrome browser but it was not working when added while loop. After using gecko driver(selenium Firefox) my problem was solved. Here is an example of while loop that you can use. It will clicking on button until the button disappeared or reach the last page.
i = 1
try:
while i < 2:
button_element = driver.find_element_by_xpath("give your button xpath")
button_element.click() #Our loop will continuing until our button xpath disappeared from web page
except:
pass #when the button xpath will disappeared it will ignore the error and jump to the next section of our code.
Here I modified your code:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import numpy as np
import pandas as pd
driver = webdriver.Firefox()
driver.maximize_window()
url = "https://profiles.ucr.edu/app/home/search;name=;org=Physics%20and%20Astronomy;title=;phone=;affiliation=Faculty"
driver.get(url)
timeout = 20
# This container collect data from first page
containers = WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#class="column ng-star-inserted"]' )))
for container in containers:
name = container.find_element_by_css_selector('.header-details h5') #we are srcaping name from each page
print(name.text)
i = 1
try:
while i < 2: #Now it will look for “next page button” in every page and continuing click on “next page button” until it will reach the last page.
next_page_button = driver.find_element_by_xpath("//li[#class='pagination-next ng-star-inserted']")
next_page_button.click()
#our this container2 start collect data from second page to last page
containers = WebDriverWait(driver, timeout).until(EC.visibility_of_all_elements_located((By.XPATH,'//div[#class="column ng-star-inserted"]' )))
for container in containers:
name = container.find_element_by_css_selector('.header-details h5') #we are srcaping name from each page
print(name.text)
time.sleep(3)
except:
pass #if any page don't have “next page button” then our code will be end without any error.