Scraping Python script returns None - beautifulsoup

im trying to scrape data from amazon in particular the product title but running my script only returns None
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
expected results should be the div containing the product Title but instead None is the output

I am unable to comment but I wanted to leave a note on what #Fozoro said in case someone in the future runs into the same issue I did. doing pip install lxml ran successfully however when I was attempting to use it as the parser for my application it still gave me errors about not finding the requested feature. however, doing:
python3 -m pip install lxml allowed me to work with the lxml parser.

Change your parser:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id="productTitle")
print(title.text)
You can also extract from content attribute of one of the meta tags
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.select_one('[name=description]')['content']
print(title)

You should start by installing lxml (if you don't have it already), you can do so using the following pip command:
pip install lxml
once installed replace this:
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
with:
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id = "productTitle")
print(title.getText().strip())
hope this helps

Related

undetected_chromedriver working in full mode but failing in headless model (cloudflare)

I am trying to open a website using undetected_chromedriver with the headless model. The website is using Cloudflare. If I am using without a headless model then the script is working but when I use headless = True it shows a captcha.
import ssl
import time
ssl._create_default_https_context = ssl._create_unverified_context
import undetected_chromedriver as uc
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"')
driver = uc.Chrome(options=options)
# driver = webdriver.Chrome()
driver.get('website_url')
# import time
time.sleep(10)
driver.save_screenshot('sample.png')
Now if I set headless = True, it's showing a captcha
import ssl
import time
ssl._create_default_https_context = ssl._create_unverified_context
import undetected_chromedriver as uc
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"')
options.headless = True
driver = uc.Chrome(options=options)
driver.get('website_url')
# import time
time.sleep(10)
driver.save_screenshot('sample.png')
How Can I make it undetectable from cloudflare?
Add this chrome options as well.
options.add_argument('--disable-blink-features=AutomationControlled')
This should prevent detection.

Trying to scrape email using beautifulsoup

I try to scrape email to scrape email but it give me none. these is page link: https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
base_url='https://www.avocats-lille.com/'
url = 'https://www.avocats-lille.com/fr/annuaire/avocats-du-tableau-au-barreau-de-lille?view=entries'
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.find_all('h2',class_='title')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=base_url+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
sleep(5)
details=soup.find_all("div",class_="item col-5")
for detail in details:
email=soup.find('a[href^="mailto"]')
print(email)
Links you are looking for are not inside the tra (title) elements.
You should change the code as following to make it working:
tra = soup.find_all('div',class_='item')
The Email address is within the following element:
kamelabbas2002#yahoo.fr
Solution
Using Selenium to print the Email address i.e. the innertext attribute you can use either of the following locator strategies:
Using css_selector:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("css selector", 'a[href^="mailto"]').text)
Using xpath:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("xpath", '//a[starts-with(#href, "mailto")]').text)
Console Output:
kamelabbas2002#yahoo.fr

BeautifulSoup Yahoo Finance Extract EPS

I am trying to extract EPS from Yahoo Finance but come out with a different figure I want. Can someone point out where went wrong?
import requests
from bs4 import BeautifulSoup
headers = {'User-Agents': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 OPR/79.0.4143.50'}
url = 'https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch'
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
print(soup.title.text)
Apple Inc. (AAPL) Stock Price, News, Quote & History - Yahoo Finance
price = soup.find('span', {'class': 'Trsdu(0.3s) '}).text
print(price)
146.92
By right I am expecting the answer is 5.11
EPS
Hi managed to get the solution via the Selenium xpath. Thanks to Google search and all's help. Got the answer I want. Which is 5.11
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.get("https://finance.yahoo.com/quote/AAPL?p=AAPL")
EPS = wd.find_element_by_xpath('//*[#id="quote-summary"]/div[2]/table/tbody/tr[4]/td[2]/span').text
print(EPS)
I personally prefer to use the "Copy selector".
You can use this code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agents': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 OPR/79.0.4143.50'}
url = 'https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch'
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
result = soup.select('#quote-summary > div.D\(ib\).W\(1\/2\).Bxz\(bb\).Pstart\(12px\).Va\(t\).ie-7_D\(i\).ie-7_Pos\(a\).smartphone_D\(b\).smartphone_W\(100\%\).smartphone_Pstart\(0px\).smartphone_BdB.smartphone_Bdc\(\$seperatorColor\) > table > tbody > tr:nth-child(4) > td.Ta\(end\).Fw\(600\).Lh\(14px\) > span')[0].text
print(result)
Output:
5.11
Using class as a CSS selector isn't very robust as there are many span elements with the class Trsdu(0.3s).
To get EPS specifically, use this:
price = soup.find('span', {'data-reactid': '153'})
print(price)
Which prints:
<span class="Trsdu(0.3s)" data-reactid="153">5.11</span>
Then you can simply:
price.text
Which will give you your required result:
5.11

Webscraping call in python returns empty values

I'm trying to get the last traded prices (LTP) from different commodities from MCX website https://www.mcxindia.com/market-data/market-watch in python 2.0. Following is the code I'm using.
import requests
from bs4 import BeautifulSoup
url = 'https://www.mcxindia.com/market-data/market-watch'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
But when I run the code, I get empty values. I suspect the website queries some other web server for the values, because when I look at the source of the web page, I do not see the last traded prices there. Can anyone please help me how to get the price data into python?
the below code gets all the market data displayed on that page, extract whatever you want from the json response.
import requests
url = "https://www.mcxindia.com/backpage.aspx/GetMarketWatch"
headers = {
"Host": "www.mcxindia.com",
"Origin": "https://www.mcxindia.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
"Content-Type": "application/json",
"Referer": "https://www.mcxindia.com/market-data/market-watch",
"Accept": "application/json, text/javascript, */*; q=0.01",
}
resp = requests.post(url, headers = headers)
market_data = resp.json()
You have to process the JS, you can use selenium to load JS, see the code below.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.mcxindia.com/market-data/market-watch")
wait(driver, 10).until(EC.visibility_of_element_located(
(By.XPATH, '//*[#class="symbol chnge-perc right5"]')))
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
print soup

Beautiful soup getting name of <a> element within a div

I'm using Beautiful Soup for the first time, and I'm trying to get values of specific element in the webpage.
For example, in this code snippet:
<div class="otg-vendor-name"><a class="otg-vendor-name-link" href="http://www.3brotherskitchen.com" target="_blank">3 Brothers Kitchen</a></div>
I wish to get "3 Brothers Kitchen" from the tag within the .
So far, I've tried something which doesn't seem to work:
import urllib2
from bs4 import BeautifulSoup
url = "http://someurl"
def get_all_vendors():
try:
web_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_page)
c = []
c.append(soup.findAll("div", {"class":'otg-vendor-name'}).contents)
print c
except urllib2.HTTPError:
print("HTTPERROR!")
except urllib2.URLError:
print("URLERROR!")
return c
You can get it by a CSS selector:
soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
Or, via find():
soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text
Update (Using requests and providing User-Agent header):
from bs4 import BeautifulSoup
import requests
url = 'http://offthegridsf.com/vendors#food'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
session.get(url)
response = session.get(url)
soup = BeautifulSoup(response.content)
print soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
print soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text