I am trying to extract EPS from Yahoo Finance but come out with a different figure I want. Can someone point out where went wrong?
import requests
from bs4 import BeautifulSoup
headers = {'User-Agents': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 OPR/79.0.4143.50'}
url = 'https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch'
r = requests.get(url)
soup = BeautifulSoup(r.text,'html.parser')
print(soup.title.text)
Apple Inc. (AAPL) Stock Price, News, Quote & History - Yahoo Finance
price = soup.find('span', {'class': 'Trsdu(0.3s) '}).text
print(price)
146.92
By right I am expecting the answer is 5.11
EPS
Hi managed to get the solution via the Selenium xpath. Thanks to Google search and all's help. Got the answer I want. Which is 5.11
!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd = webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wd.get("https://finance.yahoo.com/quote/AAPL?p=AAPL")
EPS = wd.find_element_by_xpath('//*[#id="quote-summary"]/div[2]/table/tbody/tr[4]/td[2]/span').text
print(EPS)
I personally prefer to use the "Copy selector".
You can use this code:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agents': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36 OPR/79.0.4143.50'}
url = 'https://finance.yahoo.com/quote/AAPL?p=AAPL&.tsrc=fin-srch'
r = requests.get(url)
soup = BeautifulSoup(r.text,'lxml')
result = soup.select('#quote-summary > div.D\(ib\).W\(1\/2\).Bxz\(bb\).Pstart\(12px\).Va\(t\).ie-7_D\(i\).ie-7_Pos\(a\).smartphone_D\(b\).smartphone_W\(100\%\).smartphone_Pstart\(0px\).smartphone_BdB.smartphone_Bdc\(\$seperatorColor\) > table > tbody > tr:nth-child(4) > td.Ta\(end\).Fw\(600\).Lh\(14px\) > span')[0].text
print(result)
Output:
5.11
Using class as a CSS selector isn't very robust as there are many span elements with the class Trsdu(0.3s).
To get EPS specifically, use this:
price = soup.find('span', {'data-reactid': '153'})
print(price)
Which prints:
<span class="Trsdu(0.3s)" data-reactid="153">5.11</span>
Then you can simply:
price.text
Which will give you your required result:
5.11
Related
I am trying to open a website using undetected_chromedriver with the headless model. The website is using Cloudflare. If I am using without a headless model then the script is working but when I use headless = True it shows a captcha.
import ssl
import time
ssl._create_default_https_context = ssl._create_unverified_context
import undetected_chromedriver as uc
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"')
driver = uc.Chrome(options=options)
# driver = webdriver.Chrome()
driver.get('website_url')
# import time
time.sleep(10)
driver.save_screenshot('sample.png')
Now if I set headless = True, it's showing a captcha
import ssl
import time
ssl._create_default_https_context = ssl._create_unverified_context
import undetected_chromedriver as uc
from selenium import webdriver
import os
options = webdriver.ChromeOptions()
options.add_argument('--user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"')
options.headless = True
driver = uc.Chrome(options=options)
driver.get('website_url')
# import time
time.sleep(10)
driver.save_screenshot('sample.png')
How Can I make it undetectable from cloudflare?
Add this chrome options as well.
options.add_argument('--disable-blink-features=AutomationControlled')
This should prevent detection.
I try to scrape email to scrape email but it give me none. these is page link: https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
base_url='https://www.avocats-lille.com/'
url = 'https://www.avocats-lille.com/fr/annuaire/avocats-du-tableau-au-barreau-de-lille?view=entries'
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.find_all('h2',class_='title')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=base_url+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
sleep(5)
details=soup.find_all("div",class_="item col-5")
for detail in details:
email=soup.find('a[href^="mailto"]')
print(email)
Links you are looking for are not inside the tra (title) elements.
You should change the code as following to make it working:
tra = soup.find_all('div',class_='item')
The Email address is within the following element:
kamelabbas2002#yahoo.fr
Solution
Using Selenium to print the Email address i.e. the innertext attribute you can use either of the following locator strategies:
Using css_selector:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("css selector", 'a[href^="mailto"]').text)
Using xpath:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("xpath", '//a[starts-with(#href, "mailto")]').text)
Console Output:
kamelabbas2002#yahoo.fr
I have a code that directs me to a website and prints all the titles, dates and times for each session.
However, if you click on each session on the website, there are a list of sub-sessions that drop down.
I want to print each title of the sub-session.
Here is the code I have
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
driver = webdriver.Chrome()
session=[]
driver.get('https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='accordin_title')
for item in productlist:
title=item.find('h4').text.strip()
tim=item.find('span',class_='info_red').text.strip()
dat=item.find('span',class_='info_blue').text.strip()
dictionary={"Title":title,"Time":tim,"Date":dat}
session.append(dictionary)
print(session)
Try the following to get the required content.
To get session title along with their sub-session titles:
import requests
from bs4 import BeautifulSoup
url = 'https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for items in soup.select("#accordin .accordin_details"):
session_title = items.find_previous(class_="accordin_title").h4.get_text(strip=True)
subsession_titles = [item.get_text(strip=True) for item in items.select(".accordin_list h4")]
print(session_title,subsession_titles)
To get only the sub-session titles:
for item in soup.select("#accordin .accordin_details .accordin_list h4"):
print(item.get_text(strip=True))
im trying to scrape data from amazon in particular the product title but running my script only returns None
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
expected results should be the div containing the product Title but instead None is the output
I am unable to comment but I wanted to leave a note on what #Fozoro said in case someone in the future runs into the same issue I did. doing pip install lxml ran successfully however when I was attempting to use it as the parser for my application it still gave me errors about not finding the requested feature. however, doing:
python3 -m pip install lxml allowed me to work with the lxml parser.
Change your parser:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id="productTitle")
print(title.text)
You can also extract from content attribute of one of the meta tags
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.select_one('[name=description]')['content']
print(title)
You should start by installing lxml (if you don't have it already), you can do so using the following pip command:
pip install lxml
once installed replace this:
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
with:
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id = "productTitle")
print(title.getText().strip())
hope this helps
I'm using Beautiful Soup for the first time, and I'm trying to get values of specific element in the webpage.
For example, in this code snippet:
<div class="otg-vendor-name"><a class="otg-vendor-name-link" href="http://www.3brotherskitchen.com" target="_blank">3 Brothers Kitchen</a></div>
I wish to get "3 Brothers Kitchen" from the tag within the .
So far, I've tried something which doesn't seem to work:
import urllib2
from bs4 import BeautifulSoup
url = "http://someurl"
def get_all_vendors():
try:
web_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_page)
c = []
c.append(soup.findAll("div", {"class":'otg-vendor-name'}).contents)
print c
except urllib2.HTTPError:
print("HTTPERROR!")
except urllib2.URLError:
print("URLERROR!")
return c
You can get it by a CSS selector:
soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
Or, via find():
soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text
Update (Using requests and providing User-Agent header):
from bs4 import BeautifulSoup
import requests
url = 'http://offthegridsf.com/vendors#food'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
session.get(url)
response = session.get(url)
soup = BeautifulSoup(response.content)
print soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
print soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text