Beautiful soup getting name of <a> element within a div - beautifulsoup

I'm using Beautiful Soup for the first time, and I'm trying to get values of specific element in the webpage.
For example, in this code snippet:
<div class="otg-vendor-name"><a class="otg-vendor-name-link" href="http://www.3brotherskitchen.com" target="_blank">3 Brothers Kitchen</a></div>
I wish to get "3 Brothers Kitchen" from the tag within the .
So far, I've tried something which doesn't seem to work:
import urllib2
from bs4 import BeautifulSoup
url = "http://someurl"
def get_all_vendors():
try:
web_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_page)
c = []
c.append(soup.findAll("div", {"class":'otg-vendor-name'}).contents)
print c
except urllib2.HTTPError:
print("HTTPERROR!")
except urllib2.URLError:
print("URLERROR!")
return c

You can get it by a CSS selector:
soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
Or, via find():
soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text
Update (Using requests and providing User-Agent header):
from bs4 import BeautifulSoup
import requests
url = 'http://offthegridsf.com/vendors#food'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
session.get(url)
response = session.get(url)
soup = BeautifulSoup(response.content)
print soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
print soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text

Related

Trying to scrape email using beautifulsoup

I try to scrape email to scrape email but it give me none. these is page link: https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
base_url='https://www.avocats-lille.com/'
url = 'https://www.avocats-lille.com/fr/annuaire/avocats-du-tableau-au-barreau-de-lille?view=entries'
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.find_all('h2',class_='title')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=base_url+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
sleep(5)
details=soup.find_all("div",class_="item col-5")
for detail in details:
email=soup.find('a[href^="mailto"]')
print(email)
Links you are looking for are not inside the tra (title) elements.
You should change the code as following to make it working:
tra = soup.find_all('div',class_='item')
The Email address is within the following element:
kamelabbas2002#yahoo.fr
Solution
Using Selenium to print the Email address i.e. the innertext attribute you can use either of the following locator strategies:
Using css_selector:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("css selector", 'a[href^="mailto"]').text)
Using xpath:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("xpath", '//a[starts-with(#href, "mailto")]').text)
Console Output:
kamelabbas2002#yahoo.fr

Fetch html and image from URL in python: HTTP Error 403 or Cloudflare-protected page with captcha

I want to get html code from URL with python script. when I use 'urllib' library, It is works for many sites but in my specific URL case, I get 'HTTP Error 403: Forbidden'. Here is Example I having problem with it:
from urllib.request import Request, urlopen, urlretrieve
url='https://bama.ir/car/detail-szx9y9u-hyundai-sonata-hybrid-gls-plus-2017'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = Request(url, headers=header)
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
print(webpage)
Download Image directly with python script in this specific url has same error. Example of that:
link="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/6d364e82-f39d-419a-b346-257014928907/CarImage9557638_2_thumb_900_600.jpeg"
name=link.rsplit("/",1)[1]
urlretrieve(link,name)
When I use 'BeautifulSoup' or 'cloudscraper' or 'urllib3' library, Cloudflare-protected page with captcha is received. Here Example of that:
url="https://bama.ir/car/detail-grxi644n-hyundai-genesis-coupe-2011"
from bs4 import BeautifulSoup as bs
import requests
soup = bs(requests.get(url).content, "html.parser")
print(soup.text)
import cloudscraper
scraper = cloudscraper.create_scraper()
print(scraper.get(url).text)
import urllib3
http = urllib3.PoolManager()
r = http.request("GET",url)
print(r.data)
When I use 'selenium' library, sometimes it is working but sometimes Cloudflare-protected page with captcha appearing. For downloading Image I just have to use screenshot function. Here Example of that:
from selenium import webdriver
url="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/a36114cd-1978-41a4-a558-cbe5f652faf1/CarImage9473771_0_1_thumb_900_600.jpg"
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url)
html = driver.page_source
driver.save_screenshot("screenshot.png")
driver.close()

BeautifulSoup/Scraping- Python

I have a code that directs me to a website and prints all the titles, dates and times for each session.
However, if you click on each session on the website, there are a list of sub-sessions that drop down.
I want to print each title of the sub-session.
Here is the code I have
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
driver = webdriver.Chrome()
session=[]
driver.get('https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='accordin_title')
for item in productlist:
title=item.find('h4').text.strip()
tim=item.find('span',class_='info_red').text.strip()
dat=item.find('span',class_='info_blue').text.strip()
dictionary={"Title":title,"Time":tim,"Date":dat}
session.append(dictionary)
print(session)
Try the following to get the required content.
To get session title along with their sub-session titles:
import requests
from bs4 import BeautifulSoup
url = 'https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for items in soup.select("#accordin .accordin_details"):
session_title = items.find_previous(class_="accordin_title").h4.get_text(strip=True)
subsession_titles = [item.get_text(strip=True) for item in items.select(".accordin_list h4")]
print(session_title,subsession_titles)
To get only the sub-session titles:
for item in soup.select("#accordin .accordin_details .accordin_list h4"):
print(item.get_text(strip=True))

Scraping Python script returns None

im trying to scrape data from amazon in particular the product title but running my script only returns None
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
expected results should be the div containing the product Title but instead None is the output
I am unable to comment but I wanted to leave a note on what #Fozoro said in case someone in the future runs into the same issue I did. doing pip install lxml ran successfully however when I was attempting to use it as the parser for my application it still gave me errors about not finding the requested feature. however, doing:
python3 -m pip install lxml allowed me to work with the lxml parser.
Change your parser:
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id="productTitle")
print(title.text)
You can also extract from content attribute of one of the meta tags
import requests
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Dell-Inspiron-5570-Touchscreen-Laptop/dp/B07FKRFTYW/ref=sxbs_sxwds-deals?keywords=laptops&pd_rd_i=B07FKRFTYW&pd_rd_r=38a464f1-5fc2-4e1e-91a3-c209f68e2b8c&pd_rd_w=IbLEX&pd_rd_wg=l5Ewu&pf_rd_p=8ea1b18a-72f9-4e02-9dad-007df8eca556&pf_rd_r=SWJJFWF3WM0ZQZGMN8XA&qid=1562328911&s=computers-intl-ship&smid=A19N59FKNWHX7C'
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/75.0.3770.100 Safari/537.36' }
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.select_one('[name=description]')['content']
print(title)
You should start by installing lxml (if you don't have it already), you can do so using the following pip command:
pip install lxml
once installed replace this:
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
with:
soup = BeautifulSoup(page.content, 'lxml')
title = soup.find(id = "productTitle")
print(title.getText().strip())
hope this helps

Webscraping call in python returns empty values

I'm trying to get the last traded prices (LTP) from different commodities from MCX website https://www.mcxindia.com/market-data/market-watch in python 2.0. Following is the code I'm using.
import requests
from bs4 import BeautifulSoup
url = 'https://www.mcxindia.com/market-data/market-watch'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
But when I run the code, I get empty values. I suspect the website queries some other web server for the values, because when I look at the source of the web page, I do not see the last traded prices there. Can anyone please help me how to get the price data into python?
the below code gets all the market data displayed on that page, extract whatever you want from the json response.
import requests
url = "https://www.mcxindia.com/backpage.aspx/GetMarketWatch"
headers = {
"Host": "www.mcxindia.com",
"Origin": "https://www.mcxindia.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
"Content-Type": "application/json",
"Referer": "https://www.mcxindia.com/market-data/market-watch",
"Accept": "application/json, text/javascript, */*; q=0.01",
}
resp = requests.post(url, headers = headers)
market_data = resp.json()
You have to process the JS, you can use selenium to load JS, see the code below.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.mcxindia.com/market-data/market-watch")
wait(driver, 10).until(EC.visibility_of_element_located(
(By.XPATH, '//*[#class="symbol chnge-perc right5"]')))
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
print soup