Trying to scrape email using beautifulsoup - selenium

I try to scrape email to scrape email but it give me none. these is page link: https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
base_url='https://www.avocats-lille.com/'
url = 'https://www.avocats-lille.com/fr/annuaire/avocats-du-tableau-au-barreau-de-lille?view=entries'
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.find_all('h2',class_='title')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=base_url+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
sleep(5)
details=soup.find_all("div",class_="item col-5")
for detail in details:
email=soup.find('a[href^="mailto"]')
print(email)

Links you are looking for are not inside the tra (title) elements.
You should change the code as following to make it working:
tra = soup.find_all('div',class_='item')

The Email address is within the following element:
kamelabbas2002#yahoo.fr
Solution
Using Selenium to print the Email address i.e. the innertext attribute you can use either of the following locator strategies:
Using css_selector:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("css selector", 'a[href^="mailto"]').text)
Using xpath:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("xpath", '//a[starts-with(#href, "mailto")]').text)
Console Output:
kamelabbas2002#yahoo.fr

Related

Fetch html and image from URL in python: HTTP Error 403 or Cloudflare-protected page with captcha

I want to get html code from URL with python script. when I use 'urllib' library, It is works for many sites but in my specific URL case, I get 'HTTP Error 403: Forbidden'. Here is Example I having problem with it:
from urllib.request import Request, urlopen, urlretrieve
url='https://bama.ir/car/detail-szx9y9u-hyundai-sonata-hybrid-gls-plus-2017'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = Request(url, headers=header)
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
print(webpage)
Download Image directly with python script in this specific url has same error. Example of that:
link="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/6d364e82-f39d-419a-b346-257014928907/CarImage9557638_2_thumb_900_600.jpeg"
name=link.rsplit("/",1)[1]
urlretrieve(link,name)
When I use 'BeautifulSoup' or 'cloudscraper' or 'urllib3' library, Cloudflare-protected page with captcha is received. Here Example of that:
url="https://bama.ir/car/detail-grxi644n-hyundai-genesis-coupe-2011"
from bs4 import BeautifulSoup as bs
import requests
soup = bs(requests.get(url).content, "html.parser")
print(soup.text)
import cloudscraper
scraper = cloudscraper.create_scraper()
print(scraper.get(url).text)
import urllib3
http = urllib3.PoolManager()
r = http.request("GET",url)
print(r.data)
When I use 'selenium' library, sometimes it is working but sometimes Cloudflare-protected page with captcha appearing. For downloading Image I just have to use screenshot function. Here Example of that:
from selenium import webdriver
url="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/a36114cd-1978-41a4-a558-cbe5f652faf1/CarImage9473771_0_1_thumb_900_600.jpg"
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url)
html = driver.page_source
driver.save_screenshot("screenshot.png")
driver.close()

BeautifulSoup/Scraping- Python

I have a code that directs me to a website and prints all the titles, dates and times for each session.
However, if you click on each session on the website, there are a list of sub-sessions that drop down.
I want to print each title of the sub-session.
Here is the code I have
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
driver = webdriver.Chrome()
session=[]
driver.get('https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1')
time.sleep(3)
page_source = driver.page_source
soup = BeautifulSoup(page_source,'html.parser')
productlist=soup.find_all('div',class_='accordin_title')
for item in productlist:
title=item.find('h4').text.strip()
tim=item.find('span',class_='info_red').text.strip()
dat=item.find('span',class_='info_blue').text.strip()
dictionary={"Title":title,"Time":tim,"Date":dat}
session.append(dictionary)
print(session)
Try the following to get the required content.
To get session title along with their sub-session titles:
import requests
from bs4 import BeautifulSoup
url = 'https://library.iaslc.org/conference-program?product_id=20&author=&category=&date=&session_type=&session=&presentation=&keyword=&available=&cme=&page=1'
with requests.Session() as s:
s.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
r = s.get(url)
soup = BeautifulSoup(r.text,"lxml")
for items in soup.select("#accordin .accordin_details"):
session_title = items.find_previous(class_="accordin_title").h4.get_text(strip=True)
subsession_titles = [item.get_text(strip=True) for item in items.select(".accordin_list h4")]
print(session_title,subsession_titles)
To get only the sub-session titles:
for item in soup.select("#accordin .accordin_details .accordin_list h4"):
print(item.get_text(strip=True))

Selenium returns different html source than viewed in browser

Im trying to use Selenium to load next page with results by clicking Load More button from this site.
However the source code of the html page loaded by selenium does not show(load) actual products which one can see when browsing.
Here is my code:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import os
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
#browser = webdriver.Firefox()#Chrome('./chromedriver.exe')
URL = "https://thekrazycouponlady.com/coupons-for/costco"
PATIENCE_TIME = 60
LOAD_MORE_BUTTON_XPATH = '//button[#class = "kcl-btn ng-scope"]/span'
caps = DesiredCapabilities.PHANTOMJS
# driver = webdriver.Chrome(r'C:\Python3\selenium\webdriver\chromedriver_win32\chromedriver.exe')
caps["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
driver = webdriver.PhantomJS(r'C:\Python3\selenium\webdriver\phantomjs-2.1.1-windows\bin\phantomjs.exe',service_log_path=os.path.devnull,desired_capabilities=caps)
driver.get(URL)
while True:
try:
time.sleep(20)
html = driver.page_source.encode('utf-8')
print(html)
loadMoreButton = driver.find_element_by_xpath(LOAD_MORE_BUTTON_XPATH)
loadMoreButton.click()
except Exception as e:
print (e)
break
print ("Complete")
driver.quit()
Not sure if I can attach sample html file here for reference.
Anyway, what is the problem and how do I load exactly the same page with selenium as i do via browser?
It might be due to the use of PhantomJS, it isn't maintained any more and deprecated from Selenium 3.8.1. Use Chrome headless instead.
options = Options()
options.headless = True
driver = webdriver.Chrome(CHROMEDRIVER_PATH, chrome_options=options)

Webscraping call in python returns empty values

I'm trying to get the last traded prices (LTP) from different commodities from MCX website https://www.mcxindia.com/market-data/market-watch in python 2.0. Following is the code I'm using.
import requests
from bs4 import BeautifulSoup
url = 'https://www.mcxindia.com/market-data/market-watch'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
But when I run the code, I get empty values. I suspect the website queries some other web server for the values, because when I look at the source of the web page, I do not see the last traded prices there. Can anyone please help me how to get the price data into python?
the below code gets all the market data displayed on that page, extract whatever you want from the json response.
import requests
url = "https://www.mcxindia.com/backpage.aspx/GetMarketWatch"
headers = {
"Host": "www.mcxindia.com",
"Origin": "https://www.mcxindia.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
"Content-Type": "application/json",
"Referer": "https://www.mcxindia.com/market-data/market-watch",
"Accept": "application/json, text/javascript, */*; q=0.01",
}
resp = requests.post(url, headers = headers)
market_data = resp.json()
You have to process the JS, you can use selenium to load JS, see the code below.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.mcxindia.com/market-data/market-watch")
wait(driver, 10).until(EC.visibility_of_element_located(
(By.XPATH, '//*[#class="symbol chnge-perc right5"]')))
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
print soup

Beautiful soup getting name of <a> element within a div

I'm using Beautiful Soup for the first time, and I'm trying to get values of specific element in the webpage.
For example, in this code snippet:
<div class="otg-vendor-name"><a class="otg-vendor-name-link" href="http://www.3brotherskitchen.com" target="_blank">3 Brothers Kitchen</a></div>
I wish to get "3 Brothers Kitchen" from the tag within the .
So far, I've tried something which doesn't seem to work:
import urllib2
from bs4 import BeautifulSoup
url = "http://someurl"
def get_all_vendors():
try:
web_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_page)
c = []
c.append(soup.findAll("div", {"class":'otg-vendor-name'}).contents)
print c
except urllib2.HTTPError:
print("HTTPERROR!")
except urllib2.URLError:
print("URLERROR!")
return c
You can get it by a CSS selector:
soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
Or, via find():
soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text
Update (Using requests and providing User-Agent header):
from bs4 import BeautifulSoup
import requests
url = 'http://offthegridsf.com/vendors#food'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
session.get(url)
response = session.get(url)
soup = BeautifulSoup(response.content)
print soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
print soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text