Web Scraping with Python: problem with BeautifulSoup - beautifulsoup

Please help me with the use of BeautifulSoup to web scraping finaces values from investing.com using Python 3.
Whatever I do never get any value, and the filting class is changing permanently from the web page at it is a live value.
import requests
from bs4 import BeautifulSoup
url = "https://es.investing.com/indices/spain-35-futures"
precio_objetivo = input("Introduce el PRECIO del disparador:")
precio_objetivo = float(precio_objetivo)
print (precio_objetivo)
while True:
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")
precio_actual = soup.find('span', attrs={'class': 'arial_26 inlineblock pid-8828-last','id':'last_last','dir':'ltr'})
print (precio_actual)
break;
When I don't apply any filter at soup.find (trying at least to get all the web page) I get this result:
<bound method Tag.find_all of
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>403 You are banned from this site. Please contact via a different client configuration if you believe that this is a mistake. </title>
</head>
<body>
<h1>Error 403 You are banned from this site. Please contact via a different client configuration if you believe that this is a mistake.</h1>
<p>You are banned from this site. Please contact via a different client configuration if you believe that this is a mistake.</p>
<h3>Guru Meditation:</h3>
<p>XID: 850285196</p>
<hr/>
<p>Varnish cache server</p>
</body>
</html>

It looks like that website detects where the request is coming from, so we need to 'fool' it into thinking we're on a browser.
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
r = Request("https://es.investing.com/indices/spain-35-futures", headers={"User-Agent": "Mozilla/5.0"})
c = urlopen(r).read()
soup = BeautifulSoup(c, "html.parser")
print(soup)

The web server detects the python script as a bot and hence blocks it.
By using headers you can prevent it and the following code does it:
import requests
from bs4 import BeautifulSoup
url = "https://es.investing.com/indices/spain-35-futures"
header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'}
page=requests.get(url,headers=header)
soup=BeautifulSoup(page.content,'html.parser')
#this soup returns <span class="arial_26 inlineblock pid-8828-last" dir="ltr" id="last_last">9.182,5</span>
result = soup.find('span',attrs={'id':'last_last'}).get_text()
#use the get_text() function to extract the text
print(result)

You can try using selenium web driver. As otherwise you will face this thing more if the number of requests are high. Also sometimes there are problem with sites having JavaScript.
from selenium import webdriver
url = 'https://example.com/'
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(options=options,executable_path='/usr/local/bin/chromedriver')
driver.get(url)

Related

Trying to scrape email using beautifulsoup

I try to scrape email to scrape email but it give me none. these is page link: https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from time import sleep
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
base_url='https://www.avocats-lille.com/'
url = 'https://www.avocats-lille.com/fr/annuaire/avocats-du-tableau-au-barreau-de-lille?view=entries'
driver = webdriver.Chrome("C:\Program Files (x86)\chromedriver.exe")
driver.get(url)
soup = BeautifulSoup(driver.page_source, "html.parser")
tra = soup.find_all('h2',class_='title')
productlinks=[]
for links in tra:
for link in links.find_all('a',href=True):
comp=base_url+link['href']
productlinks.append(comp)
for link in productlinks:
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
sleep(5)
details=soup.find_all("div",class_="item col-5")
for detail in details:
email=soup.find('a[href^="mailto"]')
print(email)
Links you are looking for are not inside the tra (title) elements.
You should change the code as following to make it working:
tra = soup.find_all('div',class_='item')
The Email address is within the following element:
kamelabbas2002#yahoo.fr
Solution
Using Selenium to print the Email address i.e. the innertext attribute you can use either of the following locator strategies:
Using css_selector:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("css selector", 'a[href^="mailto"]').text)
Using xpath:
driver.execute("get", {'url': 'https://www.avocats-lille.com//fr/annuaire/avocats-du-tableau-au-barreau-de-lille/2?view=entry'})
print(driver.find_element("xpath", '//a[starts-with(#href, "mailto")]').text)
Console Output:
kamelabbas2002#yahoo.fr

Fetch html and image from URL in python: HTTP Error 403 or Cloudflare-protected page with captcha

I want to get html code from URL with python script. when I use 'urllib' library, It is works for many sites but in my specific URL case, I get 'HTTP Error 403: Forbidden'. Here is Example I having problem with it:
from urllib.request import Request, urlopen, urlretrieve
url='https://bama.ir/car/detail-szx9y9u-hyundai-sonata-hybrid-gls-plus-2017'
header = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
req = Request(url, headers=header)
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')
print(webpage)
Download Image directly with python script in this specific url has same error. Example of that:
link="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/6d364e82-f39d-419a-b346-257014928907/CarImage9557638_2_thumb_900_600.jpeg"
name=link.rsplit("/",1)[1]
urlretrieve(link,name)
When I use 'BeautifulSoup' or 'cloudscraper' or 'urllib3' library, Cloudflare-protected page with captcha is received. Here Example of that:
url="https://bama.ir/car/detail-grxi644n-hyundai-genesis-coupe-2011"
from bs4 import BeautifulSoup as bs
import requests
soup = bs(requests.get(url).content, "html.parser")
print(soup.text)
import cloudscraper
scraper = cloudscraper.create_scraper()
print(scraper.get(url).text)
import urllib3
http = urllib3.PoolManager()
r = http.request("GET",url)
print(r.data)
When I use 'selenium' library, sometimes it is working but sometimes Cloudflare-protected page with captcha appearing. For downloading Image I just have to use screenshot function. Here Example of that:
from selenium import webdriver
url="https://cdn.bama.ir/uploads/BamaImages/VehicleCarImages/a36114cd-1978-41a4-a558-cbe5f652faf1/CarImage9473771_0_1_thumb_900_600.jpg"
driver = webdriver.Chrome('chromedriver.exe')
driver.get(url)
html = driver.page_source
driver.save_screenshot("screenshot.png")
driver.close()

Webscraping call in python returns empty values

I'm trying to get the last traded prices (LTP) from different commodities from MCX website https://www.mcxindia.com/market-data/market-watch in python 2.0. Following is the code I'm using.
import requests
from bs4 import BeautifulSoup
url = 'https://www.mcxindia.com/market-data/market-watch'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
But when I run the code, I get empty values. I suspect the website queries some other web server for the values, because when I look at the source of the web page, I do not see the last traded prices there. Can anyone please help me how to get the price data into python?
the below code gets all the market data displayed on that page, extract whatever you want from the json response.
import requests
url = "https://www.mcxindia.com/backpage.aspx/GetMarketWatch"
headers = {
"Host": "www.mcxindia.com",
"Origin": "https://www.mcxindia.com",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
"Content-Type": "application/json",
"Referer": "https://www.mcxindia.com/market-data/market-watch",
"Accept": "application/json, text/javascript, */*; q=0.01",
}
resp = requests.post(url, headers = headers)
market_data = resp.json()
You have to process the JS, you can use selenium to load JS, see the code below.
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wait
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get("https://www.mcxindia.com/market-data/market-watch")
wait(driver, 10).until(EC.visibility_of_element_located(
(By.XPATH, '//*[#class="symbol chnge-perc right5"]')))
source = driver.page_source
soup = BeautifulSoup(source, 'html.parser')
soup.findAll('div',attrs={'class':'ltp green ltpcenter'})
print soup

Not able to get hidden contents of a website

I am trying to scrape a website with the help of BeautifulSoup. I am not able to get the contents of the website but it is on the source code when I inspect the site.
import requests
import urllib
from bs4 import BeautifulSoup
url1 = 'https://recruiting.ultipro.com/usg1006/JobBoard/dfc53730-57d1-3460-336f-ddafabd108f3/?q=&o=postedDateDesc'
response1 = get(url1)
print(response1.text[:500])
html_soup1 = BeautifulSoup(response1.text, 'html.parser')
type(html_soup1)
all_info1 = html_soup1.find("div", {"data-bind": "foreach: opportunities"})
all_info1
all_automation1 = all_info1.find_all("div",{"data-automation":"opportunity"})
all_automation1
In the source code there is "job-title", "location" and "description" and other details but I am not able to see the same details in the html contents.
You should try like this or anything similar to fetch the title from that page:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Chrome()
driver.get('https://recruiting.ultipro.com/usg1006/JobBoard/dfc53730-57d1-3460-336f-ddafabd108f3/?q=&o=postedDateDesc')
time.sleep(3) #let the browser load it's content
soup = BeautifulSoup(driver.page_source,'lxml')
for item in soup.select("h3 .opportunity-link"):
print(item.text)
driver.quit()

Beautiful soup getting name of <a> element within a div

I'm using Beautiful Soup for the first time, and I'm trying to get values of specific element in the webpage.
For example, in this code snippet:
<div class="otg-vendor-name"><a class="otg-vendor-name-link" href="http://www.3brotherskitchen.com" target="_blank">3 Brothers Kitchen</a></div>
I wish to get "3 Brothers Kitchen" from the tag within the .
So far, I've tried something which doesn't seem to work:
import urllib2
from bs4 import BeautifulSoup
url = "http://someurl"
def get_all_vendors():
try:
web_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(web_page)
c = []
c.append(soup.findAll("div", {"class":'otg-vendor-name'}).contents)
print c
except urllib2.HTTPError:
print("HTTPERROR!")
except urllib2.URLError:
print("URLERROR!")
return c
You can get it by a CSS selector:
soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
Or, via find():
soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text
Update (Using requests and providing User-Agent header):
from bs4 import BeautifulSoup
import requests
url = 'http://offthegridsf.com/vendors#food'
with requests.Session() as session:
session.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.115 Safari/537.36'}
session.get(url)
response = session.get(url)
soup = BeautifulSoup(response.content)
print soup.select('div.otg-vendor-name > a.otg-vendor-name-link')[0].text
print soup.find('div', class_='otg-vendor-name').find('a', class_='otg-vendor-name-link').text