bs4 img crawler donload img pile in python - beautifulsoup

import requests
import urllib
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
for idx in range(1, 17):
url = "https://www.coupang.com/np/categories/311357?page=" + str(idx)
print(url)
result = requests.get(url, headers=headers)
soup_obj = BeautifulSoup(result.content, "html.parser")
div = soup_obj.findAll("div", {"class": "name"})
lis = soup_obj.find("ul", {"id": "productList"}).findAll("li")
for li in lis:
name = li.find("div", {"class": "name"})
img = li.find("dt", {"class": "image"}).find("img", {"src": ""})
print("name: " + name.text.strip())
urllib.request.urlretrieve(img, "./imagepile")
// urllib.request.urlretrieve(img, "./imagepile") how to fix this code line? ple

import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 18):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for item in soup.select("dt.image"):
print(item.img['alt'], f"https:{item.img['src']}")
main("https://www.coupang.com/np/categories/311357?page={}")
Download Version:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 3):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = [[item.img['alt'], f'https:{item.img["src"]}']
for item in soup.select("dt.image")]
for el in target:
print(f"Saving {el[0]}.jpg")
r = req.get(el[1])
with open(f"{el[0]}.jpg", 'wb') as f:
f.write(r.content)
main("https://www.coupang.com/np/categories/311357?page={}")

Related

How to avoid ModuleNotFoundError: No module named 'pandas' in VSCode

I'm trying to run from a tutorial in VSCode and keep getting errors about no pandas module but I know it is installed.
I've tried using "select interpreter" to swap between versions of python but then I have issues with requests module. The code below does work if I comment out the pandas module but I can't understand why this code doesn't work.
I tried using pip3 install pandas but the terminal tells me it is already installed.
The code is:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import smtplib
import csv
#import pandas as pd
def check_price():
URL = "https://www.amazon.co.uk/Funny-Data-T-shirt-Mining-T-Shirt/dp/B0B68TSGCR/ref=sr_1_1?keywords=funny+data+mining&qid=1560000000&s=gateway&sr=8-1"
html = requests.get(URL).text
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text().strip()
price_whole = soup2.find(attrs= {"class": "a-price-whole"}).get_text()
price_fraction = soup2.find(attrs= {"class": "a-price-fraction"}).get_text()
price = (f"{price_whole.split()[0]}.{price_fraction.split()[0]}")
today = datetime.date.today()
header = ['Title', 'Price', 'Date']
data = [title, price, today]
with open("amazonscraper2.csv", "a+", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerow(data)
while(True):
check_price()
time.sleep(5)
print("running")
#df = pd.read_csv("amazonscraper2.csv")
#print(df)

Unable to extract Email addresses from Pop Up

I have problem. I am trying to extract the email addresses from a website.
When trying to extract the email addresses,I have to click on the email icon enter image description here in order for it to appear. Once I click on the icon, a new "popup" appears.
I have tried using Selenium get_attribute for data-mailto-token & data-mailto-vector enter image description here, but without any success. How can I extract the email addresses with Python from these so called "popups"? Any help would be greatly appreciated!
Kind regards
Linus
I have tried using Selenium and looked into to further libraries for cross plattform access, but without any success
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import requests
import re
#card_small = driver.find_elements_by_class_name("Card small")
i_num = 1
list_links = []
list_links_all = []
num_inc = 1
for i_p in range(0,14):
url = "https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-"+str(num_inc)+"?filterValues=QWN0aXZlLEluYWN0aXZlOzs7OzQsMzs7Ozs7OzQ5LDEzLDUsNDU7&cHash=30901b0e3080a928cd0ad32522e81b3f"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
driver.find_element_by_css_selector("body > div.cc-window.cc-banner.cc-type-info.cc-theme-block.cc-bottom.cc-visible > div > div.cc-actions > a.cc-btn.cc-allow").click()
try:
driver.execute_script("window.scrollTo(0,2150)")
target = driver.find_elements_by_tag_name("a")
for i in target:
list_links.append(i.get_attribute("href"))
for i in range(10,22):
url_new = list_links[i]
print(url_new)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
page = requests.get(url_new, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
name = soup.find('span',class_="Avatar--name")
address = soup.find_all('span', class_="Button--label")
phone = soup.find_all('span', class_="Button--label")
if name != None:
name_text = soup.find('span', class_="Avatar--name").text
#print(name_text)
if address != None:
for i in address:
search=i.select("span p")
if search != []:
print(search[0].text)
if phone != None:
for i in phone:
match = re.search("[+]\d{2} \d{2} \d{3} \d{2} \d{2}",i.text)
if match !=None:
print(match.group())
time.sleep(5)
driver.get(url_new)
try:
driver.execute_script("window.scrollTo(0,900)")
time.sleep(5)
element=driver.find_element_by_link_text("E-Mail")
info = element.get_attribute("data-mailto-token")
print(info)
element.click()
except NoSuchElementException:
pass
list_links = []
num_inc = num_inc + 1
i_num = i_num + 1
driver.close()
"""
driver.find_element_by_css_selector("#main-content > section.CardGrid > nav > a.Button.nolabel.primary.Pagination--button.Pagination--next").click()
time.sleep(5)
print("This is the end of page: "+str(i_num))
i_num = i_num + 1
time.sleep(5)
"""
except ElementClickInterceptedException:
break
The email address can be obtained by decrypting the combination of data-mailto-token and data-mailto-vector values found in the button.

Extend scrapy settings value per spider

Assume we want to add a specific item pipeline for a particular spider. In order to comply with the DRY principle I just want to access current pipelines from settings, add my specific pipeline and set the result back to the settings for spider.
We can not accomplish this via custom_settings class attribute. Even setting that via from_crawler does not work :
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.setdict({'ITEM_PIPELINES':
{**dict(crawler.settings.getdict('ITEM_PIPELINES')),
'myscrapers.pipelines.CustomPipeline': 11}
}, priority='spider')
return super().from_crawler(cls, crawler, *args, **kwargs)
That causes this error:
TypeError: Trying to modify an immutable Settings object
How can we correctly extend a settings value in scrapy at spider level?
You can set the settings for the process:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
But if you really want to do all this inside the spider you can overwrite "update_settings" method:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
#classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
But honestly I think the first way is better...

For loop returning error when trying to loop through list with beauifulsoup

I am trying to create a loop that will loop though locations and extract out the necessary data and append it to the rest of the locations.
I feel that the code I have written is good but keep getting an error of:
AttributeError: 'NoneType' object has no attribute 'find_all'
but I know that shouldn't be the case.
Any help would be appreciated. Here is my code:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = 'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)
Andrej was right, super simple just had to put the 'f' in front.
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = f'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)

FinViz - Stock scraping giving error --AMZN not found 'NoneType' object has no attribute 'find_next'

I am new to BeautifulSoup package.
I am playing with a code that I got from some website and I got stuck with the above error. Please help.
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import requests
def get_fundamental_data(df):
for symbol in df.index:
try:
url = ("http://finviz.com/quote.ashx?t=" + symbol.lower())
soup = bs(requests.get(url).content)
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
stock_list = ['AMZN', 'GOOG', 'PG', 'KO', 'IBM', 'DG', 'XOM',
'KO', 'PEP', 'MT', 'NL', 'ALDW', 'DCM', 'GSB', 'LPL']
metric = ['P/B',
'P/E',
'Forward P/E'
]
df = pd.DataFrame(index=stock_list, columns=metric)
df = get_fundamental_data(df)
df.head()
I was able to fix my own code based on this thread.
BeautifulSoup Scraping ERROR: AttributeError: 'NoneType' object has no attribute
My code after fix:
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import requests
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Upgrade-Insecure-Requests': '1', 'Cookie': 'v2=1495343816.182.19.234.142', 'Accept-Encoding': 'gzip, deflate, sdch',
'Referer': "http://finviz.com/quote.ashx?t="}
def get_fundamental_data(df):
for symbol in df.index:
try:
#url = ("http://finviz.com/quote.ashx?t=" + symbol.lower())
r = requests.get("http://finviz.com/quote.ashx?t="+ symbol.lower(),headers=headers)
soup = bs(r.content,'html.parser')
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
stock_list = ['AMZN', 'GOOG','ABC']
metric = ['P/B',
'P/E',
'Forward P/E'
]
df = pd.DataFrame(index=stock_list, columns=metric)
df = get_fundamental_data(df)
df.head()
I ran your code and when you print(soup) in the for loop, you get this:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>403 - Forbidden: Access is denied.</title>
<style type="text/css">
<!--
body{margin:0;font-size:.7em;font-family:Verdana, Arial, Helvetica, sans-serif;background:#EEEEEE;}
fieldset{padding:0 15px 10px 15px;}
h1{font-size:2.4em;margin:0;color:#FFF;}
h2{font-size:1.7em;margin:0;color:#CC0000;}
h3{font-size:1.2em;margin:10px 0 0 0;color:#000000;}
#header{width:96%;margin:0 0 0 0;padding:6px 2% 6px 2%;font-family:"trebuchet MS", Verdana, sans-serif;color:#FFF;
background-color:#555555;}
#content{margin:0 0 0 2%;position:relative;}
.content-container{background:#FFF;width:96%;margin-top:8px;padding:10px;position:relative;}
-->
</style>
</head>
<body>
<div id="header"><h1>Server Error</h1></div>
<div id="content">
<div class="content-container"><fieldset>
<h2>403 - Forbidden: Access is denied.</h2>
<h3>You do not have permission to view this directory or page using the credentials that you supplied.</h3>
</fieldset></div>
</div>
</body>
</html>
#murali
your code is very good, i have a question: if i could import symbols from an excel table, it would be better for me. With your functions and the following code i have no data. In test.xlsx i have only one column with Symbol:
import pandas as pd
from pandas_datareader import data as pdr
import os
from pandas import ExcelWriter
from bs4 import BeautifulSoup as bs
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Upgrade-Insecure-Requests': '1', 'Cookie': 'v2=1495343816.182.19.234.142', 'Accept-Encoding': 'gzip, deflate, sdch',
'Referer': "http://finviz.com/quote.ashx?t="}
def get_fundamental_data(df):
for symbol in df.index:
try:
r = requests.get("http://finviz.com/quote.ashx?t="+ symbol.lower(),headers=headers)
soup = bs(r.content,'html.parser')
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
#stock_list = ['AMZN', 'GOOG','ABC']
filein = "test.xlsx"
stocklist = pd.read_excel(filein)
metric = ['Symbol','Company','Sector','Market Cap','Shs Float','Insider Own','Market Cap',
'Shs Float''Insider Own','P/B','P/E','Forward P/E','PEG','Debt/Eq','EPS (ttm)','Dividend %',
'ROE','ROI','EPS Q/Q','Price','Prev Close','SMA20','SMA50','SMA200','Perf Half Y','Perf Month',
'Perf Quarter','Perf Week','Perf YTD','Perf Year']
df = pd.DataFrame(data=stocklist,columns=metric)
df = get_fundamental_data(df)
df.head()