bs4 img crawler donload img pile in python

bs4 img crawler donload img pile in python - beautifulsoup

import requests
import urllib
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
for idx in range(1, 17):
url = "https://www.coupang.com/np/categories/311357?page=" + str(idx)
print(url)
result = requests.get(url, headers=headers)
soup_obj = BeautifulSoup(result.content, "html.parser")
div = soup_obj.findAll("div", {"class": "name"})
lis = soup_obj.find("ul", {"id": "productList"}).findAll("li")
for li in lis:
name = li.find("div", {"class": "name"})
img = li.find("dt", {"class": "image"}).find("img", {"src": ""})
print("name: " + name.text.strip())
urllib.request.urlretrieve(img, "./imagepile")
// urllib.request.urlretrieve(img, "./imagepile") how to fix this code line? ple

import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 18):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
for item in soup.select("dt.image"):
print(item.img['alt'], f"https:{item.img['src']}")
main("https://www.coupang.com/np/categories/311357?page={}")
Download Version:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0'}
def main(url):
with requests.Session() as req:
for item in range(1, 3):
print(f"Extracting Page# {item}")
r = req.get(url.format(item), headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
target = [[item.img['alt'], f'https:{item.img["src"]}']
for item in soup.select("dt.image")]
for el in target:
print(f"Saving {el[0]}.jpg")
r = req.get(el[1])
with open(f"{el[0]}.jpg", 'wb') as f:
f.write(r.content)
main("https://www.coupang.com/np/categories/311357?page={}")

Related

How to avoid ModuleNotFoundError: No module named 'pandas' in VSCode

I'm trying to run from a tutorial in VSCode and keep getting errors about no pandas module but I know it is installed.
I've tried using "select interpreter" to swap between versions of python but then I have issues with requests module. The code below does work if I comment out the pandas module but I can't understand why this code doesn't work.
I tried using pip3 install pandas but the terminal tells me it is already installed.
The code is:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import smtplib
import csv
#import pandas as pd
def check_price():
URL = "https://www.amazon.co.uk/Funny-Data-T-shirt-Mining-T-Shirt/dp/B0B68TSGCR/ref=sr_1_1?keywords=funny+data+mining&qid=1560000000&s=gateway&sr=8-1"
html = requests.get(URL).text
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"}
page = requests.get(URL, headers=headers)
soup1 = BeautifulSoup(page.content, "html.parser")
soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
title = soup2.find(id="productTitle").get_text().strip()
price_whole = soup2.find(attrs= {"class": "a-price-whole"}).get_text()
price_fraction = soup2.find(attrs= {"class": "a-price-fraction"}).get_text()
price = (f"{price_whole.split()[0]}.{price_fraction.split()[0]}")
today = datetime.date.today()
header = ['Title', 'Price', 'Date']
data = [title, price, today]
with open("amazonscraper2.csv", "a+", newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(header)
writer.writerow(data)
while(True):
check_price()
time.sleep(5)
print("running")
#df = pd.read_csv("amazonscraper2.csv")
#print(df)

Unable to extract Email addresses from Pop Up

I have problem. I am trying to extract the email addresses from a website.
When trying to extract the email addresses,I have to click on the email icon enter image description here in order for it to appear. Once I click on the icon, a new "popup" appears.
I have tried using Selenium get_attribute for data-mailto-token & data-mailto-vector enter image description here, but without any success. How can I extract the email addresses with Python from these so called "popups"? Any help would be greatly appreciated!
Kind regards
Linus
I have tried using Selenium and looked into to further libraries for cross plattform access, but without any success
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import requests
import re
#card_small = driver.find_elements_by_class_name("Card small")
i_num = 1
list_links = []
list_links_all = []
num_inc = 1
for i_p in range(0,14):
url = "https://www.hotelleriesuisse.ch/de/branche-und-politik/branchenverzeichnis/hotel-page-"+str(num_inc)+"?filterValues=QWN0aXZlLEluYWN0aXZlOzs7OzQsMzs7Ozs7OzQ5LDEzLDUsNDU7&cHash=30901b0e3080a928cd0ad32522e81b3f"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
driver = webdriver.Chrome(ChromeDriverManager().install())
driver.get(url)
time.sleep(5)
driver.find_element_by_css_selector("body > div.cc-window.cc-banner.cc-type-info.cc-theme-block.cc-bottom.cc-visible > div > div.cc-actions > a.cc-btn.cc-allow").click()
try:
driver.execute_script("window.scrollTo(0,2150)")
target = driver.find_elements_by_tag_name("a")
for i in target:
list_links.append(i.get_attribute("href"))
for i in range(10,22):
url_new = list_links[i]
print(url_new)
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'}
page = requests.get(url_new, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
name = soup.find('span',class_="Avatar--name")
address = soup.find_all('span', class_="Button--label")
phone = soup.find_all('span', class_="Button--label")
if name != None:
name_text = soup.find('span', class_="Avatar--name").text
#print(name_text)
if address != None:
for i in address:
search=i.select("span p")
if search != []:
print(search[0].text)
if phone != None:
for i in phone:
match = re.search("[+]\d{2} \d{2} \d{3} \d{2} \d{2}",i.text)
if match !=None:
print(match.group())
time.sleep(5)
driver.get(url_new)
try:
driver.execute_script("window.scrollTo(0,900)")
time.sleep(5)
element=driver.find_element_by_link_text("E-Mail")
info = element.get_attribute("data-mailto-token")
print(info)
element.click()
except NoSuchElementException:
pass
list_links = []
num_inc = num_inc + 1
i_num = i_num + 1
driver.close()
"""
driver.find_element_by_css_selector("#main-content > section.CardGrid > nav > a.Button.nolabel.primary.Pagination--button.Pagination--next").click()
time.sleep(5)
print("This is the end of page: "+str(i_num))
i_num = i_num + 1
time.sleep(5)
"""
except ElementClickInterceptedException:
break

The email address can be obtained by decrypting the combination of data-mailto-token and data-mailto-vector values found in the button.

Extend scrapy settings value per spider

Assume we want to add a specific item pipeline for a particular spider. In order to comply with the DRY principle I just want to access current pipelines from settings, add my specific pipeline and set the result back to the settings for spider.
We can not accomplish this via custom_settings class attribute. Even setting that via from_crawler does not work :
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.setdict({'ITEM_PIPELINES':
{**dict(crawler.settings.getdict('ITEM_PIPELINES')),
'myscrapers.pipelines.CustomPipeline': 11}
}, priority='spider')
return super().from_crawler(cls, crawler, *args, **kwargs)
That causes this error:
TypeError: Trying to modify an immutable Settings object
How can we correctly extend a settings value in scrapy at spider level?

You can set the settings for the process:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
But if you really want to do all this inside the spider you can overwrite "update_settings" method:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
#classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
But honestly I think the first way is better...

For loop returning error when trying to loop through list with beauifulsoup

I am trying to create a loop that will loop though locations and extract out the necessary data and append it to the rest of the locations.
I feel that the code I have written is good but keep getting an error of:
AttributeError: 'NoneType' object has no attribute 'find_all'
but I know that shouldn't be the case.
Any help would be appreciated. Here is my code:
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = 'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)

Andrej was right, super simple just had to put the 'f' in front.
import urllib.request
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import requests
locations = ['las-vegas-nv','denver-co']
for location in locations:
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'
}
url = f'https://www.apartments.com/{location}/1-bedrooms/'
page = requests.get(url, headers=headers)
soup = bs(page.text, 'lxml')
table = soup.find("table", class_="rentTrendGrid")
rows = []
for tr in table.find_all('tr'):
rows.append([td.text for td in tr.find_all(['th', 'td'])])
#header_row = rows[0]
rows = list(zip(*rows[0:])) # tranpose the table
df = pd.DataFrame(rows[1:], columns=rows[0])
df['City'] = location
dfs.append(df)
df = pd.concat(dfs).reset_index(drop = True)
print(df)

FinViz - Stock scraping giving error --AMZN not found 'NoneType' object has no attribute 'find_next'

I am new to BeautifulSoup package.
I am playing with a code that I got from some website and I got stuck with the above error. Please help.
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import requests
def get_fundamental_data(df):
for symbol in df.index:
try:
url = ("http://finviz.com/quote.ashx?t=" + symbol.lower())
soup = bs(requests.get(url).content)
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
stock_list = ['AMZN', 'GOOG', 'PG', 'KO', 'IBM', 'DG', 'XOM',
'KO', 'PEP', 'MT', 'NL', 'ALDW', 'DCM', 'GSB', 'LPL']
metric = ['P/B',
'P/E',
'Forward P/E'
]
df = pd.DataFrame(index=stock_list, columns=metric)
df = get_fundamental_data(df)
df.head()

I was able to fix my own code based on this thread.
BeautifulSoup Scraping ERROR: AttributeError: 'NoneType' object has no attribute
My code after fix:
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import requests
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Upgrade-Insecure-Requests': '1', 'Cookie': 'v2=1495343816.182.19.234.142', 'Accept-Encoding': 'gzip, deflate, sdch',
'Referer': "http://finviz.com/quote.ashx?t="}
def get_fundamental_data(df):
for symbol in df.index:
try:
#url = ("http://finviz.com/quote.ashx?t=" + symbol.lower())
r = requests.get("http://finviz.com/quote.ashx?t="+ symbol.lower(),headers=headers)
soup = bs(r.content,'html.parser')
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
stock_list = ['AMZN', 'GOOG','ABC']
metric = ['P/B',
'P/E',
'Forward P/E'
]
df = pd.DataFrame(index=stock_list, columns=metric)
df = get_fundamental_data(df)
df.head()

I ran your code and when you print(soup) in the for loop, you get this:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>403 - Forbidden: Access is denied.</title>
<style type="text/css">
<!--
body{margin:0;font-size:.7em;font-family:Verdana, Arial, Helvetica, sans-serif;background:#EEEEEE;}
fieldset{padding:0 15px 10px 15px;}
h1{font-size:2.4em;margin:0;color:#FFF;}
h2{font-size:1.7em;margin:0;color:#CC0000;}
h3{font-size:1.2em;margin:10px 0 0 0;color:#000000;}
#header{width:96%;margin:0 0 0 0;padding:6px 2% 6px 2%;font-family:"trebuchet MS", Verdana, sans-serif;color:#FFF;
background-color:#555555;}
#content{margin:0 0 0 2%;position:relative;}
.content-container{background:#FFF;width:96%;margin-top:8px;padding:10px;position:relative;}
-->
</style>
</head>
<body>
<div id="header"><h1>Server Error</h1></div>
<div id="content">
<div class="content-container"><fieldset>
<h2>403 - Forbidden: Access is denied.</h2>
<h3>You do not have permission to view this directory or page using the credentials that you supplied.</h3>
</fieldset></div>
</div>
</body>
</html>

#murali
your code is very good, i have a question: if i could import symbols from an excel table, it would be better for me. With your functions and the following code i have no data. In test.xlsx i have only one column with Symbol:
import pandas as pd
from pandas_datareader import data as pdr
import os
from pandas import ExcelWriter
from bs4 import BeautifulSoup as bs
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Upgrade-Insecure-Requests': '1', 'Cookie': 'v2=1495343816.182.19.234.142', 'Accept-Encoding': 'gzip, deflate, sdch',
'Referer': "http://finviz.com/quote.ashx?t="}
def get_fundamental_data(df):
for symbol in df.index:
try:
r = requests.get("http://finviz.com/quote.ashx?t="+ symbol.lower(),headers=headers)
soup = bs(r.content,'html.parser')
for m in df.columns:
df.loc[symbol,m] = fundamental_metric(soup,m)
except Exception as e:
print (symbol, 'not found')
print(e)
return df
def fundamental_metric(soup, metric):
return soup.find(text=metric).find_next(class_='snapshot-td2').text
# Define A List Of Stocks And The Fundamental Metrics
#stock_list = ['AMZN', 'GOOG','ABC']
filein = "test.xlsx"
stocklist = pd.read_excel(filein)
metric = ['Symbol','Company','Sector','Market Cap','Shs Float','Insider Own','Market Cap',
'Shs Float''Insider Own','P/B','P/E','Forward P/E','PEG','Debt/Eq','EPS (ttm)','Dividend %',
'ROE','ROI','EPS Q/Q','Price','Prev Close','SMA20','SMA50','SMA200','Perf Half Y','Perf Month',
'Perf Quarter','Perf Week','Perf YTD','Perf Year']
df = pd.DataFrame(data=stocklist,columns=metric)
df = get_fundamental_data(df)
df.head()

We Keep Coding

sql objective-c vba vb.net react-native apache vue.js tensorflow api pandas

bs4 img crawler donload img pile in python - beautifulsoup

Related

How to avoid ModuleNotFoundError: No module named 'pandas' in VSCode

Unable to extract Email addresses from Pop Up

Extend scrapy settings value per spider

For loop returning error when trying to loop through list with beauifulsoup

FinViz - Stock scraping giving error --AMZN not found 'NoneType' object has no attribute 'find_next'

Categories

Resources