this is my script. It works fine, but it's not getting all the data :
import scrapy
from scrapy_splash import SplashRequest
from concurrent.futures import process
from scrapy.crawler import CrawlerProcess
from datetime import datetime
import os
if os.path.exists('jfs_ninos.csv'):
os.remove('jfs_ninos.csv')
print("The file has been deleted successfully")
else:
print("The file does not exist!")
class JfsSpider_ninos(scrapy.Spider):
name = 'jfs_ninos'
#allowed_domains=["https://www.justforsport.com.ar"]
start_urls = ["https://www.justforsport.com.ar/ninos?page=1"]
def parse(self,response):
total_products=int(int(response.css('div.vtex-search-result-3-x-totalProducts--layout.pv5.ph9.bn-ns.bt-s.b--muted-5.tc-s.tl.t-action--small span::text').get())/32) + 1
for count in range(1, total_products):
yield SplashRequest(url=f'https://www.justforsport.com.ar/ninos?page={count}',
callback=self.parse_links)
#Extrae links de cada pagina de la seccion
def parse_links(self,response):
links=response.css('a.vtex-product-summary-2-x-clearLink.vtex-product-summary-2-x-clearLink--shelf-product.h-100.flex.flex-column::attr(href)').getall()
for link in links:
yield SplashRequest(response.urljoin('https://www.justforsport.com.ar' + link), self.parse_article_detail)
def parse_article_detail(self, response):
precio0=response.css('span.vtex-product-price-1-x-currencyContainer.vtex-product-price-1-x-currencyContainer--product')[0]
yield {
'Casa':'Just_For_Sports',
'Sku' :response.css('span.vtex-product-identifier-0-x-product-identifier__value::text').get(),
'Name':response.css('span.vtex-store-components-3-x-productBrand::text').get() ,
'precio':''.join(precio0.css('span.vtex-product-price-1-x-currencyInteger.vtex-product-price-1-x-currencyInteger--product::text').getall()),
'Link':response.url,
'Date':datetime.today().strftime('%Y-%m-%d')
}
process= CrawlerProcess(
settings = {
'FEED_URI':'jfs_ninos.csv' ,
'FEED_FORMAT': 'csv',
'FEED_EXPORT_ENCODING':'utf-8',
'CONCURRENT_REQUESTS': 16,
'AUTOTHROTTLE_ENABLED': True,
'AUTOTHROTTLE_START_DELAY': 1,
'AUTOTHROTTLE_MAX_DELAY' : 1,
'USER_AGENT' : 'Googlebot/2.1 (+http://www.google.com/bot.html)'
} )
process.crawl(JfsSpider_ninos)
process.start()
i don´t understand why I'm getting about 70 items, when tehere are 158 in total...
what's wrong? I can´t find an answer. I would like to set the random user agent, but all topics related, show the way via the settings.py, but not via the process= CrawlerProcess(
settings = { }
I saw this post : How to add random user agent to scrapy spider when calling spider from script? , but via that way, I don´t know hot to export the.csv.Line :
'FEED_URI':'jfs_ninos.csv' ,'FEED_FORMAT': 'csv'
if someone could help me, it would be great!!. Thanks in advance
Related
I am trying to get within several urls of a webpage and follow the response to the next parser to grab another set of urls on a page. However, from this page I need to grab the next page urls but I wanted to try this by manipulating the page string by parsing it and then passing this as the next page. However, the scraper crawls but it returns nothing not even the output on the final parser when I load item.
Note: I know that I can grab the next page rather simply with an if-statement on the href. However, I wanted to try something different in case I had to face a situation where I would have to do this.
Here's my scraper:
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
class ZooplasSpider(scrapy.Spider):
name = 'zooplas'
start_urls = ['https://www.zoopla.co.uk/overseas/']
def start_request(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse, )
def parse(self, response):
container = response.xpath("//ul[#class='list-inline list-unstyled']//li")
for links in container:
urls = links.xpath(".//a/#href").get()
yield response.follow(
urls, callback = self.parse_places
)
def parse_places(self, response):
container = response.xpath("//ul[#class='listing-results clearfix js-gtm-list']//li")
for links in container:
urls = links.xpath('(//div[#class="listing-results-right clearfix"]//a)[position() mod 3=1]//#href').get()
yield response.follow(
urls, callback = self.parse_listings
)
if response.xpath("//div[#id='content']//div//h1//text()").extract_first():
page_on = response.xpath("//div[#id='content']//div//h1//text()").extract_first()
name_of_page = page_on.split()[-1]
else:
pass
if response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first():
url_link = response.xpath("(//div[#class='paginate bg-muted'])//a[last()-1]//href").extract_first()
url_link = url_link.split('/')
last_page = url_link[-1].split('=')[-1]
else:
pass
all_pages = []
for index, n in enumerate(url_link):
for page_name, page_num in zip(name_of_page, last_page):
if index == 5:
url_link[index] = page_name
testit='/'.join(url_link)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, page_num+1):
equal_split[another_i] = str(2)
all_pages.append('='.join(equal_split))
for urls in all_pages:
yield response.follow(
urls, callback = self.parse.places
)
def parse_listings(self, response):
loader = ItemLoader(ZooplasItem(), response=response)
loader.default.output_processor = TakeFirst()
loader.add_xpath("//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'zoopla.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ZooplasSpider)
process.start()
I know the way of grabbing the urls works as I have tried it on a single url using the following:
url = "https://www.zoopla.co.uk/overseas/property/ireland/?new_homes=include&include_sold=false&pn=16"
list_of_stuff = ['Ireland', 'Germany','France']
pages_of_stuff = [5, 7, 6]
test = []
all_pages = []
j=0
for index, n in enumerate(a):
for l_stuff, p_stuff in zip(list_of_stuff,pages_of_stuff):
if index == 5:
a[index] = l_stuff
testit='/'.join(a)
equal_split = testit.split('=')
for another_i, n2 in enumerate(equal_split):
if another_i == 3:
for range_val in range(1, p_stuff+1):
equal_split[another_i] = str(range_val)
print('='.join(equal_split))
Which is the same as the one used above just a change of variables. this outputs the following links and they work:
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=2
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=3
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=4
https://www.zoopla.co.uk/overseas/property/Ireland/?new_homes=include&include_sold=false&pn=5
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=1
https://www.zoopla.co.uk/overseas/property/Germany/?new_homes=include&include_sold=false&pn=2
...
Your use case is suited for using scrapy crawl spider. You can write rules on how to extract links to the properties and how to extract links to the next pages. I have changed your code to use a crawl spider class and I have changed your FEEDS settings to use the recommended settings. FEED_URI and FEED_FORMAT are deprecated in newer versions of scrapy.
Read more about the crawl spider from the docs
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.loader import ItemLoader
class ZooplasItem(scrapy.Item):
stuff = Field()
country = Field()
class ZooplasSpider(CrawlSpider):
name = 'zooplas'
allowed_domains = ['zoopla.co.uk']
start_urls = ['https://www.zoopla.co.uk/overseas/']
rules = (
Rule(LinkExtractor(restrict_css='a.link-novisit'), follow=True), # follow the countries links
Rule(LinkExtractor(restrict_css='div.paginate'), follow=True), # follow pagination links
Rule(LinkExtractor(restrict_xpaths="//a[contains(#class,'listing-result')]"), callback='parse_item', follow=True), # follow the link to actual property listing
)
def parse_item(self, response):
# here you are on the details page for each property
loader = ItemLoader(ZooplasItem(), response=response)
loader.default_output_processor = TakeFirst()
loader.add_xpath("stuff", "//article[#class='dp-sidebar-wrapper__summary']//h1//text()")
loader.add_xpath("country","//li[#class='ui-breadcrumbs__item'][3]/a/text()")
yield loader.load_item()
if __name__ == '__main__':
process = CrawlerProcess(
settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36',
'FEEDS': {
'zoopla.jl': {
'format': 'jsonlines'
}
}
}
)
process.crawl(ZooplasSpider)
process.start()
Can anyone help me? I'm practicing and I can't understand what I did wrong on pagination! It only returns the first page to me and sometimes an error comes up. When it works, it just returns the first page.
"The source list for the Content Security Policy directive 'frame-src' contains an invalid source '*trackcmp.net' It will be ignored", source: https://naturaldaterra.com.br/hortifruti.html?page=2"
import scrapy
from scrapy_selenium import SeleniumRequest
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield SeleniumRequest(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
wait_time=3,
callback=self.parse
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
if next_page:
absolute_url = f"https://naturaldaterra.com.br/hortifruti.html?page={next_page}"
yield SeleniumRequest(
url=absolute_url,
wait_time=3,
callback=self.parse
)
The problem is that your xpath selector returns None instead of the next page number. Consider changing it from
next_page = response.xpath("//button[#class='tile-root-1uO'][1]/text()").get()
to
next_page = response.xpath("//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
For your future projects consider using scrapy-playwright to scrape js rendered websites. It is faster and simple to use. See a sample implementation of your scraper using scrapy-playwright
import scrapy
from scrapy.crawler import CrawlerProcess
class ComputerdealsSpider(scrapy.Spider):
name = 'produtos'
def start_requests(self):
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=1',
meta={"playwright": True}
)
def parse(self, response):
for produto in response.xpath("//div[#class='gallery-items-1IC']/div"):
yield {
'nome_produto': produto.xpath(".//div[#class='item-nameContainer-1kz']/span/text()").get(),
'valor_produto': produto.xpath(".//span[#class='itemPrice-price-1R-']/text()").getall(),
}
# scrape next page
next_page = response.xpath(
"//button[#class='tile-root_active-TUl tile-root-1uO']/following-sibling::button[1]/text()").get()
yield scrapy.Request(
url='https://naturaldaterra.com.br/hortifruti.html?page=' + next_page,
meta={"playwright": True}
)
if __name__ == "__main__":
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}, })
process.crawl(ComputerdealsSpider)
process.start()
I'm trying to make a crawler that will crawl an entire website and output a list of all the domains that the said websites links to (without duplicates).
I have come up with the following code :
import scrapy
from crawler.items import CrawlerItem
from crawler.functions import urlToDomain
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class domainSpider(CrawlSpider):
global allDomains
allDomains = []
name = "domainSpider"
allowed_domains = ["example.com"]
start_urls = [
"https://example.com/"
]
rules = (
Rule(LinkExtractor(), callback='parse', follow=True),
)
def parse(self, response):
urls = response.xpath("//a/#href").extract()
# formating all URL formats to the same one (https://url.com)
urlsOk = []
for elt in urls :
if elt [:2] == "//" : # link is external, append http
urlsOk.append(elt)
elif elt[:4] == "http" :
urlsOk.append(elt)
domaines = list(set([urlToDomain(x) for x in urlsOk]))
item = CrawlerItem()
item["domaines"] = []
item["url"] = response.url
for elt in domaines:
if elt not in allDomains :
item['domaines'].append(elt)
allDomains.append(elt)
yield({
'domaines':elt
})
This is workinge exactly like expected in retrieving the domains, but it stops crawling (finished) after the very first page.
I was overriding a built-in CrawlSpider method (parse) which caused the bug...
The solution was to change the callback method's name from parse to anything else.
Spider code is wrong. I created a demon project, but it does not work, kindly check my vs code shot cut & I have no idea for all my spider code and problems.
import scrapy
class EmailSpider(scrapy.Spider):
name='Email'
start_url = [
'http://jsjy.114chn.com/'
]
def parse(self,response):
for Email in response.xpath("//span[#id='lblEmail']"):
yiel{
'email_text': Email.xpath(".//span[#id='lblEmail_text']/p").extract_first()
}
next_page= response.xpath("//li[#class='next']/a/#href").extract_first()
if next_page is not None:
next_page_link= response.urljoin(next_page)
yield scrapy.Request(url=next_page_link, callback=self.parse)
You have problems with indentation and yield function. Also made some code-style corrections:
import scrapy
class EmailSpider(scrapy.Spider):
name = 'Email'
start_url = ['http://jsjy.114chn.com/']
def parse(self, response):
for email in response.xpath("//span[#id='lblEmail']"):
yield {
'email_text': email.xpath(".//span[#id='lblEmail_text']/p").get()
}
next_page = response.xpath("//li[#class='next']/a/#href").get()
if next_page:
yield scrapy.Request(response.urljoin(next_page))
But since you don't have any #lblEmail elements on page, this spider will not output anything.
I have a program to download photos on various websites. Each url is formed at the end of the address by codes, which are accessed in a dataframe. In a dataframe of 8,583 lines
The sites have javascript, so I use selenium to access the src of the photos. And I download it with urllib.request.urlretrieve
Example of a photo site: http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/PB/150000608817
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import urllib.request, urllib.parse, urllib.error
# Root URL of the site that is accessed to fetch the photo link
url_raiz = 'http://divulgacandcontas.tse.jus.br/divulga/#/candidato/2018/2022802018/'
# Accesses the dataframe that has the "sequencial" type codes
candidatos = pd.read_excel('candidatos_2018.xlsx',sheet_name='Sheet1', converters={'sequencial': lambda x: str(x), 'cpf': lambda x: str(x),'numero_urna': lambda x: str(x)})
# Function that opens each page and takes the link from the photo
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
time.sleep(10)
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
return link
# Function that downloads the photo and saves it with the code name "cpf"
def baixa_foto(nome, url):
urllib.request.urlretrieve(url, nome)
# Iteration in the dataframe
for num, row in candidatos.iterrows():
cpf = (row['cpf']).strip()
uf = (row['uf']).strip()
print(cpf)
print("-/-")
sequencial = (row['sequencial']).strip()
# Creates full page address
url = url_raiz + uf + '/' + sequencial
link_foto = pegalink(url)
baixa_foto(cpf, link_foto)
Please I look guidance for:
Put a try-Exception type to wait for the page to load (I'm having errors reading the src - after many hits the site takes more than ten seconds to load)
And I would like to record all possible errors - in a file or dataframe - to write down the "sequencial" code that gave error and continue the program
Would anyone know how to do it? The guidelines below were very useful, but I was unable to move forward
I put in a folder a part of the data I use and the program, if you want to look: https://drive.google.com/drive/folders/1lAnODBgC5ZUDINzGWMcvXKTzU7tVZXsj?usp=sharing
put your code within :
try:
WebDriverWait(browser, 30).until(wait_for(page_has_loaded))
# here goes your code
except: Exception
print "This is an unexpected condition!"
For waitForPageToLoad :
def page_has_loaded():
page_state = browser.execute_script(
'return document.readyState;'
)
return page_state == 'complete'
30 above is time in seconds. You can adjust it as per your need.
Approach 2 :
class wait_for_page_load(object):
def __init__(self, browser):
self.browser = browser
def __enter__(self):
self.old_page = self.browser.find_element_by_tag_name('html')
def page_has_loaded(self):
new_page = self.browser.find_element_by_tag_name('html')
return new_page.id != self.old_page.id
def __exit__(self, *_):
wait_for(self.page_has_loaded)
def pegalink(url):
profile = webdriver.FirefoxProfile()
browser = webdriver.Firefox(profile)
browser.get(url)
try:
with wait_for_page_load(browser):
html = browser.page_source
soup = BeautifulSoup(html, "html.parser")
browser.close()
link = soup.find("img", {"class": "img-thumbnail img-responsive dvg-cand-foto"})['src']
except Exception:
print ("This is an unexpected condition!")
print("Erro em: ", url)
link = "Erro"
return link