Exporting the results from multiple spiders - scrapy

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

Related

import cookies from firefox to scrapy

i used selenium firefox to successfully get a specific API and saved the session cookies using pickle, where i am stuck at now is loading the cookies to scrapy spider to get 200 request status.
below is the unsuccessful approach i used :
import scrapy
import os
import json
import pickle
class ProductsSpider(scrapy.Spider):
name = "Products"
start_urls = ["https://www.woolworths.com.au/apis/ui/products/305224,221667,305223,317793,341058,201689,221228,230414,201688,221029"]
params = {"excludeUnavailable": "true", "source":"RR-Best Sellers"}
#with open("./woolworths.json", 'r') as inputfile:
# cookie = json.load(inputfile)
with open("./woolworths.pkl", 'rb') as f:
cookies = pickle.load(f)
def start_requests(self):
url = self.start_urls[0]
yield scrapy.Request(url=url, cookies=self.cookies, meta=self.params, callback=self.parse)
def parse(self, response):
data = response.json()
for a in data:
print(a)
yield a['Name']

Passing authenticated session from selenium to scrapy

I am trying to login in a website using selenium then pass the authenticated session to scrapy to extract stuff.
The issue is that after I pass the session to scrapy I am still not logged in.
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['*****']
start_urls = ['*****']
def __init__(self):
self.driver = webdriver.Firefox()
def start_requests(self):
# driver = webdriver.Firefox()
self.driver.get('*****')
time.sleep(5)
portalButton = self.driver.find_element_by_xpath('//*[#id="fb_submit"]')
portalButton.click()
time.sleep(2)
self.driver.find_element_by_xpath('//*[#id="email"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="password"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="btn-login"]').click()
time.sleep(5)
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
yield Request(url="****",cookies=c,callback=self.parse)
def parse(self,response):
# self.log("->>>>>>>>>>>>")
open_in_browser(response)
# view(response)
self.log("->>>>>>>>>>>>")
I would suggest changing that step a bit:
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
to something like that:
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url="****",cookies=_cookies,callback=self.parse)
in each iteration you re-create c with new {cookie['name']: cookie['value']}
my code examples:
import time
import scrapy
from scrapy import Request
from scrapy.utils.response import open_in_browser
from selenium import webdriver
from selenium.webdriver.common.by import By
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['URL']
def __init__(self):
super().__init__()
self.driver = webdriver.Chrome()
def start_requests(self):
self.driver.get('URL')
time.sleep(5)
self.driver.find_element(By.ID, ('email')).send_keys('EMAIL')
self.driver.find_element(By.ID, ('passwd')).send_keys('PASSWORD')
self.driver.find_element(By.ID, ('SubmitLogin')).click()
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url='URL',
cookies=_cookies,
callback=self.parse)
self.driver.quit()
def parse(self, response, **kwargs):
open_in_browser(response)
self.log(response)

scrapy gettin coincident data

import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g298006-d740275-Reviews-Deniz_Restaurant-Izmir_Izmir_Province_Turkish_Aegean_Coast.html',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'author': response.xpath("//div[contains(#class, 'member_info')]//div/text()").extract(),
'rating': response.xpath("//span[contains(#class,'ui_bubble_rating')]/#alt").extract() ,
'comment_tag': response.xpath("//span[contains(#class, 'noQuotes')]/text()").extract(),
'comment': response.xpath('//div[#class="entry"]/p/text()').extract()
}
next_page = response.xpath("//div[contains(#class, 'unified')]/a[contains(#class, 'next')]/#href").extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
This is my spider code. My problem is when i used crawl with json. there are so many repeated data from website

How to crawl multiple URLs from csv using Selenium and Scrapy

I am currently trying to crawl multiple sites from https://blogabet.com/
At the moment, I have a "ursl.txt"-file which inlcudes two URLs: 1. http://sabobic.blogabet.com 2. http://dedi22.blogabet.com
The problem I have is the following: Selenium opens each of the two URLs one after the other in the same Tab. Thereby, it is just crawling the content of the second ULR in my "ursl.txt"-file twice. It is not crawling any content from the first URL.
I think there is a problem with the for-loop and how the "parse_tip"-function is called. This is my code:
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import re
import csv
from time import sleep
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
# We are not using the response parameter in this function because the start urls are not defined
# Our class Spider is searching for the function start_requests by default
# Request has to returned or yield
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(url, callback=self.parse_tip)
def parse_tip(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
Why are you doing another request yield Request(url, callback=self.parse_tip) when you already have a response from Selenium.
Just pass that response text to parse_tip and use text inside that
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
for item in self.parse_tip(text= self.driver.page_source):
yield item
def parse_tip(self, text):
sel = Selector(text=text)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko