import cookies from firefox to scrapy - selenium

i used selenium firefox to successfully get a specific API and saved the session cookies using pickle, where i am stuck at now is loading the cookies to scrapy spider to get 200 request status.
below is the unsuccessful approach i used :
import scrapy
import os
import json
import pickle
class ProductsSpider(scrapy.Spider):
name = "Products"
start_urls = ["https://www.woolworths.com.au/apis/ui/products/305224,221667,305223,317793,341058,201689,221228,230414,201688,221029"]
params = {"excludeUnavailable": "true", "source":"RR-Best Sellers"}
#with open("./woolworths.json", 'r') as inputfile:
# cookie = json.load(inputfile)
with open("./woolworths.pkl", 'rb') as f:
cookies = pickle.load(f)
def start_requests(self):
url = self.start_urls[0]
yield scrapy.Request(url=url, cookies=self.cookies, meta=self.params, callback=self.parse)
def parse(self, response):
data = response.json()
for a in data:
print(a)
yield a['Name']

Related

Exporting the results from multiple spiders

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

Passing authenticated session from selenium to scrapy

I am trying to login in a website using selenium then pass the authenticated session to scrapy to extract stuff.
The issue is that after I pass the session to scrapy I am still not logged in.
class LoginSpider(scrapy.Spider):
name = 'login'
allowed_domains = ['*****']
start_urls = ['*****']
def __init__(self):
self.driver = webdriver.Firefox()
def start_requests(self):
# driver = webdriver.Firefox()
self.driver.get('*****')
time.sleep(5)
portalButton = self.driver.find_element_by_xpath('//*[#id="fb_submit"]')
portalButton.click()
time.sleep(2)
self.driver.find_element_by_xpath('//*[#id="email"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="password"]').send_keys('******')
self.driver.find_element_by_xpath('//*[#id="btn-login"]').click()
time.sleep(5)
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
yield Request(url="****",cookies=c,callback=self.parse)
def parse(self,response):
# self.log("->>>>>>>>>>>>")
open_in_browser(response)
# view(response)
self.log("->>>>>>>>>>>>")
I would suggest changing that step a bit:
for cookie in self.driver.get_cookies():
c = {cookie['name']: cookie['value']}
to something like that:
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url="****",cookies=_cookies,callback=self.parse)
in each iteration you re-create c with new {cookie['name']: cookie['value']}
my code examples:
import time
import scrapy
from scrapy import Request
from scrapy.utils.response import open_in_browser
from selenium import webdriver
from selenium.webdriver.common.by import By
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['URL']
def __init__(self):
super().__init__()
self.driver = webdriver.Chrome()
def start_requests(self):
self.driver.get('URL')
time.sleep(5)
self.driver.find_element(By.ID, ('email')).send_keys('EMAIL')
self.driver.find_element(By.ID, ('passwd')).send_keys('PASSWORD')
self.driver.find_element(By.ID, ('SubmitLogin')).click()
_cookies = {cookie['name']: cookie['value'] for cookie in self.driver.get_cookies()}
yield Request(url='URL',
cookies=_cookies,
callback=self.parse)
self.driver.quit()
def parse(self, response, **kwargs):
open_in_browser(response)
self.log(response)

How to crawl multiple URLs from csv using Selenium and Scrapy

I am currently trying to crawl multiple sites from https://blogabet.com/
At the moment, I have a "ursl.txt"-file which inlcudes two URLs: 1. http://sabobic.blogabet.com 2. http://dedi22.blogabet.com
The problem I have is the following: Selenium opens each of the two URLs one after the other in the same Tab. Thereby, it is just crawling the content of the second ULR in my "ursl.txt"-file twice. It is not crawling any content from the first URL.
I think there is a problem with the for-loop and how the "parse_tip"-function is called. This is my code:
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import re
import csv
from time import sleep
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
# We are not using the response parameter in this function because the start urls are not defined
# Our class Spider is searching for the function start_requests by default
# Request has to returned or yield
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(url, callback=self.parse_tip)
def parse_tip(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
Why are you doing another request yield Request(url, callback=self.parse_tip) when you already have a response from Selenium.
Just pass that response text to parse_tip and use text inside that
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
for item in self.parse_tip(text= self.driver.page_source):
yield item
def parse_tip(self, text):
sel = Selector(text=text)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}

scrapy list of links from parse result

Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.