How to add multiple variables in one string (URL) - scrapy

My spider starts off with the start_urls, being:
https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
Based on a keywords.csv file, located in my resource folder, the keywordsID (number 20035386) will change. Once the number changed, the spider will fetch the data from another product.
I also have a chunk of code which constantly checks the page if isTruncated = true, if that's the case, it will change the page number in the URL to +1. The only problem I am having right now, is that I don't know how to set a second variable in one string (URL). When isTruncated = true the code need to adjust the URL's page number AND keywordsID accordingly. Currently, I only managed to add a variable for the page number.
Currently the chunk of code is:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
However, it should become something like:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/ {keywordsid} ?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
When I run the spider, it will crawl all the pages of the product with keywordsID 20035386, but it will only crawl the first page of all the other products listed in the keywords.csv file.
FULL CODE
./krc/spiders/krc_spider.py
# -*- coding: utf-8 -*-
import scrapy
from krc.items import KrcItem
import json
import os
import csv
import time
import datetime
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text, "category": category})
def parse(self, response):
category = response.meta["category"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
./krc/resources/keywords.csv
keyword,keywordtype
20035386,Hogedrukreiniger
20035424,Window Vacs
Current Output
When I run the spider it fetches the data from all the page's of the product with keywordsID 20035386. From all the other products with a different keywordsID, only the data from the first page will be fetched.

Use response.meta for this:
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
product_id = keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(product_id)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"category": category, "product_id": product_id})
def parse(self, response):
category = response.meta["category"]
product_id = response.meta["product_id"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{product_id}?page={page}&size=8&isocode=nl-NL".format(page=next_page, product_id=product_id),
callback=self.parse,
meta={'page': next_page, "category": category, "product_id": product_id},
)

I believe you need a nested for when your search_text change.
for [first iterating variable] in [outer loop]: # Outer loop
[do something] # Optional
for [second iterating variable] in [nested loop]: # Nested loop
[do something]
Checks this out, it might help you.
For Loops

I think adding the keyword to the url would be the following. It may or may not need + signs before and after search_text, my knowledge is limited.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/"search_text"?page={page}&size=8&isocode=nl-NL".format(page=next_page),
though I'm not really following what this line is doing, at least the format(search_text) portion of it.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)

Related

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

Count scraped items from scrapy

Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

change response in scrapy itemloader

Is there a way to change the response used in an ItemLoader, for example I am parsing two pages and I pass my_item in meta, the response in the item loader is the first one (where I initially defined it).
I am currently using my_item.add_value('price',response.xpath('//p[#id="price"]').extract_first() to get passed this since I can't use my_item.add_xpath('price', '//p[#id="price"]') because the response is from the initial page
my_item = ItemLoader(item=MyItem(),response=response)
#fill my_item
yield Request(My_url,callback=self.parse_item,meta={'my_item':my_item)
You need something like this:
def parse(self, response):
l = ItemLoader(item=YourItem(), response=response)
l.add_xpath('Field1', '...')
l.add_value('Field2', '...')
item = l.load_item()
yield scrapy.Request(
url=another_url,
callback=self.second,
meta={'item': item}
)
def second(self, response):
l = ItemLoader(item=response.meta["item"], response=response)
l.add_xpath("Field3", '...')
yield l.load_item()