Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url - scrapy

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.

It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

Related

Can't yield paralel requests conducted by items pipeline

In my scrapy code I'm trying to yield the following figures from parliament's website where all the members of parliament (MPs) are listed. Opening the links for each MP, I'm making parallel requests to get the figures I'm trying to count. I didn't use metas here because my code doesn't just make consecutive requests but it makes parallel requests for the figures after the individual page of the MP is requested. Thus I thought item containers would fit my purpose better.
Here are the figures I'm trying to scrape
How many bill proposals that each MP has their signature on
How many question proposals that each MP has their signature on
How many times that each MP spoke on the parliament
In order to count and yield out how many bills has each member of parliament has their signature on, I'm trying to write a scraper on the members of parliament which works with 3 layers:
Starting with the link where all MPs are listed
From (1) accessing the individual page of each MP where the three information defined above is displayed
3a) Requesting the page with bill proposals and counting the number of them by len function
3b) Requesting the page with question proposals and counting the number of them by len function
3c) Requesting the page with speeches and counting the number of them by len function
What I want: I want to yield the inquiries of 3a,3b,3c with the name and the party of the MP
Problem: My code above just doesn't yield anything but empty dictionaries for each request
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three. Therefore I preferred using the pipelines which apparently doesn't work.
Main code:
'''
from scrapy import Spider
from scrapy.http import Request
from ..items import MeclisItem
import logging
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
items = MeclisItem()
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
items['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
items['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback = self.mv_analysis)
pass
def mv_analysis(self, response):
items = MeclisItem()
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link, callback = self.bill_prop_counter) #number of bill proposals to be requested
yield Request(questionprop_link, callback = self.quest_prop_counter) #number of question propoesals to be requested
yield Request(speech_link, callback = self.speech_counter) #number of speeches to be requested
yield items
# COUNTING FUNCTIONS
def bill_prop_counter(self,response):
items = MeclisItem()
billproposals = response.xpath("//tr[#valign='TOP']")
items['bill_prop_count'] = len(billproposals)
pass
def quest_prop_counter(self, response):
items = MeclisItem()
questionproposals = response.xpath("//tr[#valign='TOP']")
items['res_prop_count'] = len(questionproposals)
pass
def speech_counter(self, response):
items = MeclisItem()
speeches = response.xpath("//tr[#valign='TOP']")
items['speech_count'] = len(speeches)
pass
'''
items.py code:
import scrapy
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
pass
What's displayed at scrapy:
I checked many questions on stackoverflow but still couldn't figure a way out. Thanks in advance.
ps: Spent ten minutes seperately to colour the code above and couldn't make it either :(
Note: Because my parse functions doesn't work like parse => parse2 => parse3 but rather I have 3 parallel parse functions after parse2, I failed to use the meta because I'm not yielding all the values at parse three.
You can do it like this:
Edit:
import scrapy
from scrapy import Spider
from scrapy.http import Request
# from ..items import MeclisItem
import logging
class MeclisItem(scrapy.Item):
name = scrapy.Field()
party = scrapy.Field()
bill_prop_count = scrapy.Field()
res_prop_count = scrapy.Field()
speech_count = scrapy.Field()
class MvSpider(Spider):
name = 'mv'
allowed_domains = ['tbmm.gov.tr']
start_urls = ['https://www.tbmm.gov.tr/Milletvekilleri/liste']
def parse(self, response):
mv_list = mv_list = response.xpath("//ul[#class='list-group list-group-flush']") #taking all MPs listed
for mv in mv_list:
item = MeclisItem()
item['name'] = mv.xpath("./li/div/div/a/text()").get() # MP's name taken
item['party'] = mv.xpath("./li/div/div[#class='col-md-4 text-right']/text()").get().strip() #MP's party name taken
partial_link = mv.xpath('.//div[#class="col-md-8"]/a/#href').get()
full_link = response.urljoin(partial_link)
yield Request(full_link, callback=self.mv_analysis, cb_kwargs={'item': item})
def mv_analysis(self, response, item):
billprop_link_path = response.xpath(".//a[contains(text(),'İmzası Bulunan Kanun Teklifleri')]/#href").get()
billprop_link = response.urljoin(billprop_link_path)
questionprop_link_path = response.xpath(".//a[contains(text(),'Sahibi Olduğu Yazılı Soru Önergeleri')]/#href").get()
questionprop_link = response.urljoin(questionprop_link_path)
speech_link_path = response.xpath(".//a[contains(text(),'Genel Kurul Konuşmaları')]/#href").get()
speech_link = response.urljoin(speech_link_path)
yield Request(billprop_link,
callback=self.bill_prop_counter,
cb_kwargs={'item': item, 'questionprop_link': questionprop_link, 'speech_link': speech_link}) #number of bill proposals to be requested
# COUNTING FUNCTIONS
def bill_prop_counter(self, response, item, questionprop_link, speech_link):
billproposals = response.xpath("//tr[#valign='TOP']")
item['bill_prop_count'] = len(billproposals)
yield Request(questionprop_link,
callback=self.quest_prop_counter,
cb_kwargs={'item': item, 'speech_link': speech_link}) #number of question propoesals to be requested
def quest_prop_counter(self, response, item, speech_link):
questionproposals = response.xpath("//tr[#valign='TOP']")
item['res_prop_count'] = len(questionproposals)
yield Request(speech_link,
callback=self.speech_counter,
cb_kwargs={'item': item}) #number of speeches to be requested
def speech_counter(self, response, item):
speeches = response.xpath("//tr[#valign='TOP']")
item['speech_count'] = len(speeches)
yield item

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

Count scraped items from scrapy

Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

How can I make scrapy check for a field and ignore searching the link

import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
class SunBizSpider(scrapy.Spider):
name = 'sunbiz'
start_urls = ['http://search.sunbiz.org/Inquiry/CorporationSearch/SearchResults?inquiryType=EntityName&searchNameOrder=A&searchTerm=a']
def parse(self, response):
leurl = 'http://search.sunbiz.org'
next_plis = response.xpath("//div[#class='navigationBar'][1]//a[#title='Next List']/#href").extract()
next_lis = (leurl+ ', '.join(next_plis))
yield scrapy.Request(next_lis, callback=self.parse)
for href in response.css('.large-width a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_biz)
def parse_biz(self, response):
re1='((?:[0]?[1-9]|[1][012])[-:\\/.](?:(?:[0-2]?\\d{1})|(?:[3][01]{1}))[-:\\/.](?:(?:[1]{1}\\d{1}\\d{1}\\d{1})|(?:[2]{1}\\d{3})))(?![\\d])' # MMDDYYYY 1
date = response.xpath('//span').re_first(re1)
yield {
'Name': response.css('.corporationName span::text').extract()[1],
'Date': date,
'Link': response.url,
}
Regular expression would most likely find the words inact and cross reff
As you can see above I highlighted words such as inact, name hs and cross rf which are the items I want the crawler to check for and not do anything if it has those words.
you could use the xpath selector to check inner text, so if you want to get all the td with inner text Active, use something like:
response.xpath('//td[text()="Active"]')
The same for other strings, you could also use:
response.xpath('//td[contains(text(), "Activ")]')
if you only want one part of the string

Scrape multiple URLs with Scrapy

How can I scrape multiple URLs with Scrapy?
Am I forced to make multiple crawlers?
class TravelSpider(BaseSpider):
name = "speedy"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4),"http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Python says:
NameError: name 'i' is not defined
But when I use one URL it works fine!
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)"]
Your python syntax is incorrect, try:
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)] + \
["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
If you need to write code to generate start requests, you can define a start_requests() method instead of using start_urls.
You can initialize start_urls in __init__.py method:
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class TravelItem(Item):
url = Field()
class TravelSpider(BaseSpider):
def __init__(self, name=None, **kwargs):
self.start_urls = []
self.start_urls.extend(["http://example.com/category/top/page-%d/" % i for i in xrange(4)])
self.start_urls.extend(["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)])
super(TravelSpider, self).__init__(name, **kwargs)
name = "speedy"
allowed_domains = ["example.com"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Hope that helps.
There are only four ranges in Python: LEGB, because the local scope of the class definition and the local extent of the list derivation are not nested functions, so they do not form the Enclosing scope.Therefore, they are two separate local scopes that cannot be accessed from each other.
so, don't use 'for' and class variables at the same time