Scrapy crawl spider only crawls as if DEPTH = 1 and stops with reason = finished - scrapy

I've got a rather simple spider, that loads URLs from files (working) and should then start crawling and archive the HTML response.
It was working before nicely and since days, I can't figure out anymore, what I've changed to make it stop working.
Now, the spider only crawls the first page of every URL and stops then:
'finish_reason': 'finished',
Spider:
class TesterSpider(CrawlSpider):
name = 'tester'
allowed_domains = []
rules = (
Rule(LinkExtractor(allow=(), deny=(r'.*Zahlung.*', r'.*Cookies.*', r'.*Login.*', r'.*Datenschutz.*', r'.*Registrieren.*', r'.*Kontaktformular.*', )),callback='parse_item'),
)
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
def start_requests(self):
logging.log(logging.INFO, "======== Starting with start_requests")
self._compile_rules()
smgt = Sourcemanagement()
rootdir = smgt.get_root_dir()
file_list = smgt.list_all_files ( rootdir + "/sources" )
links = smgt.get_all_domains()
links = list(set(links))
request_list = []
for link in links:
o = urlparse(link)
result = '{uri.netloc}'.format(uri=o)
self.allowed_domains.append(result)
request_list.append ( Request(url=link, callback=self.parse_item) )
return ( request_list )
def parse_item(self, response):
item = {}
self.write_html_file ( response )
return item
And the settings:
BOT_NAME = 'crawlerscrapy'
SPIDER_MODULES = ['crawlerscrapy.spiders']
NEWSPIDER_MODULE = 'crawlerscrapy.spiders'
USER_AGENT_LIST = "useragents.txt"
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 150
DOWNLOAD_DELAY = 43
CONCURRENT_REQUESTS_PER_DOMAIN = 1
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Accept-Language': 'de',
}
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
'random_useragent.RandomUserAgentMiddleware': 400
}
AUTOTHROTTLE_ENABLED = False
SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
REACTOR_THREADPOOL_MAXSIZE = 20
LOG_LEVEL = 'DEBUG'
DEPTH_LIMIT = 0
DOWNLOAD_TIMEOUT = 15
DEPTH_PRIORITY = 1
SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
Any idea, what I'm doing wrong?
EDIT:
I found out the answer:
request_list.append ( Request(url=link, callback=self.parse_item) )
# to be replaced by:
request_list.append ( Request(url=link, callback=self.parse) )
But I don't really understand why.
https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
So I can return an empty dict in parse_item but I shouldn't because it would break the flow of things?

CrawlSpider.parse is the method that takes care of applying your rules to a response. Only responses you send to CrawlSpider.parse will get your rules applied, generating additional responses.
By yielding a request with a different callback, you are specifying that you don’t want rules to be applied to the response to that request.
The right place to put your parse_item callback when using a CrawlSpider subclass (as opposed to a Spider) is your rules. You already did that.
If what you want is to have responses to your start requests be handled both by rules and by a different callback, you might be better off using a regular spider. CrawlSpider is a very specialized spider, with a limited set of use cases; as soon as you need to do something it doesn’t support, you need to switch to a regular spider.

Related

How do I connect items from one parse method to another?

'''
import scrapy
from ..items import GooddealItem
class FarmtoolsSpider(scrapy.Spider):
name = 'farmtools'
allowed_domains = ['www.gooddeal.com']
start_urls = ['https://www.gooddeal.com/all?
source=private&sort=publishdate%20desc']
def parse(self, response):
items = GooddealItem()
rows = response.xpath('//ul[#class="card-collection"]/li')
for row in rows:
link = row.xpath('.//a/#href').get() #this is the full link.
link_split = link.split('/')[-1] #this splits the url link th first time.
linkid = link_split.split('?')[0] #this splits it the second time.
title = row.xpath('.//div[1]/p[#class="card__body-title"]/text()').get()
county = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/following-sibling::node()/text()').get()
price = row.xpath('.//p[#class="card__price"]/span[1]/text()').get()
subcat = row.xpath('.//a/div/div[2]/div[1]/p[2]/text()[2]').get()
zero = row.xpath('.//a/div/div[2]/div[1]/ul[#class="card__body-keyinfo"]/li[contains(text(),"min")]/text()').get()
if zero == '0 min':
items['linkid'] = linkid
items['title'] = title
items['county'] = county
items['price'] = price
items['subcat'] = subcat
items['zero'] = zero
items['link'] = link
yield response.follow(url = link, callback=self.parse_item_page)
def parse_item_page(self, response):
items = GooddealItem()
rows = response.xpath('/html/body[1]')
for row in rows:
category = row.xpath('.//main/div/div[1]/div/div[1]/div/nav/span/a[1]/span/text()').get(),
views = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[1]/div[3]/div[1]/div/div[1]/div/div/span[2]/text()').get(),
seller_id = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[1]/div[1]/div[2]/a/#href').get(),
seller_ads = row.xpath('.//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[3]/dd/text()').get(),
lifetime_ads = row.xpath('//main/div/div[1]/div/div[2]/div[2]/div[2]/div[3]/div/div[2]/div/dl[4]/dd/text()').get()
items['category'] = category
items['views'] = views
items['seller_id'] = seller_id
items['seller_ads'] = seller_ads
items['lifetime_ads'] = lifetime_ads
yield items
'''
I'm stuck on this as it's my first attempt. When I run the code I'm just getting back:
2020-07-12 22:53:21 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.gooddeal.com/dogs-for-sale/dachshunds/25348559>
{'category': (None,),
'lifetime_ads': None,
'seller_ads': (None,),
'seller_id': (None,),
'views': (None,)}
Any help will be appreciated, thanks
I'm assuming you want the data scraped in parse method to be joined together with the data scraped in the parse_item_page.
If you are using Scrapy v1.7+ you can use cb_kwargs when building the request.
This parameter receives a dict with arbitrary data that will be used as argument in the callback function. So you would have to do something like this in your request:
...
yield response.follow(url = link, callback=self.parse_item_page, cb_kwargs={'scraped_item': items})
For this to work, you also need to change the callback function to receive this parameter. Like this:
def parse_item_page(self, response, scraped_item):
...
Scrapy will take care of sending the scraped_item when calling the parse_item_page.
If you are using Scrapy v1.6 or older:
You will need to use the meta parameter. This method still works in more recent versions, but cb_kwargs(solution above) are preferable.
When building the request you will use the meta parameter to include some arbitrary data in the request. The data will be accessible in the response object that the callback function receives. Your request should look like this:
...
yield response.follow(url = link, callback=self.parse_item_page, meta={'scraped_item': items})
In this case you will access the data by calling response.meta:
def parse_item_page(self, response):
items = response.meta.get('scraped_item') #response.meta is a dict
...

Need to return Scrapy callback method data to calling function

In below code I am trying to collect email ids from a website. It can be on contact or about us page.
From parse method I follow extemail method for all those pages.
From every page I collected few email ids.
Now I need to print them with original record sent to init method.
For example:
record = "https://www.wockenfusscandies.com/"
I want to print output as,
https://www.wockenfusscandies.com/|abc#gamil.com|def#outlook.com
I am not able to store them in self.emails and deliver back to init method.
Please help.
import scrapy
from scrapy.crawler import CrawlerProcess
class EmailSpider(scrapy.Spider):
def __init__(self, record):
self.record = record
self.emails = []
url = record.split("|")[4]
if not url.startswith("http"):
url = "http://{}".format(url)
if url:
self.start_urls = ["https://www.wockenfusscandies.com/"]
else:
self.start_urls = []
def parse(self, response):
contact_list = [a.attrib['href'] for a in response.css('a') if 'contact' in a.attrib['href'] or 'about' in a.attrib['href']]
contact_list.append(response.request.url)
for fllink in contact_list:
yield response.follow(fllink, self.extemail)
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
yield {
'emails': emails
}
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
f = open("/Users/kalpesh/work/data/test.csv")
for rec in f:
process.crawl(EmailSpider, record=rec)
f.close()
process.start()
If I understand your intend correctly you could try the following proceeding:
a) collect the mail-ids in self.emails like
def extemail(self, response):
emails = response.css('body').re('[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+')
self.emails = emails.copy()
yield {
'emails': emails
}
(Or on what other way you get the email-ids from emails)
b) add a close(self, reason) method as in GitHub-Example which is called when the spider has finished
def close(self, reason):
mails_for_record = ""
for mail in self.emails:
mails_for_record += mail + "|"
print(self.record + mails_for_record)
Please also note, I read somewhere that for some versions of Scrapy it is def close(self, reason), for others it is def closed(self, reason).
Hope, this proceeding helps you.
You should visit all the site pages before yielding result for this one site.
This means that you should have queue of pages to visit and results storage.
It can be done using meta.
Some pseudocode:
def parse(self, response):
meta = response.meta
if not meta.get('seen'):
# -- finding urls of contact and about us pages --
# -- putting it to meta['queue'] --
# -- setting meta['seen'] = True
page_emails_found = ...getting emails here...
# --- extending already discovered emails
# --- from other pages/initial empty list with new ones
meta['emails'].extend(page_emails_found)
# if queue isn't empty - yielding new request
if meta['queue']:
next_url = meta['queue'].pop()
yield Request(next_url, callback=self.parse, meta=copy(meta))
# if queue is empty - yielding result from meta
else:
yield {'url': current_domain, 'emails': meta['emails']}
Something like this..

Scrapy Spider Does Not Scrape Page 1

I want my spider to scrape the listings on every page of a website. I used CrawlSpider and LinkExtractor. But when I looked at the csv file, nothing on the first page (i.e. start url) was scraped. The scraped items started from page 2. I tested my crawler on the Scrapy shell and it seemed fine. I can't figure out where the problem lies. Below is my spider code. Please help. Thanks a lot!
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from shputuo.items_shputuo import ShputuoItem
class Shputuo(CrawlSpider):
name = "shputuo"
allowed_domains = ["shpt.gov.cn"] # DO NOT use www in allowed domains
start_urls = ["http://www.shpt.gov.cn/gb/n6132/n6134/n6156/n7110/n7120/index.html"]
rules = (
Rule(LinkExtractor(allow=(), restrict_xpaths=("//div[#class = 'page']/ul/li[5]/a",)), callback="parse_items", follow= True),
)
def parse_items(self, response):
for sel in response.xpath("//div[#class = 'neirong']/ul/li"):
item = ShputuoItem()
word = sel.xpath("a/text()").extract()[0]
item['id'] = word[3:11]
item['title'] = word[11:len(word)]
item['link'] = "http://www.shpt.gov.cn" + sel.xpath("a/#href").extract()[0]
item['time2'] = sel.xpath("span/text()").extract()[0][1:11]
request = scrapy.Request(item['link'], callback = self.parse_content)
request.meta['item'] = item
yield request
def parse_content(self, response):
item = response.meta['item']
item['question'] = response.xpath("//div[#id = 'ivs_content']/p[2]/text()").extract()[0]
item['question'] = "".join(map(unicode.strip, item['question'])) # get rid of unwated spaces and others
item['reply'] = response.xpath("//div[#id = 'ivs_content']/p[3]/text()").extract()[0]
item['reply'] = "".join(map(unicode.strip, item['reply']))
item['agency'] = item['reply'][6:10]
item['time1'] = "2015-" + item['question'][0] + "-" + item['question'][2]
yield item
looks like what you really need to do is to parse elements of the start_urls requests and not to only follow the rules.
For that use the parse_start_url method which is the callback by default of the start_urls requests.

Scrapy FormRequest return 400 error code

I am trying to scrapy following website in which the pagination is though AJAX request.
http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak
I am sending FormRequest to access the different pages, however I am getting following error.
Retrying http://studiegids.uva.nl/xmlpages/plspub/uva_search.courses_pls> (failed 1 times): 400 Bad Request
Not able to understand what is wrong? Following is the code.
class Spider(BaseSpider):
name = "zoek"
allowed_domains = ["studiegids.uva.nl"]
start_urls = ["http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak"]
def parse(self, response):
base_url = "http://studiegids.uva.nl/xmlpages/page/2014-2015/zoek-vak"
for i in range(1, 10):
data = {'p_fetch_size': unicode(20),
'p_page:': unicode(i),
'p_searchpagetype': u'courses',
'p_site_lang': u'nl',
'p_strip': u'/2014-2015',
'p_ctxparam': u'/xmlpages/page/2014-2015/',
'p_rsrcpath':u'/xmlpages/resources/TXP/studiegidswebsite/'}
yield FormRequest.from_response(response,
formdata=data,
callback=self.fetch_details,
dont_click=True)
# yield FormRequest(base_url,
# formdata=data,
# callback=self.fetch_details)
def fetch_details(self, response):
# print response.body
hxs = HtmlXPathSelector(response)
item = ZoekItem()
Studiegidsnummer = hxs.select("//div[#class=item-info']//tr[1]/td[2]/p/text()")
Studielast = hxs.select("//div[#class=item-info']//tr[2]/td[2]/p/text()")
Voertaal = hxs.select("//div[#class=item-info']//tr[3]/td[2]/p/text()")
Ingangseis = hxs.select("//div[#class=item-info']//tr[4]/td[2]/p/text()")
Studiejaar = hxs.select("//div[#class=item-info']//tr[5]/td[2]/p/text()")
Onderwijsinstituut = hxs.select("//div[#class=item-info']//tr[6]/td[2]/p/text()")
for i in range(20):
item['Studiegidsnummer'] = Studiegidsnummer
item['Studielast'] = Studielast
item['Voertaal'] = Voertaal
yield item
Try also check headers using firebug.
400 Bad Request usually means that your request does not fully match the expected request format. Common causes include missing or invalid cookies, headers or parameters.
On your web browser, open the Network tab of the Developer Tools and trigger the request. When you see the request in the Network tab, inspect it fully (parameters, headers, etc.). Try to match such a request in your code.

Sequentially crawl website using scrapy

Is there a way to tell scrapy to stop crawling based upon condition in the 2nd level page? I am doing the following:
I have a start_url to begin with (1st level page)
I have set of urls extracted from the start_url using parse(self,
response)
Then I add queue the links using Request with callback as parseDetailPage(self, response)
Under parseDetail (2nd level page) I come to know if I can stop crawling or not
Right now I am using CloseSpider() to accomplish this, but the problem is that the urls to be parsed are already queued by the time I start crawling second level pages and I do not know how to remove them from the queue. Is there a way to sequentially crawl the list of links and then be able to stop in parseDetailPage?
global job_in_range
start_urls = []
start_urls.append("http://sfbay.craigslist.org/sof/")
def __init__(self):
self.job_in_range = True
def parse(self, response):
hxs = HtmlXPathSelector(response)
results = hxs.select('//blockquote[#id="toc_rows"]')
items = []
if results:
links = results.select('.//p[#class="row"]/a/#href')
for link in links:
if link is self.end_url:
break;
nextUrl = link.extract()
isValid = WPUtil.validateUrl(nextUrl);
if isValid:
item = WoodPeckerItem()
item['url'] = nextUrl
item = Request(nextUrl, meta={'item':item},callback=self.parseDetailPage)
items.append(item)
else:
self.error.log('Could not parse the document')
return items
def parseDetailPage(self, response):
if self.job_in_range is False:
raise CloseSpider('End date reached - No more crawling for ' + self.name)
hxs = HtmlXPathSelector(response)
print response
body = hxs.select('//article[#id="pagecontainer"]/section[#class="body"]')
item = response.meta['item']
item['postDate'] = body.select('.//section[#class="userbody"]/div[#class="postinginfos"]/p')[1].select('.//date/text()')[0].extract()
if item['jobTitle'] is 'Admin':
self.job_in_range = False
raise CloseSpider('Stop crawling')
item['jobTitle'] = body.select('.//h2[#class="postingtitle"]/text()')[0].extract()
item['description'] = body.select(str('.//section[#class="userbody"]/section[#id="postingbody"]')).extract()
return item
Do you mean that you would like to stop the spider and resume it without parsing the urls which have been parsed?
If so, you may try to set the JOB_DIR setting. This setting can keep the request.queue in specified file on the disk.