Scrapy CrawlSpider: Getting data before extracting link - scrapy

In CrawlSpider, how can I scrape the marked field "4 days ago" in the image before extracting each link?
The below-mentioned CrawlSpider is working fine. But in 'parse_item' I want to add a new field named 'Add posted' where I want to get the field marked on the image.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}

When using the Rule object of the scrapy crawl spider, the extracted link's text is saved in a meta field of the request named link_text. You can obtain this value in the parse_item method and extract the time information using regex. You can read more about it from the docs. See below example.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}

To show in a loop, you can use the following xpath to receive that data point:
x = response.xpath('//div[#class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }

Related

Removing duplicates from response

I am looking to try and remove duplicate timestamps for when I scrape the following site for data on BTC. I want to remove the duplicates after every time requests are sent, so that scrapy can remove the duplicates.
However, I cannot understand how the duplicates are removed when it involves the json response. I had thought it would remove the duplicates when I put the json into a dataframe, however it will not do this.
Here's the items pipeline:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
for time in adapter['data']['timestamp']:
if time in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(time)
return item
The pipelines seems to not produce any errors however it is not removing the duplicate timestamps, so it is not working.
Here's the script that I am using to grab the data.
import scrapy
import numpy as np
from collections import defaultdict
import pandas as pd
import time
def storeBitcoin(response):
bitcoin = defaultdict(list)
resp = response.json()['data']['KdataInfo']
for row in range(0, len(resp)):
bitcoin['timestamp'].append(resp[row]['T'])
bitcoin['open'].append(resp[row]['O'])
bitcoin['closed'].append(resp[row]['C'])
bitcoin['high'].append(resp[row]['H'])
bitcoin['low'].append(resp[row]['L'])
return bitcoin
sec_begin = [55, 75]
sec_end = [15, 35]
class BtcSpider(scrapy.Spider):
name = 'btcc2'
start_urls = ['https://www.btcc.com/quot/history?']
custom_settings = {
'DOWNLOAD_DELAY':0.2
}
def start_requests(self):
for urls in self.start_urls:
for begin, end in zip(sec_begin, sec_end):
yield scrapy.FormRequest(
url=urls,
method="GET",
formdata = {
'codeid': '3223607',
'token': 'm19JU98eIFQjRgwsf9b3eXXI1jmDSW9N',
'interval': '35',
'from': f'16517697{begin}',
'to': f'16518562{end}',
},
callback = self.parse,
)
def parse(self, response):
data = pd.DataFrame(storeBitcoin(response))
data = data.drop_duplicates(subset=['timestamp'])
yield data

scrapy list of links from parse result

Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.

scrapy yield Request not working

I wrote the following scrapy spider but it's not continuing the crawling process after the initial request, although I've yielded more scrapy.Requests for scrapy to follow.
import regex as re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Spider
class myspider(Spider):
name = 'haha'
allowed_domains = ['https://blog.scrapinghub.com/']
start_urls = ['https://blog.scrapinghub.com/']
extractor = LinkExtractor(allow=allowed_domains)
def parse(self, response):
# To extract all the links on this page
links_in_page = self.extractor.extract_links(response)
for link in links_in_page:
yield scrapy.Request(link.url, callback=self.parse)
allowed_domains needs to be a list of domains, not a list of URLs.
So it should be:
allowed_domains = ['blog.scrapinghub.com']

Scrapy crawl spider returns broken urls

I am trying to create a simple crawl spider, but the response.url seem to be broken.
The code i am currently running is:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from teatrorecur.items import TeatrorecurItem
class Teatrorecurspider(CrawlSpider):
name = "teatrorecurspider"
allowed_domains = ["cartelera.com.uy"]
start_urls = (
'http://www.cartelera.com.uy/apeliculafunciones.aspx?,,PELICULAS,OBRA,0,26',
)
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29', )), callback='parse_item', follow=False),
#Rule(LinkExtractor(restrict_xpaths='//a[#href="CINE%2COBRA%2C-1%2C29"]'), follow=False, callback='parse_item'),
#Rule(LinkExtractor(allow=('CINE&OBRA&-1&29$', )), callback='parse_item', follow=False),
)
def parse_item(self, response):
item = TeatrorecurItem()
item['url']=response.url
yield item
a sample url i'm getting from this code is
<200 http://www.cartelera.com.uy/apeliculafunciones.aspx?-1=&12415=&29=&CINE=&OBRA=>
but the corresponding element in the page has the following href value
<a href="http://www.cartelera.com.uy/apeliculafunciones.aspx?12415&&CINE&OBRA&-1&29">
as you can see, the string following the .aspx? is messed up, i have no clue what is wrong.
LinkExtractor has a option named canonicalize that defaults to True.
Set it to False like so:
rules = (
Rule(LinkExtractor(allow=('CINE&OBRA&-1&29',), canonicalize=False), callback='parse_item', follow=False),
)
This will prevent LinkExtractor from performing changes to the url described at the def of canonicalize_url.

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko