I am looking to try and remove duplicate timestamps for when I scrape the following site for data on BTC. I want to remove the duplicates after every time requests are sent, so that scrapy can remove the duplicates.
However, I cannot understand how the duplicates are removed when it involves the json response. I had thought it would remove the duplicates when I put the json into a dataframe, however it will not do this.
Here's the items pipeline:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
for time in adapter['data']['timestamp']:
if time in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(time)
return item
The pipelines seems to not produce any errors however it is not removing the duplicate timestamps, so it is not working.
Here's the script that I am using to grab the data.
import scrapy
import numpy as np
from collections import defaultdict
import pandas as pd
import time
def storeBitcoin(response):
bitcoin = defaultdict(list)
resp = response.json()['data']['KdataInfo']
for row in range(0, len(resp)):
bitcoin['timestamp'].append(resp[row]['T'])
bitcoin['open'].append(resp[row]['O'])
bitcoin['closed'].append(resp[row]['C'])
bitcoin['high'].append(resp[row]['H'])
bitcoin['low'].append(resp[row]['L'])
return bitcoin
sec_begin = [55, 75]
sec_end = [15, 35]
class BtcSpider(scrapy.Spider):
name = 'btcc2'
start_urls = ['https://www.btcc.com/quot/history?']
custom_settings = {
'DOWNLOAD_DELAY':0.2
}
def start_requests(self):
for urls in self.start_urls:
for begin, end in zip(sec_begin, sec_end):
yield scrapy.FormRequest(
url=urls,
method="GET",
formdata = {
'codeid': '3223607',
'token': 'm19JU98eIFQjRgwsf9b3eXXI1jmDSW9N',
'interval': '35',
'from': f'16517697{begin}',
'to': f'16518562{end}',
},
callback = self.parse,
)
def parse(self, response):
data = pd.DataFrame(storeBitcoin(response))
data = data.drop_duplicates(subset=['timestamp'])
yield data
Related
i used selenium firefox to successfully get a specific API and saved the session cookies using pickle, where i am stuck at now is loading the cookies to scrapy spider to get 200 request status.
below is the unsuccessful approach i used :
import scrapy
import os
import json
import pickle
class ProductsSpider(scrapy.Spider):
name = "Products"
start_urls = ["https://www.woolworths.com.au/apis/ui/products/305224,221667,305223,317793,341058,201689,221228,230414,201688,221029"]
params = {"excludeUnavailable": "true", "source":"RR-Best Sellers"}
#with open("./woolworths.json", 'r') as inputfile:
# cookie = json.load(inputfile)
with open("./woolworths.pkl", 'rb') as f:
cookies = pickle.load(f)
def start_requests(self):
url = self.start_urls[0]
yield scrapy.Request(url=url, cookies=self.cookies, meta=self.params, callback=self.parse)
def parse(self, response):
data = response.json()
for a in data:
print(a)
yield a['Name']
I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()
In CrawlSpider, how can I scrape the marked field "4 days ago" in the image before extracting each link?
The below-mentioned CrawlSpider is working fine. But in 'parse_item' I want to add a new field named 'Add posted' where I want to get the field marked on the image.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
'Add posted': ?
}
When using the Rule object of the scrapy crawl spider, the extracted link's text is saved in a meta field of the request named link_text. You can obtain this value in the parse_item method and extract the time information using regex. You can read more about it from the docs. See below example.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
class PropertySpider(CrawlSpider):
name = 'property'
allowed_domains = ['www.openrent.co.uk']
start_urls = [
'https://www.openrent.co.uk/properties-to-rent/london?term=London&skip='+ str(x) for x in range(0, 5, 20)
]
rules = (
Rule(LinkExtractor(restrict_xpaths="//div[#id='property-data']/a"), callback='parse_item', follow=True),
)
def parse_item(self, response):
link_text = response.request.meta.get("link_text")
m = re.search(r"(Last Updated.*ago)", link_text)
if m:
posted = m.group(1).replace("\xa0", " ")
yield {
'Title': response.xpath("//h1[#class='property-title']/text()").get(),
'Price': response.xpath("//h3[#class='perMonthPrice price-title']/text()").get(),
'Links': response.url,
"Add posted": posted
}
To show in a loop, you can use the following xpath to receive that data point:
x = response.xpath('//div[#class="timeStamp"]')
for i in x:
yield {'result': i.xpath("./i/following-sibling::text()").get().strip() }
Scrapy newbie here. I'm currently trying to extend my crawlspider so it can take in multiple arguments from a text document (instead of manually typing in each arguments into the command line like scrapy crawl crawl5 -a start_url="argument"). Currently, I can input one argument and that generates a few items. But I'd like some guidance on two problems:
How can I create an item for each argument?
How can I use that item as a container for the items I generate from each argument?
My goal here is to sort of imitate running my crawlspider multiple times, while keeping the items returned from each argument separate.
EDIT.. here's my code- as you can see it's a scraper for thesaurus.com
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from thesaurus.items import ThesaurusItem
class MySpider(CrawlSpider):
name = 'crawl5'
def __init__(self, *args, **kwargs):
self.start_urls = ["http://www.thesaurus.com/browse/%s" %kwargs.get('start_url')]
self.allowed_domains = ["thesaurus.com"]
self.rules = (
Rule(LinkExtractor(restrict_xpaths=("//div[id='paginator']//a/#href"))),
Rule(LinkExtractor(allow=('http://www.thesaurus.com/browse/%s/.$' %kwargs.get('start_url'), 'http://www.thesaurus.com/browse/%s/..$' %kwargs.get('start_url'))), callback='parse_item', follow=True)
)
super(MySpider, self).__init__(*args, **kwargs)
def parse_start_url(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
def parse_item(self, response):
for sel in response.xpath("//div[contains(#class, 'syn_of_syns')]"):
print(sel)
item = ThesaurusItem()
item['mainsynonym'] = sel.xpath("div/div/div/a/text()").extract()
item['definition'] = sel.xpath("div/div/div[#class='def']/text()").extract()
item['secondarysynonym'] = sel.xpath('div/div/ul/li/a/text()').extract()
yield item
Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.
Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2