Scrapy issue with csv output - scrapy

Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.

Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2

Related

Removing duplicates from response

I am looking to try and remove duplicate timestamps for when I scrape the following site for data on BTC. I want to remove the duplicates after every time requests are sent, so that scrapy can remove the duplicates.
However, I cannot understand how the duplicates are removed when it involves the json response. I had thought it would remove the duplicates when I put the json into a dataframe, however it will not do this.
Here's the items pipeline:
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class DuplicatesPipeline:
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
adapter = ItemAdapter(item)
for time in adapter['data']['timestamp']:
if time in self.ids_seen:
raise DropItem(f"Duplicate item found: {item!r}")
else:
self.ids_seen.add(time)
return item
The pipelines seems to not produce any errors however it is not removing the duplicate timestamps, so it is not working.
Here's the script that I am using to grab the data.
import scrapy
import numpy as np
from collections import defaultdict
import pandas as pd
import time
def storeBitcoin(response):
bitcoin = defaultdict(list)
resp = response.json()['data']['KdataInfo']
for row in range(0, len(resp)):
bitcoin['timestamp'].append(resp[row]['T'])
bitcoin['open'].append(resp[row]['O'])
bitcoin['closed'].append(resp[row]['C'])
bitcoin['high'].append(resp[row]['H'])
bitcoin['low'].append(resp[row]['L'])
return bitcoin
sec_begin = [55, 75]
sec_end = [15, 35]
class BtcSpider(scrapy.Spider):
name = 'btcc2'
start_urls = ['https://www.btcc.com/quot/history?']
custom_settings = {
'DOWNLOAD_DELAY':0.2
}
def start_requests(self):
for urls in self.start_urls:
for begin, end in zip(sec_begin, sec_end):
yield scrapy.FormRequest(
url=urls,
method="GET",
formdata = {
'codeid': '3223607',
'token': 'm19JU98eIFQjRgwsf9b3eXXI1jmDSW9N',
'interval': '35',
'from': f'16517697{begin}',
'to': f'16518562{end}',
},
callback = self.parse,
)
def parse(self, response):
data = pd.DataFrame(storeBitcoin(response))
data = data.drop_duplicates(subset=['timestamp'])
yield data

Scrapy fetch Start_URLs from MySQL table

I am having issues fetching the start_urls for Scrapy to run from a table in MySQL.
MySQL has a database called "scrapy", a table called "urls" with a single column called "url" which has on each row a URL to scrape.
This is the code I have currently, but I feel like I am missing a concept somewhere:
# -*- coding: utf-8 -*-
import scrapy
import datetime
import urlparse
import socket
import MySQLdb
from scrapy.loader import ItemLoader
from example.items import exampleitem
class instantdbSpider(scrapy.Spider):
name = 'instantdb'
allowed_domains = ['example.com']
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='Password!',
db='scrapy',
host='localhost',
charset="utf8",
use_unicode=True,
cursorclass=MySQLdb.cursors.DictCursor
)
cursor = conn.cursor()
cursor.execute('SELECT * FROM urls')
rows = cursor.fetchall()
for row in rows:
url = row["url"]
yield Request(url=url)
def parse(self, response):
l = ItemLoader(item=exampleitem(), response=response)
#Scrape Fields
l.add_xpath('title', '//html/head/title/text()')
l.add_xpath('sku', '//*[#id="js-zoom-image-container"]/div[5]/h2/strong/text()')
l.add_xpath('price', '//*[#id="main-content"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/p/text()[1]')
l.add_xpath('product_title', '//html/body/div[1]/span[4]/text()')
l.add_xpath('image_url', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/a/img/#src')
l.add_xpath('description', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[7]')
# Administration Fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()
Any help would be greatly appreciated as I seem to be going around in circles. Thanks.
You have an indentation issue:
for row in rows:
url = row["url"]
yield Request(url=url)

How to scrape multiple pages with scrapy?

I'm trying to scrape a table with multiple pages. With the following code I print the first page data:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
I have written the next code to download all the pages. It is based on other posts that I have read:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
When I try to print all the pages I don't obtain anything. Can anyone help me to know what is the mistake?
Scrapy needs parse callback first. Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
or just rewrite start_request method with other callback:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
Here is a code to crawl all pages:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[#id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item

Relative path in scrapyd

import scrapy
import csv
from series.items import SeriesItem
class EpisodeScraperSpider(scrapy.Spider):
name = "episode_scraper"
allowed_domains = ["imdb.com"]
start_urls = []
def __init__(self, id=None, series=None, *args, **kwargs):
super(EpisodeScraperSpider, self).__init__(*args, **kwargs)
if id is not None:
self.start_urls = ['http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(id, series)]
else:
with open('series_episode.csv') as f:
f_csv = csv.DictReader(f)
for row in f_csv:
self.start_urls.append('http://www.imdb.com/title/{!s}/episodes?season={!s}'.format(row["id"], row["series"]))
def parse(self, response):
episodes = response.xpath('//div[contains(#class, "list_item")]')
title = response.xpath('//h3/a/text()').extract()[0]
for episode in episodes:
global title
item = SeriesItem()
item['series_episode'] = episode.xpath('div/a/div[contains(#data-const,"tt")]/div/text()').extract()
item['title'] = '{!s}: {!s}'.format(title, episode.xpath('div[#class="info"]/strong/a/text()').extract())
item['imdb_id'] = episode.xpath('div[#class="image"]/a/div/#data-const').extract()
item['airdate'] = [x.strip() for x in episode.xpath('div/div[#class="airdate"]/text()').extract()]
yield item
When I try this script in scrapyd I got no result. It does have result in scrapy. I think the problem is in this line.
with open('series_episode.csv') as f:
I don't know where to put my csv file.
Please help me!!
Thanks
one option would be to save it in /tmp
with open('/tmp/series_episode.csv') as f:

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko