scrapy export the data to files by urls'path - scrapy

How can I change the scrapy's sorce code , in orde to save files by the urls, when I export the data from HTML pages.
For example:
this pages (http://example/big/ppp) have lots of pages links
http://example/big/ppp/a
http://example/big/ppp/b
http://example/big/ppp/c
......
and I wnat to save the data from
http://example/big/ppp/a in d:/ppp/a.csv
http://example/big/ppp/b in d:/ppp/b.csv
http://example/big/ppp/c in d:/ppp/c.csv
because of this pages(http://example/big/ppp) have lots of links that like
http://example/big/ppp/a,http://example/big/ppp/b.
So could you help me, kind person!

You can use scrapy pipeline to do this job, add a field to the item you are going to export, for example named 'source' (http://example/big/ppp/a) to record where the item from:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class MyCsvPipeline(object):
def __init__(self):
self.csvfiles = {}
self.exporter = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def close_spider(self, spider):
for e in self.exporter.values():
e.finish_exporting()
for f in self.csvfiles.values():
f.close()
def process_item(self, item, spider):
csv = item['source'].split('/')[-1] + '.csv'
if csv not in self.csvfiles:
newfile = open('d:/ppp/'+csv, 'wb')
self.csvfiles[csv] = newfile
self.exporter[csv] = CsvItemExporter(newfile)
self.exporter[csv].start_exporting()
self.exporter[csv].export_item(item)
return item
apply this pipeline in settings.py
ITEM_PIPELINES = {
'xxxx.pipelines.MyCsvPipeline': 300,
}
another option
use scrapy crawl xxx -t csv -o all.csv --loglevel=INFO to export all items to a csv, then use another script to separate it into small csv according to 'source'.

Related

Scrapy register a custom clean line by line item exporter

So, this should have been simpler, and in the end it was quite simple, but the Scrapy documentations do leave some finding out to do ... So anyway this is a Q + A:
How to right Scrapy items "as is" line by line to a text file?
Basically you need to register an item exporter, and then tell Scrapy that you want to use it from command line:
Create a file named: lines_exporter.py:
from scrapy.exporters import BaseItemExporter
from scrapy.utils.python import to_bytes
class LinesExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
super().__init__(dont_fail=True, **kwargs)
self.file = file
self._kwargs.setdefault('ensure_ascii', not self.encoding)
def export_item(self, item):
itemdict = dict(self._get_serialized_fields(item))
data = ', '.join(itemdict.values()) + '\n'
self.file.write(to_bytes(data, self.encoding))
Add/edit the following in your settings.py:
FEED_EXPORTERS = {
'lines': 'project_name.lines_exporter.LinesExporter',
}
When invoking Scrapy from command line, specify the output-format flag, or:
-t lines
Enjoy!

Trying to scrape a website with scrapy - Not receiving any data

For an assignment I have to fetch from data from a Kaercher webshop. The data I need to fetch is the product Title, description and price.
Additionally I need to be able to fetch multiple products (high pressure cleaners, vacuum cleaners, ...) with the same script. So I probably need to make a .csv keyword file or something to adjust the URL accordingly.
However, I can't seem to be able to fetch the data with my current script..
Info: I will add my entire file structure and current code. I only adjusted the actual spider file (karcher_crawler.py), the other files are mostly default.
My folder structure:
scrapy_karcher/ # Project root directory
scrapy.cfg # Contains the configuration information to deploy the spider
scrapy_karcher/ # Project's python module
__init__.py
items.py # Describes the definition of each item that we’re scraping
middlewares.py # Project middlewares
pipelines.py # Project pipelines file
settings.py # Project settings file
spiders/ # All the spider code goes into this directory
__init__.py
karcher_crawler.py # The spider
My "karcher_crawler.py" code
import scrapy
class KarcherCrawlerSpider(scrapy.Spider):
name = 'karcher_crawler'
start_urls = [
'https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html'
]
def parse(self, response):
products=response.xpath("//div[#class='col-sm-3 col-xs-6 fg-products-item']")
# iterating over search results
for product in products:
# Defining the XPaths
XPATH_PRODUCT_NAME=".//div[#class='product-info']//h6[contains(#class,'product-label')]//a/text()"
XPATH_PRODUCT_PRICE=".//div[#class='product-info']//div[#class='product-price']//span/text()"
XPATH_PRODUCT_DESCRIPTION=".//div[#class='product-info']//div[#class='product-description']//a/text()"
raw_product_name=product.xpath(XPATH_PRODUCT_NAME).extract()
raw_product_price=product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_description=product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
# cleaning the data
product_name=''.join(raw_product_name).strip(
) if raw_product_name else None
product_price=''.join(raw_product_price).strip(
) if raw_product_price else None
product_description=''.join(raw_product_description).strip(
) if raw_product_description else None
yield {
'product_name': product_name,
'product_price': product_price,
'product_description': product_description,
}
My "items.py" code:
import scrapy
class ScrapyKarcherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
My "pipelines.py" code:
class ScrapyKarcherPipeline(object):
def process_item(self, item, spider):
return item
my "scrapy.cfg" code:
[settings]
default = scrapy_karcher.settings
[deploy]
#url = http://localhost:6800/
project = scrapy_karcher
I managed to request the required data using the following code:
Spider file (.py)
import scrapy
from krc.items import KrcItem
import json
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def parse(self, response):
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
items file (.py.
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
pass
Thanks to #gangabass I managed to locate the URL's which contain the data I needed to extract. (you can find them in the "Network" tab when you are inspecting a webpage (press F12 or right click anywhere to inspect).

Scrapy image pipeline: How to drop image on checksum?

I am scraping some images using the scrapy image pipeline and want to drop images from importing that match a certain hash.
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
item['images'] = response.xpath('//meta[#property="og:image"][not(contains(#content, "Demo_600x600"))]/#content').extract()[0:self.max_pix]
Images:
url "https://www.example.de…212-B726-757P-A20D-1.jpg"
path "full/56de72acb6c1e12ffa8644c1bb96df4edf421438.jpg"
checksum "e206446c40c22cfd5f94966c337b56cc"
How can I make sure this image will be excluded within the import?
You can try overriding get_images method from imagepipeline. Image will not get downloaded if hash matches.
import logging
from io import BytesIO
from scrapy.utils.misc import md5sum
logger = logging.getLogger(__name__)
def get_images(self, response, request, info):
checksum = md5sum(BytesIO(response.body))
drop_list = ['hash1', 'hash2']
logger.debug('Verifying Checksum: {}'.format(checksum))
if checksum in drop_list:
logger.debug('Skipping Checksum: {}'.format(checksum))
raise Exception('Dropping Image')
return super(MyImagesPipeline,self).get_images(response, request, info)

Empty .json file

I have written this short spider code to extract titles from hacker news front page(http://news.ycombinator.com/).
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] =
sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# printing titles using print statement.
print (item['hackertitle']
However when i run the code scrapy scrawl hackernewscrawler -o hntitles.json -t json
i get an empty .json file that does not have any content in it.
You should change print statement to yield:
import scrapy
class HackerItem(scrapy.Item): #declaring the item
hackertitle = scrapy.Field()
class HackerSpider(scrapy.Spider):
name = 'hackernewscrawler'
allowed_domains = ['news.ycombinator.com'] # website we chose
start_urls = ['http://news.ycombinator.com/']
def parse(self,response):
sel = scrapy.Selector(response) #selector to help us extract the titles
item=HackerItem() #the item declared up
# xpath of the titles
item['hackertitle'] = sel.xpath("//tr[#class='athing']/td[3]/a[#href]/text()").extract()
# return items
yield item
Then run:
scrapy crawl hackernewscrawler -o hntitles.json -t json

scrapy only crawl 1 picture

I want crawl image of link : "http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html" but but the code is only crawl a picture (in my computer) and crawl all pictures (in my friend computer). Plese please help me
import scrapy
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors import LinkExtractor
from imgur.items import ImgurItem
class ImgurSpider(CrawlSpider):
name = 'imgur'
allowed_domains = ['vnexpress.net']
start_urls = ['http://vnexpress.net/photo/cuoc-song-do-day/nguoi-trung-quoc-ra-be-boi-danh-mat-chuoc-tranh-nong-3445592.html']
# rules = [Rule(LinkExtractor(allow=['/*']), 'parse123')]
def parse(self, response):
image = ImgurItem()
# image['title'] = response.xpath(\
# "//img[data-notes-url=""]").extract()
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
return image
rel = response.xpath("//div[#id='article_content']//img/#src").extract()
image['image_urls'] = [rel[0]]
You take only one link by specifying [0] index.
Try
image['image_urls'] = rel
You can also split your code to url parsing function, and a callback for downloading the images.