Scrapy image pipeline: How to drop image on checksum? - scrapy

I am scraping some images using the scrapy image pipeline and want to drop images from importing that match a certain hash.
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
item['images'] = response.xpath('//meta[#property="og:image"][not(contains(#content, "Demo_600x600"))]/#content').extract()[0:self.max_pix]
Images:
url "https://www.example.de…212-B726-757P-A20D-1.jpg"
path "full/56de72acb6c1e12ffa8644c1bb96df4edf421438.jpg"
checksum "e206446c40c22cfd5f94966c337b56cc"
How can I make sure this image will be excluded within the import?

You can try overriding get_images method from imagepipeline. Image will not get downloaded if hash matches.
import logging
from io import BytesIO
from scrapy.utils.misc import md5sum
logger = logging.getLogger(__name__)
def get_images(self, response, request, info):
checksum = md5sum(BytesIO(response.body))
drop_list = ['hash1', 'hash2']
logger.debug('Verifying Checksum: {}'.format(checksum))
if checksum in drop_list:
logger.debug('Skipping Checksum: {}'.format(checksum))
raise Exception('Dropping Image')
return super(MyImagesPipeline,self).get_images(response, request, info)

Related

Trying to scrape a website with scrapy - Not receiving any data

For an assignment I have to fetch from data from a Kaercher webshop. The data I need to fetch is the product Title, description and price.
Additionally I need to be able to fetch multiple products (high pressure cleaners, vacuum cleaners, ...) with the same script. So I probably need to make a .csv keyword file or something to adjust the URL accordingly.
However, I can't seem to be able to fetch the data with my current script..
Info: I will add my entire file structure and current code. I only adjusted the actual spider file (karcher_crawler.py), the other files are mostly default.
My folder structure:
scrapy_karcher/ # Project root directory
scrapy.cfg # Contains the configuration information to deploy the spider
scrapy_karcher/ # Project's python module
__init__.py
items.py # Describes the definition of each item that we’re scraping
middlewares.py # Project middlewares
pipelines.py # Project pipelines file
settings.py # Project settings file
spiders/ # All the spider code goes into this directory
__init__.py
karcher_crawler.py # The spider
My "karcher_crawler.py" code
import scrapy
class KarcherCrawlerSpider(scrapy.Spider):
name = 'karcher_crawler'
start_urls = [
'https://www.kaercher.com/nl/webshop/hogedrukreinigers-resultaten.html'
]
def parse(self, response):
products=response.xpath("//div[#class='col-sm-3 col-xs-6 fg-products-item']")
# iterating over search results
for product in products:
# Defining the XPaths
XPATH_PRODUCT_NAME=".//div[#class='product-info']//h6[contains(#class,'product-label')]//a/text()"
XPATH_PRODUCT_PRICE=".//div[#class='product-info']//div[#class='product-price']//span/text()"
XPATH_PRODUCT_DESCRIPTION=".//div[#class='product-info']//div[#class='product-description']//a/text()"
raw_product_name=product.xpath(XPATH_PRODUCT_NAME).extract()
raw_product_price=product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_description=product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
# cleaning the data
product_name=''.join(raw_product_name).strip(
) if raw_product_name else None
product_price=''.join(raw_product_price).strip(
) if raw_product_price else None
product_description=''.join(raw_product_description).strip(
) if raw_product_description else None
yield {
'product_name': product_name,
'product_price': product_price,
'product_description': product_description,
}
My "items.py" code:
import scrapy
class ScrapyKarcherItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
My "pipelines.py" code:
class ScrapyKarcherPipeline(object):
def process_item(self, item, spider):
return item
my "scrapy.cfg" code:
[settings]
default = scrapy_karcher.settings
[deploy]
#url = http://localhost:6800/
project = scrapy_karcher
I managed to request the required data using the following code:
Spider file (.py)
import scrapy
from krc.items import KrcItem
import json
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def parse(self, response):
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"]
yield item
items file (.py.
import scrapy
class KrcItem(scrapy.Item):
productid=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
price=scrapy.Field()
pass
Thanks to #gangabass I managed to locate the URL's which contain the data I needed to extract. (you can find them in the "Network" tab when you are inspecting a webpage (press F12 or right click anywhere to inspect).

How to load a zip file (containing shp) from s3 bucket to Geopandas?

I zipped name.shp, name.shx, name.dbf files and uploaded them into a AWS s3 bucket. So now, i wanna load this zip file and convert the contained shapefile into a GeoDataFrame of geopandas.
I can do it perfectly if the file is a zipped geojson instead of zipped shapefile.
import io
import boto3
import geopandas as gpd
import zipfile
cliente = boto3.client("s3", aws_access_key_id=ak, aws_secret_access_key=sk)
bucket_name = 'bucketname'
object_key = 'myfolder/locations.zip'
bytes_buffer = io.BytesIO()
cliente.download_fileobj(Bucket=bucket_name, Key=object_key, Fileobj=bytes_buffer)
geojson = bytes_buffer.getvalue()
with zipfile.ZipFile(bytes_buffer) as zi:
with zi.open("locations.shp") as file:
print(gpd.read_file(file.read().decode('ISO-8859-9')))
I got this error:
ç­¤íEÀ¡ËÆ3À: No such file or directory
Basically geopandas package allows to read files directly from S3. And as mentioned in the answer above it allows to read zip files also. So below you can see the code which will read zip file from s3 without downloading it. You need to enter zip+s3:// in the beginning, then add the path in S3.
geopandas.read_file(f'zip+s3://bucket-name/file.zip')
You can read zip directly, no need to use zipfile. You need all parts of Shapefile, not just .shp itself. That is why it works with geojson. You just need to pass it with zip:///. So instead of
gpd.read_file('path/file.shp')
You go with
gpd.read_file('zip:///path/file.zip')
I am not familiar enough with boto3 to know at which point you actually have this path, but I think it will help.
I do not know if it can be of any help, but I faced a similar problem recently, though I only wanted to read the .shp with fiona. I ended up like others zipping the relevant shp, dbf, cpg and shx on the bucket.
And to read from the bucket, I do like so:
from io import BytesIO
from pathlib import Path
from typing import List
from typing import Union
import boto3
from fiona.io import ZipMemoryFile
from pydantic import BaseSettings
from shapely.geometry import Point
from shapely.geometry import Polygon
import fiona
class S3Configuration(BaseSettings):
"""
S3 configuration class
"""
s3_access_key_id: str = ''
s3_secret_access_key: str = ''
s3_region_name: str = ''
s3_endpoint_url: str = ''
s3_bucket_name: str = ''
s3_use: bool = False
S3_CONF = S3Configuration()
S3_STR = 's3'
S3_SESSION = boto3.session.Session()
S3 = S3_SESSION.resource(
service_name=S3_STR,
aws_access_key_id=S3_CONF.s3_access_key_id,
aws_secret_access_key=S3_CONF.s3_secret_access_key,
endpoint_url=S3_CONF.s3_endpoint_url,
region_name=S3_CONF.s3_region_name,
use_ssl=True,
verify=True,
)
BUCKET = S3_CONF.s3_bucket_name
CordexShape = Union[Polygon, List[Polygon], List[Point]]
ZIP_EXT = '.zip'
def get_shapefile_data(file_path: Path, s3_use: S3_CONF.s3_use) -> CordexShape:
"""
Retrieves the shapefile content associated to the passed file_path (either on disk or on S3).
file_path is a .shp file.
"""
if s3_use:
return load_zipped_shp(get_s3_object(file_path.with_suffix(ZIP_EXT)), file_path)
return load_shp(file_path)
def get_s3_object(file_path: Path) -> bytes:
"""
Retrieve as bytes the content associated to the passed file_path
"""
return S3.Object(bucket_name=BUCKET, key=forge_key(file_path)).get()['Body'].read()
def forge_key(file_path: Path) -> str:
"""
Edit this code at your convenience to forge the bucket key out of the passed file_path
"""
return str(file_path.relative_to(*file_path.parts[:2]))
def load_shp(file_path: Path) -> CordexShape:
"""
Retrieve a list of Polygons stored at file_path location
"""
with fiona.open(file_path) as shape:
parsed_shape = list(shape)
return parsed_shape
def load_zipped_shp(zipped_data: bytes, file_path: Path) -> CordexShape:
"""
Retrieve a list of Polygons stored at file_path location
"""
with ZipMemoryFile(BytesIO(zipped_data)) as zip_memory_file:
with zip_memory_file.open(file_path.name) as shape:
parsed_shape = list(shape)
return parsed_shape
There is quite a lot of code, but the first part is very helpful to easily use a minio proxy for local devs (just have to change the .env).
The key to solve the issue for me was the use of fiona not so well documented (in my opinion) but life saver (in my case :)) ZipMemoryFile

Read and parse CSV file in S3 without downloading the entire file using Python

So, i want to read a large CSV file from an S3 bucket, but i dont want that file to be completely downloaded in memory, what i wanna do is somehow stream the file in chunks and then process it.
So far this is what i have done, but i dont think so this is gonna solve the problem.
import logging
import boto3
import codecs
import os
import csv
LOGGER = logging.getLogger()
LOGGER.setLevel(logging.INFO)
s3 = boto3.client('s3')
def lambda_handler(event, context):
# retrieve bucket name and file_key from the S3 event
bucket_name = event['Records'][0]['s3']['bucket']['name']
file_key = event['Records'][0]['s3']['object']['key']
chunk, chunksize = [], 1000
if file_key.endswith('.csv'):
LOGGER.info('Reading {} from {}'.format(file_key, bucket_name))
# get the object
obj = s3.get_object(Bucket=bucket_name, Key=file_key)
file_object = obj['Body']
count = 0
for i, line in enumerate(file_object):
count += 1
if (i % chunksize == 0 and i > 0):
process_chunk(chunk)
del chunk[:]
chunk.append(line)
def process_chunk(chuck):
print(len(chuck))
This will do what you want to achieve. It wont download the whole file in the memory, instead will download in chunks, process and proceed:
from smart_open import smart_open
import csv
def get_s3_file_stream(s3_path):
"""
This function will return a stream of the s3 file.
The s3_path should be of the format: '<bucket_name>/<file_path_inside_the_bucket>'
"""
#This is the full path with credentials:
complete_s3_path = 's3://' + aws_access_key_id + ':' + aws_secret_access_key + '#' + s3_path
return smart_open(complete_s3_path, encoding='utf8')
def download_and_process_csv:
datareader = csv.DictReader(get_s3_file_stream(s3_path))
for row in datareader:
yield process_csv(row) # write a function to do whatever you want to do with the CSV
Did u try AWS Athena https://aws.amazon.com/athena/ ?
its extremely good serverless and pay as go. Without dowloading the file it does everything what you want.
BlazingSql is open source and its also usefull in case of big data problem.

scrapy export the data to files by urls'path

How can I change the scrapy's sorce code , in orde to save files by the urls, when I export the data from HTML pages.
For example:
this pages (http://example/big/ppp) have lots of pages links
http://example/big/ppp/a
http://example/big/ppp/b
http://example/big/ppp/c
......
and I wnat to save the data from
http://example/big/ppp/a in d:/ppp/a.csv
http://example/big/ppp/b in d:/ppp/b.csv
http://example/big/ppp/c in d:/ppp/c.csv
because of this pages(http://example/big/ppp) have lots of links that like
http://example/big/ppp/a,http://example/big/ppp/b.
So could you help me, kind person!
You can use scrapy pipeline to do this job, add a field to the item you are going to export, for example named 'source' (http://example/big/ppp/a) to record where the item from:
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class MyCsvPipeline(object):
def __init__(self):
self.csvfiles = {}
self.exporter = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def close_spider(self, spider):
for e in self.exporter.values():
e.finish_exporting()
for f in self.csvfiles.values():
f.close()
def process_item(self, item, spider):
csv = item['source'].split('/')[-1] + '.csv'
if csv not in self.csvfiles:
newfile = open('d:/ppp/'+csv, 'wb')
self.csvfiles[csv] = newfile
self.exporter[csv] = CsvItemExporter(newfile)
self.exporter[csv].start_exporting()
self.exporter[csv].export_item(item)
return item
apply this pipeline in settings.py
ITEM_PIPELINES = {
'xxxx.pipelines.MyCsvPipeline': 300,
}
another option
use scrapy crawl xxx -t csv -o all.csv --loglevel=INFO to export all items to a csv, then use another script to separate it into small csv according to 'source'.

apollo-upload-client and graphene-django

I have a question about using apollo-upload-client and graphene-django. Here I've discovered that apollo-upload-client adding operations to formData. But here graphene-django is only trying to get query parameter. And the question is, where and how it should be fixed?
If you're referring to the data that has a header like (when viewing the HTTP from Chrome tools):
Content-Disposition: form-data; name="operations"
and data like
{"operationName":"MyMutation","variables":{"myData"....}, "query":"mutation MyMutation"...},
the graphene-python library interprets this and assembles it into a query for you, inserting the variables and removing the file data from the query. If you are using Django, you can find all of the uploaded files in info.context.FILES when writing a mutation.
Here's my solution to support the latest apollo-upload-client (8.1). I recently had to revisit my Django code when I upgraded from apollo-upload-client 5.x to 8.x. Hope this helps.
Sorry I'm using an older graphene-django but hopefully you can update the mutation syntax to the latest.
Upload scalar type (passthrough, basically):
class Upload(Scalar):
'''A file upload'''
#staticmethod
def serialize(value):
raise Exception('File upload cannot be serialized')
#staticmethod
def parse_literal(node):
raise Exception('No such thing as a file upload literal')
#staticmethod
def parse_value(value):
return value
My upload mutation:
class UploadImage(relay.ClientIDMutation):
class Input:
image = graphene.Field(Upload, required=True)
success = graphene.Field(graphene.Boolean)
#classmethod
def mutate_and_get_payload(cls, input, context, info):
with NamedTemporaryFile(delete=False) as tmp:
for chunk in input['image'].chunks():
tmp.write(chunk)
image_file = tmp.name
# do something with image_file
return UploadImage(success=True)
The heavy lifting happens in a custom GraphQL view. Basically it injects the file object into the appropriate places in the variables map.
def maybe_int(s):
try:
return int(s)
except ValueError:
return s
class CustomGraphqlView(GraphQLView):
def parse_request_json(self, json_string):
try:
request_json = json.loads(json_string)
if self.batch:
assert isinstance(request_json,
list), ('Batch requests should receive a list, but received {}.').format(
repr(request_json))
assert len(request_json) > 0, ('Received an empty list in the batch request.')
else:
assert isinstance(request_json, dict), ('The received data is not a valid JSON query.')
return request_json
except AssertionError as e:
raise HttpError(HttpResponseBadRequest(str(e)))
except BaseException:
logger.exception('Invalid JSON')
raise HttpError(HttpResponseBadRequest('POST body sent invalid JSON.'))
def parse_body(self, request):
content_type = self.get_content_type(request)
if content_type == 'application/graphql':
return {'query': request.body.decode()}
elif content_type == 'application/json':
return self.parse_request_json(request.body.decode('utf-8'))
elif content_type in ['application/x-www-form-urlencoded', 'multipart/form-data']:
operations_json = request.POST.get('operations')
map_json = request.POST.get('map')
if operations_json and map_json:
operations = self.parse_request_json(operations_json)
map = self.parse_request_json(map_json)
for file_id, f in request.FILES.items():
for name in map[file_id]:
segments = [maybe_int(s) for s in name.split('.')]
cur = operations
while len(segments) > 1:
cur = cur[segments.pop(0)]
cur[segments.pop(0)] = f
logger.info('parse_body %s', operations)
return operations
else:
return request.POST
return {}