Spider to download images does not seem to work though i have pil installed - scrapy

I am a newbie to scrapy.I am trying to write a spider to download images.for using the image pipeline,is installing PIL sufficient?My PIL is located in
/usr/lib/python2.7/dist-packages/PIL
How do I include it in my Scrapy project?
settings file:
BOT_NAME = 'paulsmith'
BOT_VERSION = '1.0'
ITEM_PIPELINES = ['scrapy.contrib.pipeline.images.ImagesPipeline']
IMAGE_STORE = '/home/jay/Scrapy/paulsmith/images'
SPIDER_MODULES = ['paulsmith.spiders']
NEWSPIDER_MODULE = 'paulsmith.spiders'
DEFAULT_ITEM_CLASS = 'paulsmith.items.PaulsmithItem'
USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
Items file:
from scrapy.item import Item, Field
class PaulsmithItem(Item):
image_urls=Field()
image = Field()
pass
Spider code
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from paulsmith.items import PaulsmithItem
class PaulSmithSpider(BaseSpider):
name="Paul"
allowed_domains=["http://www.paulsmith.co.uk/uk-en/shop/mens"]
start_urls=["http://www.paulsmith.co.uk/uk-en/shop/mens/jeans"]
def parse(self,response):
item= PaulsmithItem()
#open('paulsmith.html','wb').write(response.body)
hxs=HtmlXPathSelector(response)
#sites=hxs.select('//div[#class="category-products"]')
item['image_urls']=hxs.select("//div[#class='category-products']//a/img/#src").extract()
#for site in sites:
#print site.extract()
#image = site.select('//a/img/#src').extract()
return item
SPIDER = PaulSmithSpider()

You may have not set IMAGES_STORE = '/path/to/valid/dir'
morever, try to use a Custom Images pipeline like this:
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
You can check whether image_urls are requested from method "get_media_requests"
reference: http://doc.scrapy.org/en/latest/topics/images.html

Related

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

unable to connect spider closed when yielding to s3 bucket

# -*- coding: utf-8 -*-
import scrapy
from scrapy.utils.response import open_in_browser
from pydispatch import dispatcher
from scrapy.signalmanager import SignalManager
#from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
class ExampleSpider(scrapy.Spider):
name = 'forever'
allowed_domains = ['example.com']
kohlind = max_kohls = 0
total_products = 0
colected = 0
items = []
#AWS_ACCESS_KEY_ID = 'id'
#AWS_SECRET_ACCESS_KEY = 'pass'
start_urls=['https://www.example.com/']
custom_settings = {'FEED_URI' : f's3://example-products/fulltest.csv',
'FEED_EXPORT_FIELDS': ['ITEM_ID','URL','SELLER','PRICE','SALE_PRICE','MAIN_IMAGE','OTHER_IMAGE','SKU','PRODUCT_NAME']
}
def __init__(self):
SignalManager(dispatcher.Any).connect(receiver=self._close, signal=signals.spider_closed)
#spider code
def _close(self):
print(f"\n\nClosing Spider with {len(self.items)} New Items")
for i in self.items:
if "URL" in i.keys():
yield item
print("Done")
Program is not connecting to _close function, no error is found and the yield items in spider code are uploaded normally (except the _close nothing happens)
I tried removing the s3 in the settings, It worked fine (i.e entered the _close function)
How can I fix?
Try this code below, and it should work
# -*- coding: utf-8 -*-
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
class ExampleSpider(scrapy.Spider):
name = 'forever'
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def spider_closed(self, spider):
print(f"\n\nClosing Spider with {len(self.items)} New Items")

Scrapy fetch Start_URLs from MySQL table

I am having issues fetching the start_urls for Scrapy to run from a table in MySQL.
MySQL has a database called "scrapy", a table called "urls" with a single column called "url" which has on each row a URL to scrape.
This is the code I have currently, but I feel like I am missing a concept somewhere:
# -*- coding: utf-8 -*-
import scrapy
import datetime
import urlparse
import socket
import MySQLdb
from scrapy.loader import ItemLoader
from example.items import exampleitem
class instantdbSpider(scrapy.Spider):
name = 'instantdb'
allowed_domains = ['example.com']
def start_requests(self):
conn = MySQLdb.connect(
user='root',
passwd='Password!',
db='scrapy',
host='localhost',
charset="utf8",
use_unicode=True,
cursorclass=MySQLdb.cursors.DictCursor
)
cursor = conn.cursor()
cursor.execute('SELECT * FROM urls')
rows = cursor.fetchall()
for row in rows:
url = row["url"]
yield Request(url=url)
def parse(self, response):
l = ItemLoader(item=exampleitem(), response=response)
#Scrape Fields
l.add_xpath('title', '//html/head/title/text()')
l.add_xpath('sku', '//*[#id="js-zoom-image-container"]/div[5]/h2/strong/text()')
l.add_xpath('price', '//*[#id="main-content"]/div/div[1]/div[1]/div[1]/div/div[1]/div[2]/p/text()[1]')
l.add_xpath('product_title', '//html/body/div[1]/span[4]/text()')
l.add_xpath('image_url', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/a/img/#src')
l.add_xpath('description', '//*[#id="main-content"]/div/div[1]/div[1]/div[2]/div[7]')
# Administration Fields
l.add_value('url', response.url)
l.add_value('project', self.settings.get('BOT_NAME'))
l.add_value('spider', self.name)
l.add_value('server', socket.gethostname())
l.add_value('date', datetime.datetime.now())
return l.load_item()
Any help would be greatly appreciated as I seem to be going around in circles. Thanks.
You have an indentation issue:
for row in rows:
url = row["url"]
yield Request(url=url)

How to scrape multiple pages with scrapy?

I'm trying to scrape a table with multiple pages. With the following code I print the first page data:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
I have written the next code to download all the pages. It is based on other posts that I have read:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
When I try to print all the pages I don't obtain anything. Can anyone help me to know what is the mistake?
Scrapy needs parse callback first. Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
or just rewrite start_request method with other callback:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
Here is a code to crawl all pages:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[#id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item

How can I schedule scrapy spider to crawl after certain time?

I want to schedule my spider to run again in 1 hour when crawling is finished. In my code spider_closed method is calling after crawling end. Now How to run the spider again from this method. Or are there any available settings to schedule the scrapy spider.
Here is my basic spider code.
import scrapy
import codecs
from a2i.items import A2iItem
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.conf import settings
from scrapy.crawler import CrawlerProcess
from scrapy import signals
from scrapy.utils.project import get_project_settings
from scrapy.xlib.pydispatch import dispatcher
class A2iSpider(scrapy.Spider):
name = "notice"
f = open("urls.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
allowed_domains = ["prothom-alo.com"]
def __init__(self):
dispatcher.connect(self.spider_closed, signals.spider_closed)
def parse(self, response):
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
print "*"*70
print url
print "\n\n"
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':2,'url' : url})
def parse_page(self, response):
filename = "response.txt"
depth = response.meta['depth']
with open(filename, 'a') as f:
f.write(str(depth))
f.write("\n")
f.write(response.meta['url'])
f.write("\n")
for href in response.css("a::attr('href')"):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_page,meta={'depth':depth+1,'url' : url})
def spider_closed(self, spider):
print "$"*2000
You can use cron.
crontab -e to create schedule and run scripts as root, or
crontab -u [user] -e to run as a specific user.
at the bottom you can add
0 * * * * cd /path/to/your/scrapy && scrapy crawl [yourScrapy] >> /path/to/log/scrapy_log.log
0 * * * * makes the script runs hourly, and I believe you can find more details about the settings online.
You can run your spider with the JOBDIR setting, it will save your requests loaded in the scheduler
scrapy crawl somespider -s JOBDIR=crawls/somespider-1
https://doc.scrapy.org/en/latest/topics/jobs.html