How to link the items.py and my spider file? - scrapy

I'm new to scrapy and trying to scrape a page which has several links. Which I want to follow and scrape the content from that page as well, and from that page there is another link that I want to scrape.
I tried this path on shell and it worked but, I don't know what I am missing here. I want to be able to crawl through two pages by following the links.
I tried reading through tutorials but I don't really understand what I am missing here.
This is my items.py file.
import scrapy
# item class included here
class ScriptsItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
And here is my scripts.py file.
import scrapy
import ScriptsItem
class ScriptsSpider(scrapy.Spider):
name = 'scripts'
allowed_domains = ['https://www.imsdb.com/TV/Futurama.html']
start_urls = ['http://https://www.imsdb.com/TV/Futurama.html/']
BASE_URL = 'https://www.imsdb.com/TV/Futurama.html'
def parse(self, response):
links = response.xpath('//table//td//p//a//#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = ScriptsItem()
item["link"] = response.url
item["attr"] = "".join(response.xpath("//table[#class = 'script-details']//tr[2]//td[2]//a//text()").extract())
return item

Replace
import ScriptsItem
to
from your_project_name.items import ScriptsItem
your_project_name - Name of your project

Related

Extract text from table with Scrapy

I'm trying to extract Job title insides a table from this page: http://www.chalmers.se/en/about-chalmers/Working-at-Chalmers/Vacancies/Pages/default.aspx
This is the code, but it always returns empty. Any idea how to fix this?
import os
from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
class mySpider(CrawlSpider):
name = "myspider"
allowed_domains = ["www.chalmers.se"]
start_urls = [
"http://www.chalmers.se/en/about-chalmers/Working-at-Chalmers/Vacancies/Pages/default.aspx",
]
def parse(self, response):
sel = response.selector
# try to extract text from a tag inside <td>
for tr in sel.css("table#jobsTable>tbody>tr"):
my_title = tr.xpath('td[#class="jobitem"]/a/text()').extract()
print '================', my_title
I also try to give absolute html path, like bellow but still got empty title:
my_title = response.xpath('/html/body/div/div[1]/div/div[11]/div/table/tbody/tr[1]/td[2]/a/text()').extract()
Your website gets above Jobs table from another source (loading it using AJAX call).
So you just need to start from another url:
start_urls = ['https://web103.reachmee.com/ext/I003/304/main?site=5&validator=a72aeedd63ec10de71e46f8d91d0d57c&lang=UK&ref=&ihelper=http://www.chalmers.se/en/about-chalmers/Working-at-Chalmers/Vacancies/Pages/default.aspx']

Scrapy spider not following pagination

I am using code from this link(https://github.com/eloyz/reddit/blob/master/reddit/spiders/pic.py) but somehow I am unable to visit paginated page.
I am on using scrapy 1.3.0
You don't have any mechanism for processing next page, all you do is gathering images.
Here is what you should be doing, I wrote some selectors but didn't test it.
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
import urlparse
class xxx_spider(Spider):
name = "xxx"
allowed_domains = ["xxx.com"]
def start_requests(self):
url = 'first page url'
yield Request(url=url, callback=self.parse, meta={"page":1})
def parse(self, response):
page = response.meta["page"] + 1
html = Selector(response)
pics = html.css('div.thing')
for selector in pics:
item = PicItem()
item['image_urls'] = selector.xpath('a/#href').extract()
item['title'] = selector.xpath('div/p/a/text()').extract()
item['url'] = selector.xpath('a/#href').extract()
yield item
next_link = html.css("span.next-button a::attr(href)")
if not next_link is None:
yield Request(url=url, callback=self.parse, meta={"page":page})
Similar to what you did, but when I get images, I then check next page link, if it exists then I yield another request with it.
Mehmet

How to download image using Scrapy?

I am newbie to scrapy. I am trying to download an image from here. I was following Official-Doc and this article.
My settings.py looks like:
BOT_NAME = 'shopclues'
SPIDER_MODULES = ['shopclues.spiders']
NEWSPIDER_MODULE = 'shopclues.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy.contrib.pipeline.images.ImagesPipeline':1
}
IMAGES_STORE="home/pr.singh/Projects"
and items.py looks like:
import scrapy
from scrapy.item import Item
class ShopcluesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class ImgData(Item):
image_urls=scrapy.Field()
images=scrapy.Field()
I think both these files are good. But I am unable to write correct spider for getting the image. I am able to grab the image URL but don't know how to store image using imagePipeline.
My spider looks like:
from shopclues.items import ImgData
import scrapy
import datetime
class DownloadFirstImg(scrapy.Spider):
name="DownloadfirstImg"
start_urls=[
'http://www.shopclues.com/canon-powershot-sx410-is-2.html',
]
def parse (self, response):
url= response.css("body div.site-container div#container div.ml_containermain div.content-helper div.aside-site-content div.product form#product_form_83013851 div.product-gallery div#product_images_83013851_update div.slide a#det_img_link_83013851_25781870")
yield scrapy.Request(url.xpath('#href').extract(),self.parse_page)
def parse_page(self,response):
imgURl=response.css("body div.site-container div#container div.ml_containermain div.content-helper div.aside-site-content div.product form#product_form_83013851 div.product-gallery div#product_images_83013851_update div.slide a#det_img_link_83013851_25781870::attr(href)").extract()
yield {
ImgData(image_urls=[imgURl])
}
I have written the spider following this-article. But I am not getting anything. I run my spider as scrapy crawl DownloadfirstImg -o img5.json
but I am not getting any json nor any image? Any help on How to grab image if it's url is known. I have never worked with python also so things seem much complicated to me. Links to any good tutorial may help. TIA
I don't understand why you yield a request for an image you just need to save it on the item and the images pipeline will do the rest, this is all you need.
def parse (self, response):
url= response.css("body div.site-container div#container div.ml_containermain div.content-helper div.aside-site-content div.product form#product_form_83013851 div.product-gallery div#product_images_83013851_update div.slide a#det_img_link_83013851_25781870")
yield ImgData(image_urls=[url.xpath('#href').extract_first()])

Exporting unique items from CrawlSpider

I am using scrapy's CrawlSpider spider class to iterate over the list of start_urls and crawl each site's internal pages to fetch e-mail addresses. I would like to export a file with a single (unique) item for each start_url, with the list of matched e-mails. For that I purpose I needed to override the make_requests_from_url and parse methods so I can pass each start_url item in the response's meta dict (see code) to the internal pages. The output from running this code is:
www.a.com,['webmaster#a.com']
www.a.com,['webmaster#a.com','info#a.com']
www.a.com,['webmaster#a.com','info#a.com','admin#a.com']
However, I only want the export file to contain the last entry from the above output
(www.a.com,['admin#a.com,webmaster#a.com, info#a.com'])
Is that possible?
Code:
class MySpider(CrawlSpider):
start_urls = [... urls list ...]
def parse(self, response):
for request_or_item in CrawlSpider.parse(self, response):
if isinstance(request_or_item, Request):
request_or_item.meta.update(dict(url_item=response.meta['url_item']))
yield request_or_item
def make_requests_from_url(self, url):
# Create a unique item for each url. Append email to this item from internal pages
url_item = MyItem()
url_item["url"] = url
url_item["emais"] = []
return Request(url, dont_filter=True, meta = {'url_item': url_item})
def parse_page(self, response):
url_item = response.meta["url_item"]
url_item["emails"].append(** some regex of emails from the response object **)
return url_item
You could use pipeline to process items.
see Duplicates filter on Scrapy documentation.

Scrapy HtmlXPathSelector

Just trying out scrapy and trying to get a basic spider working. I know this is just probably something I'm missing but I've tried everything I can think of.
The error I get is:
line 11, in JustASpider
sites = hxs.select('//title/text()')
NameError: name 'hxs' is not defined
My code is very basic at the moment, but I still can't seem to find where I'm going wrong. Thanks for any help!
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class JustASpider(BaseSpider):
name = "google.com"
start_urls = ["http://www.google.com/search?hl=en&q=search"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//title/text()')
for site in sites:
print site.extract()
SPIDER = JustASpider()
The code looks quite old version. I recommend using these codes instead
from scrapy.spider import Spider
from scrapy.selector import Selector
class JustASpider(Spider):
name = "googlespider"
allowed_domains=["google.com"]
start_urls = ["http://www.google.com/search?hl=en&q=search"]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//title/text()').extract()
print sites
#for site in sites: (I dont know why you want to loop for extracting the text in the title element)
#print site.extract()
hope it helps and here is a good example to follow.
I removed the SPIDER call at the end and removed the for loop. There was only one title tag (as one would expect) and it seems that was throwing off the loop. The code I have working is as follows:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class JustASpider(BaseSpider):
name = "google.com"
start_urls = ["http://www.google.com/search?hl=en&q=search"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//title/text()')
final = titles.extract()
I had a similar problem, NameError: name 'hxs' is not defined, and the problem related to spaces and tabs: the IDE uses spaces instead of tabs, you should check it out.
Code looks correct.
In latest versions of Scrapy
HtmlXPathSelector is deprecated.
Use Selector:
hxs = Selector(response)
sites = hxs.xpath('//title/text()')
Be sure you are running the code you are showing us.
Try deleting *.pyc files in your project.
This works for me:
Save the file as test.py
Use the command scrapy runspider <filename.py>
For example:
scrapy runspider test.py
You should change
from scrapy.selector import HtmlXPathSelector
into
from scrapy.selector import Selector
And use hxs=Selector(response) instead.
I use Scrapy with BeautifulSoup4.0. For me, Soup is easy to read and understand. This is an option if you don't have to use HtmlXPathSelector. Hope this helps!
import scrapy
from bs4 import BeautifulSoup
import Item
def parse(self, response):
soup = BeautifulSoup(response.body,'html.parser')
print 'Current url: %s' % response.url
item = Item()
for link in soup.find_all('a'):
if link.get('href') is not None:
url = response.urljoin(link.get('href'))
item['url'] = url
yield scrapy.Request(url,callback=self.parse)
yield item
this is just a demo but it works. need to be customized offcourse.
#!/usr/bin/env python
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class DmozSpider(BaseSpider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//ul/li')
for site in sites:
title = site.select('a/text()').extract()
link = site.select('a/#href').extract()
desc = site.select('text()').extract()
print title, link, desc