How to use scrapy to crawl multi level pages? (two level) - scrapy

On the first page it well scraping the text "test1" in title tag but nothing in page two "test2.html"
my script:
from scrapy.spider import Spider
from scrapy.selector import Selector
from testscrapy1.items import Website
class DmozSpider(Spider):
name = "bill"
allowed_domains = ["http://www.mywebsite.com"]
start_urls = [
"http://www.mywebsite.com/test.html"]
def parse(self,response):
for site in response.xpath('//head'):
item = Website()
item['title'] = site.xpath('//title/text()').extract()
yield item
yield scrapy.Request(url="www.mywebsite.com/test1.html", callback=self.other_function)
def other_function(self,response):
for other_thing in response.xpath('//head'):
item = Website()
item['title'] = other_thing.xpath('//title/text()').extract()
yield item
Thank you in advance STEF

Try
yield scrapy.Request(url="www.mywebsite.com", callback=self.other_function)
instead of
yield scrapy.Request(url="www.mywebsite.com/test1.html", callback=self.other_function)

Related

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

Count scraped items from scrapy

Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)

Following urls in javascript - Scrapy Splash

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.
Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]

Scrape multiple URLs with Scrapy

How can I scrape multiple URLs with Scrapy?
Am I forced to make multiple crawlers?
class TravelSpider(BaseSpider):
name = "speedy"
allowed_domains = ["example.com"]
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4),"http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Python says:
NameError: name 'i' is not defined
But when I use one URL it works fine!
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)"]
Your python syntax is incorrect, try:
start_urls = ["http://example.com/category/top/page-%d/" % i for i in xrange(4)] + \
["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)]
If you need to write code to generate start requests, you can define a start_requests() method instead of using start_urls.
You can initialize start_urls in __init__.py method:
from scrapy.item import Item, Field
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
class TravelItem(Item):
url = Field()
class TravelSpider(BaseSpider):
def __init__(self, name=None, **kwargs):
self.start_urls = []
self.start_urls.extend(["http://example.com/category/top/page-%d/" % i for i in xrange(4)])
self.start_urls.extend(["http://example.com/superurl/top/page-%d/" % i for i in xrange(55)])
super(TravelSpider, self).__init__(name, **kwargs)
name = "speedy"
allowed_domains = ["example.com"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
items = []
item = TravelItem()
item['url'] = hxs.select('//a[#class="out"]/#href').extract()
out = "\n".join(str(e) for e in item['url']);
print out
Hope that helps.
There are only four ranges in Python: LEGB, because the local scope of the class definition and the local extent of the list derivation are not nested functions, so they do not form the Enclosing scope.Therefore, they are two separate local scopes that cannot be accessed from each other.
so, don't use 'for' and class variables at the same time