scrapy gettin coincident data - scrapy

import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g298006-d740275-Reviews-Deniz_Restaurant-Izmir_Izmir_Province_Turkish_Aegean_Coast.html',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'author': response.xpath("//div[contains(#class, 'member_info')]//div/text()").extract(),
'rating': response.xpath("//span[contains(#class,'ui_bubble_rating')]/#alt").extract() ,
'comment_tag': response.xpath("//span[contains(#class, 'noQuotes')]/text()").extract(),
'comment': response.xpath('//div[#class="entry"]/p/text()').extract()
}
next_page = response.xpath("//div[contains(#class, 'unified')]/a[contains(#class, 'next')]/#href").extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
This is my spider code. My problem is when i used crawl with json. there are so many repeated data from website

Related

Exporting the results from multiple spiders

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

Need helpwith YellowPages spider

I'm new to scrapy, I've been able to create a few spiders so far. I would like to write a spider that will crawl Yellowpages, looking for websites that have a 404 response, the spider is working OK, however, the pagination is not working. Any help will be much appreciated. thanks in advance
# -*- coding: utf-8 -*-
import scrapy
class SpiderSpider(scrapy.Spider):
name = 'spider'
#allowed_domains = ['www.yellowpages.com']
start_urls = ['https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL']
def parse(self, response):
for listing in response.css('div.search-results.organic div.srp-listing'):
url = listing.css('a.track-visit-website::attr(href)').extract_first()
yield scrapy.Request(url=url, callback=self.parse_details)
# follow pagination links
next_page_url = response.css('a.next.ajax-page::attr(href)').extract_first()
next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self,response):
yield{'Response': response,}
I ran your code and found out that there are some errors. In the first loop, you don't check the value of url and sometimes it is None. This error stops the execution, that's why you thought the pagination didn't work.
Here is a working code:
# -*- coding: utf-8 -*-
import scrapy
class SpiderSpider(scrapy.Spider):
name = 'spider'
#allowed_domains = ['www.yellowpages.com']
start_urls = ['https://www.yellowpages.com/search?search_terms=handyman&geo_location_terms=Miami%2C+FL']
def parse(self, response):
for listing in response.css('div.search-results.organic div.srp-listing'):
url = listing.css('a.track-visit-website::attr(href)').extract_first()
if url:
yield scrapy.Request(url=url, callback=self.parse_details)
next_page_url = response.css('a.next.ajax-page::attr(href)').extract_first()
next_page_url = response.urljoin(next_page_url)
if next_page_url:
yield scrapy.Request(url=next_page_url, callback=self.parse)
def parse_details(self,response):
yield{'Response': response,}

scrapy list of links from parse result

Here's my current code:
#scrap all the cafe links from example.com
import scrapy, re
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy import Selector
class DengaSpider(scrapy.Spider):
name = 'cafes'
allowed_domains = ['example.com']
start_urls = [
'http://example.com/archives/8136.html',
]
cafeOnlyLink = []
def parse(self, response):
cafelink = response.xpath('//li/a[contains(#href, "archives")]/#href').extract()
twoHourRegex = re.compile(r'^http://example\.com/archives/\d+.html$')
cafeOnlyLink = [ s for s in cafelink if twoHourRegex.match(s) ]
So how should I continue to parse content from each url containing in the [cafeOnlyLink] list? and I want to save all the result from each page in a csv file.
You can use something like this:
for url in cafeOnlyLink:
yield scrapy.Request(url=url, callback=self.parse_save_to_csv)
def parse_save_to_csv(self, response):
# The content is in response.body, so you have to select what information
# you want to sent to the csv file.

scrapy code not working properly

… I have a scrapy Code that is running in shell, but when I try to export it to csv, it returns an empty file. It exports data when I do not go into a link and try to parse the description
import scrapy
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmozs.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
import scrapy
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
filename = response.url.split("/")[-2] + '.html'
with open(filename, 'wb') as f:
f.write(response.body)
just you have miss type
allowed_domains = ["dmozs.org"]
allowed_domains = ["dmoz.org"]
please change code allowed_domains = ["dmoz.org"]

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko