Reviews from site duplicate - scrapy

I am scraping reviews from a website and these reviews tend to duplicate. The issue I am facing is with the mitatigation of duplicates and I am thinking my xpath may be an issue but I cannot solve this.
Here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd
class CruisesItems(scrapy.Item):
user_rating = Field(output_processor = TakeFirst())
user = Field(output_processor = TakeFirst())
location = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
reviews = Field(output_processor = Join())
class CruisesSpider(scrapy.Spider):
name = 'cruises_reviews'
start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath("//div[#class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
for reviews in container:
loader = ItemLoader(CruisesItems(), selector = reviews)
loader.add_xpath('user_rating', "//div[#class='eVykL Gi z cPeBe MD cwpFC']//div[#class='emWez F1']/span/#class")
loader.add_xpath('user', "(//div[#class='eVykL Gi z cPeBe MD cwpFC']//div[#class='xMxrO']//div[#class='bcaHz']//span//text())[position() mod 2=1]")
loader.add_xpath('location',"(//div[#class='eVykL Gi z cPeBe MD cwpFC']//div[#class='xMxrO']//div[#class='BZmsN']//span//text())[position() mod 5=1]")
loader.add_xpath('title', ".//div[#class='eVykL Gi z cPeBe MD cwpFC']//div[#class='fpMxB MC _S b S6 H5 _a']//text()")
loader.add_xpath('reviews', "//div[#class='eVykL Gi z cPeBe MD cwpFC']//div[#class='pIRBV _T']//span//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEEDS':{
'cruise_reviews.jl':{
'format':'jsonlines'
}
}
}
)
process.crawl(CruisesSpider)
process.start()

You need to use relative xpath.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd
class CruisesItems(scrapy.Item):
user_rating = Field(output_processor = TakeFirst())
user = Field(output_processor = TakeFirst())
location = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
reviews = Field(output_processor = Join())
class CruisesSpider(scrapy.Spider):
name = 'cruises_reviews'
start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']
custom_settings = {
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse
)
def parse(self, response):
# container = response.xpath("//div[#class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
container = response.xpath('//div[#id="ship_reviews"]//div[#class="eVykL Gi z cPeBe MD cwpFC"]')
for reviews in container:
loader = ItemLoader(CruisesItems(), selector=reviews)
loader.add_xpath('user_rating', ".//div[#class='emWez F1']/span/#class")
loader.add_xpath('user', "(.//div[#class='xMxrO']//div[#class='bcaHz']//span//text())[position() mod 2=1]")
loader.add_xpath('location', "(//div[#class='xMxrO']//div[#class='BZmsN']//span//text())[position() mod 5=1]")
loader.add_xpath('title', ".//div[#class='fpMxB MC _S b S6 H5 _a']//text()")
loader.add_xpath('reviews', ".//div[#class='pIRBV _T']//span//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEEDS':{
'cruise_reviews.jl':{
'format':'jsonlines'
}
}
}
)
process.crawl(CruisesSpider)
process.start()
Notice that I moved //div[#class="eVykL Gi z cPeBe MD cwpFC"] to the container because it's in all the item's xpaths.

Related

How to crawl multiple URLs from csv using Selenium and Scrapy

I am currently trying to crawl multiple sites from https://blogabet.com/
At the moment, I have a "ursl.txt"-file which inlcudes two URLs: 1. http://sabobic.blogabet.com 2. http://dedi22.blogabet.com
The problem I have is the following: Selenium opens each of the two URLs one after the other in the same Tab. Thereby, it is just crawling the content of the second ULR in my "ursl.txt"-file twice. It is not crawling any content from the first URL.
I think there is a problem with the for-loop and how the "parse_tip"-function is called. This is my code:
import scrapy
from scrapy import Spider
from scrapy.selector import Selector
from scrapy.http import Request
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from time import sleep
import re
import csv
from time import sleep
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
# We are not using the response parameter in this function because the start urls are not defined
# Our class Spider is searching for the function start_requests by default
# Request has to returned or yield
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(url, callback=self.parse_tip)
def parse_tip(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}
Why are you doing another request yield Request(url, callback=self.parse_tip) when you already have a response from Selenium.
Just pass that response text to parse_tip and use text inside that
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [url.strip() for url in f.readlines()]
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
for item in self.parse_tip(text= self.driver.page_source):
yield item
def parse_tip(self, text):
sel = Selector(text=text)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}

Selenium & Scrapy: Last URL overwrites other URLs

I am currently trying to crawl data from three websites (three different URLs). Therefore, I am using a text-file to load the different URLs into the start_url.
At the moment, there are three URLs in my file. However, the script just saves/overwrites the data of the two URLs before.
This is my code:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from time import sleep
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import re
import csv
class AlltipsSpider(Spider):
name = 'alltips'
allowed_domains = ['blogabet.com']
def start_requests(self):
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
with open("urls.txt", "rt") as f:
start_urls = [l.strip() for l in f.readlines()]
self.driver = webdriver.Chrome('C:\webdrivers\chromedriver.exe')
for url in start_urls:
self.driver.get(url)
self.driver.find_element_by_id('currentTab').click()
sleep(3)
self.logger.info('Sleeping for 5 sec.')
self.driver.find_element_by_xpath('//*[#id="_blog-menu"]/div[2]/div/div[2]/a[3]').click()
sleep(7)
self.logger.info('Sleeping for 7 sec.')
yield Request(self.driver.current_url, callback=self.crawltips)
def crawltips(self, response):
sel = Selector(text=self.driver.page_source)
allposts = sel.xpath('//*[#class="block media _feedPick feed-pick"]')
for post in allposts:
username = post.xpath('.//div[#class="col-sm-7 col-lg-6 no-padding"]/a/#title').extract()
publish_date = post.xpath('.//*[#class="bet-age text-muted"]/text()').extract()
yield{'Username': username,
'Publish date': publish_date
}

How to scrape multiple pages with scrapy?

I'm trying to scrape a table with multiple pages. With the following code I print the first page data:
import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators
class mySpider(scrapy.Spider):
name = "education2"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
def parse(self, response):
return Request(
url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
callback=self.parse_table
)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
I have written the next code to download all the pages. It is based on other posts that I have read:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def parse_table(self,response):
sel = response.selector
for tr in sel.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('td[1]/text()').extract_first()
item['years'] = tr.xpath('td[position()>1]/text()').extract()
print(item)
yield item
When I try to print all the pages I don't obtain anything. Can anyone help me to know what is the mistake?
Scrapy needs parse callback first. Scrapy doc
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse", follow= True),)
def parse(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
or just rewrite start_request method with other callback:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[#id="linkNextB"]',)), callback="parse_table", follow= True),)
def start_requests(self):
for url in self.start_urls:
yield Request(url, callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//*[#id="divData"]/div/table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item
Here is a code to crawl all pages:
import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html
from w3lib.url import add_or_replace_parameter
class mySpider(CrawlSpider):
name = "education3"
allowed_domains = ["data.un.org"]
start_urls = (
'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
)
api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
def parse(self, response):
max_page = int(response.xpath('//*[#id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
for page in range(1, max_page + 1):
yield Request(
url=add_or_replace_parameter(self.api_url, 'Page', page),
callback=self.parse_table)
def parse_table(self,response):
for tr in response.xpath('//table/tr'):
item = EducationIndicators()
item['country'] = tr.xpath('./td[1]/text()').extract_first()
item['years'] = tr.xpath('./td[position()>1]/text()').extract()
print(item)
yield item

Scrapy issue with csv output

Here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
from scrapy.http import Request
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (Rule(SgmlLinkExtractor(allow=('\?page=\d')),'parse_start_url',follow=True),)
def parse_start_url(self, response):
hxs = HtmlXPathSelector(response)
subpages = hxs.select('//a[#class="detailsHyper_class"]/#href').extract()
ep = hxs.select('//a[#itemprop="name"]/text()').extract()
ad = hxs.select('//div[#class="results_address_class"]/text()').extract()
for eponimia,address,subpage in zip(ep,ad,subpages):
vriskoit = VriskoItem()
vriskoit['eponimia'] = eponimia
vriskoit['address'] = address
request = Request(subpage,callback = self.subPage)
request.meta['vriskoit'] = vriskoit
yield request
def subPage(self,response):
vriskoit = response.meta['vriskoit']
hxs = HtmlXPathSelector(response)
vriskoit['category'] = hxs.select('//div[#class="category_class"]/span/text()').extract()
yield vriskoit
and here is my pipiline:
import csv
class myExporter(object):
def __init__(self):
self.brandCategoryCsv = csv.writer(open('brandCategoryTable.csv', 'wb'))
self.brandCategoryCsv.writerow(['eponimia', 'address','category'])
def process_item(self, item, spider):
for e,a,c in zip(item['eponimia'],item['address'],item['category']):
self.brandCategoryCsv.writerow([e.encode('utf-8'), a.encode('utf-8'), c.encode('utf-8')])
return item
my problem is that for both the first 2 fields (eponimia,address), only the first character is written to the output csv file and i cant find why.
Any help would be much appreciated, i am out of ideas.
Remove zip function from myExporter.process_item
def process_item(self, item, spider):
self.brandCategoryCsv.writerow([item['eponimia'].encode('utf-8'),
item['address'].encode('utf-8'),
item['category'].encode('utf-8')])
return item
You already converted items list to individual items in vriskoSpider.parse_start_url.
zip iterates your strings:
In [1]: a = 'test1'
In [2]: b = 'test2'
In [3]: for x, y in zip(a, b):
...: print x, y
...:
t t
e e
s s
t t
1 2

Scrapy won't get results from first page

here is my spider:
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from vrisko.items import VriskoItem
class vriskoSpider(CrawlSpider):
name = 'vrisko'
allowed_domains = ['vrisko.gr']
start_urls = ['http://www.vrisko.gr/search/%CE%B3%CE%B9%CE%B1%CF%84%CF%81%CE%BF%CF%82/%CE%BA%CE%BF%CF%81%CE%B4%CE%B5%CE%BB%CE%B9%CE%BF']
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse_vrisko'),
)
def parse_vrisko(self, response):
hxs = HtmlXPathSelector(response)
vriskoit = VriskoItem()
vriskoit['eponimia'] = hxs.select("//a[#itemprop='name']/text()").extract()
vriskoit['address'] = hxs.select("//div[#class='results_address_class']/text()").extract()
print ' '.join(vriskoit['eponimia']).join(vriskoit['address'])
return vriskoit
The pages i try to crawl have the format http://www.blabla.com/blabla/bla?page=x
where x = any integer.
My problem is that my spider crawls all pages except the first one!
Any ideas why does this happen ?
Thank you in advance!
if you look into scrapy doc , start_urls response goes to **
parse
** method
so you can change your rule like this
rules = (
Rule(SgmlLinkExtractor(allow=('\?page=\d')), callback='parse'),
)
and method name from def parse_vrisko(self, response): to def parse(self, response):
or you can remove start_urls and start your spider with def start_requests(self): with callback to parse_vrisko