Getting each text corresponding to the each tag - scrapy

I'm trying to grab some data from the left-side column of a webpage. The aim is to click on all the show more buttons using scrapy_playwright, and grab the title of each the elements belonging to the show more list. However, when I run my scraper it iterates the same header make for all of the lists. I need to get these unique for each set of lists.
Here's my scraper:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor = TakeFirst())
category = Field(output_processor = TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY':0.5
}
def start_requests(self):
for url in self.start_urls:
for i in range(0, 11):
yield scrapy.Request(
url = url,
callback = self.parse,
meta= dict(
playwright = True,
playwright_include_page = True,
playwright_page_coroutines = [
PageCoroutine("click", selector=f"(//div[#class='toggle-bottom-filter'])[{i}]"),
PageCoroutine("wait_for_timeout", 5000),
]
),
)
def parse(self, response):
container = response.xpath("(//div[#id]//ul[#class='list-filter disp-bloc list-model1'])//li")
test= response.xpath("(//div[#class='elem-filter id_marque clearfix'])")
for items in container:
for values in test:
loader = ItemLoader(ConfusedItem(), selector = items)
loader.add_xpath('clicks', './/#onclick')
loader.add_value('category', values.xpath("(//h2[#class=' select-load select-off'])//text()").getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'json_data.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ConfusedSpider)
process.start()
Output:
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'Make',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}
Expected output:
{'category': 'SELLER TYPE',
'clicks': "javascript:ctrl.set_criteria('id_vendeur',2,'Dealer')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',30,'less than 30 day')"}
2022-01-27 15:17:04 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.theparking.eu/used-cars/?_escaped_fragment_=%2Fused-cars%2F%253Fid_categorie%253D0>
{'category': 'FIRST LISTING DATE',
'clicks': "javascript:ctrl.set_criteria('id_fraicheur',31,'more than 30 day')"}

Your code has 2 issues. One your xpath selectors are not correct and two you are not using scrapy playwright therefore the clicks are not being done. Looping and changing the item index is not correct because once you click an item, that item is removed from the DOM and therefore the next item is now at the first index. Also, to enable scrapy-playwright you need to have at least these additional settings:
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
I have corrected those two issues in the code below. You will need to add some error handling and also find a better way of knowing how many clicks you should enable in your code.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from scrapy.loader import ItemLoader
from scrapy_playwright.page import PageCoroutine
class ConfusedItem(scrapy.Item):
clicks = Field(output_processor=TakeFirst())
category = Field(output_processor=TakeFirst())
class ConfusedSpider(scrapy.Spider):
name = 'confused'
allowed_domains = ['x']
start_urls = ['https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_categorie%3D0']
custom_settings = {
'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
meta=dict(
playwright=True,
playwright_page_coroutines=[
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
PageCoroutine("click", "//div[#class='toggle-bottom-filter']"),
]
),
)
def parse(self, response):
for category in response.xpath("//div[#id='face_links']/div"):
name = category.xpath("./h2/text()").get()
for item in category.xpath("./ul/li"):
loader = ItemLoader(ConfusedItem(), selector=item)
loader.add_xpath('clicks', './#onclick')
loader.add_value("category", name)
yield loader.load_item()
process = CrawlerProcess(
settings={
'FEED_URI': 'json_data.jl',
'FEED_FORMAT': 'jsonlines',
'DOWNLOAD_HANDLERS': {
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 60000,
"PLAYWRIGHT_BROWSER_TYPE": "webkit"
}
)
process.crawl(ConfusedSpider)
process.start()
Sample output is as below

Related

Exporting the results from multiple spiders

I wanted to set up multiple spiders and run these, where URL_LINK contains links to the main url, and DATA_LINK are urls embedded in the main urls. The example I am using does not represent this case as I am using quotes urls, however this is the purpose of the set-up. I then wanted to crawl the spiders and store these results. However, I am unsure on how I can call the spider to crawl it because there are two separate spiders.
For example, if I run scrapy crawl val in the terminal I get:
raise error.ReactorAlreadyInstalledError("reactor already installed")
twisted.internet.error.ReactorAlreadyInstalledError: reactor already installed
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging
from scrapy import signals
import scrapy
URL_LIST = ['https://quotes.toscrape.com/tag/love/',
'https://quotes.toscrape.com/tag/inspirational/']
DATA_LIST = ['https://quotes.toscrape.com/tag/life/',
'https://quotes.toscrape.com/tag/humor/']
def store_url(*args, **kwargs):
URL_LIST.append(kwargs['item'])
def store_data(*args, **kwargs):
DATA_LIST.append(kwargs['item'])
class QuotesSpiderWebsiteA(scrapy.Spider):
name='val'
start_urls = URL_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
class QuotesSpiderWebsiteB(scrapy.Spider):
name='valb'
start_urls = DATA_LIST
custom_settings = {'FEEDS':{
'quotes.jl':{
'format':'jsonlines'
}
}}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
self.parse
)
def parse(self, response):
content = response.xpath('//div[#class = "row"]')
for items in content:
yield {
'some_items_links':items.xpath(".//a//#href").get(),
}
if __name__ == '__main__':
configure_logging()
runner = CrawlerRunner()
#defer.inlineCallbacks
def crawl():
crawler1 = runner.create_crawler(QuotesSpiderWebsiteA)
crawler2 = runner.create_crawler(QuotesSpiderWebsiteB)
crawler1.signals.connect(store_url, signals.item_scraped)
crawler2.signals.connect(store_data, signals.item_scraped)
yield runner.crawl(crawler1)
yield runner.crawl(crawler2)
reactor.stop()

Scrapy Selenium: Why pagination is not working for scrapy-selenium?

I am trying to get data using scrapy-selenium but there is some issue with the pagination. I have tried my level best to use different selectors and methods but nothing changes. It can only able to scrape the 1st page. I have also checked the other solutions but still, I am unable to make it work. Looking forward to experts' advice.
Source: https://www.gumtree.com/property-for-sale/london
import scrapy
from urllib.parse import urljoin
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
class Basic2Spider(scrapy.Spider):
name = 'basic2'
def start_requests(self):
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.set_window_size(1920, 1080)
driver.get("https://www.gumtree.com/property-for-sale/london")
time.sleep(2)
property_xpath = driver.find_elements(By.XPATH, "(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
href= detail.get_attribute('href')
time.sleep(2)
yield SeleniumRequest(
url = href,
)
driver.quit()
return super().start_requests()
def parse(self, response):
yield {
'Title': response.xpath("//div[#class='css-w50tn5 e1pt9h6u11']/h1/text()").get(),
'Price': response.xpath("//h3[#itemprop='price']/text()").get(),
'Add Posted': response.xpath("//*[#id='content']/div[1]/div/main/div[5]/section/div[1]/dl[1]/dd/text()").get(),
'Links': response.url
}
next_page = response.xpath("//li[#class='pagination-currentpage']/following-sibling::li[1]/a/text()").get()
if next_page:
abs_url = f'https://www.gumtree.com/property-for-sale/london/page{next_page}'
yield SeleniumRequest(
url= abs_url,
wait_time=5,
callback=self.parse
)
Your code seem to be correct but getting tcp ip block. I also tried alternative way where code is correct and pagination is working and this type of pagination is two times faster than others but gives me sometimes strange result and sometimes getting ip block.
import scrapy
from scrapy import Selector
from scrapy_selenium import SeleniumRequest
class Basic2Spider(scrapy.Spider):
name = 'basic2'
responses = []
def start_requests(self):
url='https://www.gumtree.com/property-for-sale/london/page{page}'
for page in range(1,6):
print(page)
yield SeleniumRequest(
url=url.format(page=page),
callback=self.parse,
wait_time=5
)
def parse(self, response):
driver = response.meta['driver']
intial_page = driver.page_source
self.responses.append(intial_page)
for resp in self.responses:
r = Selector(text=resp)
property_xpath = r.xpath("(//article[#class='listing-maxi']/a)[position()>=2 and position()<30]")
for detail in property_xpath:
yield {
'Title': detail.xpath('.//*[#class="listing-title"]/text()').get().strip(),
'Price': detail.xpath('.//*[#class="listing-price"]/strong/text()').get(),
'Add Posted': detail.xpath('.//*[#class="listing-posted-date txt-sub"]/span//text()').getall()[2].strip(),
'Links': response.url
}

Extend scrapy settings value per spider

Assume we want to add a specific item pipeline for a particular spider. In order to comply with the DRY principle I just want to access current pipelines from settings, add my specific pipeline and set the result back to the settings for spider.
We can not accomplish this via custom_settings class attribute. Even setting that via from_crawler does not work :
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
crawler.settings.setdict({'ITEM_PIPELINES':
{**dict(crawler.settings.getdict('ITEM_PIPELINES')),
'myscrapers.pipelines.CustomPipeline': 11}
}, priority='spider')
return super().from_crawler(cls, crawler, *args, **kwargs)
That causes this error:
TypeError: Trying to modify an immutable Settings object
How can we correctly extend a settings value in scrapy at spider level?
You can set the settings for the process:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}
process = CrawlerProcess(settings)
process.crawl(spidername)
settings['ITEM_PIPELINES'] = {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}
process.crawl(spidername)
process.start()
But if you really want to do all this inside the spider you can overwrite "update_settings" method:
import scrapy
from itemadapter import ItemAdapter
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
import re
class ExampleSpider(scrapy.Spider):
name = 'exampleSpider'
allowed_domains = ['scrapingclub.com']
start_urls = ['https://scrapingclub.com/exercise/detail_basic/']
custom_settings1 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline1': 300}}
custom_settings2 = {'ITEM_PIPELINES': {'tempbuffer.spiders.yetanotherspider.ItemPipeline2': 300}}
#classmethod
def update_settings(cls, settings):
settings.setdict(getattr(cls, 'custom_settings1' if getattr(cls, 'is_pipeline_1', True) else 'custom_settings2', None) or {}, priority='spider')
def parse(self, response):
item = ExampleItem()
item['title'] = response.xpath('//h3/text()').get()
item['price'] = response.xpath('//div[#class="card-body"]/h4/text()').get()
yield item
class ExampleItem(scrapy.Item):
title = scrapy.Field()
price = scrapy.Field()
class ItemPipeline1:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price < 15:
print('Cheap enough')
return item
else:
raise DropItem(f"Missing price in {item}")
class ItemPipeline2:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
price = adapter['price']
if price:
price = float(re.findall(r'\d+\.\d+', price)[0])
if price > 10:
print('Too expensive')
return item
else:
raise DropItem(f"Missing price in {item}")
if __name__ == "__main__":
spidername = 'exampleSpider'
settings = get_project_settings()
settings['USER_AGENT'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
process = CrawlerProcess(settings)
ExampleSpider.is_pipeline_1 = True
process.crawl(ExampleSpider)
ExampleSpider.is_pipeline_1 = False
process.crawl(ExampleSpider)
process.start()
But honestly I think the first way is better...

Scrapy-Selenium: Unable to get data

I am trying to get some data using Scrapy-Selenium. The script is working fine except for the "URL", "Name of the share" and "Code of the share" fields. It is providing empty values for these fields. Though the "URL" field is containing a URL link, but it is not the full URL.
What am I actually missing here?
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from scrapy_selenium import SeleniumRequest
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from datetime import datetime
from scrapy import selector
from scrapy.selector import Selector
from markets_selenium.settings import *
class PriceSelSpider(scrapy.Spider):
name = 'price_sel'
def start_requests(self):
yield SeleniumRequest(
url="https://markets.ft.com",
wait_time=10,
callback=self.parse
)
def __init__(self):
chrome_options = Options()
chrome_options.add_argument('__headless')
chrome_path = SELENIUM_DRIVER_EXECUTABLE_PATH
driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
driver.set_window_size(1920, 1080)
driver.get('https://markets.ft.com/data/funds/tearsheet/historical?s=GB00B4NXY349:GBP')
time.sleep(2)
# self.html = driver.page_source
driver.quit()
def parse(self, response):
# resp = Selector(text=self.html)
now = datetime.now()
dt_string = now.strftime("%Y/%m/%d %H:%M:%S")
infos = response.xpath("//div[#class='mod-ui-table--freeze-pane__scroll-container']/table/tbody/tr")
for info in infos:
yield {
'Time': dt_string,
'URL': response.url,
'Name of the share': response.xpath(".//h1[#class='mod-tearsheet-overview__header__name mod-tearsheet-overview__header__name--large']/text()").get(),
'Code of the share': response.xpath(".//div[#class='mod-tearsheet-overview__header__symbol']/span/text()").get(),
'Date of share price ': info.xpath(".//td/span[1]/text()").get(),
'Opening price': info.xpath(".//td[2]/text()").get(),
'Highest price': info.xpath(".//td[3]/text()").get(),
'Lowest price': info.xpath(".//td[4]/text()").get(),
'Closing price': info.xpath(".//td[5]/text()").get(),
'Volume': info.xpath("(.//td/span)[3]/text()").get()
}
Output
{'Time': '2021/11/30 01:47:55', 'URL': 'https://markets.ft.com/data', 'Name of the share': None, 'Code of the share': None, 'Date of share price ': 'Friday, November 26, 2021', 'Opening price': '178.73', 'Highest price': '178.73', 'Lowest price': '178.73', 'Closing price': '178.73', 'Volume': '--'}
2021-11-30 01:47:55 [scrapy.core.scraper] DEBUG: Scraped from <200 https://markets.ft.com/data>

scrapy gettin coincident data

import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g298006-d740275-Reviews-Deniz_Restaurant-Izmir_Izmir_Province_Turkish_Aegean_Coast.html',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'author': response.xpath("//div[contains(#class, 'member_info')]//div/text()").extract(),
'rating': response.xpath("//span[contains(#class,'ui_bubble_rating')]/#alt").extract() ,
'comment_tag': response.xpath("//span[contains(#class, 'noQuotes')]/text()").extract(),
'comment': response.xpath('//div[#class="entry"]/p/text()').extract()
}
next_page = response.xpath("//div[contains(#class, 'unified')]/a[contains(#class, 'next')]/#href").extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
This is my spider code. My problem is when i used crawl with json. there are so many repeated data from website