Following urls in javascript - Scrapy Splash - scrapy

I am extremely new to web scraping. I manage to extract information from static websites but am now trying my hand following urls and extracting data (which ofcourse involves some javascript). I have installed scrapy-splash for the same which is running perfectly fine.
The website I am trying to scrape is https://www.ta.com/portfolio/investments/ari-network-services-inc and the button to the top right side takes you to the next page (which is javascript, hence splash). I want to scrape some basic data (like company name, sectors etc) on all the pages till the last one. This is what I have done so far and I need help to correct this to successfully execute.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = ['https://www.ta.com/portfolio/investments/ari-network-services-inc']
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team
}
next_page = response.css('li.item_detail-nav-item--next a::attr(href)').extract()
if next_page is not None:
yield SplashRequest(urlparse.urljoin('https://www.ta.com',next_page),callback=self.parse, args={"wait":3})
This gives me the correct information for the start_url but doesn't proceed to the next page.

Update. The issue was in the order in which I had the scraping of websites. Below is the updated code which worked well.
import scrapy
from scrapy_splash import SplashRequest
import urllib.parse as urlparse
class TAFolio(scrapy.Spider):
name = 'Portfolio'
start_urls = [
'https://www.ta.com/portfolio/business-services',
'https://www.ta.com/portfolio/consumer',
'https://www.ta.com/portfolio/financial-services',
'https://www.ta.com/portfolio/healthcare',
'https://www.ta.com/portfolio/technology'
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url=url, callback = self.parse, args={"wait" : 3})
def parse(self, response):
companylink = response.css('div.tiles.js-portfolio-tiles a::attr(href)').extract()
for i in companylink:
yield response.follow('https://www.ta.com' + str(i), callback=self.parse1)
def parse1(self, response):
companyname = response.css('h1.item_detail-main-info-heading::text').extract_first()
sectors = response.css('.item_detail-main-info-group-item::text')[0].extract()
investmentyear = response.css('.item_detail-main-info-group-item::text')[1].extract()
status = response.css('.item_detail-main-info-group-item::text')[2].extract()
location = response.css('.item_detail-main-info-group-item::text')[3].extract()
region = response.css('.item_detail-main-info-group-item::text')[4].extract()
team = response.css('div.item_detail-main-info-group a::text').extract()
about_company = response.css('h2.item_detail-main-content-heading::text').extract()
about_company_detail = response.css('div.markdown p::text').extract()
yield {
'companyname': companyname,
'sectors': sectors,
'investmentyear': investmentyear,
'status': status,
'location': location,
'region': region,
'team': team,
'about_company': about_company,
'about_company_detail' : about_company_detail
}

Related

scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

scrapy pagination not working on tripadvisor

i'm trying to scrape the restaurant pages on tripadvisor (just to learn how it works)
However, i only get the first page.
What am I missing?
here is the code, thanks!
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name' : stores.css('a.bHGqj::text').extract(),
'link' : stores.css('a.bHGqj').xpath("#href").extract()
}
next_page = ('http://tripadvisor.com' + response.css('a.nav').attrib['href']).extract()
##next_page = response.xpath('//a[contains(text(), "Next")]/#href).extract())
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
#djmystica, Now it's working fine
import scrapy
class TripadvSpider(scrapy.Spider):
name = 'tripadv'
allowed_domains = ['tripadvisor.com']
start_urls = [
'https://www.tripadvisor.com/Restaurants-g60795-oa0-Philadelphia_Pennsylvania.html#EATERY_LIST_CONTENTS']
def parse(self, response):
for stores in response.css('div.emrzT'):
yield {
'name': stores.css('a.bHGqj::text').extract_first(),
'link': stores.css('a.bHGqj').xpath("#href").extract_first()}
#next_page = ('http://tripadvisor.com' +response.css('a.nav').attrib['href']).extract()
next_page = response.xpath('//a[contains(text(), "Next")]/#href').extract_first()
abs_next_page = f'https://www.tripadvisor.com{next_page}'
#next_page = ('http://tripadvisor.com' + response.css('a:contains("Next")').attrib['href'].extract())
if abs_next_page is not None:
yield response.follow(abs_next_page, callback=self.parse)

How to store scraped links in Scrapy

I did a lot of searches on the web but I couldn't find anything related or maybe it has to do with the wording used.
Basically, I would like to write a spider that would able to save the scraped links and to check if some other links have been already scraped. Is there any build in function in scrapy to do so?
Many thanks
You can write your own method for this purpose. I have written in my project and you can take reference from this. A dictionary called already_parsed_urls and for every callback, I am updating this dictionary.
You can look at the below code snippet and take reference.
from scrapy.spiders import CrawlSpider
from scrapy_splash import SplashRequest
class Spider(CrawlSpider):
name = 'test'
allowed_domains = []
web_url = ''
start_urls = ['']
counter = 0
already_parsed_urls = {}
wait_time = 3
timeout = '90'
def start_requests(self):
for start_url in self.start_urls:
yield SplashRequest(start_url, callback=self.parse_courses,
args={'wait': self.wait_time, 'timeout': self.timeout})
def parse_courses(self, response):
course_urls = []
yield SplashRequest(course_urls[0], callback=self.parse_items, args={'wait': self.wait_time})
def parse_items(self, response):
if not self.already_parsed_urls.get(response.url):
# Get Program URL
program_url = response.url
self.already_parsed_urls[response.url] = 1
else:
return {}

How to add multiple variables in one string (URL)

My spider starts off with the start_urls, being:
https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL
Based on a keywords.csv file, located in my resource folder, the keywordsID (number 20035386) will change. Once the number changed, the spider will fetch the data from another product.
I also have a chunk of code which constantly checks the page if isTruncated = true, if that's the case, it will change the page number in the URL to +1. The only problem I am having right now, is that I don't know how to set a second variable in one string (URL). When isTruncated = true the code need to adjust the URL's page number AND keywordsID accordingly. Currently, I only managed to add a variable for the page number.
Currently the chunk of code is:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
However, it should become something like:
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/ {keywordsid} ?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
When I run the spider, it will crawl all the pages of the product with keywordsID 20035386, but it will only crawl the first page of all the other products listed in the keywords.csv file.
FULL CODE
./krc/spiders/krc_spider.py
# -*- coding: utf-8 -*-
import scrapy
from krc.items import KrcItem
import json
import os
import csv
import time
import datetime
class KRCSpider(scrapy.Spider):
name = "krc_spider"
allowed_domains = ["kaercher.com"]
start_urls = ['https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page=1&size=8&isocode=nl-NL']
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
search_text=keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"search_text": search_text, "category": category})
def parse(self, response):
category = response.meta["category"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/20035386?page={page}&size=8&isocode=nl-NL".format(page=next_page),
callback=self.parse,
meta={'page': next_page, "category": category},
)
./krc/resources/keywords.csv
keyword,keywordtype
20035386,Hogedrukreiniger
20035424,Window Vacs
Current Output
When I run the spider it fetches the data from all the page's of the product with keywordsID 20035386. From all the other products with a different keywordsID, only the data from the first page will be fetched.
Use response.meta for this:
def start_requests(self):
"""Read keywords from keywords file amd construct the search URL"""
with open(os.path.join(os.path.dirname(__file__), "../resources/keywords.csv")) as search_keywords:
for keyword in csv.DictReader(search_keywords):
product_id = keyword["keyword"]
category = keyword["keywordtype"]
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(product_id)
# The meta is used to send our search text into the parser as metadata
yield scrapy.Request(url, callback = self.parse, meta = {"category": category, "product_id": product_id})
def parse(self, response):
category = response.meta["category"]
product_id = response.meta["product_id"]
current_page = response.meta.get("page", 1)
next_page = current_page + 1
#Printing the timestamp when fetching the data, using default timezone from the requesting machine
ts = time.time()
timestamp = datetime.datetime.fromtimestamp(ts).strftime('%d-%m-%Y %H:%M:%S')
#Defining the items
item = KrcItem()
data = json.loads(response.text)
for company in data.get('products', []):
item["productid"] = company["id"]
item["category"] = category
item["name"] = company["name"]
item["description"] = company["description"]
item["price"] = company["priceFormatted"].replace("\u20ac","").strip()
item["timestamp"] = timestamp
yield item
#Checking whether "isTruncated" is true (boolean), if so, next page will be triggered
if data["isTruncated"]:
yield scrapy.Request(
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{product_id}?page={page}&size=8&isocode=nl-NL".format(page=next_page, product_id=product_id),
callback=self.parse,
meta={'page': next_page, "category": category, "product_id": product_id},
)
I believe you need a nested for when your search_text change.
for [first iterating variable] in [outer loop]: # Outer loop
[do something] # Optional
for [second iterating variable] in [nested loop]: # Nested loop
[do something]
Checks this out, it might help you.
For Loops
I think adding the keyword to the url would be the following. It may or may not need + signs before and after search_text, my knowledge is limited.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/"search_text"?page={page}&size=8&isocode=nl-NL".format(page=next_page),
though I'm not really following what this line is doing, at least the format(search_text) portion of it.
url="https://www.kaercher.com/api/v1/products/search/shoppableproducts/partial/{0}?page=1&size=8&isocode=nl-NL".format(search_text)

Is there a way to get ID of the starting URL from database in scrapy with some function, make_requests_from_url

I am pulling start URL's from Database and also need ID's associated with the URL so that I can pass it in the ITEMS pipeline and store in the table along with items.
I am using "make_requests_from_url(row[1])" to pass the start URL's "start_urls = []" which forms the list of starting URL's. The id's row[0] is what I need to pass to Items when the respective items are crawled.
Below is my spider code:
import scrapy
import mysql.connector
from ..items import AmzProductinfoItem
class AmzProductinfoSpiderSpider(scrapy.Spider):
name = 'amz_ProductInfo_Spider'
nextPageNumber = 2
allowed_domains = ['amazon.in']
start_urls = []
url_fid = []
def __init__(self):
self.connection = mysql.connector.connect(host='localhost', database='datacollecter', user='root', password='', charset="utf8", use_unicode=True)
self.cursor = self.connection.cursor()
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield self.make_requests_from_url(row[1])
I have tried with comparing "response.url" in parse method but that changes as spider moves on to next page.
Not sure how can I achieve this. any direction is appreciated.
It's not clear why do you need self.make_requests_from_url. You can yield your requests directly:
def start_requests(self):
sql_get_StartUrl = 'SELECT * FROM database.table'
self.cursor.execute(sql_get_StartUrl)
rows = self.cursor.fetchall()
for row in rows:
yield scrapy.Request(url=row[1], meta={'url_id': row[0]}, callback=self.parse)
def parse(self, response):
url_id = response.meta["url_id"]