When downloading csv files from webpage getting missing url scheme - scrapy

I am fairly new to scraping and have been trying to directly download .csv files from a website. I managed to fix my last issue with the edit, however I get a new error when trying to download the files. The follow error is:
raise ValueError(f'Missing scheme in request url: {self._url}')
ValueError: Missing scheme in request url: h
I am not sure what is triggering this error because the links follow properly to the next function.
For example, here is what I have tried:
import scrapy
from nhs.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback = self.download_files)
def download_files(self, response):
test_files = response.xpath("//article[#class='rich-text']//p")
month_files = response.xpath("//article[#class='rich-text']//h3")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files,ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
Items.py:
import scrapy
class DownfilesItem(scrapy.Item):
# define the fields for your item here like:
file_urls = scrapy.Field()
original_file_name = scrapy.Field()
Pipelines.py:
from scrapy.pipelines.files import FilesPipeline
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[1]
return file_name
Settings.py:
ITEM_PIPELINES = {'nhs.pipelines.DownfilesPipeline': 150}
FILES_STORE = "Files"
Updated error after #supersuers answer:
IsADirectoryError: [Errno 21] Is a directory: 'Files/'
It seems this is caused by FILES_STORE = "Files", however when I remove this I do not get an error but no files are downloaded neither.

item['file_urls'] should be a list:
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
Edit:
The second error is because of the pipeline, file_name is an empty string, you can change it for example to:
file_name: str = request.url.split("/")[-1]
Edit 2:
I think that the problem is in the xpath selectors, try this and tweak it to your needs:
import scrapy
from tempbuffer.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['https://www.england.nhs.uk/statistics/statistical-work-areas/ae-waiting-times-and-activity/ae-attendances-and-emergency-admissions-2021-22/']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36',
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse
)
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback=self.download_files)
def download_files(self, response):
# test_files = response.xpath("//article[#class='rich-text']//p")
test_files = response.xpath("//article[#class='rich-text']//p[a[contains(#href, '.xls')]]")
# month_files = response.xpath("//article[#class='rich-text']//h3")
# couldn't make a prettier xpath selector
month_files = response.xpath("//article[#class='rich-text']//h3[starts-with(text(), 'January') or starts-with(text(), 'February') or starts-with(text(), 'March') or starts-with(text(), 'April') or starts-with(text(), 'May') or starts-with(text(), 'June') or starts-with(text(), 'July') or starts-with(text(), 'August') or starts-with(text(), 'September') or starts-with(text(), 'October') or starts-with(text(), 'November') or starts-with(text(), 'December')]")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files, ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item

Related

Spider grabs 1 item from each page and not all items

My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': 'api2.branch.io',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': 'https://www.reed.co.uk',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.reed.co.uk/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':0.5,
#'LOG_LEVEL':'INFO',
}
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(self.degree.degrees,self.degree.degrees_entry,self.degree.degrees_graduate,self.degree.graduate_entry,self.degree.graduate, self.degree.sector):
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_graduate}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate}-jobs')
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
headers=headers,
callback = self.parse,
cb_kwargs = {
'items':items,
'page_no':0
}
)
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_jobs,
cb_kwargs = {
'loader':loader
})
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
headers=headers,
cb_kwargs = {
'items':items,
'page_no':page_no
})
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'reed_jobs_post.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ReedSpider)
process.start()
I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback=self.parse_jobs,
cb_kwargs={
'loader': loader
}
)
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url=next_page,
callback=self.parse,
headers=headers,
cb_kwargs={
'items': items,
'page_no': page_no
})

Remove duplicates based on a unique ID

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': 'www.theparking.eu',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://www.theparking.eu',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.theparking.eu/used-cars/used-cars/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
method="POST",
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
},
headers=headers,
cb_kwargs = {
'page_number':page
}
)
def parse(self, response,page_number):
container = json.loads(response.text)
test=container['#lists']
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique = i.select('li[tref]')
for make, unique in zip_longest(
carMake, carUnique
):
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
loader.add_value("page_number",page_number)
if make != None:
loader.add_value('make', make.text)
else:
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'park.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(CountrySpider)
process.start()
class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(item['unique_id'])
return item
Also add to custom_settings:
custom_settings = {
'ITEM_PIPELINES': {
'myproject.path_to_your_file.DuplicatesPipeline': 300
}
}

How to parse two different items in scrapy?

I am using scrapy 2.1 for parsing a category result page.
There are 2 different things I want to scrape from that site:
Category information like e.g. Title and URL
Product items within that category page
Number 2 works, but I am struggling on how to implement the storage of category info. My first attempt is to create another Item Class CatItem:
class CatItem(scrapy.Item):
title = scrapy.Field() # char -
url = scrapy.Field() # char -
level = scrapy.Field() # int -
class ProductItem(scrapy.Item):
title = scrapy.Field() # char -
Let's parse the page:
def parse_item(self, response):
# save category info
category = CatItem()
category['url'] = response.url
category['title'] = response.url
category['level'] = 1
yield category
# now let's parse all products within that category
for selector in response.xpath("//article//ul/div[#data-qa-id='result-list-entry']"):
product = ProductItem()
product['title'] = selector.xpath(".//a/h2/text()").extract_first()
yield product
My Pipeline:
class mysql_pipeline(object):
def __init__(self):
self.create_connection()
def create_connection(self):
settings = get_project_settings()
def process_item(self, item, spider):
self.store_db(item, spider)
return item
Now here I don't know how to proceed. There is only one "item" within process_item definition.
How can I pass the category information to the store_db method as well?
You can check item type in your pipeline:
from your_project.items import CatItem, ProductItem
class YourPipeline(object):
...
def process_item(self, item, spider):
if isinstance(item, CatItem):
save_category(item)
return item
UPDATE Simple PoC code:
import scrapy
import csv
from scrapy.crawler import CrawlerProcess
class BooksPipeline(object):
def process_item(self, item, spider):
filename = None
if isinstance(item, CategoryItem):
filename = 'Categories.csv'
elif isinstance(item, BookItem):
filename = 'Books.csv'
with open(filename, 'a', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Title', 'URL'], lineterminator="\n")
writer.writerow(item)
return item
class BooksSpider(scrapy.Spider):
name = "books"
start_urls = ['http://books.toscrape.com/']
def parse(self, response):
for book_url in response.xpath('//ol/li//h3/a/#href').getall():
yield scrapy.Request(
url=response.urljoin(book_url),
callback=self.parse_book,
)
def parse_book(self, response):
category = CategoryItem()
category['Title'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/text()').get()
category['URL'] = response.xpath('//ul[#class="breadcrumb"]/li[last() - 1]/a/#href').get()
yield category
book = BookItem()
book['Title'] = response.xpath('//h1/text()').get()
book['URL'] = response.url
yield book
class BookItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
class CategoryItem(scrapy.Item):
Title = scrapy.Field()
URL = scrapy.Field()
if __name__ == "__main__":
process = CrawlerProcess(
{
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'ITEM_PIPELINES': {
'__main__.BooksPipeline': 300,
}
}
)
process.crawl(BooksSpider)
process.start()

ItemLoader doesn't pass the loader context to input processors

my spider: autospd.py
class AutospdSpider(scrapy.Spider):
name = 'autospd'
start_urls = ['http://news.dayoo.com/guangzhou/150960_2.shtml']
dt_ft = "%Y-%m-%d %H:%M"
def parse(self, response):
list_objs = response.css("div.dy-list>div")
for li in list_objs:
loader = AutopjtItemLoader(item=AutopjtItem(), selector=li, context=self.dt_ft)
print(loader.context.items()) #please see print-1
loader.nested_css("h2>a").add_css("title", "::text")
loader.nested_css("h2>a").add_css("url", "::attr(href)")
loader.nested_css("div.txt-area>div.news-time").add_xpath("pub_time", "string()")
yield loader.load_item()
print-1: dict_items([('context', '%Y-%m-%d %H:%M'), ('selector',
\r\n '>), ('response', None), ('item',
{}) ])
items.py
def func(value, loader_context):
print(loader_context.items()) # please see print-2
# ft = loader_context.get("context")
# time_dt = datetime.strptime(value, ft)
return value
class AutopjtItemLoader(ItemLoader):
default_output_processor = TakeFirst()
pub_time_in = MapCompose(func)
class AutopjtItem(scrapy.Item):
title = scrapy.Field()
url = scrapy.Field()
pub_time = scrapy.Field()
print-2: [('selector', [2019-06-12 08:59< '>]), ('response',
None), ('item', {})]
Why don't have "context" in loader_context?
def nested_xpath(self, xpath, **context):
selector = self.selector.xpath(xpath)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
def nested_css(self, css, **context):
selector = self.selector.css(css)
context.update(selector=selector)
subloader = self.__class__(
item=self.item, parent=self, **context
)
return subloader
From the scrapy's source code, if you use nested_css or nested_xpath, you must add your context. eg:
loader.nested_css("div.txt-area>div.news-time", dt_ft=self.dt_ft).add_xpath("pub_time", "string()")

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item