My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': '',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': '',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': '',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(,,,,,
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
callback = self.parse,
cb_kwargs = {
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
callback = self.parse_jobs,
cb_kwargs = {
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
cb_kwargs = {
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
'loader': loader
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
'items': items,
'page_no': page_no
Hope I am asking this the right way - just confused with what's going on: I have my working script (below). I'm trying to take the URLs from a spreadsheet, rather than copy and paste them in - basically, creating urlsA from column N on the sheet connected.
I've tested it out - I can print urlsA to terminal no problem, so I know the Sheet connection is working. I just can't seem to use them when I try to run the full script. I'm receiving this error:
Working code (before pulling links from Google Sheet):
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
profilesA = []
urlsA = ['',
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in'.smtext > a[href^="/contests/"]'):
profileA = ''+profileA.get('href')
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in'a[href^="/game/play_by_play/"]'):
profileB = ''+profileB.get('href')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36"
urls = profilesB
s = requests.Session()
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
Broken code: "No connection adapters were found" error:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
import csv
from unittest import skip
import json
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1cEQlPB_ykJrucnbGgKhlKj49RdLNAzeO6fiO2gkQeNU')
wk = sh.worksheet("Team Select")
profilesA = []
ShUrls = wk.batch_get(('N3:N',))[0]
urlsA = ShUrls
for urlA in urlsA:
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileA in'.smtext > a[href^="/contests/"]'):
profileA = ''+profileA.get('href')
profilesB = []
urlsB = profilesA
for urlB in urlsB:
req = requests.get(urlB, headers={'User-Agent': 'Mozilla/5.0'})
soup = BeautifulSoup(req.text, 'html.parser')
for profileB in'a[href^="/game/play_by_play/"]'):
profileB = ''+profileB.get('href')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36"
urls = profilesB
s = requests.Session()
for url in urls:
gameId = url.split('/')[-1]
r = s.get(url)
dfs = pd.read_html(r.text)
for df in dfs:
if len(df.columns) > 2:
if df.iloc[0, 2] == 'Score':
df[4] = df[3]
df[[2, 3]] = df[2].str.split('-', expand=True)
df.to_csv('2022test.csv', mode='a', index=False)
I'd inspect this line:
ShUrls = wk.batch_get(('N3:N',))[0]
As you might be pulling a list of lists, hence, this line breaks
req = requests.get(urlA, headers={'User-Agent': 'Mozilla/5.0'})
with the No connection adapters were found error as a list is not a valid URL.
Needed to flatten urlsA after seeing it was an array of arrays. Using this, then calling flatten fixed the issue:
def flatten(l):
fl = []
for sublist in l:
for item in sublist:
return fl
I am fairly new to scraping and have been trying to directly download .csv files from a website. I managed to fix my last issue with the edit, however I get a new error when trying to download the files. The follow error is:
raise ValueError(f'Missing scheme in request url: {self._url}')
ValueError: Missing scheme in request url: h
I am not sure what is triggering this error because the links follow properly to the next function.
For example, here is what I have tried:
import scrapy
from nhs.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['']
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36'
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url = url,
callback = self.parse
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback = self.download_files)
def download_files(self, response):
test_files = response.xpath("//article[#class='rich-text']//p")
month_files = response.xpath("//article[#class='rich-text']//h3")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files,ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = ind_files
item['original_file_name'] = ind_text
yield item
import scrapy
class DownfilesItem(scrapy.Item):
# define the fields for your item here like:
file_urls = scrapy.Field()
original_file_name = scrapy.Field()
from scrapy.pipelines.files import FilesPipeline
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[1]
return file_name
ITEM_PIPELINES = {'nhs.pipelines.DownfilesPipeline': 150}
Updated error after #supersuers answer:
IsADirectoryError: [Errno 21] Is a directory: 'Files/'
It seems this is caused by FILES_STORE = "Files", however when I remove this I do not get an error but no files are downloaded neither.
item['file_urls'] should be a list:
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
The second error is because of the pipeline, file_name is an empty string, you can change it for example to:
file_name: str = request.url.split("/")[-1]
Edit 2:
I think that the problem is in the xpath selectors, try this and tweak it to your needs:
import scrapy
from tempbuffer.items import DownfilesItem
class NhsScapeSpider(scrapy.Spider):
name = 'nhs_scape'
#allowed_domains = ['nh']
start_urls = ['']
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/ Safari/537.36',
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
def parse(self, response):
side_panel = response.xpath("//aside[#class='subnav group minimal_nav desktop-only']//ul[#class='children']//li")
for years in side_panel:
year_links = years.xpath('.//a/#href').get()
yield response.follow(year_links, callback=self.download_files)
def download_files(self, response):
# test_files = response.xpath("//article[#class='rich-text']//p")
test_files = response.xpath("//article[#class='rich-text']//p[a[contains(#href, '.xls')]]")
# month_files = response.xpath("//article[#class='rich-text']//h3")
# couldn't make a prettier xpath selector
month_files = response.xpath("//article[#class='rich-text']//h3[starts-with(text(), 'January') or starts-with(text(), 'February') or starts-with(text(), 'March') or starts-with(text(), 'April') or starts-with(text(), 'May') or starts-with(text(), 'June') or starts-with(text(), 'July') or starts-with(text(), 'August') or starts-with(text(), 'September') or starts-with(text(), 'October') or starts-with(text(), 'November') or starts-with(text(), 'December')]")
for files, mn in zip(test_files, month_files):
all_files = files.xpath('.//a//#href').getall()
all_file_names = files.xpath('.//a//text()').getall()
month_year = mn.xpath('.//text()').get()
for ind_files, ind_text in zip(all_files, all_file_names):
item = DownfilesItem()
if '.xls' in ind_files and 'Monthly' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
elif '.xls' in ind_files and 'Week' in ind_text:
item['file_urls'] = [ind_files]
item['original_file_name'] = ind_text
yield item
I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': '',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': '',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': '',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages,,
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
cb_kwargs = {
def parse(self, response,page_number):
container = json.loads(response.text)
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake ="a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique ='li[tref]')
for make, unique in zip_longest(
carMake, carUnique
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
if make != None:
loader.add_value('make', make.text)
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
return item
Also add to custom_settings:
custom_settings = {
'myproject.path_to_your_file.DuplicatesPipeline': 300
The output is a csv file with a list of businesses including name, address, telephone and coordinates, for some reason only partial coordinates are generated, the ones that aren't generated and ran in a single run with geopy will find the coordinates, so potentially geopy can find the coordinates for all of them but for some reason it skips sometimes, I thought it might be needing some time to call the api and added threading but it didn't solve the issue.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import threading
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="")
main_list = []
def extract(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36'}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup.find_all('div', class_ = 'listing__content__wrap--flexed jsGoToMp')
def transform(articles):
for item in articles:
name = item.find('a', class_ ='listing__name--link listing__link jsListingName').text
street = item.find('span', {'itemprop':'streetAddress'}).text
street = ''
city = item.find('span', {'itemprop':'addressLocality'}).text
city = ''
province = item.find('span', {'itemprop':'addressRegion'}).text
province = ''
postCode = item.find('span', {'itemprop':'postalCode'}).text
postCode = ''
phone = item.find('li', class_ = 'mlr__submenu__item').text.strip()
phone = ''
def search_geo():
global location
location = geolocator.geocode(street + ' ' + city)
print(street + ' ' + city)
thread = threading.Thread(target=search_geo)
slatitude = location.latitude
slatitude = ''
thread = threading.Thread(target=search_geo)
slongitude = location.longitude
slongitude = ''
business = {
'name': name,
'street': street,
'city': city,
'province': province,
'postCode': postCode,
'phone': phone,
'slongitude': slongitude,
'slatitude': slatitude
def load():
df = pd.DataFrame(main_list)
df.to_csv('repairshopsbc', index=False)
for x in range(1,2):
print(f'Getting page {x}')
articles = extract(f'{x}/car+repair/British+Columbia+BC')
print('Saved to CSV')
I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
Third problem:
How would I go to implement the next page (comment out code inside
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = [""]
start_urls = [
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = '' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] =
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = ''
getphone = ''
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = [""]
start_urls = [
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
domain = gumtree + href
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item