Scrapy yield from a non call back method - scrapy

I am trying to scrape a html file that has a json object with all the required test case data , but processing of the json happens in "find" and "parseTestCaseDetails" method where iteratively i get the test case details , which i finally parse in "findInterestedFields" , so my requirement is to yield the details of test cases to a json file from the the last method called in the hierarchy i.e findInterestedFields , is it possible to achieve this??
Thanks in advance!
import scrapy
import datetime
import json
import re
import collections
import time
import os
import js2xml
from scrapy.selector import HtmlXPathSelector
class AltiplanoAVInprogressTCSpiderTest(scrapy.Spider):
name = "AltiplanoAVInprogressTCSpiderTest"
buildNumber = []
FormatedBuildTime = []
keys = []
testcaseName=''
testcaseSuit=''
testcaseDoc=''
def start_requests(self):
print os.environ["jenkinsdomain"]
urls = [
os.environ["jenkinsdomain"] + "/job/InprogressFlakyAndBlockedTestCaseDetails/lastSuccessfulBuild/"
]
for url in urls:
print url
yield scrapy.Request(url=url, callback=self.parse, errback=self.parseerror)
def parseerror(self, failure):
print failure
def parse(self, response):
hxs = HtmlXPathSelector(response)
buildNumberString = hxs.select('normalize-space(//*[#id="main-panel"]/h1/text())').extract_first()
self.buildNumber = buildNumberString.split("#")[-1].split("(")[0].strip()
buildTimeUnformatd = buildNumberString.split("(")[-1].split(")")[0].strip().replace("PM", "").replace("AM", "")
buildTimeUnformatd = buildTimeUnformatd.strip()
t = time.strptime(buildTimeUnformatd, "%b %d, %Y %I:%M:%S")
self.FormatedBuildTime = time.strftime('%d %b %y %H:%M IST', t)
static_testDetailsUrl = os.environ[
"jenkinsdomain"] + "XX/Inprogress-ANV.html"
yield scrapy.Request(url=static_testDetailsUrl, callback=self.parseTestDetails, errback=self.parseerror)
def find(self, key, dictionary):
for k, v in dictionary.iteritems():
if k == key:
yield v
self.parseTestCaseDetails(v)
elif isinstance(v, dict):
for result in self.find(key, v):
yield result
elif isinstance(v, list):
for d in v:
for result in self.find(key, d):
yield result
def parseTestCaseDetails(self, testcases):
# get the list of test cases and again parse them one by one to get the name and other fields
for testcase in testcases:
self.findInterestedFields(testcase)
def findInterestedFields(self, dictionary):
jsonLoad = json.dumps(dictionary, indent=4)
loaded_json = json.loads(jsonLoad)
readabledatetime = datetime.datetime.now().strftime("%d %b %y %H:%M IST")
for k, v in loaded_json.iteritems():
if k == "name":
testcaseName = v
if k == "fullName":
testcaseSuit = v
if k == "doc":
testcaseDoc = v
yield {"name":"avInprogresstestcases",'avInprogresstestcases':{
'testcaseName':testcaseName
}}
def parseTestDetails(self, response):
data = response.xpath('//script[4]/text()').extract_first().strip()
jstree = js2xml.parse(data)
testDetailsInJson = js2xml.jsonlike.getall(jstree)
jsonLoad = json.dumps(testDetailsInJson[0], indent=4)
loaded_json = json.loads(jsonLoad)
list(self.find('tests', loaded_json))

Related

No adapter found for objects of type: 'itemadapter.adapter.ItemAdapter'

I want to change the names of images downloaded from a webpage. I want to use standard names given by the website as opposed to cleaning the request url for it.
I have the following pipeline.py
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ScrapyExercisesPipeline:
def process_item(self, item, spider):
adapter = ItemAdapter(item)
return adapter
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
adapter = ScrapyExercisesPipeline().process_item()[0]
image_name: str = f'{adapter}.jpg'
return image_name
This produces the following error:
raise TypeError(f"No adapter found for objects of type: {type(item)} ({item})")
TypeError: No adapter found for objects of type: <class 'itemadapter.adapter.ItemAdapter'> (<ItemAdapter for ScrapyExercisesItem(name='unknown267', images=['https://bl-web-assets.britishland.com/live/meadowhall/s3fs-public/styles/retailer_thumbnail/public/retailer/boots_1.jpg?qQ.NHRs04tdmGxoyZKerRHcrhCImB3JH&itok=PD5LxLmS&cb=1657061667-curtime&v=1657061667-curtime'])>)
scraper.py:
import scrapy
from scrapy_exercises.items import ScrapyExercisesItem
class TestSpider(scrapy.Spider):
name = 'test'
#allowed_domains = ['x']
start_urls = ['https://www.meadowhall.co.uk/eatdrinkshop?page=1']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
callback=self.parse,
cb_kwargs = {'pg':0}
)
def parse(self, response,pg):
pg=0
content_page = response.xpath("//div[#class='view-content']//div")
for cnt in content_page:
image_url = cnt.xpath(".//img//#src").get()
image_name = cnt.xpath(".//img//#alt").get()
if image_url != None:
pg+=1
items = ScrapyExercisesItem()
if image_name == '':
items['name'] = 'unknown'+f'{pg}'
items['images'] = [image_url]
yield items
else:
items['name'] = image_name
items['images'] = [image_url]
yield items
settings.py
ITEM_PIPELINES = {
#'scrapy.pipelines.images.ImagesPipeline': 1,
'scrapy_exercises.pipelines.ScrapyExercisesPipeline':45,
'scrapy_exercises.pipelines.DownfilesPipeline': 55
}
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
IMAGES_STORE = os.path.join(BASE_DIR, 'images')
IMAGES_URLS_FIELD = 'images'
IMAGES_RESULT_FIELD = 'results'
You are calling on a pipeline from within your pipeline while that pipeline is also registered in your settings to be run as a pipeline. It would be simpler to just extract the name field from your item in your DownFilesPipeLine and return it.
Change your pipelines.py file to:
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class DownfilesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, item=None):
return item['name'] + '.jpg'
You also need to turn off the ScrapyExercisesPipeline in your settings

scrapy-playwright returning nothing but an error

I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):

Only crawler the first page and save detailed contents as dataframe in Python

I'm trying to loop pages, crawler and save detailed contents from this link:
Based on the code from here, I've modified the code to:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls[:1]: # remove [:1] to scrape all the pages
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
updated_df = pd.DataFrame()
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
# print(f"Fetching data for {key}...")
dfs = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4")
# https://stackoverflow.com/questions/39710903/pd-read-html-imports-a-list-rather-than-a-dataframe
for df in dfs:
# df = dfs[0].T
df = dfs[0].T.iloc[1:, :].copy()
updated_df = updated_df.append(df)
print(updated_df)
cols = ['项目编号', '转让/出租标的名称', '转让方/出租方名称', '转让标的评估价/年租金评估价(元)',
'转让底价/年租金底价(元)', '受让方/承租方名称', '成交价/成交年租金(元)', '成交日期']
updated_df.columns = cols
updated_df.to_excel('./data.xlsx', index = False)
But it only successfully crawler the first page, how could I crawler all the pages and also add url column? Thanks.
Is this what you're looking for? This processes all the urls and dumps a list of dataframes to a single excel file.
Here's how:
import pandas as pd
import requests
from bs4 import BeautifulSoup
BASE_URL = "http://www.jscq.com.cn/dsf/zc/cjgg"
COLUMNS = [
'项目编号', '转让/出租标的名称', '转让方/出租方名称',
'转让标的评估价/年租金评估价(元)', '转让底价/年租金底价(元)',
'受让方/承租方名称', '成交价/成交年租金(元)', '成交日期', 'URL'
]
def get_main_urls() -> list:
start_url = f"{BASE_URL}/index.html"
return [start_url] + [f"{BASE_URL}/index_{i}.html" for i in range(1, 6)]
def get_follow_urls(urls: list, session: requests.Session()) -> iter:
for url in urls:
body = session.get(url).content
s = BeautifulSoup(body, "lxml").find_all("td", {"width": "60%"})
yield from [f"{BASE_URL}{a.find('a')['href'][1:]}" for a in s]
def post_process(list_of_dataframes: list, source_url: str) -> pd.DataFrame():
temp_df = list_of_dataframes[0]
temp_df = temp_df.append(
pd.Series(["URL", source_url], index=temp_df.columns),
ignore_index=True,
)
return temp_df.T.iloc[1:, :].copy()
def dump_to_excel(post_processed_dfs: list):
df = pd.concat(post_processed_dfs)
df.columns = COLUMNS
df.to_excel("scraped_data.xlsx", index=False)
processed_dfs = []
with requests.Session() as connection_session: # reuse your connection!
for follow_url in get_follow_urls(get_main_urls(), connection_session):
key = follow_url.rsplit("/")[-1].replace(".html", "")
print(f"Fetching data for {key}...")
df_list = pd.read_html(
connection_session.get(follow_url).content.decode("utf-8"),
flavor="bs4",
)
processed_dfs.append(post_process(df_list, follow_url))
dump_to_excel(processed_dfs)
Output:

Scrapy does not work for pages with pagination page> 1, although the links given are correct

Scrapy does not work for pages with pagination page> 1, although the links given are correct. My code:
Linux, Debian 9, Python 3.5, MongoDB, Scrapy, Scrapy-Splash
code
import scrapy
import copy
import datetime
import json
import pymongo
from webscrapy.items import WebscrapyItem
from scrapy.conf import settings
from bson.objectid import ObjectId
class YooxSpiderSpider(scrapy.Spider):
name = 'yoox-spider'
allowed_domains = ['yoox.com']
base_url = 'https://www.yoox.com'
job = {}
start_url = ''
splash_url = ''
connection = pymongo.MongoClient(settings['MONGODB_SERVER'],
settings['MONGODB_PORT'])
db = connection[settings['MONGODB_DB']]
def __init__(self, job_id):
self.job = self.db.jobs.find_one({'_id':ObjectId(job_id)})
self.start_url = self.job['start_url']
self.splash_url = self.job['splash_url']
def start_requests(self):
# job['start_url'] - This is the starting link for the desired category, for example Yoox/Woman or Yoox/Men
print("------------- start ---------")
yield scrapy.Request(url=''.join((self.splash_url, self.start_url)), callback=self.parse)
def parse(self, response):
for cat in [response.xpath(
"//div[#id='teleyooxCategories']/div[#class='teleyoox-section-content']/div[#class='teleyoox-section-padding']/ul[#class='text-size-default']/li")[0]]:
#url_category = response.urljoin('/render.html?url=https://www.yoox.com' + cat.xpath('./a/#href').extract_first())
sub_url_category = cat.xpath('./a/#href').extract_first()
if sub_url_category:
url_category = ''.join((self.base_url, cat.xpath('./a/#href').extract_first()))
Item = WebscrapyItem()
Item['job_id'] = self.job['_id']
Item['basecat'] = self.job['basecat']
Item['gender'] = self.job['gender']
Item['category'] = cat.xpath('./a/text()').extract_first().strip()
Item['url_category'] = url_category
yield scrapy.Request(url=''.join((self.splash_url, url_category)), meta={'Item': Item}, callback=self.parse_data, dont_filter=True)
def parse_data(self, response):
print('')
print('')
print(' ++++++++ current page ++++++++ ', response.url)
print('')
print('')
# Getting product references and product_id
for product in response.xpath("//div[#id='itemsGrid']/div[#id='srpage1']/div[#class='col-8-24']"):
sub_url_product = product.xpath('./div/div/a/#href').extract_first()
#url_product = response.urljoin('/render.html?url=https://www.yoox.com' + product.xpath('./div/div/a/#href').extract_first())
if sub_url_product:
Item = copy.deepcopy(response.meta['Item'])
product_id = product.xpath('./div/#id').extract_first()
price = product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][2:]
sizes = [size for size in product.xpath("./div/div/a[#class='itemlink']/div[#class='colorSize']/div[#class='size text-light']/span/text()").extract()]
available_products = {
'basecat': Item['basecat'],
'category': Item['category'],
'job_id': Item['job_id'],
'product_id': product_id,
}
#if not self.db.data.find(available_products).count():
#print('NEW product: ', product_id)
cutout_images = [
product.xpath("./div/div/a/img/#data-original").extract_first(),
product.xpath("./div/div/a/img/#rel").extract_first(),
]
Item['dt'] = datetime.datetime.utcnow()
Item['product_id'] = product_id
Item['url_product'] = ''.join((self.base_url, sub_url_product))
Item['data'] = {
'sku':'',
'date':'',
'cutout_image': cutout_images,
'data-category': product.xpath("./div/#data-category").extract_first(),
'microcategory': product.xpath("./div/div/a[#class='itemlink']/div[#class='microcategory font-sans']/text()").extract_first().strip(),
'description':'',
'price': price,
#currency - получаю из первого символа стоимости товара
'currency': product.xpath("./div/div[#class='itemData text-center']/a[#class='itemlink']/div[#class='price']/span/text()").extract()[-1][0],
'brand': product.xpath("./div/div/a[#class='itemlink']/div[#class='brand font-bold text-uppercase']/text()").extract_first(),
'merchant':'',
'sizes':sizes,
#response.xpath().extract_first()
}
#yield scrapy.Request(url=''.join((self.splash_url, Item['url_product'])), meta={'Item': Item}, callback=self.parse_details, dont_filter=True)
yield Item
#next_page_url = response.xpath("//div[#id='navigation-bar-top']/div[#class='col-6-24']/div[#id='pagination-lite']/a[#class='pure-menu-item nextPage js-track-me']/#href").extract_first()
next_page_url = response.xpath(
"//div[#id='navigation-bar-bottom']/div[#class='col-16-24']/ul[#class='pagination list-inline pull-right text-center js-pagination']/li[#class='next-page']/a/#href").extract_first()
if next_page_url:
print('')
print('')
print(' ++++++++ next page ++++++++ ', next_page_url)
print('')
print('')
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data, dont_filter=True)
else:
print(' ++++++++ NEXT CATEGORY ++++++++ ')
pass
def parse_details(self, response):
# Производим глубокое копирование для избежания перемешивания данных
Item = copy.deepcopy(response.meta['Item'])
#other_data = json.loads(response.xpath('//section[#id="product"]/script[#type="application/ld+json"]//text()').extract_first())
Item['details'] = {
'header': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='itemTitle']/h1/a/text()").extract_first().strip(),
'price': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[1]/text()").extract_first(),
'priceCurrency': response.xpath("//div[#id='itemData']/div[#id='js-item-details']/div[#id='item-price']/span[#class='font-bold']/span[2]/#content").extract_first(),
#'colorName': response.xpath("//div[#id='js-item-color-size']/div[#id='itemColors']/div[#class='dataTitleBox font-bold text-uppercase text-size-xs margin-bottom']/span[#class='select-color-size-label']/text()").extract_first(),
#'reference': response.xpath("//div[#class='info-section']/div[#class='product-info-wrapper _product-info'][1]/p[#class='product-color']/span[2]/text()").extract_first(),
'description': response.xpath("//div[#id='itemContent']/div[#class='row text-size-default info-2cols']/div[#class='info-col-1 item-info-column col-1-2']/ul/li[#id='itemDescription']/div[#class='info-body font-sans padding-half-top']/text()").extract_first(),
#'sizeList': response.xpath("//div[#id='js-item-details']/div[#id='js-item-color-size']/div[#id='itemSizes']/ul").extract_first(),
#'other_data': other_data,
}
print('')
print('')
print(Item)
print('')
print('')
yield Item
Parsing works only for the first pages of all categories, although there is code and refers to callback = pars_data:
if next_page_url:
yield scrapy.Request(url=''.join((self.splash_url, next_page_url)), callback=self.parse_data)
else:
pass
A message is displayed on the console url for new page=2, but every current page url is new category (((. Please help my.
++++++++ next page ++++++++ https://***/us/women/shoponline/underwear_mc/2#/dept=women&gender=D&page=2&attributes=%7b%27ctgr%27%3a%5b%27ntm%27%5d%7d&season=X

Getting repeated requests from same url with different values

I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item