I'm new to scrapy and I'm trying to scrape 300 urls per spider for my project, but for testing purposes, I only scrape 2 urls. When I use proxy rotator it takes 120 secs to 500 secs to finish my script, but when I'm not using proxy it takes 2 to 20 secs to finish my script. Is there any solution that would improve the overall performance of my script? Also, I use proxy rotator from proxyrotator.com
I often get TimeoutError when using proxy rotator. Any advice or answer would be appreciated.
settings.py
#settings.py
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 0.25
RETRY_TIMES = 3
CONNECTION_TIMEOUT = 30
COOKIES_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"DNT": "1",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
}
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 100
AUTOTHROTTLE_ENABLED = False
DOWNLOAD_TIMEOUT = 30
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400,
}
static_spider.py
class ArticleStaticSpider(scrapy.Spider):
logfile("server.log", maxBytes=1e6, backupCount=3)
name = "article_static"
custom_settings = {
'ITEM_PIPELINES': {'news_extractor.pipelines.StaticExtractorPipeline': 300},
"FEEDS": {"articles.json": {"format": "json"}},
}
def __init__(self, urls=None):
self.urls = urls
def start_requests(self):
for url in self.urls:
proxy = get_proxy() # get request for proxy rotator
ip = proxy['ip']
port = proxy['port']
meta_proxy = f"http://{ip}:{port}"
headers = {
"User-Agent": proxy['randomUserAgent']
}
meta = {
"proxy": meta_proxy }
yield scrapy.Request(url, callback=self.parse,headers=headers, meta=meta, errback=self.errback_httpbin)
def parse(self, response):
src = StaticSource(response.url) # module for cleaning html tags
text_format = src.text
news = News(response.url, text_format) # module for extracting data
data = news.generate_data()
print(data)
scraper.py
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import time
ulrs = [
"http://www.nytimes.com/2021/02/28/technology/seniors-vaccines-technology.html",
"http://www.nytimes.com/2021/02/28/us/schools-reopening-philadelphia-parents.html"
]
def main():
process = CrawlerProcess(get_project_settings())
process.crawl('article_static', urls=urls)
process.crawl('article_static', urls=urls)
process.start()
if __name__ == "__main__":
t1 = time.perf_counter()
main()
t2 = time.perf_counter()
print(f'Finished in {round(t2-t1, 2)} second(s).....')
Related
I'm learning Scrapy-playwright and it's fighting me. I'm attempting to gather store locations from a site using the CrawlSpider with a rule including a process_request that triggers the request to run through playwright. In my callback def I can print a value found on the page, but not return or yield anything. I've attempted to cache the data into an item, and return/yield a dict, all of which produces the error.
ERROR: Spider must return request, item, or None, got 'Deferred'
I'm stumped.
import re
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from banners.items import StoreItem
from scrapy_playwright.page import PageCoroutine
from scrapy.http.response import Response
def set_playwright_true(request, response):
request.meta["playwright"] = True
request.meta["playwright_include_page"] = True
request.meta["playwright_page_coroutines"] = ('wait_for_selector', 'span.store-name-city')
return request
class StoreSpider(CrawlSpider):
name = "retailer"
allowed_domains = ['retailer.com']
start_urls = ['https://www.retailer.com/store/0000-city-ak']
custom_settings = {
'ROBOTSTXT_OBEY': True ,
#'DOWNLOAD_DELAY': .5 ,
#'CONCURRENT_REQUESTS_PER_DOMAIN': 3 ,
'DOWNLOAD_HANDLERS': {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" ,
} ,
'TWISTED_REACTOR': "twisted.internet.asyncioreactor.AsyncioSelectorReactor" ,
}
rules = (
Rule(LinkExtractor(allow=('directory/ak/anchorage'))),
Rule(LinkExtractor(allow=(r'store/[0-9]+'), deny=(r'store/[0-9]+.+/.+')), callback='parse_item', follow=False, process_request=set_playwright_true),
)
async def parse_item(self, response):
items = []
item = StoreItem()
self.logger.info('***** Start processing ' + response.url + '. *****')
Name = response.css('meta[itemprop=alternateName]').attrib['content'] + ' - ' + response.css('span.store-name-city::text').get()
print(Name)
item['Name'] = Name
item['StoreID'] = response.css('meta[itemprop=storeID]').attrib['content']
item['Address1'] = response.css('span.store-address-line-1::text').get()
item['City'] = response.css('span.store-address-city::text').get()
item['State'] = response.css('span.store-address-state::text').get()
item['Zip'] = response.css('span.store-address-postal::text').get()
item['Phone'] = response.css('div.store-phone::text').get()
item['Latitude'] = response.css('meta[itemprop=latitude]').attrib['content']
item['Longitude'] = response.css('meta[itemprop=longitude]').attrib['content']
items.append(item)
return(items)
Changing parse_item from an async def to a plain def resolved the issue.
async def parse_item(self, response):
changed to
def parse_item(self, response):
My spiders seems to only grab about 1 job-listing from each webpage. When I remove parse_jobs, and load_item() in parse I can extract all the job-listings for each page. So the issue is likely when it goes to parse_jobs and loads items, however I cannot seem to figure out the issue.
Here's what I have tried:
from scrapy.item import Field
import scrapy
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from itemloaders.processors import TakeFirst, MapCompose, Join
import pandas as pd
from collections import defaultdict
from scrapy_splash import SplashRequest
headers = {
'authority': 'api2.branch.io',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'content-type': 'application/x-www-form-urlencoded',
'accept': '*/*',
'origin': 'https://www.reed.co.uk',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
'referer': 'https://www.reed.co.uk/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class ReedItem(scrapy.Item):
category = Field(output_processor = TakeFirst())
salary = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
organisation = Field(output_processor = TakeFirst())
region = Field(input_processor = MapCompose(str.strip),
output_processor = TakeFirst())
items = Field(output_processor = TakeFirst())
post = Field(input_processor = MapCompose(),
output_processor = Join(" "))
page_no = Field(output_processor = TakeFirst())
class ReedSpider(scrapy.Spider):
name = 'reed'
degrees={'upper': ['Finance','Accounting','Aeronautical-Engineering','Manufacturing-Engineering'],
'degrees_entry': ['degree-finance-entry','degree-accounting-entry','degree-aeronautical-engineering-entry','degree-manufacturing-engineering-entry'],
'degrees_graduate': ['degree-finance-graduate','degree-accounting-graduate','degree-aeronautical-engineering-graduate','degree-manufacturing-engineering-graduate'],
'degrees': ['degree-finance','degree-accounting','degree-aeronautical-engineering','degree-manufacturing-engineering'],
'graduate_entry': ['graduate-finance-entry','graduate-accounting-entry','graduate-aeronautical-engineering-entry','graduate-manufacturing-engineering-entry'],
'graduate': ['graduate-finance','graduate-accounting','graduate-aeronautical-engineering','graduate-manufacturing-engineering'],
'sector': ['Accountancy_Finance','Accountancy_Finance','Engineering_Manufacturing','Engineering_Manufacturing'],
'degree_type': ['Accountancy_finance','Accountancy_finance','Aeronautical_Engineering','Manufacturing_Engineering']}
degree = pd.DataFrame(degrees)
start_urls = defaultdict(list)
custom_settings = {
'USER_AGENT':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'DOWNLOAD_DELAY':0.5,
#'LOG_LEVEL':'INFO',
}
def start_requests(self):
for degrees, degrees_entry,degrees_graduate, graduate_entry, graduate,sector in zip(self.degree.degrees,self.degree.degrees_entry,self.degree.degrees_graduate,self.degree.graduate_entry,self.degree.graduate, self.degree.sector):
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate_entry}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{degrees_graduate}-jobs')
self.start_urls[sector].append(f'https://www.reed.co.uk/jobs/{graduate}-jobs')
for items, urls in self.start_urls.items():
for url in urls:
yield scrapy.Request(
url = url,
headers=headers,
callback = self.parse,
cb_kwargs = {
'items':items,
'page_no':0
}
)
def parse(self, response, items, page_no):
container = response.xpath("//div[#class='row search-results']")
for lists in container:
page_no += 1
loader = ItemLoader(ReedItem(), selector = lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
links = response.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback = self.parse_jobs,
cb_kwargs = {
'loader':loader
})
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url = next_page,
callback = self.parse,
headers=headers,
cb_kwargs = {
'items':items,
'page_no':page_no
})
def parse_jobs(self, response, loader):
loader.add_value('post',response.xpath('(//span[#itemprop="description"]/p/text()) | (//span[#itemprop="description"]/p//text()) | (//span[#itemprop="description"]/ul//li/text())').getall())
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'reed_jobs_post.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(ReedSpider)
process.start()
I commented out the lines I changed.
def parse(self, response, items, page_no):
# container = response.xpath("//div[#class='row search-results']")
container = response.xpath("//div[#class='row search-results']//article")
page_no += 1
for lists in container:
# page_no += 1
loader = ItemLoader(ReedItem(), selector=lists)
loader.add_value('items', items)
loader.add_xpath('region', ".//div[#class='metadata']//ul//li[#class='location']//text()")
loader.add_value('page_no', page_no)
loader.add_xpath('category', "//div[#class='col-sm-11 col-xs-12 page-title']//h1/text()")
loader.add_xpath('title', './/h3[#class="title"]/a/#title')
loader.add_xpath('salary', './/li[#class="salary"]/text()')
loader.add_xpath('organisation', './/a[#class="gtmJobListingPostedBy"]/text()')
# links = response.xpath('.//h3[#class="title"]/a/#href').get()
links = lists.xpath('.//h3[#class="title"]/a/#href').get()
yield response.follow(
response.urljoin(links),
callback=self.parse_jobs,
cb_kwargs={
'loader': loader
}
)
next_page = response.xpath('//a[#id="nextPage"]/#href').get()
if next_page:
yield response.follow(
url=next_page,
callback=self.parse,
headers=headers,
cb_kwargs={
'items': items,
'page_no': page_no
})
For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code
I am using scrapy along with scrapyrt to create APIs. My application have around 20 spiders. We are using NFS server for load balancing on production. Unfortunately application using 40% and more space.
"stats": {
"downloader/request_bytes": 12033,
"downloader/request_count": 5,
"downloader/request_method_count/GET": 4,
"downloader/request_method_count/POST": 1,
"downloader/response_bytes": 20165,
"downloader/response_count": 5,
"downloader/response_status_count/200": 3,
"downloader/response_status_count/302": 1,
"downloader/response_status_count/404": 1,
"finish_reason": "finished",
"finish_time": "2019-05-23 06:05:04",
"item_scraped_count": 1,
"log_count/DEBUG": 35,
"log_count/INFO": 20,
"memusage/max": 3399057408,
"memusage/startup": 3399057408,
"request_depth_max": 2,
"response_received_count": 4,
"scheduler/dequeued": 4,
"scheduler/dequeued/memory": 4,
"scheduler/enqueued": 4,
"scheduler/enqueued/memory": 4,
"start_time": "2019-05-23 06:05:01"
}
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
14500 root 20 0 4999116 3.190g 7184 S 0.3 40.9 103:34.01 scrapyrt
I followed scrapy memory leak documentation so remove meta attribute from request, but still memory is increasing.
class GarudaRetrieveBookingSpider(Spider):
"""get the pax, flight and fare details"""
name = "garuda_retrieve_booking"
meta = dict()
formdata = dict()
start_url = ''
booking_code = ''
output_dict = {'schedule_detail': [], 'pax_details': [], 'reservation_name': '', 'fare_details': {}}
pax_count = 0
adult_count = 0
child_count = 0
infant_count = 0
ticket_list_adt_child = []
ticket_list_inf = []
# this variable is created to save rt command response data to pass in next call if there are no tickets
rt_response = ''
def start_requests(self):
"""
:return: Request object
"""
post_data = self.data
garuda_session_id = post_data['parameter']['jSessionId']
post_data["command"] = "IG"
file_path = os.path.join(GARUDA_SESSION_FILES_PATH, TODAY_DATE, garuda_session_id + "_session.txt")
session_data = get_cookies(self, file_path)
self.start_url = GARUDA_KEEP_ALIVE_URL.format(session_id=session_data["jSessionId"], site=SITE, lang=LANGUAGE)
self.meta = {"session_data": session_data, "post_data": post_data}
return [Request(self.start_url, self.parse, errback=self.errback_httpbin)]
def parse(self, response):
"""
:param response:
:return: FormRequest
description: submit IG command
"""
self.log("\n\nparse response: {}\n\n".format(response.text))
if response.status != 200:
error_message = 'parse method failed.'
return {"status": False, "error_message": error_message}
session_data = self.meta["session_data"]
command = self.meta["post_data"]["command"]
# override the command with current command
session_data["tasks"][0]["command"]["command"] = command
self.formdata = {
"data": json.dumps(session_data)
}
yield scrapy.FormRequest(CRYTO_COMMAND_URL, formdata=self.formdata,
callback=self.ig_command_response, errback=self.errback_httpbin)
I am trying to crawl some data as my side project but I am having a problem gathering it. I have been trying for two day without much luck.
First problem:
When I crawl the post form the main page I get a wrong token.
Second problem:
I have read and I have tried to implement scrapy docs request to get the phone number but in vain,
or this answer
stackoverflow
Third problem:
How would I go to implement the next page (comment out code inside gumtree.py).
Fourth problem:
I am now able to get the phone numbers but I am getting repeated requests to the same url with different values, [see results]
I would really appreciate if anyone could give me a direction.
My main goal is to crawl post that have phone numbers
I have tried to search stackoverflow but I couldn't find the proper post.
Many Thanks
setting.py
BOT_NAME = 'crawler'
SPIDER_MODULES = ['crawler.spiders']
NEWSPIDER_MODULE = 'crawler.spiders'enter code here
USER_AGENT = "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36"
TELNETCONSOLE_ENABLED = False
gumtree.py [UPDATED]
# -*- coding: utf-8 -*-
import re
import json
import scrapy
from scrapy import Request, Item, Field, Selector
def complete_link(string):
return string
class MyItem(Item):
token = Field()
post_id = Field()
post_url = Field()
phone_num = Field()
phone_url = Field()
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = MyItem()
for href in response.css('a.listing-link::attr(href)').extract():
domain = 'https://www.gumtree.com' + href
request = Request(domain, callback=self.parse_post, meta={'domain':domain,'item':item})
yield request
# next_page = response.css('li.pagination-next a::attr("href")').extract_first()
# if next_page is not None:
# next_page = response.urljoin(next_page)
# yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_url'] = response.meta['domain']
post_id = re.match('.*?([0-9]+)$', item['post_url'])
if post_id:
item['post_id'] = post_id.group(1)
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
request = Request('https://www.gumtree.com/ajax/account/seller/reveal/number/' + item['post_id'], headers={'X-GUMTREE-TOKEN':item['token']}, callback=self.parse_phone, meta={'item':item})
yield request
def parse_phone(self, response):
item = response.meta['item']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = phone['data']
return item
results: [scrapy crawl gumtree -o ..\result.json]
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "004407488470949"},
{"post_url": "https://www.gumtree.com/p/ford/ford-galaxy-2.0-tdci-auto-titanium-7-seater-full-service-history-alloys/1214586540", "post_id": "1214586540", "token": "eyJhbGciOiJIUzI1NiJ9.eyJuYmYiOjE0ODYyMjgwMTUsImlzcyI6Imh0dHBzOlwvXC93d3cuZ3VtdHJlZS5jb20iLCJleHAiOjE0ODYyNDk2MTQsImlhdCI6MTQ4NjIyODAxNCwiYWR2ZXJ0X2lkIjoxMjE0NTg2NTQwfQ.Lv0aCIKHo_2DbTcIw7RvE535PFAD5OX16_SFMDz--Cs", "phone_num": "01527853397"},
Have you checked that meta['item'] actually being passed to parse_token()?
I'd do the following:
meta = { 'item': item }
request = Request(response.urljoin(href), meta=meta, callback=self.parse_token)
yield request
I have found the solution.
# -*- coding: utf-8 -*-
import re, json, scrapy
from crawler.items import CrawlerItem
from scrapy import Request, Item, Field, Selector
gumtree = 'https://www.gumtree.com'
getphone = 'https://www.gumtree.com/ajax/account/seller/reveal/number/'
class GumtreeSpider(scrapy.Spider):
name = "gumtree"
allowed_domains = ["gumtree.com"]
start_urls = [
'https://www.gumtree.com/search?search_category=cars',
]
def parse(self, response):
item = CrawlerItem()
pid = []
arr_url = []
for href in response.css('a.listing-link::attr(href)').extract():
if len(href) > 0:
post_id = u''.join(href).encode('utf-8').strip()
post_id = re.match('.*?([0-9]+)$', post_id)
if post_id:
pid.append(post_id.group(1))
domain = gumtree + href
arr_url.append(domain)
i = 0
while i < len(arr_url):
url = u''.join(arr_url[i]).encode('utf-8').strip()
request = Request(url, callback=self.parse_post, meta={'url':url,'item':item,'pid':pid[i]}, headers={'Referer':gumtree})
i += 1
yield request
next_page = response.css('li.pagination-next a::attr("href")').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield Request(next_page, callback=self.parse)
def parse_post(self, response):
item = response.meta['item']
item['post_id'] = response.meta['pid']
item['post_url'] = response.meta['url']
token = response.xpath('//script[contains(., "revealSellerTelephoneNumberToken")]').extract()
arr_token = re.findall(r'"([^"]*)"', str(token))
if len(arr_token) == 15:
item['token'] = arr_token[-2]
ref = item['post_url']
req = Request(getphone + item['post_id'], callback=self.parse_phone, headers={'X-GUMTREE-TOKEN':item['token'], 'Referer':ref}, meta={'url':response.meta['url'],'item':item})
return req
def parse_phone(self, response):
item = response.meta['item']
item['post_url'] = response.meta['url']
phone = json.loads(response.body_as_unicode())
item['phone_num'] = u''.join(phone['data']).encode('utf-8').strip()
return item