Optimising the speed of a scrapy spider - scrapy

I have a scrapy project where I am scraping cryptocurrency prices. I ran my spider but it takes 4 minutes to crawl the first page of the website. The website has 132 pages, which implies that my spider will take approximately 8 hours to scrape all the 132 pages. Is there a way I can optimise my spider to run faster, I tried enabling the CONCURRENT_REQUEST setting but it does not help. My code for the spider is shown below.
import scrapy
from ..items import CryptocurrencyItem
from scrapy.contracts import Contract
from scrapy.exceptions import ContractFail
class UrlSpider(scrapy.Spider):
name = "getUrls"
start_urls = ['https://www.coingecko.com']
custom_settings = {'ITEM_PIPELINES': {'cryptocurrency.pipelines.CryptocurrencyPipeline': 300}}
def parse(self, response):
""" Contract to check presence of fields in scraped items
#scrapes name rootURL historicalData market
"""
for index in response.css("""body > div.container > div.gecko-table-container > div.coingecko-table
> div.position-relative > div > table > tbody > tr:nth-child(n+1) > td.py-0.coin-name.cg-sticky-col.cg-sticky-third-col.px-0
> div > div.tw-flex-auto > a::attr('href')"""):
url = response.urljoin(index.get())
yield scrapy.Request(url=url, callback=self.parseInnerPage)
#nextPage = response.css("body > div.container > div.gecko-table-container > div.coingecko-table > div.row.no-gutters.tw-flex.flex-column.flex-lg-row.tw-justify-center.mt-2 > nav > ul > li.page-item.next > a::attr('href')").get()
#if nextPage is not None:
# nextPage = response.urljoin(nextPage)
# yield scrapy.Request(url=nextPage, callback=self.parse)
def parseInnerPage(self, response):
items = CryptocurrencyItem()
name = response.css("""body > div.container > div.tw-grid.tw-grid-cols-1.lg\:tw-grid-cols-3.tw-mb-4 >
div.tw-col-span-3.md\:tw-col-span-2 > div > div.tw-col-span-2.md\:tw-col-span-2 >
div.tw-flex.tw-text-gray-900.dark\:tw-text-white.tw-mt-2.tw-items-center > div::text""").get()
historicalData_raw = response.css("#navigationTab > li:nth-child(4) > a::attr('href')").get()
historicalData = response.urljoin(historicalData_raw)
market_raw = response.css("#navigationTabMarketsChoice::attr('href')").get()
market = response.urljoin(market_raw)
rootURL = response.url
temp = name.split("\n")
name = temp[1]
items['name'] = name
items['rootURL'] = rootURL
items['historicalData'] = historicalData
items['market'] = market
yield items
def close(self, reason):
start_time = self.crawler.stats.get_value('start_time')
finish_time = self.crawler.stats.get_value('finish_time')
print("Total run time: ", finish_time-start_time)

Related

Scrapy Item Pipeline does not process items to sqlite

i use scrapy to get data from a website (www.imensa.de) and i need to persist those data/items into a sqlite3 database. The Spider works fine, but i think there's something wrong about the pipeline.
Within the pipeline, the new .db-file is created properly with the specified tables and columns. But its not getting populated.
Spider.py
# Import library
from __future__ import absolute_import
import scrapy
from scrapy.loader import Item
from scrapy.loader import ItemLoader
from ..items import Mensen_Table
from ..items import Mensa_Meal
import datetime
from datetime import date
from datetime import timedelta
global base_url
global meal_plan_day_list
meal_plan_day_list = []
# Create Spider class
class mensen_crawler(scrapy.Spider):
# Name of spider
name = 'mensen'
# Website to scrape
allowed_domains = ['imensa.de']
start_urls = ['https://www.imensa.de/']
# STATE_NAME / STATE_LINK
def parse(self, response):
base_url = 'https://www.imensa.de/'
# Loop through response to parse state_name & state_link
for i, (name, link) in enumerate(zip(response.css('.group a::text'),response.css('.group a::attr(href)'))):
# Condition to only get german states (16 states, index starting from 0 -> 15)
if i < 16:
state_name = name.get()
state_link = base_url + link.get()
yield scrapy.Request(state_link, callback=self.parse_layer2, cb_kwargs={'state_name': state_name, 'state_link':state_link})
# CITY_NAME / CITY_LINK
def parse_layer2(self, response, state_name, state_link):
global base_url
base_url = 'https://www.imensa.de/'
for (city, link) in zip(response.css('.group a::text'), response.css('.group a::attr(href)')):
city_name = city.get()
yield print('current_city: ',city_name,' (state: ',state_name,')')
city_link_part = link.get().split('/')[0]
yield print('city_link_part: ', city_link_part)
city_link_real = base_url + city_link_part + '/index.html'
yield scrapy.Request(city_link_real, callback=self.parse_layer3, cb_kwargs={'state_name': state_name, 'state_link':state_link, 'city_name': city_name,'city_link': city_link_real})
# MENSA_NAME/MENSA_LINK
def parse_layer3(self, response, state_name, state_link, city_name, city_link):
base_url = 'https://www.imensa.de/'
for group in response.css('.group'):
uni_name = group.css('h2::text').get()
yield print('UNI_NAME: ',uni_name)
for mensa in group.css('.element'):
mensa_name = mensa.css('a::text').get()
yield print('mensa_name: ',mensa_name,' (state: ',state_name,') (city: ',city_name,') (uni_name: ',uni_name,')')
mensa_link = base_url + city_link.replace('https://www.imensa.de/','').split('/')[0] + '/' + mensa.css('a::attr(href)').get()
yield print('mensa_link: ', mensa_link)
yield scrapy.Request(mensa_link, callback=self.parse_layer4, cb_kwargs={'state_name': state_name,
'state_link':state_link,
'city_name': city_name,
'city_link': city_link,
'uni_name': uni_name,
'mensa_name': mensa_name,
'mensa_link': mensa_link
})
# CREATE MENSA ITEM -----------------------------------------------------------------------------------------------------------------
def parse_layer4(self, response, state_name, state_link, city_name, city_link, uni_name, mensa_name, mensa_link):
l = ItemLoader(item=Mensen_Table(), response=response)
try:
rating_avg = response.css('.aw-ratings-average::text').get()
except:
rating_avg = 0
try:
rating_count = response.css('.aw-ratings-count::text').get()
except:
rating_count = 0
address_list = []
for address_element in response.css('a.panel-body::text'):
address_list.append(address_element.get())
mensa_location = ', '.join(address_list)
yield print('mensa_location: ', mensa_location)
yield print('parse_layer_3 -- rating_avg: ',rating_avg)
yield print('parse_layer_3 -- rating_count: ', rating_count)
l.add_value('state_name', state_name)
l.add_value('state_link', state_link)
l.add_value('city_name', city_name)
l.add_value('city_link', city_link)
l.add_value('uni_name', uni_name)
l.add_value('mensa_name', mensa_name)
l.add_value('mensen_link', mensa_link)
l.add_value('mensa_address', mensa_location)
l.add_value('mensen_rating_avg', rating_avg)
l.add_value('mensen_rating_count', rating_count)
yield l.load_item()
for i,x in enumerate(response.css('.col-md-4.no-padding-xs .list-group')):
if i == 0:
date_list = x.css('.pull-right::text').extract()
day_list = x.css('a::text').extract()
link_list = x.css('a::attr(href)').extract()
yield print('date_list: ',date_list)
yield print('day_list: ', day_list)
yield print('link_list: ',link_list)
yield print('mensa_link: ',mensa_link)
# PROCESS DATE LIST
#------------------------------------------
meal_plan_date_list = []
for ele in date_list:
if ele == 'heute':
today = datetime.date.today().strftime('%d.%m.%Y')
meal_plan_date_list.append(today)
elif ele == 'morgen':
today = datetime.date.today()
tomorrow = today + datetime.timedelta(days=1)
meal_plan_date_list.append(tomorrow.strftime('%d.%m.%Y'))
else:
meal_plan_date_list.append(ele)
yield print('meal_plan_date_list: ',meal_plan_date_list)
# PROCESS LINK LIST
#--------------------------------------------
meal_plan_link_list = []
for ele in day_list:
link = mensa_link.replace('index.html','') + ele.lower() + '.html'
meal_plan_link_list.append(link)
yield print('meal_plan_link_list: ',meal_plan_link_list)
#meal_plan_list = []
#meal_plan_prep_list = zip(meal_plan_date_list, day_list, meal_plan_link_list)
#for item in meal_plan_prep_list:
# yield print('meal_plan_list_item: ', item)
for date, day, link in zip(meal_plan_date_list, day_list, meal_plan_link_list):
yield scrapy.Request(link, callback=self.parse_layer5, cb_kwargs={'mensa_name': mensa_name, 'mensa_link': link, 'day': day, 'date': date})
# PARSE MEAL PLAN --------------------------------------------------------------------------
def parse_layer5(self, response, mensa_name, mensa_link, day, date):
for element in response.css('.aw-meal-category'):
for sub in element.css('.aw-meal.row.no-margin-xs'):
l = ItemLoader(item=Mensa_Meal(),response=response,selector=sub)
meal_name = sub.css('p.aw-meal-description::text').get()
try:
meal_price = sub.css('.col-sm-2.no-padding-xs.aw-meal-price::text').get().replace('€','').strip()
except:
meal_price = 0
try:
meal_attributes = sub.css('.small.aw-meal-attributes span::text').extract_first().replace(u'\xa0', u'')
except:
meal_attributes = ''
#if not meal_price == None:
l.add_value('mensa_name', mensa_name)
l.add_value('date_of_meal_plan', date)
l.add_value('meal_name', meal_name)
l.add_value('meal_attributes', meal_attributes)
l.add_value('meal_price', meal_price)
yield l.load_item()
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from itemadapter import ItemAdapter
from scrapy.loader import ItemLoader
from itemloaders.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from scrapy.item import Item
class Mensen_Table(scrapy.Item):
state_name = scrapy.Field()
state_link = scrapy.Field()
city_name = scrapy.Field()
city_link = scrapy.Field()
uni_name = scrapy.Field()
mensa_name = scrapy.Field()
mensen_link = scrapy.Field()
mensa_address = scrapy.Field()
mensen_rating_avg = scrapy.Field()
mensen_rating_count = scrapy.Field()
five_star_ratings = scrapy.Field()
four_star_ratings = scrapy.Field()
three_star_ratings = scrapy.Field()
two_star_ratings = scrapy.Field()
one_star_ratings = scrapy.Field()
class Mensa_Meal(scrapy.Item):
mensa_name = scrapy.Field()
date_of_meal_plan = scrapy.Field()
meal_name = scrapy.Field()
meal_attributes = scrapy.Field()
meal_price = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import item
from scrapy.exceptions import DropItem
from itemadapter import ItemAdapter
import sqlite3
# useful for handling different item types with a single interface
from items import Mensen_Table, Mensa_Meal
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
def create_table_mensen(self):
print('TABLE MENSEN CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS mensen_table (
state_name TEXT,
city_name TEXT,
uni_name TEXT,
mensa_name TEXT,
mensa_address LONG,
mensen_rating_avg FLOAT,
mensen_rating_count TEXT)
""")
def create_table_meal(self):
return print('TABLE MEAL CREATED')
self.cur.execute("""CREATE TABLE IF NOT EXISTS meal_table (
mensa_name TEXT,
date_of_meal_plan DATE,
meal_name LONG,
meal_attributes LONG,
meal_price FLOAT)
""")
def process_item(self, item, spider):
if isinstance(item, Mensen_Table):
print('MENSEN TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO mensen_table (state_name, city_name, uni_name, mensa_name, mensa_address, mensen_rating_avg, mensen_rating_count) VALUES (?, ?, ?, ?, ?, ?, ?)""",
(item['state_name'], item['city_name'], item['uni_name'], item['mensa_name'], item['mensa_address'], item['mensen_rating_avg'], item['mensen_rating_count']))
self.con.commit()
return item
if isinstance(item, Mensa_Meal):
print('MEAL TABLE ITEM PROCESSED')
self.cur.execute("""INSERT INTO meal_table (mensa_name, date_of_meal_plan, meal_name, meal_attributes, meal_price) VALUES (?, ?, ?, ?, ?)""",
(item['mensa_name'], item['date_of_meal_plan'], item['meal_name'], item['meal_attributes'], item['meal_price']))
self.con.commit()
return item
What am i doing wrong? The items get displayed properly, but the dont reach the database-file.
Any help would be highly appreciated!
The only problem I see is that you never close the connection to the database, although I am not sure that this will solve your issue. It might though and it's good practice to do anyway.
In your pipeline add close_spider method that closes the connection to the database. This method is triggered automatically by the scrapy engine just before it deletes the spider and closes the twisted reactor.
class IwCrawlerPipeline:
def __init__(self):
self.con = sqlite3.connect('imensa.db')
self.cur = self.con.cursor()
self.create_table_mensen()
self.create_table_meal()
...
def close_spider(self, spider):
self.con.close()

Scraper not grabbing all text in list/div

My scraper seems to skip on information, for example I want to extract all the countries and the highway codes belonging to each country. However, when I iterate over this I only get one country and one highway code for each list of numbered highways.
How do I get all the highway routes and their respective countries?
Here's the scraper that I am working with:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose
from itemloaders import ItemLoader
class roadItem(scrapy.Item):
highway_names = Field(output_processor = TakeFirst())
country = Field(output_processor = TakeFirst())
route_name = Field(output_processor = TakeFirst())
class roadSpider(scrapy.Spider):
name = "road"
start_urls = ["https://en.wikipedia.org/w/index.php?title=Category:Lists_of_roads_sharing_the_same_title&pageuntil=092%0AList+of+highways+numbered+92#mw-pages"]
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
list_data = response.xpath('(//div[#class="mw-category-group"][last()])[2]//ul')
for items in list_data:
for link in items.xpath(".//a/#href").getall():
yield response.follow(
response.urljoin(link),
callback = self.parse_roads
)
next_page = response.xpath("//div[#id='mw-pages']/a[1]//#href").get()
if next_page is not None:
yield response.follow(
response.urljoin(next_page),
callback = self.parse
)
def parse_roads(self, response):
loader = ItemLoader(roadItem())
loader.add_value("highway_names", response.xpath("//h1[#id='firstHeading']//text()").get())
data = response.xpath("//div[#id='mw-content-text']")
for list_h2 in data:
if list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get():
loader.add_value("country", list_h2.xpath("(((//h2)[position() >1]//span)[position() mod 4=1])[position() < last()]//text()").get())
else:
loader.add_value("country", list_h2.xpath("(//h2//span)[1]//text()").get())
if list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
for routes in list_h2.xpath("(//div[#class='mw-parser-output']//ul)[position() > 1 and position() < last()]//li//#title").getall():
loader.add_value('route_name', routes)
else:
for routess in list_h2.xpath("//ul//li//#title").getall():
loader.add_value('route_name', routess)
yield loader.load_item()

Spider is working perfectly but doesn't scrape some results

It's working fine, there are approx 208 product information, but for some product details, it give no results, I've executed those products link separately in scrapy shell, working fine there, but why it misses out 25% off product details?
I've tried rotating user-agents, applied different xpaths, but in vain.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from ..items import AmazonItem
import time
from scrapy.linkextractors import LinkExtractor
import urllib.parse
class QuotesSpider(scrapy.Spider):
name = 'pet'
start_urls = ['https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&qid=1567115653&rnid=1632651031&ref=sr_nr_p_89_1',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=2',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=3',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=4',
'https://www.amazon.co.uk/s?k=moleskine&rh=p_89%3AMoleskine&dc&page=5'
]
def parse(self, response):
links =response.xpath("//h2/a[contains(#href,'/dp')]/#href").extract()
urll = ['https://www.amazon.co.uk' + link for link in links]
urls = urll
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_details)
def parse_details(self, response):
global name1
global sales_rank11
global price1
global prime1
list = AmazonItem()
name = response.xpath(".//*[(#id ='productTitle')]/text()").extract_first()
if name is None:
name1 = name
self.logger.info('skip')
else:
name1 = name.replace('\n', '').strip()
price = response.xpath("//span[#id='price_inside_buybox']/text()").get()
if price is None:
price1 = response.xpath("//span[#class='a-color-price']/text()").get()
if price1 is None:
price1 = 'No Price Avaiable'
self.logger.info('skip')
else:
price1 = price.replace('\n', '').replace(' ','')
prime = response.xpath("//span[#id='price-shipping-message']/b").get()
if prime is None:
prime1 = 'Not Prime'
else:
prime1 = 'Prime'
sales_rank1 = response.xpath("//tr[#id='SalesRank']/td[#class='value']/text()").get()
if sales_rank1 is None:
sales_rank11 = 'No Sales Rank Available'
else:
sales_rank11 = sales_rank1.replace('(','').replace('\n','')
list['Name'] = name1
list['Price'] = price1
list['SalesRank'] = sales_rank11
list['Prime'] = prime1
list['Url'] = response.url
yield list
Box 2 conatins correct information, but box 1 doesn't have data, but if we go to the link, there's data there
That's the product name of box 1's url, woking fine in scrapy, but not in spider.
Is there something am i missing?

Why is scrapy suddenly giving me an *unpredictable* AttributeError, stating no attribute 'css'

For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code

requests + bs4 no results from pages

Here the code that can get info from https://www.gabar.org/membersearchresults.cfm
but cannot from https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
base_url = 'https://www.gabar.org'
def make_soup(link):
r = requests.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
global links_to_visit
global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()
What I need to understand or do if I want to get full result?
What you need to understand is that there is more than a URL to an HTTP-request. In this case, a search result is only available to the session that executed the search and can therefore only be paged through if you are the "owner" of that session. Most websites identify a session using session-cookies that you need to send along with your HTTP-request.
This can be a huge hassle, but luckily pythons requests takes care of all of that for you with requests.session. Instead of using requests.get(url) you initialize the session session=requests.session() and then use that session in subsequent requests session.get(url). This will automagically preserve cookies for you and in many ways behave like an actual browser would.
You can read more about how requests.session works here.
And last but not least, your fixed code =)
from bs4 import BeautifulSoup
import requests
import traceback
links_to_visit = []
navigation_links = [] # for testing next button
# we initialize the session here
session = requests.session()
base_url = 'https://www.gabar.org'
def make_soup(link):
# r = requests.get(link)
# we use the session here in order to preserve cookies across requests
r = session.get(link)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def all_results(url):
# globals are almost never needed or recommended and certainly not here.
# you can just leave this out
# global links_to_visit
# global navigation_links
soup = make_soup(url)
print(soup)
div = soup.find('div', {'class': 'cs_control'})
links = div.find_all('a')
print(links)
for link in links:
try:
if link.text == 'Next': # prev, next, new search
navigation_links.append(link)
print('got it')
elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
pass # I dont need that link
else:
links_to_visit.append(link)
except:
traceback.print_exc()
print(len(links_to_visit))
print(links_to_visit)
#print(links_to_visit[-1].get('href'))
def start():
flag = 1
page = 1
while page < 60716:
flag = 0
if navigation_links[-1].text == 'Next':
flag = 1
next_link = navigation_links[-1]
#print(next_link.get('href'))
page += 25
print(base_url + next_link.get('href'))
all_results(base_url + next_link.get('href'))
print('page is:', page)
if __name__ == '__main__':
all_results('https://www.gabar.org/membersearchresults.cfm')
start()