The navigation is doing by POST filled form
First scraped result is ok.
{
def start_requests(self):
for search in self.searchs:
for u in self.start_urls:
self.cursor.execute("UPDATE motcleazloop SET nb=nb+1")
self.conn.commit()
frmdata = {"q": search}
return [FormRequest (url=u, method="POST", formdata=frmdata, callback=self.parse_o)]
def parse_o(self,response):
#count the number of forms in the response
print 'number of forms:'
print len(response.css('form'))
sel = Selector(response)
#Call item function
item = SniffyduckItem()
#Extract items to a array
item['urls'] = sel.xpath('//*[#id=\'links\']/*/*/*/*/#href').extract()
# Process item
yield item
# Call next job of the robot
yield self.loop_a(response)
def loop_a(self,response):
# print response.body
nbforms = len(response.css('form'))
if nbforms == 3:
print "4"
yield FormRequest.from_response (response, formnumber=2, callback=self.parse_o)
elif nbforms == 4:
print "3"
yield FormRequest.from_response (response, formnumber=3, callback=self.parse_o)
else:
# if only 2 forms
exit(0)
}
Error:
2018-02-24 10:50:06 [scrapy.core.scraper] ERROR: Spider must return
Request, BaseItem, dict or None, got 'list' in
return FormRequest instead of [FormRequest], you should delete []
Related
Several java.lang.NullPointerException in test.py file of Qpython 3L errors I got when run test.py from standard package of Qpython scripts.
File test.py and screens of errors attached.
When started to post obtained such error:"It looks like your post is mostly code; please add some more details."
Cannot add any other comments.
Any idea ?
Thank you.
import sys
import time
import types
import androidhelper
try:
import gdata.docs.service
except ImportError:
gdata = None
droid = androidhelper.Android()
def event_loop():
for i in range(10):
time.sleep(1)
droid.eventClearBuffer()
time.sleep(1)
e = droid.eventPoll(1)
if e.result is not None:
return True
return False
def test_imports():
try:
import termios
import bs4 as BeautifulSoup
import pyxmpp2 as xmpp
from xml.dom import minidom
except ImportError:
return False
return True
def test_clipboard():
previous = droid.getClipboard().result
msg = 'Hello, world!'
droid.setClipboard(msg)
echo = droid.getClipboard().result
droid.setClipboard(previous)
return echo == msg
def test_gdata():
if gdata is None:
return False
# Create a client class which will make HTTP requests with Google Docs server.
client = gdata.docs.service.DocsService()
# Authenticate using your Google Docs email address and password.
username = droid.dialogGetInput('Username').result
password = droid.dialogGetPassword('Password', 'For ' + username).result
try:
client.ClientLogin(username, password)
except:
return False
# Query the server for an Atom feed containing a list of your documents.
documents_feed = client.GetDocumentListFeed()
# Loop through the feed and extract each document entry.
return bool(list(documents_feed.entry))
def test_gps():
droid.startLocating()
try:
return event_loop()
finally:
droid.stopLocating()
def test_battery():
droid.batteryStartMonitoring()
time.sleep(1)
try:
return bool(droid.batteryGetStatus())
finally:
droid.batteryStopMonitoring()
def test_sensors():
# Accelerometer, once per second.
droid.startSensingTimed(2, 1000)
try:
return event_loop()
finally:
droid.stopSensing()
def test_speak():
result = droid.ttsSpeak('Hello, world!')
return result.error is None
def test_phone_state():
droid.startTrackingPhoneState()
try:
return event_loop()
finally:
droid.stopTrackingPhoneState()
def test_ringer_silent():
result1 = droid.toggleRingerSilentMode()
result2 = droid.toggleRingerSilentMode()
return result1.error is None and result2.error is None
def test_ringer_volume():
get_result = droid.getRingerVolume()
if get_result.error is not None:
return False
droid.setRingerVolume(0)
set_result = droid.setRingerVolume(get_result.result)
if set_result.error is not None:
return False
return True
def test_get_last_known_location():
result = droid.getLastKnownLocation()
return result.error is None
def test_geocode():
result = droid.geocode(0.0, 0.0, 1)
return result.error is None
def test_make_toast():
result = droid.makeToast('Hello, world!')
return result.error is None
def test_vibrate():
result = droid.vibrate()
return result.error is None
def test_notify():
result = droid.notify('Test Title', 'Hello, world!')
return result.error is None
def test_get_running_packages():
result = droid.getRunningPackages()
return result.error is None
def test_alert_dialog():
title = 'User Interface'
message = 'Welcome to the SL4A integration test.'
droid.dialogCreateAlert(title, message)
droid.dialogSetPositiveButtonText('Continue')
droid.dialogShow()
response = droid.dialogGetResponse().result
return response['which'] == 'positive'
def test_alert_dialog_with_buttons():
title = 'Alert'
message = ('This alert box has 3 buttons and '
'will wait for you to press one.')
droid.dialogCreateAlert(title, message)
droid.dialogSetPositiveButtonText('Yes')
droid.dialogSetNegativeButtonText('No')
droid.dialogSetNeutralButtonText('Cancel')
droid.dialogShow()
response = droid.dialogGetResponse().result
return response['which'] in ('positive', 'negative', 'neutral')
def test_spinner_progress():
title = 'Spinner'
message = 'This is simple spinner progress.'
droid.dialogCreateSpinnerProgress(title, message)
droid.dialogShow()
time.sleep(2)
droid.dialogDismiss()
return True
def test_horizontal_progress():
title = 'Horizontal'
message = 'This is simple horizontal progress.'
droid.dialogCreateHorizontalProgress(title, message, 50)
droid.dialogShow()
for x in range(0, 50):
time.sleep(0.1)
droid.dialogSetCurrentProgress(x)
droid.dialogDismiss()
return True
def test_alert_dialog_with_list():
title = 'Alert'
droid.dialogCreateAlert(title)
droid.dialogSetItems(['foo', 'bar', 'baz'])
droid.dialogShow()
response = droid.dialogGetResponse().result
return True
def test_alert_dialog_with_single_choice_list():
title = 'Alert'
droid.dialogCreateAlert(title)
droid.dialogSetSingleChoiceItems(['foo', 'bar', 'baz'])
droid.dialogSetPositiveButtonText('Yay!')
droid.dialogShow()
response = droid.dialogGetResponse().result
return True
def test_alert_dialog_with_multi_choice_list():
title = 'Alert'
droid.dialogCreateAlert(title)
droid.dialogSetMultiChoiceItems(['foo', 'bar', 'baz'], [])
droid.dialogSetPositiveButtonText('Yay!')
droid.dialogShow()
response = droid.dialogGetResponse().result
return True
def test_wifi():
result1 = droid.toggleWifiState()
result2 = droid.toggleWifiState()
droid.toggleWifiState(True) # Make sure wifi ends up ON, as it interferes with other tests
return result1.error is None and result2.error is None
if __name__ == '__main__':
for name, value in list(globals().items()):
if name.startswith('test_') and isinstance(value, types.FunctionType):
print('Running %s...' % name, end=' ')
sys.stdout.flush()
if value():
print(' PASS')
else:
print(' FAIL')
Screenshot with errors
Looking to just count the number of things scraped. New to python and scraping just following the example and what to know how to just count the number of times Albert Einstein shows up and print to a json file. Just can not get it to print to file using print, yield, or return.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
i=0
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
i+=1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I found out how to get to the item_scraped_count that shows up in the log output at the end of the spider.
import scrapy
from scrapy import signals
class CountSpider(scrapy.Spider):
name = 'count'
start_urls = ['https://example.com']
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(CountSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
return spider
def spider_closed(self, spider):
stats = spider.crawler.stats.get_stats()
numcount = str(stats['item_scraped_count'])
Here I can create a csv file with the stats
In scrapy request are made asynchronously, and each request will callback to the parse function indepedently. Your i variable is not an instance variable, so it's scope is limited to each function call.
Even if that wasn't the case, the recursion would turn your counter to 0 in each callback.
I would suggest you to take a look at scrapy items, at the end of the scrapy process it will return a counter with the nr of scraped items. Although that maybe an overkill if you don't want to store anymore information but the number of occurrences of "Albert Einstein".
If that's all you want, you can use a dirtier solution, set your counter var to be a instance var and have parse method to increment it, like this:
import scrapy
class QuotesSpider(scrapy.Spider):
name = "author"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
counter = 0
def parse(self, response):
for quote in response.css('div.quote'):
author = quote.css("small.author::text").get()
if author == "Albert Einstein":
self.counter += 1
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
I am trying to scrape a html file that has a json object with all the required test case data , but processing of the json happens in "find" and "parseTestCaseDetails" method where iteratively i get the test case details , which i finally parse in "findInterestedFields" , so my requirement is to yield the details of test cases to a json file from the the last method called in the hierarchy i.e findInterestedFields , is it possible to achieve this??
Thanks in advance!
import scrapy
import datetime
import json
import re
import collections
import time
import os
import js2xml
from scrapy.selector import HtmlXPathSelector
class AltiplanoAVInprogressTCSpiderTest(scrapy.Spider):
name = "AltiplanoAVInprogressTCSpiderTest"
buildNumber = []
FormatedBuildTime = []
keys = []
testcaseName=''
testcaseSuit=''
testcaseDoc=''
def start_requests(self):
print os.environ["jenkinsdomain"]
urls = [
os.environ["jenkinsdomain"] + "/job/InprogressFlakyAndBlockedTestCaseDetails/lastSuccessfulBuild/"
]
for url in urls:
print url
yield scrapy.Request(url=url, callback=self.parse, errback=self.parseerror)
def parseerror(self, failure):
print failure
def parse(self, response):
hxs = HtmlXPathSelector(response)
buildNumberString = hxs.select('normalize-space(//*[#id="main-panel"]/h1/text())').extract_first()
self.buildNumber = buildNumberString.split("#")[-1].split("(")[0].strip()
buildTimeUnformatd = buildNumberString.split("(")[-1].split(")")[0].strip().replace("PM", "").replace("AM", "")
buildTimeUnformatd = buildTimeUnformatd.strip()
t = time.strptime(buildTimeUnformatd, "%b %d, %Y %I:%M:%S")
self.FormatedBuildTime = time.strftime('%d %b %y %H:%M IST', t)
static_testDetailsUrl = os.environ[
"jenkinsdomain"] + "XX/Inprogress-ANV.html"
yield scrapy.Request(url=static_testDetailsUrl, callback=self.parseTestDetails, errback=self.parseerror)
def find(self, key, dictionary):
for k, v in dictionary.iteritems():
if k == key:
yield v
self.parseTestCaseDetails(v)
elif isinstance(v, dict):
for result in self.find(key, v):
yield result
elif isinstance(v, list):
for d in v:
for result in self.find(key, d):
yield result
def parseTestCaseDetails(self, testcases):
# get the list of test cases and again parse them one by one to get the name and other fields
for testcase in testcases:
self.findInterestedFields(testcase)
def findInterestedFields(self, dictionary):
jsonLoad = json.dumps(dictionary, indent=4)
loaded_json = json.loads(jsonLoad)
readabledatetime = datetime.datetime.now().strftime("%d %b %y %H:%M IST")
for k, v in loaded_json.iteritems():
if k == "name":
testcaseName = v
if k == "fullName":
testcaseSuit = v
if k == "doc":
testcaseDoc = v
yield {"name":"avInprogresstestcases",'avInprogresstestcases':{
'testcaseName':testcaseName
}}
def parseTestDetails(self, response):
data = response.xpath('//script[4]/text()').extract_first().strip()
jstree = js2xml.parse(data)
testDetailsInJson = js2xml.jsonlike.getall(jstree)
jsonLoad = json.dumps(testDetailsInJson[0], indent=4)
loaded_json = json.loads(jsonLoad)
list(self.find('tests', loaded_json))
For my job, I built a scrapy spider to quickly check in on ~200-500 website landing pages for clues that the pages are not functioning, outside of just 400-style errors. (e.g. check for the presence of "out of stock" on page.) This check happens across approx. 30 different websites under my purview, all of them using the same page structure.
This has worked fine, every day, for 4 months.
Then, suddenly, and without change to the code, I started getting unpredictable errors, about 4 weeks ago:
url_title = response.css("title::text").extract_first()
AttributeError: 'Response' object has no attribute 'css'
If I run this spider, this error will occur with, say... 3 out of 400 pages.
Then, if immediately run the spider again, those same 3 pages are scraped just fine without error, and 4 totally different pages will return the same error.
Furthermore, if I run the EXACT same spider as below, but replace mapping with just these 7 erroneous landing pages, they are scraped perfectly fine.
Is there something in my code that's not quite right??
I'm going to attach the whole code - sorry in advance!! - I just fear that something I might deem as superfluous may in fact be the cause. So this is the whole thing, but with sensitive data replaced with ####.
I've checked all of the affected pages, and of course the css is valid, and the title is always present.
I've done sudo apt-get update & sudo apt-get dist-upgrade on the server running scrapy, in hopes that this would help. No luck.
import scrapy
from scrapy import signals
from sqlalchemy.orm import sessionmaker
from datetime import date, datetime, timedelta
from scrapy.http.request import Request
from w3lib.url import safe_download_url
from sqlalchemy import and_, or_, not_
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEText import MIMEText
from sqlalchemy.engine import create_engine
engine = create_engine('mysql://######:#######localhost/LandingPages', pool_recycle=3600, echo=False)
#conn = engine.connect()
from LandingPageVerifier.models import LandingPagesFacebook, LandingPagesGoogle, LandingPagesSimplifi, LandingPagesScrapeLog, LandingPagesScrapeResults
Session = sessionmaker(bind=engine)
session = Session()
# today = datetime.now().strftime("%Y-%m-%d")
# thisyear = datetime.now().strftime("%Y")
# thismonth = datetime.now().strftime("%m")
# thisday = datetime.now().strftime("%d")
# start = date(year=2019,month=04,day=09)
todays_datetime = datetime(datetime.today().year, datetime.today().month, datetime.today().day)
print todays_datetime
landingpages_today_fb = session.query(LandingPagesFacebook).filter(LandingPagesFacebook.created_on >= todays_datetime).all()
landingpages_today_google = session.query(LandingPagesGoogle).filter(LandingPagesGoogle.created_on >= todays_datetime).all()
landingpages_today_simplifi = session.query(LandingPagesSimplifi).filter(LandingPagesSimplifi.created_on >= todays_datetime).all()
session.close()
#Mix 'em together!
landingpages_today = landingpages_today_fb + landingpages_today_google + landingpages_today_simplifi
#landingpages_today = landingpages_today_fb
#Do some iterating and formatting work
landingpages_today = [(u.ad_url_full, u.client_id) for u in landingpages_today]
#print landingpages_today
landingpages_today = list(set(landingpages_today))
#print 'Unique pages: '
#print landingpages_today
# unique_landingpages = [(u[0]) for u in landingpages_today]
# unique_landingpage_client = [(u[1]) for u in landingpages_today]
# print 'Pages----->', len(unique_landingpages)
class LandingPage004Spider(scrapy.Spider):
name='LandingPage004Spider'
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(LandingPage004Spider, cls).from_crawler(crawler, *args, **kwargs)
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
return spider
def spider_closed(self, spider):
#stats = spider.crawler.stats.get_stats()
stats = spider.crawler.stats.get_value('item_scraped_count'),
Session = sessionmaker(bind=engine)
session = Session()
logitem = LandingPagesScrapeLog(scrape_count = spider.crawler.stats.get_value('item_scraped_count'),
is200 = spider.crawler.stats.get_value('downloader/response_status_count/200'),
is400 = spider.crawler.stats.get_value('downloader/response_status_count/400'),
is403 = spider.crawler.stats.get_value('downloader/response_status_count/403'),
is404 = spider.crawler.stats.get_value('downloader/response_status_count/404'),
is500 = spider.crawler.stats.get_value('downloader/response_status_count/500'),
scrapy_errors = spider.crawler.stats.get_value('log_count/ERROR'),
scrapy_criticals = spider.crawler.stats.get_value('log_count/CRITICAL'),
)
session.add(logitem)
session.commit()
session.close()
#mapping = landingpages_today
handle_httpstatus_list = [200, 302, 404, 400, 500]
start_urls = []
def start_requests(self):
for url, client_id in self.mapping:
yield Request(url, callback=self.parse, meta={'client_id': client_id})
def parse(self, response):
##DEBUG - return all scraped data
#wholepage = response.body.lower()
url = response.url
if 'redirect_urls' in response.request.meta:
redirecturl = response.request.meta['redirect_urls'][0]
if 'utm.pag.ca' in redirecturl:
url_shortener = response.request.meta['redirect_urls'][0]
else:
url_shortener = 'None'
else:
url_shortener = 'None'
client_id = response.meta['client_id']
url_title = response.css("title::text").extract_first()
# pagesize = len(response.xpath('//*[not(descendant-or-self::script)]'))
pagesize = len(response.body)
HTTP_code = response.status
####ERROR CHECK: Small page size
if 'instapage' in response.body.lower():
if pagesize <= 20000:
err_small = 1
else:
err_small = 0
else:
if pagesize <= 35000:
err_small = 1
else:
err_small = 0
####ERROR CHECK: Page contains the phrase 'not found'
if 'not found' in response.xpath('//*[not(descendant-or-self::script)]').extract_first().lower():
#their sites are full of HTML errors, making scrapy unable to notice what is and is not inside a script element
if 'dealerinspire' in response.body.lower():
err_has_not_found = 0
else:
err_has_not_found = 1
else:
err_has_not_found = 0
####ERROR CHECK: Page cotains the phrase 'can't be found'
if "can't be found" in response.xpath('//*[not(self::script)]').extract_first().lower():
err_has_cantbefound = 1
else:
err_has_cantbefound = 0
####ERROR CHECK: Page contains the phrase 'unable to locate'
if 'unable to locate' in response.body.lower():
err_has_unabletolocate = 1
else:
err_has_unabletolocate = 0
####ERROR CHECK: Page contains phrase 'no longer available'
if 'no longer available' in response.body.lower():
err_has_nolongeravailable = 1
else:
err_has_nolongeravailable = 0
####ERROR CHECK: Page contains phrase 'no service specials'
if 'no service specials' in response.body.lower():
err_has_noservicespecials = 1
else:
err_has_noservicespecials = 0
####ERROR CHECK: Page contains phrase 'Sorry, no' to match zero inventory for a search, which normally says "Sorry, no items matching your request were found."
if 'sorry, no ' in response.body.lower():
err_has_sorryno = 1
else:
err_has_sorryno = 0
yield {'client_id': client_id, 'url': url, 'url_shortener': url_shortener, 'url_title': url_title, "pagesize": pagesize, "HTTP_code": HTTP_code, "err_small": err_small, 'err_has_not_found': err_has_not_found, 'err_has_cantbefound': err_has_cantbefound, 'err_has_unabletolocate': err_has_unabletolocate, 'err_has_nolongeravailable': err_has_nolongeravailable, 'err_has_noservicespecials': err_has_noservicespecials, 'err_has_sorryno': err_has_sorryno}
#E-mail settings
def sendmail(recipients,subject,body):
fromaddr = "#######"
toaddr = recipients
msg = MIMEMultipart()
msg['From'] = fromaddr
msg['Subject'] = subject
body = body
msg.attach(MIMEText(body, 'html'))
server = smtplib.SMTP('########)
server.starttls()
server.login(fromaddr, "##########")
text = msg.as_string()
server.sendmail(fromaddr, recipients, text)
server.quit()
`
Expected results is a perfect scrape, with no errors.
Actual results are unpredicatable AttributeErrors, claiming that attribute 'css' can't be found on some pages. But if I scrape those pages individually, using the same script, they scrape just fine.
Sometimes Scrapy can't parse HTML because of markup errors, that's why you can't call response.css(). You can catch these events in your code and analyze broken HTML:
def parse(self, response):
try:
....
your code
.....
except:
with open("Error.htm", "w") as f:
f.write(response.body)
UPDATE You can try to check for empty response:
def parse(self, response):
if not response.body:
yield scrapy.Request(url=response.url, callback=self.parse, meta={'client_id': response.meta["client_id"]})
# your original code
I want to write a chat demo with tornado and redis. I use redis subscribe , but what I wrote is not work . when I run the code , iterm output
listening 8000
GroupChat here
getMsg here
None
None
And I PUBLISH testc helloword in redis-cli, iterm output:
[I 150401 18:30:57 web:1825] 304 GET /groupchat?key=testc (127.0.0.1) 2.40ms
Message(kind=u'message', channel=u'testc', body=u'helloword', pattern=u'testc')
I just want to get the Message in GroupChat.get , but I get None. anyone help me?
GroupChat code is here :
class GroupChat(tornado.web.RequestHandler):
def initialize(self):
print 'GroupChat here'
self.c = tornadoredis.Client(host=CONFIG['REDIS_HOST'], port=CONFIG['REDIS_PORT'], password=CONFIG['REDIS_AUTH'])
self.channelMsgModel = channelMsgModel(self.c)
#tornado.gen.coroutine
def get(self):
try:
key = self.get_argument('key')
info = yield self.channelMsgModel.getMsg(key)
print info
self.finish(info)
except Exception, e:
print e
pass
channelMsgModel code is here :
import tornado.gen
class channelMsgModel :
timeout = 10
def __init__(self, redisobj):
self.redisobj = redisobj
#tornado.gen.coroutine
def getMsg(self, key):
print 'getMsg here'
yield tornado.gen.Task(self.redisobj.subscribe, key)
info = self.redisobj.listen(self.on_message)
print info
raise tornado.gen.Return(info)
def on_message(self, msg):
if (msg.kind == 'message'):
print msg
return msg
elif (msg.kind == 'unsubscribe'):
self.redisobj.disconnect()
# raise tornado.gen.Return(False)
Use a toro.Queue (which will be included in Tornado itself in the upcoming version 4.2):
class channelMsgModel:
def __init__(self, redisobj):
self.redisobj = redisobj
self.queue = toro.Queue()
#gen.coroutine
def getMsg(self, key):
yield gen.Task(self.redisobj.subscribe, key)
self.redisobj.listen(self.on_message)
info = yield self.queue.get()
raise tornado.gen.Return(info)
def on_message(self, msg):
if (msg.kind == 'message'):
self.queue.put_nowait(msg)
elif (msg.kind == 'unsubscribe'):
self.redisobj.disconnect()