I am testing my webapp using grinder. I generated script using tcp proxy.
It worked fine, but when I add logger to it, following error is produced:
Error running worker process
net.grinder.scriptengine.jython.JaythonScriptExceutionException:
SyntaxError ('invalid syntax', ('c:\\........ \\script_name.py', 79, 9,
"\tlog('Arvind Purohit')"))
(no code object) at line 0
This is my script:
# The Grinder 3.9.1
# HTTP script recorded by TCPProxy at 9 Jul, 2012 3:08:10 PM
from net.grinder.script import Test
from net.grinder.script.Grinder import grinder
from net.grinder.plugin.http import HTTPPluginControl, HTTPRequest
from HTTPClient import NVPair
log = grinder.logger.info
connectionDefaults = HTTPPluginControl.getConnectionDefaults()
httpUtilities = HTTPPluginControl.getHTTPUtilities()
# To use a proxy server, uncomment the next line and set the host and port.
# connectionDefaults.setProxyServer("localhost", 8001)
# These definitions at the top level of the file are evaluated once,
# when the worker process is started.
connectionDefaults.defaultHeaders = \
[ NVPair('Accept-Encoding', 'gzip, deflate'),
NVPair('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0)'), ]
headers0= \
[ NVPair('Accept', '*/*'),
NVPair('Referer', 'http://192.168.1.53:8081/JSP-LOGIN/login.jsp'),
NVPair('Accept-Language', 'en-IN'), ]
headers1= \
[ NVPair('Accept', 'image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*'),
NVPair('Referer', 'http://192.168.1.53:8081/JSP-LOGIN/login.jsp'),
NVPair('Accept-Language', 'en-IN'), ]
headers2= \
[ NVPair('Accept', 'image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*'),
NVPair('Referer', 'http://192.168.1.53:8081/JSP-LOGIN/welcome.jsp'),
NVPair('Accept-Language', 'en-IN'), ]
url0 = 'http://192.168.1.53:8081'
# Create an HTTPRequest for each request, then replace the
# reference to the HTTPRequest with an instrumented version.
# You can access the unadorned instance using request101.__target__.
# ========= START -================
request101 = HTTPRequest(url=url0)
request101 = Test(101, 'GET login.jsp').wrap(request101)
request102 = HTTPRequest(url=url0, headers=headers0)
request102 = Test(102, 'GET valid.js').wrap(request102)
request103 = HTTPRequest(url=url0)
request103 = Test(103, 'GET favicon.ico').wrap(request103)
# ====== login=============
request201 = HTTPRequest(url=url0, headers=headers1)
request201 = Test(201, 'POST loginmid.jsp').wrap(request201)
request202 = HTTPRequest(url=url0, headers=headers1)
request202 = Test(202, 'GET welcome.jsp').wrap(request202)
# ==========LOGOUT============
request301 = HTTPRequest(url=url0, headers=headers2)
request301 = Test(301, 'GET logout.jsp').wrap(request301)
request302 = HTTPRequest(url=url0, headers=headers2)
request302 = Test(302, 'GET login.jsp').wrap(request302)
request303 = HTTPRequest(url=url0, headers=headers0)
request303 = Test(303, 'GET valid.js').wrap(request303)
class TestRunner:
"""A TestRunner instance is created for each worker thread."""
# A method for each recorded page.
def page1(self):
"""GET login.jsp (requests 101-103)."""
result = request101.GET('/JSP-LOGIN/login.jsp', None,
( NVPair('Accept', 'image/jpeg, application/x-ms-application, image/gif, application/xaml+xml, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*'),
NVPair('Accept-Language', 'en-IN'), ))
log('ARVIND PUROHIT')
grinder.sleep(13)
request102.GET('/JSP-LOGIN/valid.js')
grinder.sleep(62)
request103.GET('/favicon.ico', None,
( NVPair('Accept', '*/*'), ))
return result
def page2(self):
"""POST loginmid.jsp (requests 201-202)."""
# Expecting 302 'Moved Temporarily'
result = request201.POST('/JSP-LOGIN/loginmid.jsp',
( NVPair('userName', 'A'),
NVPair('password', 'A'),
NVPair('Submit', 'Login'), ),
( NVPair('Content-Type', 'application/x-www-form-urlencoded'), ))
grinder.sleep(15)
request202.GET('/JSP-LOGIN/welcome.jsp')
self.token_flag = \
httpUtilities.valueFromBodyURI('flag') # 'edit'
return result
def page3(self):
"""GET logout.jsp (requests 301-303)."""
# Expecting 302 'Moved Temporarily'
result = request301.GET('/JSP-LOGIN/logout.jsp')
request302.GET('/JSP-LOGIN/login.jsp')
request303.GET('/JSP-LOGIN/valid.js', None,
( NVPair('If-Modified-Since', 'Tue, 03 Jul 2012 10:18:40 GMT'),
NVPair('If-None-Match', 'W/\"4436-1341310720000\"'), ))
return result
def __call__(self):
"""Called for every run performed by the worker thread."""
self.page1() # GET login.jsp (requests 101-103)
grinder.sleep(12893)
self.page2() # POST loginmid.jsp (requests 201-202)
grinder.sleep(16403)
self.page3() # GET logout.jsp (requests 301-303)
def instrumentMethod(test, method_name, c=TestRunner):
"""Instrument a method with the given Test."""
unadorned = getattr(c, method_name)
import new
method = new.instancemethod(test.wrap(unadorned), None, c)
setattr(c, method_name, method)
# Replace each method with an instrumented version.
# You can call the unadorned method using self.page1.__target__().
instrumentMethod(Test(100, 'Page 1'), 'page1')
instrumentMethod(Test(200, 'Page 2'), 'page2')
instrumentMethod(Test(300, 'Page 3'), 'page3')
I'm just learning Python (less than a week), but I think it may be an issue with the \t.
Python is reading this:
\tlog('Arvind Purohit')
But, instead of a tab, it's expecting 4 spaces, like this:
log('Arvind Purohit')
That's why it must be working when you copy/paste it. Make sure you're using an editor that shows all characters to avoid this and indentation with vertical lines so you can also avoid "(no code object) at line 0" errors.
Related
I am a newbie to python at best. I have been attempting to make a function to download a specific number of images from a desired google image search into a specific folder in your google drive. But I have hit a snag that I am unable to fix; please can someone point out where I am going wrong or point me in the right direction to fix it. I believe the issue is im = requests.get(link) (line 36). So far, I have the following:
# mount the drive
from google.colab import drive
drive.mount('/content/gdrive')
#module import
import requests
from bs4 import BeautifulSoup
#define parameters of search
query = input("Images of:")
print("Number of images:")
NumberOfImages = int(input())
FolderLocation = input("Input Folder Location:")
image_type="ActiOn"
query= query.split()
query='+'.join(query)
url="https://www.google.co.in/search?q="+query+"&source=lnms&tbm=isch"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# soup
request = requests.get(url,headers=headers)
soup = BeautifulSoup(request.text,'html.parser')
images = soup.find_all('img')
# loop to save
tik = 0
for image in images:
if tik <= NumberOfImages:
link = image['src']
name = query+"_"+str(tik)
print(link, name)
with open(FolderLocation+"/"+name+".jpg",'wb') as f:
im = requests.get(link)
f.write(im.content)
print("Writing "+name+ " to file")
tik +=1
else:
break
Is this an issue with requesting the 'src' links from google, or is there something else I am missing out?
Any help would be much appreciated. Thanks.
In order to scrape the full-res image URL using requests and beautifulsoup you need to scrape data from the page source (CTRL+U) code via regex.
Find all <script> tags:
soup.select('script')
Match images data via regex from the <script> tags:
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
Match desired images (full res size) via regex:
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
matched_images_data_json)
Extract and decode them using bytes() and decode() and tell how much elements you want to extract with list() slicing [:20] (grabs first 20 images):
for fixed_full_res_image in matched_google_full_resolution_images[:20]:
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
Code and full example in the online IDE that also downloads images:
import requests, lxml, re, json, urllib.request
from bs4 import BeautifulSoup
headers = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}
params = {
"q": "pexels cat",
"tbm": "isch",
"hl": "en",
"ijn": "0",
}
html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
def get_images_data():
print('\nGoogle Images Metadata:')
for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
source = google_image.select_one('.fxgdke').text
link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
print(f'{title}\n{source}\n{link}\n')
# this steps could be refactored to a more compact
all_script_tags = soup.select('script')
# # https://regex101.com/r/48UZhY/4
matched_images_data = ''.join(re.findall(r"AF_initDataCallback\(([^<]+)\);", str(all_script_tags)))
# https://kodlogs.com/34776/json-decoder-jsondecodeerror-expecting-property-name-enclosed-in-double-quotes
# if you try to json.loads() without json.dumps it will throw an error:
# "Expecting property name enclosed in double quotes"
matched_images_data_fix = json.dumps(matched_images_data)
matched_images_data_json = json.loads(matched_images_data_fix)
# https://regex101.com/r/pdZOnW/3
matched_google_image_data = re.findall(r'\[\"GRID_STATE0\",null,\[\[1,\[0,\".*?\",(.*),\"All\",', matched_images_data_json)
# https://regex101.com/r/NnRg27/1
matched_google_images_thumbnails = ', '.join(
re.findall(r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]',
str(matched_google_image_data))).split(', ')
print('Google Image Thumbnails:') # in order
for fixed_google_image_thumbnail in matched_google_images_thumbnails:
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')
# after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
print(google_image_thumbnail)
# removing previously matched thumbnails for easier full resolution image matches.
removed_matched_google_images_thumbnails = re.sub(
r'\[\"(https\:\/\/encrypted-tbn0\.gstatic\.com\/images\?.*?)\",\d+,\d+\]', '', str(matched_google_image_data))
# https://regex101.com/r/fXjfb1/4
# https://stackoverflow.com/a/19821774/15164646
matched_google_full_resolution_images = re.findall(r"(?:'|,),\[\"(https:|http.*?)\",\d+,\d+\]",
removed_matched_google_images_thumbnails)
print('\nFull Resolution Images:') # in order
for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
# https://stackoverflow.com/a/4004439/15164646 comment by Frédéric Hamidi
original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
print(original_size_img)
# ------------------------------------------------
# Download original images
# print(f'Downloading {index} image...')
# opener=urllib.request.build_opener()
# opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
# urllib.request.install_opener(opener)
# urllib.request.urlretrieve(original_size_img, f'Bs4_Images/original_size_img_{index}.jpg')
get_images_data()
-------------
'''
Google Images Metadata:
9,000+ Best Cat Photos · 100% Free Download · Pexels Stock Photos
pexels.com
https://www.pexels.com/search/cat/
other results ...
Google Image Thumbnails:
https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR2cZsuRkkLWXOIsl9BZzbeaCcI0qav7nenDvvqi-YSm4nVJZYyljRsJZv6N5vS8hMNU_w&usqp=CAU
other results ...
Full Resolution Images:
https://images.pexels.com/photos/1170986/pexels-photo-1170986.jpeg?cs=srgb&dl=pexels-evg-culture-1170986.jpg&fm=jpg
other results ...
'''
Alternatively, you can achieve the same thing by using Google Images API from SerpApi. It's a paid API with a free plan.
The difference is that you don't have to deal with regex, bypass blocks from Google, and maintain it over time if something crashes. Instead, you only need to iterate over structured JSON and get the data you want.
Example code to integrate:
import os, json # json for pretty output
from serpapi import GoogleSearch
def get_google_images():
params = {
"api_key": os.getenv("API_KEY"),
"engine": "google",
"q": "pexels cat",
"tbm": "isch"
}
search = GoogleSearch(params)
results = search.get_dict()
print(json.dumps(results['images_results'], indent=2, ensure_ascii=False))
get_google_images()
---------------
'''
[
... # other images
{
"position": 100, # img number
"thumbnail": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRR1FCGhFsr_qZoxPvQBDjVn17e_8bA5PB8mg&usqp=CAU",
"source": "pexels.com",
"title": "Close-up of Cat · Free Stock Photo",
"link": "https://www.pexels.com/photo/close-up-of-cat-320014/",
"original": "https://images.pexels.com/photos/2612982/pexels-photo-2612982.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500",
"is_product": false
}
]
'''
P.S - I wrote a more in-depth blog post about how to scrape Google Images, and how to reduce the chance of being blocked while web scraping search engines.
Disclaimer, I work for SerpApi.
I am trying to establish a websocket to blockchain over blockchain exchange api, but I get a response like HTTP/1.1 400 Bad Request.
Code example taken from https://github.com/websocket-client/websocket-client
CODE I
import websocket
from websocket import create_connection
try:
import thread
except ImportError:
import _thread as thread
import time
def on_message(ws, message):
print(message)
def on_error(ws, error):
print(error)
def on_close(ws):
print("### closed ###")
def on_open(ws):
msg = '{"token": "", "action": "subscribe", "channel": "auth"}'
ws.send(msg)
if __name__ == "__main__":
# options = {'origin': 'https://exchange.blockchain.com'}
url = "wss://ws.prod.blockchain.info/mercury-gateway/v1/ws"
websocket.enableTrace(True)
ws = websocket.WebSocketApp(url,
on_message=on_message,
on_error=on_error,
on_close=on_close)
ws.on_open = on_open
ws.run_forever()
--- request header ---
GET /mercury-gateway/v1/ws HTTP/1.1
Upgrade: websocket
Connection: Upgrade
Host: ws.prod.blockchain.info
Origin: http://ws.prod.blockchain.info
Sec-WebSocket-Key: ldRVrsrxBUnvokDHvzNICw==
Sec-WebSocket-Version: 13
--- response header ---
HTTP/1.1 400 Bad Request
Server: nginx
Date: Mon, 08 Feb 2021 23:20:44 GMT
Content-Type: text/plain; charset=UTF-8
Content-Length: 70
X-Cache-Status: 9b46f4091784da7cbae64c3e66446707
X-Blockchain-Language: en
X-Blockchain-Language-ID: 0:0:0 (en:en:en)
X-Request-ID: 5ef71341f43f1d9ec5d37c25164676dc
X-Original-Host: ws.prod.blockchain.info
X-Blockchain-Server: BlockchainFE/1.0
X-Blockchain-CP-F: j1vn 0.012 - 5ef71341f43f1d9ec5d37c25164676dc
Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
X-Content-Type-Options: nosniff
X-XSS-Protection: 1; mode=block
Via: 1.1 google
Alt-Svc: clear
Connection: close
I tried with and without the options dictionary, but without any success.
Although that code works, but I need the event methods on_message, on_error, on_close and on_open.
Either there is a way to get those events working for CODE II or there is a way to establish the websocket connection for CODE I.
Help is much appreciated. Thank you!
CODE II
from websocket import create_connection
options = {}
options['origin'] = 'https://exchange.blockchain.com'
url = "wss://ws.prod.blockchain.info/mercury-gateway/v1/ws"
ws = create_connection(url, **options)
print(ws.sock_opt)
msg = '{"token": "", "action": "subscribe", "channel": "auth"}'
ws.send(msg)
result = ws.recv()
print(result)
msg = '{"action": "subscribe", "channel": "balances"}'
ws.send(msg)
result = ws.recv()
print(result)
Actually, the answer is, setting the origin-Header to the right value, like this:
import websocket
try:
import thread
except ImportError:
import _thread as thread
import time
def on_message(ws, message):
print("MSG:", message)
def on_error(ws, error):
print("### error ###")
print(error)
def on_close(ws):
print("### closed ###")
def on_open(ws):
msg = '{"action": "subscribe", "channel": "prices", "symbol": "BTC-EUR", "granularity": "60" }'
ws.send(msg)
if __name__ == "__main__":
url = "wss://ws.prod.blockchain.info/mercury-gateway/v1/ws"
websocket.enableTrace(True)
ws = websocket.WebSocketApp(url,
on_message=on_message,
on_error=on_error,
on_close=on_close)
ws.on_open = on_open
options = {'origin': 'https://exchange.blockchain.com'}
ws.run_forever(origin=options['origin'])
I am working on a script to scrape some information off Amazon's Prime Now grocery website. However, I am stumbling on the first step in which I am attempting to start a session and login to the page.
I am fairly positive that the issue is in building the 'data' object. There are 10 input's in the html but the data object I have constructed only has 9, with the missing one being the submit button. I am not entirely sure if it is relevant as this is my first time working with BeautifulSoup.
Any help would be greatly appreciated! All of my code is below, with the last if/else statement confirming that it has not worked when I run the code.
import requests
from bs4 import BeautifulSoup
# define URL where login form is located
site = 'https://primenow.amazon.com/ap/signin?clientContext=133-1292951-7489930&openid.return_to=https%3A%2F%2Fprimenow.amazon.com%2Fap-post-redirect%3FsiteState%3DclientContext%253D131-7694496-4754740%252CsourceUrl%253Dhttps%25253A%25252F%25252Fprimenow.amazon.com%25252Fhome%252Csignature%253DIFISh0byLJrJApqlChzLdkc2FCEj3D&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=amzn_houdini_desktop_us&openid.mode=checkid_setup&marketPlaceId=A1IXFGJ6ITL7J4&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&pageId=amzn_pn_us&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=3600'
# initiate session
session = requests.Session()
# define session headers
session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.61 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Referer': site
}
# get login page
resp = session.get(site)
html = resp.text
# get BeautifulSoup object of the html of the login page
soup = BeautifulSoup(html , 'lxml')
# scrape login page to get all the needed inputs required for login
data = {}
form = soup.find('form')
for field in form.find_all('input'):
try:
data[field['name']] = field['value']
except:
pass
# add username and password to the data for post request
data['email'] = 'my email'
data['password'] = 'my password'
# submit post request with username / password and other needed info
post_resp = session.post(site, data = data)
post_soup = BeautifulSoup(post_resp.content , 'lxml')
if post_soup.find_all('title')[0].text == 'Your Account':
print('Login Successfull')
else:
print('Login Failed')
Here is my spider, which I run from a script to parse content of my local dokuwiki:
DEBUG = True
if DEBUG:
f_debug = open('debug.log','w')
md5s = []
class DokuWikiMd5Spider(scrapy.Spider):
name = 'dokuwikispider'
start_urls = ['https://dokuwiki.mjcc.lasil.ru/doku.php']
visited = []
custom_settings = {
'CONCURRENT_REQUESTS': 1,
}
#staticmethod
def get_page_name(url):
url = url.replace("https://dokuwiki.mjcc.lasil.ru/doku.php?", '')
if 'id=start&do=search' in url:
# because credentials are in URL, here we cut only page name
# https://dokuwiki.mjcc.lasil.ru/doku.php?id=start&do=search&id=%D0%BF%D0%BE%D1%81%D1%82%D0%B0%D0%B2%D1%89%D0%B8%D0%BA%D0%B8_%D0%B8_%D0%BA%D0%BE%D0%BD%D1%82%D0%B0%D0%BA%D1%82%D1%8B&q=&p=PASSWORD&u=admin
m = re.findall('id=([^&]+)', url)
return m[1]
else:
m = re.search('id=([^&]+)', url)
return m.group(1)
def parse(self, response):
password = keyring.get_password('dokuwiki', 'admin')
return scrapy.FormRequest.from_response(
response,
formdata = {'u': 'admin', 'p': password},
callback = self.after_login
)
def after_login(self, response):
# check login succeed before going on
if b"authentication failed" in response.body:
self.logger.error("Login failed")
return
# continue scraping with authenticated session...
if DEBUG:
f_debug.write("parsing: {}\n".format(response.url))
text = response.text
# cut everything except page content, not to depend on wiki settings when comparing
m = re.findall('.*(<!-- wikipage start -->.*<!-- wikipage stop -->).*', text, re.DOTALL)
text = m[0][0]
# with open(r'F:\TEMP\test.html','w') as f:
# f.write(text)
md5 = hashlib.md5()
md5.update(text.encode('utf-8'))
md5s.append({'url': self.get_page_name(response.url), 'md5': md5.hexdigest()})
yield {'url': self.get_page_name(response.url), 'md5': md5.hexdigest()}
for next_page in response.xpath('//a/#href'):
next_url = next_page.extract()
if DEBUG:
f_debug.write("\t?next page: {}\n".format(next_url))
if 'doku.php?id=' in next_url:
# to process every page name only one time
next_page_name = self.get_page_name(next_url)
if next_page_name not in self.visited:
if DEBUG:
f_debug.write("\t\t!\n")
self.visited.append(next_page_name)
yield response.follow("https://dokuwiki.mjcc.lasil.ru/{}&u=admin&p={}".format(next_url, keyring.get_password('dokuwiki', 'admin')), self.after_login)
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(DokuWikiMd5Spider)
process.start() # the script will block here until the crawling is finished
So in debug messages I see, that spider crowled page 'wiki_backup':
2019-01-28 19:49:22 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://dokuwiki.mjcc.lasil.ru//doku.php?id=wiki_backup&u=admin&p=PASSWORD> (referer: https://dokuwiki.mjcc.lasil.ru//doku.php?id=%D1%81%D0%BE%D0%B7%D0%B4%D0%B0%D0%BD%D0%B8%D0%B5_%D0%B8_%D0%BF%D1%80%D0%BE%D0%B2%D0%B5%D1%80%D0%BA%D0%B0_%D0%B1%D1%8D%D0%BA%D0%B0%D0%BF%D0%BE%D0%B2&u=admin&p=PASSWORD)
And I can see its content in the crawled method, as you can see in screenshot
But that page wasn't parsed even one time, as you can see in ''debug.log'':
root#F91_Moin20:/home/ishayahu # cat debug.log | grep wiki_backup
?next page: /doku.php?id=wiki_backup
The problem was in a way, how spider checks if authentification was failed. It (as in the tutorial) search for words "authentification failed", but because I had the same words in page content, spider thought that here was an authentification error and stop processing the page.
There should be another way to check if authentification was really failed.
The cgi scripts that I have tried are unable to retrieve json data from my inets httpd server.
In order to retrieve json data in a cgi script, you need to be able to read the body of the request, which will contain something like:
{"a": 1, "b": 2}
With a perl cgi script, I can read the body of a request like this:
my $cgi = CGI->new;
my $req_body = $cgi->param('POSTDATA');
I assume that is an indirect way of reading what the server pipes to the script's stdin because in a python cgi script I have to write:
req_body = sys.stdin.read()
When I request a cgi script from an apache server, my perl and python cgi scripts can successfully get the json data from apache. But when I request the same cgi scripts from my inets httpd server, my perl cgi script reads nothing for the request body, and my python cgi script hangs then the server times out. My cgi scripts are able to retrieve data formatted as "a=1&b=2" from an inets httpd server--in that case the cgi facilities in both perl and python automatically parse the data for me, so instead of trying to read the body of the request, I just access the structures that cgi created.
Here is my httpd sever configuration (server.conf):
[
{modules, [
mod_alias,
mod_actions,
mod_esi,
mod_cgi,
mod_get,
mod_log
]},
{bind_address, "localhost"},
{port,0},
{server_name,"httpd_test"},
{server_root,"/Users/7stud/erlang_programs/inets_proj"},
{document_root,"./htdocs"},
{script_alias, {"/cgi-bin/", "/Users/7stud/erlang_programs/inets_proj/cgi-bin/"} },
{erl_script_alias, {"/erl", [mymod]} },
{erl_script_nocache, true},
{error_log, "./errors.log"},
{transfer_log, "./requests.log"}
].
I start my httpd server with this program (s.erl):
-module(s).
-compile(export_all).
%Need to look up port with httpd:info(Server)
ensure_inets_start() ->
case inets:start() of
ok -> ok;
{error,{already_started,inets}} -> ok
end.
start() ->
ok = ensure_inets_start(),
{ok, Server} = inets:start(httpd,
[{proplist_file, "./server.conf"}]
),
Server.
stop(Server) ->
ok = inets:stop(httpd, Server).
My cgi script (1.py):
#!/usr/bin/env python3
import json
import cgi
import cgitb
cgitb.enable() #errors to browser
import sys
sys.stdout.write("Content-Type: text/html")
sys.stdout.write("\r\n\r\n")
#print("<div>hello</div>")
req_body = sys.stdin.read()
my_dict = json.loads(req_body)
if my_dict:
a = my_dict.get("a", "Not found")
b = my_dict.get("b", "Not found")
total = a + b
print("<div>Got json: {}</div>".format(my_dict) )
print("<div>a={}, b={}, total={}</div>".format(a, b, total))
else:
print("<div>Couldn't read json data.</div>")
My cgi script (1.pl):
#!/usr/bin/env perl
use strict;
use warnings;
use 5.020;
use autodie;
use Data::Dumper;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
use JSON;
my $q = CGI->new;
print $q->header,
$q->start_html("Test Page"),
$q->h1("Results:"),
$q->div("json=$json"),
$q->end_html;
Server startup in terminal window:
~/erlang_programs/inets_proj$ erl
Erlang/OTP 20 [erts-9.2] [source] [64-bit] [smp:4:4] [ds:4:4:10] [async-threads:10] [hipe] [kernel-poll:false]
Eshell V9.2 (abort with ^G)
1> c(s).
s.erl:2: Warning: export_all flag enabled - all functions will be exported
{ok,s}
2> Server = s:start().
<0.86.0>
3> httpd:info(Server).
[{mime_types,[{"htm","text/html"},{"html","text/html"}]},
{server_name,"httpd_test"},
{erl_script_nocache,true},
{script_alias,{"/cgi-bin/",
"/Users/7stud/erlang_programs/inets_proj/cgi-bin/"}},
{bind_address,{127,0,0,1}},
{modules,[mod_alias,mod_actions,mod_esi,mod_cgi,mod_get,
mod_log]},
{server_root,"/Users/7stud/erlang_programs/inets_proj"},
{erl_script_alias,{"/erl",[mymod]}},
{port,51301},
{transfer_log,<0.93.0>},
{error_log,<0.92.0>},
{document_root,"./htdocs"}]
4>
curl request:
$ curl -v \
> -H 'Content-Type: application/json' \
> --data '{"a": 1, "b": 2}' \
> http://localhost:51301/cgi-bin/1.py
* Trying ::1...
* TCP_NODELAY set
* Connection failed
* connect to ::1 port 51301 failed: Connection refused
* Trying 127.0.0.1...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 51301 (#0)
> POST /cgi-bin/1.py HTTP/1.1
> Host: localhost:51301
> User-Agent: curl/7.58.0
> Accept: */*
> Content-Type: application/json
> Content-Length: 16
>
* upload completely sent off: 16 out of 16 bytes
===== hangs for about 5 seconds ====
< HTTP/1.1 504 Gateway Time-out
< Date: Thu, 08 Mar 2018 11:02:27 GMT
< Content-Type: text/html
< Server: inets/6.4.5
* no chunk, no close, no size. Assume close to signal end
<
* Closing connection 0
$
My directory structure:
~/erlang_programs$ tree inets_proj/
inets_proj/
├── apache_cl.erl
├── cgi-bin
│ ├── 1.pl
│ └── 1.py
├── cl.beam
├── cl.erl
├── errors.log
├── htdocs
│ └── file1.txt
├── mylog.log
├── mymod.beam
├── mymod.erl
├── old_server.conf
├── old_server3.conf
├── old_server4.conf
├── requests.log
├── s.beam
├── s.erl
├── server.conf
└── urlencoded_post_cl.erl
I dug up the RFC for the cgi spec, which says:
RFC 3875 CGI Version 1.1 October 2004
4.2. Request Message-Body
Request data is accessed by the script in a system-defined method;
unless defined otherwise, this will be by reading the 'standard
input' file descriptor or file handle.
Request-Data = [ request-body ] [ extension-data ]
request-body = <CONTENT_LENGTH>OCTET
extension-data = *OCTET
A request-body is supplied with the request if the CONTENT_LENGTH is
not NULL. The server MUST make at least that many bytes available
for the script to read. The server MAY signal an end-of-file
condition after CONTENT_LENGTH bytes have been read or it MAY supply
extension data. Therefore, the script MUST NOT attempt to read more
than CONTENT_LENGTH bytes, even if more data is available. However,
it is not obliged to read any of the data.
I don't understand what extension data is, but the key line is:
the [cgi] script MUST NOT attempt to read more than CONTENT_LENGTH
bytes, even if more data is available.
If I alter my python script to read in the content length rather than trying to read in the whole stdin file--which doesn't stop reading until it gets an eof signal--then my python cgi script successfully retrieves the json data from my inets httpd server.
#!/usr/bin/env python3
import json
import sys
import os
content_len = int(os.environ["CONTENT_LENGTH"])
req_body = sys.stdin.read(content_len)
my_dict = json.loads(req_body)
sys.stdout.write("Content-Type: text/html")
sys.stdout.write("\r\n\r\n")
if my_dict:
a = my_dict.get("a", "Not found")
b = my_dict.get("b", "Not found")
total = a + b
print("<div>Content-Length={}</div".format(content_len))
print("<div>Got json: {}</div>".format(my_dict) )
print("<div>a={}, b={}, total={}</div>".format(a, b, total))
else:
print("<div>Couldn't read json data.</div>")
'''
form = cgi.FieldStorage()
if "a" not in form:
print("<H1>Error:</H1>")
print("<div>'a' not in form</div>")
else:
print("<p>a:{}</p>".format( form["a"].value) )
if "b" not in form:
print("<H1>Error:</H1>")
print("<div>'b' not in form</div>")
else:
print("<p>b:{}</p>".format(form["b"].value) )
'''
Server info:
4> httpd:info(Server).
[{mime_types,[{"htm","text/html"},{"html","text/html"}]},
{server_name,"httpd_test"},
{erl_script_nocache,true},
{script_alias,{"/cgi-bin/",
"/Users/7stud/erlang_programs/inets_proj/cgi-bin/"}},
{bind_address,{127,0,0,1}},
{modules,[mod_alias,mod_actions,mod_esi,mod_cgi,mod_get,
mod_log]},
{server_root,"/Users/7stud/erlang_programs/inets_proj"},
{erl_script_alias,{"/erl",[mymod]}},
{port,65451},
{transfer_log,<0.93.0>},
{error_log,<0.92.0>},
{document_root,"./htdocs"}]
5>
curl request (note that curl automatically calculates the content length and puts it in a Content-Length header):
~$ curl -v \
> -H 'Content-Type: application/json' \
> --data '{"a": 1, "b": 2}' \
> http://localhost:65451/cgi-bin/1.py
* Trying ::1...
* TCP_NODELAY set
* Connection failed
* connect to ::1 port 65451 failed: Connection refused
* Trying 127.0.0.1...
* TCP_NODELAY set
* Connected to localhost (127.0.0.1) port 65451 (#0)
> POST /cgi-bin/1.py HTTP/1.1
> Host: localhost:65451
> User-Agent: curl/7.58.0
> Accept: */*
> Content-Type: application/json
> Content-Length: 16
>
* upload completely sent off: 16 out of 16 bytes
< HTTP/1.1 200 OK
< Date: Fri, 09 Mar 2018 04:36:42 GMT
< Server: inets/6.4.5
< Transfer-Encoding: chunked
< Content-Type: text/html
<
<div>Content-Length=16</div
<div>Got json: {'a': 1, 'b': 2}</div>
<div>a=1, b=2, total=3</div>
* Connection #0 to host localhost left intact
~$
Here's the perl script that I got work with inets httpd (1.pl):
#!/usr/bin/env perl
use strict;
use warnings;
use 5.020;
use autodie;
use Data::Dumper;
use JSON;
if (my $content_len = $ENV{CONTENT_LENGTH}) {
read(STDIN, my $json, $content_len);
my $href = decode_json($json);
my $a = $href->{a};
my $b = $href->{b};
print 'Content-type: text/html';
print "\r\n\r\n";
print "<div>a=$a</div>";
print "<div>b=$b</div>";
#my $q = CGI->new; #Doesn't work with inets httpd server
#my $q = CGI->new(''); #Doesn't try to read from stdin, do does work.
# print $q->header,
# $q->start_html("Test Page"),
# $q->div("json=$json"),
# $q->div("a=$a"),
# $q->div("b=$b"),
# $q->div("total=$total"),
# $q->end_html;
}
else {
my $error = "Could not read json: No Content-Length header in request.";
print 'Content-type: text/html';
print "\r\n\r\n";
print "<div>$error</div>";
# my $q = CGI->new;
# print $q->header,
# $q->start_html("Test Page"),
# $q->h1($error),
# $q->end_html;
}
I couldn't get perl's CGI module to work in conjunction with reading from STDIN. Edit: A kind soul at perlmonks helped me solve that one:
my $q = CGI->new('');
The blank string tells CGI->new not to read from stdin and parse the data.