how to get response body in webkitgtk? - webkit

I tried to handle the webview resource_load_started and webresource finished signals to get the response body, but I ran into trouble.
Here are my python codes:
import gi
gi.require_version('WebKit2', '4.0')
gi.require_version("Gtk", "3.0")
from gi.repository import WebKit2, Gtk
window = Gtk.Window()
window.connect('delete-event', Gtk.main_quit)
view = WebKit2.WebView()
view.load_uri('https://yandex.com/')
def resorce_load_finished(resource):
raw_data = resource.get_data(cancellable=None, callback=None, user_data=None)
print("url: ", resource.get_response().get_uri(), "data_length: ", resource.get_response().get_content_length(), "\r\nraw_data: ", raw_data)
def resorce_load_started(view, resource, request):
resource.connect("finished", resorce_load_finished)
view.connect("resource_load_started", resorce_load_started)
window.add(view)
window.set_default_size(1024,1024)
window.show_all()
Gtk.main()
And the output as follow(https prefix removed):
/home/again/Documents/webkit2/venv/bin/python /home/again/Documents/webkit2/get_resource.py
url: //lazka.github.io/pgi-docs/#WebKit2-4.0 data_length: 1842
raw_data: None
url: //lazka.github.io/pgi-docs/sidebar.html data_length: 6344
raw_data: None
url: //lazka.github.io/pgi-docs/WebKit2-4.0/ data_length: 2216
raw_data: None
url: //lazka.github.io/pgi-docs/js.cookie-2.1.0.min.js data_length: 874
raw_data: None
url: //lazka.github.io/pgi-docs/_static/css/pgi.css data_length: 1047
raw_data: None
url: //lazka.github.io/pgi-docs/_static/graphviz.css data_length: 215
raw_data: None
url: //lazka.github.io/pgi-docs/pgi-config.js data_length: 388
raw_data: None
url: //lazka.github.io/pgi-docs/pgi-searchtools.js data_length: 2973
raw_data: None
url: //lazka.github.io/pgi-docs/_static/css/theme.css data_length: 19061
raw_data: None
url: //lazka.github.io/pgi-docs/_static/fonts/lato-bold.ttf data_length: 39816
raw_data: None
url: //lazka.github.io/pgi-docs/_static/fonts/fontawesome-webfont.ttf data_length: 83998
raw_data: None
url: //lazka.github.io/pgi-docs/_static/fonts/lato-regular.ttf data_length: 39653
raw_data: None
Process finished with exit code 0
My question is do I use correct signal handler? If used correctly, then, Why resource.get_data() always return None, what happen there? I can only get the response URL and its length.

must be call get_data_finish to get final result. something like this:
def res_finished(resource, result, error):
data = resource.get_data_finish(result)
print(resource.get_uri(), data)
def received_data(resource, data_length):
resource.get_data(cancellable=None, callback=res_finished, user_data=None)
def resource_load_started(view, resource, request):
resource.connect("received_data", received_data)
view.connect("resource_load_started", resource_load_started)

Related

I created a simple class that works without #jitclass. When I try to improve it with #jitclass it stops working. What is happening here?

Following example 12.4 from the following website https://python-programming.quantecon.org/numba.html#id4 i constructed a simple class to model an AR(1) process.
Although the code works fine without the use of #jitclass, the code stops working after I remove ("#").
import numpy as np
import numba
import matplotlib.pyplot as plt
from numba import float64
from numba import int32
from numba.experimental import jitclass
#ar_1_data = [('ρ', float64), ('z_0', float64), ('μ', float64), ('σ', float64)]
##jitclass(ar_1_data)
class ar_1:
def __init__(self, ρ = 0.5, z_0 = 1, μ = 0, σ = 1):
self.ρ = ρ
self.z = z_0
self.lnz = np.log(z_0)
self.μ = μ
self.σ = σ
def update(self):
self.z = self.z**(self.ρ) * np.e**(np.random.normal(self.μ,self.σ))
def sequence(self, n):
path = []
path_log = []
for i in range(n):
path.append(self.z)
path_log.append(np.log(self.z))
self.update()
self.sequence = path
self.sequence_log = path_log
a = ar_1()
a.sequence(100)
Here is the Error im getting after removing the "#":
---------------------------------------------------------------------------
TypingError Traceback (most recent call last)
Input In [83], in <cell line: 1>()
----> 1 a = ar_1()
2 a.sequence(100)
File ~\anaconda3\lib\site-packages\numba\experimental\jitclass\base.py:124, in JitClassType.__call__(cls, *args, **kwargs)
122 bind = cls._ctor_sig.bind(None, *args, **kwargs)
123 bind.apply_defaults()
--> 124 return cls._ctor(*bind.args[1:], **bind.kwargs)
File ~\anaconda3\lib\site-packages\numba\core\dispatcher.py:468, in _DispatcherBase._compile_for_args(self, *args, **kws)
464 msg = (f"{str(e).rstrip()} \n\nThis error may have been caused "
465 f"by the following argument(s):\n{args_str}\n")
466 e.patch_message(msg)
--> 468 error_rewrite(e, 'typing')
469 except errors.UnsupportedError as e:
470 # Something unsupported is present in the user code, add help info
471 error_rewrite(e, 'unsupported_error')
File ~\anaconda3\lib\site-packages\numba\core\dispatcher.py:409, in _DispatcherBase._compile_for_args.<locals>.error_rewrite(e, issue_type)
407 raise e
408 else:
--> 409 raise e.with_traceback(None)
TypingError: Failed in nopython mode pipeline (step: nopython frontend)
Failed in nopython mode pipeline (step: nopython frontend)
Cannot resolve setattr: (instance.jitclass.ar_1#2658a3e4d30<ρ:float64,z_0:float64,μ:float64,σ:float64>).z = int64
File "..\..\..\..\..\AppData\Local\Temp\ipykernel_17336\4275445632.py", line 9:
<source missing, REPL/exec in use?>
During: typing of set attribute 'z' at C:\Users\Hogar\AppData\Local\Temp\ipykernel_17336\4275445632.py (9)
File "..\..\..\..\..\AppData\Local\Temp\ipykernel_17336\4275445632.py", line 9:
<source missing, REPL/exec in use?>
During: resolving callee type: jitclass.ar_1#2658a3e4d30<ρ:float64,z_0:float64,μ:float64,σ:float64>
During: typing of call at <string> (3)
During: resolving callee type: jitclass.ar_1#2658a3e4d30<ρ:float64,z_0:float64,μ:float64,σ:float64>
During: typing of call at <string> (3)
File "<string>", line 3:
<source missing, REPL/exec in use?>

Remove duplicates based on a unique ID

I want to remove duplicate data based on unique id's. Each listing from the site has a unique id and so I want to filter the data to remove any duplicates.
After looking at the documentation for scrapy: Filter duplicates
I have tried to implement a similar pipeline to remove the duplicates, however I am unfamiliar on how to get this to work.
Here's what I have truied:
import scrapy
from scrapy.loader import ItemLoader
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup
import json
from itertools import zip_longest
from collections import defaultdict
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
headers = {
'authority': 'www.theparking.eu',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': '*/*',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'x-requested-with': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"macOS"',
'origin': 'https://www.theparking.eu',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'cors',
'sec-fetch-dest': 'empty',
#'referer': 'https://www.theparking.eu/used-cars/used-cars/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
class DuplicatesPipeline:
def __init__(self):
# self.ids_seen = set()
self.titles_seen = set()
def process_item(self, unique_id, spider):
if unique_id in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(unique_id)
return unique_id
class Countryitem(scrapy.Item):
make = Field(output_processor = TakeFirst())
unique_id = Field(output_processor = TakeFirst())
page_number = Field(output_processor = TakeFirst())
class CountrySpider(scrapy.Spider):
name = "country"
test_dict={'country_id': [4,5,109,7,6,8,87],
'country': ['australia','austria','bahamas','belarus','belgium','bosnia and herzegovina','brasil'],
'make': [20, 13, 131, 113, 32, 62, 104],
'model': [1108, 4655, 687, 492, 499, 702, 6143],
'engine': [5, 11, 10, 7, 14, 21, 170]}
#for links, pages, id, country in zip(url_data.links, url_data.pages, url_data.id, url_data.country):
def start_requests(self):
for id_ in zip(self.test_dict['country_id']):
for id_marque in self.test_dict['make']:
for models in self.test_dict['model']:
for engine in self.test_dict['engine']:
for page in range(1, 10000):
yield scrapy.FormRequest(
url = f'https://www.theparking.eu/used-cars/#!/used-cars/%3Fid_pays%3D{id_}%26id_marque%3D{id_marque}%26id_modele%3D{models}%26id_motorisation%3D{engine}',
method="POST",
callback = self.parse,
formdata = {
'ajax': '{"tab_id":"t0","cur_page":%s,"cur_trie":"distance","query":"","critere":{"id_pays":[%s],"id_marque":[%s], "id_modele":[%s], "id_motorisation":[%s]},"sliders":{"prix":{"id":"#range_prix","face":"prix","max_counter":983615,"min":"1","max":"400000"},"km":{"id":"#range_km","face":"km","max_counter":1071165,"min":"1","max":"500000"},"millesime":{"id":"#range_millesime","face":"millesime","max_counter":1163610,"min":"1900","max":"2022"}},"req_num":1,"nb_results":"11795660","current_location_distance":-1,"logged_in":false}' % (page,id_, id_marque, models, engine),
'tabs': '["t0"]'
},
headers=headers,
cb_kwargs = {
'page_number':page
}
)
def parse(self, response,page_number):
container = json.loads(response.text)
test=container['#lists']
soup = BeautifulSoup(test, 'lxml')
for i in soup:
carMake = i.select("a.external.tag_f_titre > span.title-block.brand:nth-child(1)")
carUnique = i.select('li[tref]')
for make, unique in zip_longest(
carMake, carUnique
):
loader = ItemLoader(Countryitem())
# loader.add_value('page_number', page_number)
loader.add_value("unique_id", unique['tref'])
loader.add_value("page_number",page_number)
if make != None:
loader.add_value('make', make.text)
else:
loader.add_value('make', "None")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEED_URI':'park.jl',
'FEED_FORMAT':'jsonlines'
}
)
process.crawl(CountrySpider)
process.start()
class DuplicatesPipeline:
def __init__(self):
self.titles_seen = set()
def process_item(self, item, spider):
if item['unique_id'] in self.titles_seen:
raise DropItem("Duplicate item title found: %s" % unique_id)
else:
self.titles_seen.add(item['unique_id'])
return item
Also add to custom_settings:
custom_settings = {
'ITEM_PIPELINES': {
'myproject.path_to_your_file.DuplicatesPipeline': 300
}
}

Scrapy - idle signal spider running into an error

I'm trying to create a spider, which is running all the time and as soon as it arrives in it's idle state, it should fetch the next url to parse from the database.
Unfortunately, I got stack at the very beginning already:
# -*- coding: utf-8 -*-
import scrapy
from scrapy import signals
from scrapy import Spider
import logging
class SignalspiderSpider(Spider):
name = 'signalspider'
allowed_domains = ['domain.de']
yet = False
def start_requests(self):
logging.log(logging.INFO, "______ Loading requests")
yield scrapy.Request('https://www.domain.de/product1.html')
#classmethod
def from_crawler(cls, crawler, *args, **kwargs):
logging.log(logging.INFO, "______ From Crawler")
spider = spider = super(SignalspiderSpider, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.idle, signal=scrapy.signals.spider_idle)
return spider
def parse(self, response):
self.logger.info("______ Finished extracting structured data from HTML")
pass
def idle(self):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.crawler.engine.crawl(self.create_request(), self)
self.yet = True
def create_request(self):
logging.log(logging.INFO, "_____________ Create requests")
yield scrapy.Request('https://www.domain.de/product2.html?dvar_82_color=blau&cgid=')
and the error that I get:
2019-03-27 21:41:38 [root] INFO: _______ Idle state
2019-03-27 21:41:38 [root] INFO: _____________ Create requests
2019-03-27 21:41:38 [scrapy.utils.signal] ERROR: Error caught on signal handler: <bound method RefererMiddleware.request_scheduled of <scrapy.spidermiddlewares.referer.RefererMiddleware object at 0x7f93bcc13978>>
Traceback (most recent call last):
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/utils/signal.py", line 30, in send_catch_log
*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/pydispatch/robustapply.py", line 55, in robustApply
return receiver(*arguments, **named)
File "/home/spidy/Documents/spo/lib/python3.5/site-packages/scrapy/spidermiddlewares/referer.py", line 343, in request_scheduled
redirected_urls = request.meta.get('redirect_urls', [])
AttributeError: 'NoneType' object has no attribute 'meta'
What am I doing wrong?
Try with:
def idle(self, spider):
logging.log(logging.INFO, "_______ Idle state")
if not self.yet:
self.yet = True
self.crawler.engine.crawl(Request(url='https://www.domain.de/product2.html?dvar_82_color=blau&cgid=', callback=spider.parse), spider)
I'm not sure if it is correct to create a request in method spider_idle, passing another method wich makes the request, like you do.
See more at Scrapy spider_idle signal - need to add requests with parse item callback

Issue accessing S3 from Tensorflow

With the following config.
os.environ['AWS_ACCESS_KEY_ID'] = 'xxxxxx'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'xxxxxxxx'
os.environ['AWS_REGION'] = 'us-west-2'
os.environ['S3_ENDPOINT'] = 's3-us-west-2.amazonaws.com'
os.environ['S3_USE_HTTPS'] = '1'
os.environ['S3_VERIFY_SSL'] = '1'
print(file_io.stat('s3://abcd/def.txt'))
I get the error
/usr/local/lib/python3.6/dist-packages/tensorflow/python/lib/io/file_io.py in stat(filename)
556 with errors.raise_exception_on_not_ok_status() as status:
557 pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
--> 558 return file_statistics
559
560
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
526 None, None,
527 compat.as_text(c_api.TF_Message(self.status.status)),
--> 528 c_api.TF_GetCode(self.status.status))
529 # Delete the underlying status object from memory otherwise it stays alive
530 # as there is a reference to status from this from the traceback due to
NotFoundError: Object s3://abcd/def.txt does not exist
Note this file does exist.
I also get the following error on a write and close.
UnknownError: PermanentRedirect: Unable to parse ExceptionName: PermanentRedirect Message: The bucket you are attempting to access must be addressed using the specified endpoint. Please send all future requests to this endpoint.
What more is needed to fix this?
This is how my config looks like:
import os
os.environ['AWS_REGION'] = 'us-west-2'
os.environ['S3_ENDPOINT'] = 'https://s3-us-west-2.amazonaws.com'
os.environ['S3_VERIFY_SSL'] = '0'
I think you have to change from
os.environ['S3_ENDPOINT'] = 's3-us-west-2.amazonaws.com'
to
os.environ['S3_ENDPOINT'] = 'https://s3-us-west-2.amazonaws.com'
Here is a link for your reference.
you can use endpoint like
mybucket.s3-us-west-2.amazonaws.com
then use s3://pathtofile to access

Wrote an errback for my Scrapy spider, but tracebacks also keep happening, why?

I am using Scrapy 1.1 and I call Scrapy from within a script. My spider launching method looks like this:
def run_spider(self):
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()
Here is an extract of my spider with an errback written as in the documentation, but it only prints when catches a failure.
class SiteSpider(scrapy.Spider):
name = 'SiteCrawler'
custom_settings = {
'FEED_FORMAT': 'json',
'FEED_URI': 'result.json',
}
def __init__(self, websites_file=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.websites_file = websites_file
print('***********')
print(self.websites_file)
def start_requests(self):
.....
if is_valid_url(website_url):
yield scrapy.Request(url=website_url, callback=self.parse, errback=self.handle_errors, meta={'url': account_id})
def parse(self, response):
.....
yield item
def handle_errors(self, failure):
if failure.check(HttpError):
# these exceptions come from HttpError spider middleware
# you can get the non-200 response
response = failure.value.response
print('HttpError on ' + response.url)
elif failure.check(DNSLookupError):
# this is the original request
request = failure.request
print('DNSLookupError on ' + request.url)
elif failure.check(TimeoutError, TCPTimedOutError):
request = failure.request
print('TimeoutError on ' + request.url)
My problem is that I get errors I expect, like:
TimeoutError on http://www.example.com
But also get tracebacks for the same websites:
2016-08-05 13:40:55 [scrapy] ERROR: Error downloading <GET http://www.example.com/robots.txt>: TCP connection timed out: 60: Operation timed out.
Traceback (most recent call last):
File ".../anaconda/lib/python3.5/site-packages/twisted/internet/defer.py", line 1126, in _inlineCallbacks
result = result.throwExceptionIntoGenerator(g)
File ".../anaconda/lib/python3.5/site-packages/twisted/python/failure.py", line 389, in throwExceptionIntoGenerator
return g.throw(self.type, self.value, self.tb)
File ".../anaconda/lib/python3.5/site-packages/scrapy/core/downloader/middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
twisted.internet.error.TCPTimedOutError: TCP connection timed out: 60: Operation timed out.
The written exception handling messages and the tracebacks can often be traced to the same websites. After searching a lot on stackoverflow, in the docs and the likes I still dont know why I see the tracebacks.
This also occurs with DNSLookupErrors for example.
Excuse me, my Scrapy knowledge is juvenile. Is this normal behavior?
Also, I added this to settings.py, which is under my crawler. Other entires (for example the item_pipelines) most exactly work.
LOG_LEVEL = 'WARNING'
But I still see debug messages, not only warnings and everything above that. (if configure_logging() is added to the spider launch) I am running this from terminal on mac os x.
I would be very happy to get any help with this.
Try this in a script:
if __name__ == '__main__':
runner = CrawlerProcess(get_project_settings())
spider = SiteSpider()
configure_logging()
d = runner.crawl(spider, websites_file=self.raw_data_file)
d.addBoth(lambda _: reactor.stop())
reactor.run()