Error saving crawled page using file_urls and ITEM_PIPELINES: Missing scheme in request url: h - scrapy

I'm trying to have scrapy download a copy of each page it crawls but when I run my spider the log contains entries like
2016-06-20 15:39:12 [scrapy] ERROR: Error processing {'file_urls': 'http://example.com/page',
'title': u'PageTitle'}
Traceback (most recent call last):
File "c:\anaconda3\envs\scrapy\lib\site-packages\twisted\internet\defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\media.py", line 44, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\pipelines\files.py", line 365, in get_media_requests
return [Request(x) for x in item.get(self.files_urls_field, [])]
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 25, in __init__
self._set_url(url)
File "c:\anaconda3\envs\scrapy\lib\site-packages\scrapy\http\request\__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
Other questions in SO about this error seem to relate to problems with the start_urls but my start url is fine as the spider crawls across the site, it just doesn't save the page to my specified files_store.
I populate the file_urls using item['file_urls'] = response.url
Do I need to specify the url a different way?

Related

How to store crawled data from Scrapy to FTP as csv?

My scrapy settings.py
from datetime import datetime
file_name = datetime.today().strftime('%Y-%m-%d_%H%M_')
save_name = file_name + 'Mobile_Nshopping'
FEED_URI = 'ftp://myusername:mypassword#ftp.mymail.com/uploads/%(save_name)s.csv'
when I'm running my spider scrapy crawl my_project_name getting error...
Can I have to create a pipeline?
\scrapy\extensions\feedexport.py:247: ScrapyDeprecationWarning: The `FEED_URI` and `FEED_FORMAT` settings have been deprecated in favor of the `FEEDS` setting. Please see the `FEEDS` setting docs for more details
exporter = cls(crawler)
Traceback (most recent call last):
File "c:\users\viren\appdata\local\programs\python\python38\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "c:\users\viren\appdata\local\programs\python\python38\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\viren\AppData\Local\Programs\Python\Python38\Scripts\scrapy.exe\__main__.py", line 7, in <module>
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\cmdline.py", line 145, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\cmdline.py", line 100, in _run_print_help
func(*a, **kw)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\cmdline.py", line 153, in _run_command
cmd.run(args, opts)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\commands\crawl.py", line 22, in run
crawl_defer = self.crawler_process.crawl(spname, **opts.spargs)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\crawler.py", line 191, in crawl
crawler = self.create_crawler(crawler_or_spidercls)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\crawler.py", line 224, in create_crawler
return self._create_crawler(crawler_or_spidercls)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\crawler.py", line 229, in _create_crawler
return Crawler(spidercls, self.settings)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\crawler.py", line 72, in __init__
self.extensions = ExtensionManager.from_crawler(self)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\middleware.py", line 53, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\middleware.py", line 35, in from_settings
mw = create_instance(mwcls, settings, crawler)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\utils\misc.py", line 167, in create_instance
instance = objcls.from_crawler(crawler, *args, **kwargs)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 247, in from_crawler
exporter = cls(crawler)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 282, in __init__
if not self._storage_supported(uri, feed_options):
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 427, in _storage_supported
self._get_storage(uri, feed_options)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 458, in _get_storage
instance = build_instance(feedcls.from_crawler, crawler)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 455, in build_instance
return build_storage(builder, uri, feed_options=feed_options, preargs=preargs)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 46, in build_storage
return builder(*preargs, uri, *args, **kwargs)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 201, in from_crawler
return build_storage(
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 46, in build_storage
return builder(*preargs, uri, *args, **kwargs)
File "c:\users\viren\appdata\local\programs\python\python38\lib\site-packages\scrapy\extensions\feedexport.py", line 192, in __init__
self.port = int(u.port or '21')
File "c:\users\viren\appdata\local\programs\python\python38\lib\urllib\parse.py", line 174, in port
raise ValueError(message) from None
ValueError: Port could not be cast to integer value as 'Edh=)9sd'
I don't know how to store CSV into FTP.
error is coming because my password is int?
Is there anything I forget to write?
Can I have to create a pipeline?
Yes, you probably should create a pipeline. As shown in the Scrapy Architecture Diagram, the basic concept is this: requests are sent, responses come back and processed by the spider, and finally, the pipeline does something with the items returned by the spider. In your case, you could create a pipeline that saves the data in a CSV file and uploads it to an ftp server. See Scrapy's Item Pipeline documentation for more information.
I don't know how to store CSV into FTP. error is coming because my password is int? Is there anything I forget to write?
I believe this is due to the deprecation error below (and shown at the top of the errors you provided):
ScrapyDeprecationWarning: The FEED_URI and FEED_FORMAT settings have been deprecated in favor of the FEEDS setting. Please see the FEEDS setting docs for more details.
Try replacing FEED_URI with FEEDS; see the Scrapy documentation on FEEDS.
You need to specify the port as well.
You can specify this in settings.
See also class definition from scrapy docs
class FTPFilesStore:
FTP_USERNAME = None
FTP_PASSWORD = None
USE_ACTIVE_MODE = None
def __init__(self, uri):
if not uri.startswith("ftp://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 'ftp'")
u = urlparse(uri)
self.port = u.port
self.host = u.hostname
self.port = int(u.port or 21)
self.username = u.username or self.FTP_USERNAME
self.password = u.password or self.FTP_PASSWORD
self.basedir = u.path.rstrip('/')

Selenium"Can't connect to HTTPS URL because the SSL module is not available

I have an anaconda environment with selenium installed. When I try to run I get this error:
Traceback (most recent call last):
File "c:\Users\Nick\Desktop\Code\product-scraper\sephora-scraper\scraper.py", line 31, in <module>
ChromeDriverManager().install(), options=options)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\webdriver_manager\chrome.py", line 34, in install
driver_path = self._get_driver_path(self.driver)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\webdriver_manager\manager.py", line 21, in _get_driver_path
driver_version = driver.get_version()
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\webdriver_manager\driver.py", line 40, in get_version
return self.get_latest_release_version()
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\webdriver_manager\driver.py", line 63, in get_latest_release_version
resp = requests.get(f"{self._latest_release_url}_{self.browser_version}")
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\requests\api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\requests\sessions.py", line 542, in request
resp = self.send(prep, **send_kwargs)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\requests\sessions.py", line 655, in send
r = adapter.send(request, **kwargs)
File "C:\Users\Nick\anaconda3\envs\web-scraper\lib\site-packages\requests\adapters.py", line 514, in send
raise SSLError(e, request=request)
requests.exceptions.SSLError: HTTPSConnectionPool(host='chromedriver.storage.googleapis.com', port=443): Max retries exceeded with url: /LATEST_RELEASE_88.0.4324 (Caused by SSLError("Can't connect to HTTPS URL because the SSL module is not available."))
I'm new to anaconda so I don't know what else to provide. Please leave a comment if I need to anything and I will add it right away. Thanks.
Try to add this path to your environment variable:
..\Anaconda3
..\Anaconda3\scripts
..\Anaconda3\Library\bin
You might need to restart windows after set up environment path

s3fs timeout issue on an AWS Lambda function within a VPN

s3fs seems to fail from time to time when reading from an S3 bucket using an AWS Lambda function within a VPN. I am using s3fs==0.4.0 and pandas==1.0.1.
import s3fs
import pandas as pd
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
s3_file = event['Records'][0]['s3']['object']['key']
s3fs.S3FileSystem.connect_timeout = 1800
s3fs.S3FileSystem.read_timeout = 1800
with s3fs.S3FileSystem(anon=False).open(f"s3://{bucket}/{s3_file}", 'rb') as f:
self.data = pd.read_json(f, **kwargs)
The stacktrace is the following:
Traceback (most recent call last):
File "/var/task/urllib3/connection.py", line 157, in _new_conn
(self._dns_host, self.port), self.timeout, **extra_kw
File "/var/task/urllib3/util/connection.py", line 84, in create_connection
raise err
File "/var/task/urllib3/util/connection.py", line 74, in create_connection
sock.connect(sa)
TimeoutError: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/task/botocore/httpsession.py", line 263, in send
chunked=self._chunked(request.headers),
File "/var/task/urllib3/connectionpool.py", line 720, in urlopen
method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
File "/var/task/urllib3/util/retry.py", line 376, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/var/task/urllib3/packages/six.py", line 735, in reraise
raise value
File "/var/task/urllib3/connectionpool.py", line 672, in urlopen
chunked=chunked,
File "/var/task/urllib3/connectionpool.py", line 376, in _make_request
self._validate_conn(conn)
File "/var/task/urllib3/connectionpool.py", line 994, in _validate_conn
conn.connect()
File "/var/task/urllib3/connection.py", line 300, in connect
conn = self._new_conn()
File "/var/task/urllib3/connection.py", line 169, in _new_conn
self, "Failed to establish a new connection: %s" % e
urllib3.exceptions.NewConnectionError: <botocore.awsrequest.AWSHTTPSConnection object at 0x7f4d578e3ed0>: Failed to establish a new connection: [Errno 110] Connection timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/var/task/botocore/endpoint.py", line 200, in _do_get_response
http_response = self._send(request)
File "/var/task/botocore/endpoint.py", line 244, in _send
return self.http_session.send(request)
File "/var/task/botocore/httpsession.py", line 283, in send
raise EndpointConnectionError(endpoint_url=request.url, error=e)
botocore.exceptions.EndpointConnectionError: Could not connect to the endpoint URL: "https://my_bucket.s3.eu-west-1.amazonaws.com/?list-type=2&prefix=my_folder%2Fsomething%2F&delimiter=%2F&encoding-type=url"
Has someone faced this same issue? Why would it fail only sometimes? Is there a s3fs configuration that could help for this specific issue?
Actually there was no problem at all with s3fs. Seems like we were using a Lambda function with two Subnets within the VPC and one was working normally but the other one wasn't allowed to access S3 resources, therefore when a Lambda was spawned using the second network it wouldn't be able to connect at all.
Fixing this issue was as easy as removing the second subnet.
You could also use boto3 which is supported by AWS, in order to get json from S3.
import json
import boto3
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
s3 = boto3.resource('s3')
file_object = s3_resource.Object(bucket, key)
json_content = json.loads(file_object.get()['Body'].read())

Error downloading PDF files

I have the following (simplified) code:
import os
import scrapy
class TestSpider(scrapy.Spider):
name = 'test_spider'
start_urls = ['http://www.pdf995.com/samples/pdf.pdf', ]
def parse(self, response):
save_path = 'test'
file_name = 'test.pdf'
self.save_page(response, save_path, file_name)
def save_page(self, response, save_dir, file_name):
os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, file_name), 'wb') as afile:
afile.write(response.body)
When i run it, I get this error:
[scrapy.core.scraper] ERROR: Error downloading <GET http://www.pdf995.com/samples/pdf.pdf>
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 43, in process_request
defer.returnValue((yield download_func(request=request,spider=spider)))
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1278, in returnValue
raise _DefGen_Return(val)
twisted.internet.defer._DefGen_Return: <200 http://www.pdf995.com/samples/pdf.pdf>
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python36\lib\site-packages\twisted\internet\defer.py", line 1301, in _inlineCallbacks
result = g.send(result)
File "C:\Python36\lib\site-packages\scrapy\core\downloader\middleware.py", line 53, in process_response
spider=spider)
File "C:\Python36\lib\site-packages\scrapy_beautifulsoup\middleware.py", line 16, in process_response
return response.replace(body=str(BeautifulSoup(response.body, self.parser)))
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 79, in replace
return cls(*args, **kwargs)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 20, in __init__
self._set_body(body)
File "C:\Python36\lib\site-packages\scrapy\http\response\__init__.py", line 55, in _set_body
"Response body must be bytes. "
TypeError: Response body must be bytes. If you want to pass unicode body use TextResponse or HtmlResponse.
Do I need to introduce a middleware or something to handle this? This looks like it should be valid, at least by other examples.
Note: at the moment I'm not using a pipeline because there in my real spider I have a lot of checks on whether the related item has been scraped, validating if this pdf belongs to the item, and checking a custom name of a pdf to see if it was downloaded. And as mentioned, many samples did what I'm doing here so I thought it would be easier and work.
The issue because of your own scrapy_beautifulsoup\middleware.py which is trying to replace the return response.replace(body=str(BeautifulSoup(response.body, self.parser))).
You need to correct that and that should fix the issue

Socket error when connect to Google Big Query in my local PC

It was working fine for me to run following steps in past two days to connect to Big query in pycharm in my PC
step 1:gcloud auth application-default login
step 2 :then connect to BG in the Pycharm in my local PC.
however, when I used the same method to try today:
Below error occurs:
Since I am behind "Great Wall" in China, so can only use VPN to login in google cloud, so I am not sure if it is caused by the VPN or does it have something to do with the setting of my google'account ? however, I tried with "service-account-key", it happen to be the same issue log:
Traceback (most recent call last):
File "C:/Users/emma/PycharmProjects/GCP/INIT.py", line 134, in <module>
explicit()
File "C:/Users/emma/PycharmProjects/GCP/INIT.py", line 54, in explicit
for dataset in bigquery_client.list_datasets():
File "C:\Python27\lib\site-packages\google\cloud\iterator.py", line 218, in _items_iter
for page in self._page_iter(increment=False):
File "C:\Python27\lib\site-packages\google\cloud\iterator.py", line 247, in _page_iter
page = self._next_page()
File "C:\Python27\lib\site-packages\google\cloud\iterator.py", line 347, in _next_page
response = self._get_next_page_response()
File "C:\Python27\lib\site-packages\google\cloud\iterator.py", line 396, in _get_next_page_response
query_params=params)
File "C:\Python27\lib\site-packages\google\cloud\_http.py", line 299, in api_request
headers=headers, target_object=_target_object)
File "C:\Python27\lib\site-packages\google\cloud\_http.py", line 193, in _make_request
return self._do_request(method, url, headers, data, target_object)
File "C:\Python27\lib\site-packages\google\cloud\_http.py", line 223, in _do_request
body=data)
File "C:\Python27\lib\site-packages\google_auth_httplib2.py", line 187, in request
self._request, method, uri, request_headers)
File "C:\Python27\lib\site-packages\google\auth\credentials.py", line 121, in before_request
self.refresh(request)
File "C:\Python27\lib\site-packages\google\oauth2\service_account.py", line 310, in refresh
request, self._token_uri, assertion)
File "C:\Python27\lib\site-packages\google\oauth2\_client.py", line 143, in jwt_grant
response_data = _token_endpoint_request(request, token_uri, body)
File "C:\Python27\lib\site-packages\google\oauth2\_client.py", line 104, in _token_endpoint_request
method='POST', url=token_uri, headers=headers, body=body)
File "C:\Python27\lib\site-packages\google_auth_httplib2.py", line 116, in __call__
url, method=method, body=body, headers=headers, **kwargs)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1609, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1351, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1272, in _conn_request
conn.connect()
File "C:\Python27\lib\site-packages\httplib2\__init__.py", line 1075, in connect
raise socket.error, msg
socket.error: [Errno 10060]
I update the python-cloud library, then everything works fine now.
Hope it keep it all the time :(