Unable to load data into bigquery (BadStatusLine) - google-bigquery

I am trying to load a local file into an existing table within bigquery. Tried 3 times on different days. File has 1.1m rows. I can't spot any specific error being encountered. Following are the details spit out...
== Platform ==
CPython:2.7.4:Linux-2.6.18-308.11.1.el5.centos.plus-x86_64-with-redhat-5.8-Final
== bq version ==
v2.0.12
== Command line ==
['/opt/./python2.7.4/bin/bq', 'load', '395733598146:apache_l1.sjc_web_201304', 'x.2013-04-23']
== UTC timestamp ==
2013-05-01 18:48:17
== Error trace ==
File "build/bdist.linux-x86_64/egg/bq.py", line 652, in RunSafely
return_value = self.RunWithArgs(*args, **kwds)
File "build/bdist.linux-x86_64/egg/bq.py", line 880, in RunWithArgs
job = client.Load(table_reference, source, schema=schema, **opts)
File "build/bdist.linux-x86_64/egg/bigquery_client.py", line 1634, in Load
upload_file=upload_file, **kwds)
File "build/bdist.linux-x86_64/egg/bigquery_client.py", line 1366, in ExecuteJob
job_id=job_id)
File "build/bdist.linux-x86_64/egg/bigquery_client.py", line 1352, in RunJobSynchronously
upload_file=upload_file, job_id=job_id)
File "build/bdist.linux-x86_64/egg/bigquery_client.py", line 1346, in StartJob
projectId=project_id).execute()
File "build/bdist.linux-x86_64/egg/bigquery_client.py", line 274, in execute
return super(BigqueryHttp, self).execute(**kwds)
File "build/bdist.linux-x86_64/egg/oauth2client/util.py", line 120, in positional_wrapper
return wrapped(*args, **kwargs)
File "build/bdist.linux-x86_64/egg/apiclient/http.py", line 656, in execute
_, body = self.next_chunk(http=http)
File "build/bdist.linux-x86_64/egg/oauth2client/util.py", line 120, in positional_wrapper
return wrapped(*args, **kwargs)
File "build/bdist.linux-x86_64/egg/apiclient/http.py", line 784, in next_chunk
headers=headers)
File "build/bdist.linux-x86_64/egg/oauth2client/util.py", line 120, in positional_wrapper
return wrapped(*args, **kwargs)
File "build/bdist.linux-x86_64/egg/oauth2client/client.py", line 428, in new_request
redirections, connection_type)
File "/opt/python2.7.4/lib/python2.7/site-packages/httplib2-0.8-py2.7.egg/httplib2/__init__.py", line 1570, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "/opt/python2.7.4/lib/python2.7/site-packages/httplib2-0.8-py2.7.egg/httplib2/__init__.py", line 1317, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/opt/python2.7.4/lib/python2.7/site-packages/httplib2-0.8-py2.7.egg/httplib2/__init__.py", line 1286, in _conn_request
response = conn.getresponse()
File "/opt/python2.7.4/lib/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/opt/python2.7.4/lib/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/opt/python2.7.4/lib/python2.7/httplib.py", line 373, in _read_status
raise BadStatusLine(line)

BigQuery doesn't like uploading large local files directly. Try first uploading it into a google cloud storage bucket (gs://), and then importing it into BQ from there.
Install gsutil directions from command line, or use your Google Developer's console in your web browser

You can able to load a local file into an exiting BigQuery table:
All Rows:
bq load --source_format=CSV mydataset.mytable myfile.csv col1:INTEGER,col2:STRING
Skip First Row:
bq load --skip_leading_rows=1 --source_format=CSV mydataset.mytable myfile.csv col1:INTEGER,col2:STRING

Related

Error with fetching chromium with fetch command

When I try to fetch chromium with the command it gives me a whole bunch of errors and at the end says "FileNotFoundError: [WinError 2] The system cannot find the file specified"
After the error occurred (output bellow) I tried to add the DEPOT_TOOLS_WIN_TOOLCHAIN to the system environment variables but after a restart nothing happened. Does anyone have a solution to this?
Error Output:
[0:00:00] Started.
Traceback (most recent call last):
File "C:\src\depot_tools\metrics.py", line 301, in print_notice_and_exit
yield
File "C:\src\depot_tools\gclient.py", line 3495, in <module>
sys.exit(main(sys.argv[1:]))
File "C:\src\depot_tools\gclient.py", line 3481, in main
return dispatcher.execute(OptionParser(), argv)
File "C:\src\depot_tools\subcommand.py", line 252, in execute
return command(parser, args[1:])
File "C:\src\depot_tools\gclient.py", line 3032, in CMDsync
ret = client.RunOnDeps('update', args)
File "C:\src\depot_tools\gclient.py", line 2026, in RunOnDeps
work_queue.flush(revision_overrides,
File "C:\src\depot_tools\gclient_utils.py", line 1016, in flush
reraise(e[0], e[1], e[2])
File "C:\src\depot_tools\gclient_utils.py", line 70, in reraise
raise value
File "C:\src\depot_tools\gclient_utils.py", line 1093, in run
self.item.run(*self.args, **self.kwargs)
File "C:\src\depot_tools\gclient.py", line 1005, in run
self._got_revision = self._used_scm.RunCommand(command, options, args,
File "C:\src\depot_tools\gclient_scm.py", line 128, in RunCommand
return getattr(self, command)(options, args, file_list)
File "C:\src\depot_tools\gclient_scm.py", line 610, in update
mirror = self._GetMirror(url, options, revision, revision_ref)
File "C:\src\depot_tools\gclient_scm.py", line 1060, in _GetMirror
if not self.cache_dir:
File "C:\src\depot_tools\gclient_scm.py", line 225, in cache_dir
return git_cache.Mirror.GetCachePath()
File "C:\src\depot_tools\git_cache.py", line 193, in GetCachePath
cachepath = subprocess.check_output(
File "C:\src\depot_tools\.cipd_bin\3.8\bin\Lib\subprocess.py", line 415, in check_output
return run(*popenargs, stdout=PIPE, timeout=timeout, check=True,
File "C:\src\depot_tools\.cipd_bin\3.8\bin\Lib\subprocess.py", line 493, in run
with Popen(*popenargs, **kwargs) as process:
File "C:\src\depot_tools\.cipd_bin\3.8\bin\Lib\subprocess.py", line 858, in __init__
self._execute_child(args, executable, preexec_fn, close_fds,
File "C:\src\depot_tools\.cipd_bin\3.8\bin\Lib\subprocess.py", line 1311, in _execute_child
hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
FileNotFoundError: [WinError 2] The system cannot find the file specified
Subprocess failed with return code 1.
You should delete config DEPOT_TOOLS_UPDATE on environment.

Exception Resource Temporarily unavailable while firing 50 requests for checking snapshot exits or not

I have created snapshots of 50 disks because gcloud does not allow snapshots with similar name. Before firing snapshot create I was checking if snapshot exists or not in gcp and fired 50 requests simultaneously and almost 5-6 requests failed with below exception.
snapshots().get(project=self.project, snapshot=name).execute()
exception :
File "/tmp/cloudpoint/libs/gcp/lib/oauth2client/_helpers.py", line 133, in positional_wrapper return wrapped(*args, **kwargs)
File "/tmp/cloudpoint/libs/gcp/lib/googleapiclient/http.py", line 837, in execute method=str(self.method), body=self.body, headers=self.headers)
File "/tmp/cloudpoint/libs/gcp/lib/googleapiclient/http.py", line 163, in _retry_request resp, content = http.request(uri, method, *args, **kwargs)
File "/tmp/cloudpoint/libs/gcp/lib/oauth2client/transport.py", line 175, in new_request redirections, connection_type)
File "/tmp/cloudpoint/libs/gcp/lib/oauth2client/transport.py", line 282, in request connection_type=connection_type)
File "/usr/lib/python2.7/dist-packages/httplib2/__init__.py", line 1607, in request (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "/usr/lib/python2.7/dist-packages/httplib2/__init__.py", line 1349, in _request(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/usr/lib/python2.7/dist-packages/httplib2/__init__.py", line 1305, in _conn_request response = conn.getresponse()
File "/usr/lib/python2.7/httplib.py", line 1136, in getresponse response.begin()
File "/usr/lib/python2.7/httplib.py", line 453, in begin version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 409, in _read_status line = self.fp.readline(_MAXLINE + 1)
File "/usr/lib/python2.7/socket.py", line 480, in readline data = self._sock.recv(self._rbufsize)
File "/usr/lib/python2.7/ssl.py", line 756, in recv return self.read(buflen)
File "/usr/lib/python2.7/ssl.py", line 643, in read v = self._sslobj.read(len)
error: [Errno 11] Resource temporarily unavailable
The error message "Resource temporarily unavailable" means that the Compute Engine API was not available to fulfill the request. Since you made 50 simultaneous requests to check if the snapshots exist, the Compute Engine API was not able to handle all 50 of the requests at the same time so it timed out for 5-6 of the 50 requests.

AttributeError: 'S3File' object has not attribute 'getvalue', while running to_csv

I'm running to_csv command as follows to an ouput file on a s3 bucket with ServerSideEncryption enabled:
to_csv("s3://mys3bucket/result.csv",
storage_option={'s3_additional_kwargs':
{'ServerSideEncryption': 'AES256'}})
I'm getting the following attribute error:
File "/usr/lib/python2.7/site-packages/dask/dataframe/core.py", line 1091, in to_csv
return to_csv(self, filename, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 577, in to_csv
delayed(values).compute(get=get, scheduler=scheduler)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 156, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/usr/lib/python2.7/site-packages/dask/base.py", line 400, in compute
results = schedule(dsk, keys, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 2159, in get
direct=direct)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1562, in gather
asynchronous=asynchronous)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 652, in sync
return sync(self.loop, func, *args, **kwargs)
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 275, in sync
six.reraise(*error[0])
File "/usr/lib/python2.7/site-packages/distributed/utils.py", line 260, in f
result[0] = yield make_coro()
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1099, in run
value = future.result()
File "/usr/lib64/python2.7/site-packages/tornado/concurrent.py", line 260, in result
raise_exc_info(self._exc_info)
File "/usr/lib64/python2.7/site-packages/tornado/gen.py", line 1107, in run
yielded = self.gen.throw(*exc_info)
File "/usr/lib/python2.7/site-packages/distributed/client.py", line 1439, in _gather
traceback)
File "/usr/lib/python2.7/site-packages/dask/dataframe/io/csv.py", line 439, in _to_csv_chunk
df.to_csv(f, **kwargs)
File "/usr/lib64/python2.7/site-packages/pandas/core/frame.py", line 1745, in to_csv
formatter.save()
File "/usr/lib64/python2.7/site-packages/pandas/io/formats/csvs.py", line 161, in save
buf = f.getvalue()
File "/usr/lib/python2.7/site-packages/dask/bytes/utils.py", line 136, in __getattr__
return getattr(self.file, key)
AttributeError: 'S3File' object has no attribute 'getvalue'
I searched for this error, but couldn't find a relevant solution.
Do you have any idea?

Scrapyd S3 feed export "Connection Reset by Peer"

I'm running Scrapyd with a FEED_URI set to export to S3, but I received the following error at the very end of my scrape. Note that it successfully uploaded a few hundred kb of data to the bucket as the scrape began, then threw this error at the end:
2014-11-24 10:11:23+0000 [word] ERROR: Error storing csv feed (2285242 items) in: s3://kitchen.bucket/FoodItem.csv
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 783, in __bootstrap
self.__bootstrap_inner()
File "/usr/lib/python2.7/threading.py", line 810, in __bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 763, in run
self.__target(*self.__args, **self.__kwargs)
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/python/threadpool.py", line 191, in _worker
result = context.call(ctx, function, *args, **kwargs)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 118, in callWithContext
return self.currentContext().callWithContext(ctx, func, *args, **kw)
File "/usr/lib/python2.7/dist-packages/twisted/python/context.py", line 81, in callWithContext
return func(*args,**kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/feedexport.py", line 101, in _store_in_thread
key.set_contents_from_file(file)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 1291, in set_contents_from_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 748, in send_file
chunked_transfer=chunked_transfer, size=size)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 949, in _send_file_internal
query_args=query_args
File "/usr/local/lib/python2.7/dist-packages/boto/s3/connection.py", line 664, in make_request
retry_handler=retry_handler
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 1068, in make_request
retry_handler=retry_handler)
File "/usr/local/lib/python2.7/dist-packages/boto/connection.py", line 939, in _mexe
request.body, request.headers)
File "/usr/local/lib/python2.7/dist-packages/boto/s3/key.py", line 842, in sender
http_conn.send(chunk)
File "/usr/lib/python2.7/httplib.py", line 805, in send
self.sock.sendall(data)
File "/usr/lib/python2.7/ssl.py", line 329, in sendall
v = self.send(data[count:])
File "/usr/lib/python2.7/ssl.py", line 298, in send
v = self._sslobj.write(data)
socket.error: [Errno 104] Connection reset by peer
Looks similar to boto issue 2207. I'm using gbirke's MultiFeedExporter, and received a similar error on both of my items.

SSL3_GET_RECORD:wrong version number when loading to bigquery table from google cloud storage

When loading to a Bigquery table from files in Google cloud storage, I kept getting this SSL3_GET_RECORD:wrong version number exception.
Eventually when I check the job history from the Google Bigquery job history webpage, the load job will show that it has succeeded.
Could you please help? Thank you.
Here is the error message I am getting:
========================================
== Platform ==
CPython:2.7.6:Linux-2.6.18-194.32.1.el5-x86_64-with-redhat-5.5-Final
== bq version ==
2.0.18
== Command line ==
['/opt/google-cloud-sdk/platform/bq/bq.py', '--credential_file', '/offworld/hornet/.config/gcloud/legacy_credentials/clok#vindicotech.com/singlestore.json', '--project', 'formal-cascade-571', 'load', '--source_format=NEWLINE_DELIMITED_JSON', 'dw_sandbox.impressions_20140603', 'gs://dw_sandbox/impressions/20140603/20140604175042285_20140604195938608_20140603_0_*', '/offworld/specificmedia/logsTobq/schemas/impressionsSchema.txt']
== UTC timestamp ==
2014-06-05 01:19:06
== Error trace ==
File "/opt/google-cloud-sdk/platform/bq/bq.py", line 779, in RunSafely
return_value = self.RunWithArgs(*args, **kwds)
File "/opt/google-cloud-sdk/platform/bq/bq.py", line 1020, in RunWithArgs
job = client.Load(table_reference, source, schema=schema, **opts)
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 2011, in Load
upload_file=upload_file, **kwds)
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 1611, in ExecuteJob
job_id=job_id)
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 1599, in RunJobSynchronously
result = self.WaitJob(job_reference)
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 1713, in WaitJob
done, job = self.PollJob(job_reference, status=status, wait=wait)
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 1752, in PollJob
job = self.apiclient.jobs().get(**dict(job_reference)).execute()
File "/opt/google-cloud-sdk/platform/bq/bigquery_client.py", line 307, in execute
return super(BigqueryHttp, self).execute(**kwds)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/oauth2client/util.py", line 132, in positional_wrapper
return wrapped(*args, **kwargs)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/apiclient/http.py", line 716, in execute
body=self.body, headers=self.headers)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/oauth2client/util.py", line 132, in positional_wrapper
return wrapped(*args, **kwargs)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/oauth2client/client.py", line 490, in new_request
redirections, connection_type)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/httplib2/__init__.py", line 1586, in request
(response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/httplib2/__init__.py", line 1333, in _request
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
File "/opt/google-cloud-sdk/bin/bootstrapping/../../lib/httplib2/__init__.py", line 1289, in _conn_request
response = conn.getresponse()
File "/usr/lib64/python2.7/httplib.py", line 1045, in getresponse
response.begin()
File "/usr/lib64/python2.7/httplib.py", line 409, in begin
version, status, reason = self._read_status()
File "/usr/lib64/python2.7/httplib.py", line 365, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "/usr/lib64/python2.7/socket.py", line 476, in readline
data = self._sock.recv(self._rbufsize)
File "/usr/lib64/python2.7/ssl.py", line 241, in recv
return self.read(buflen)
File "/usr/lib64/python2.7/ssl.py", line 160, in read
return self._sslobj.read(len)
========================================
Unexpected exception in load operation: [Errno 1] _ssl.c:1426:
error:1408F10B:SSL routines:SSL3_GET_RECORD:wrong version number
Is there any chance that you're using the same HTTP object in multiple threads? I.e the thread you create the job in is not necessarily the same thread you poll for completion in? If this is the case, this came up internally within google today, and this was the fix:
class _HTTPFactoryWrapper(object):
"""Wraps a request factory so that each request returns a new http object.
API client's Http object is not threadsafe since calls to the same domain will
reuse the same HTTPConnection. If one API call is outstanding then a second
will try to send a request over the same domain. This causes chaos that
seems to surface itself as SSLErrors during processing.
"""
def __init__(self, factory):
self.factory = factory
def request(self, *args, **kwargs):
return self.factory.Create().request(*args, **kwargs)
Then change the creation of the bigquery stub from:
return discovery.build(api_name, api_version, http=http_factory.Create())
To
http_wrapper = _HTTPFactoryWrapper(http_factory)
return discovery.build(api_name, api_version, http=http_wrapper)