Writing to BigQuery from within a ParDo function - google-bigquery

I would like to call a beam.io.Write(beam.io.BigQuerySink(..)) operation from within a ParDo function to generate a separate BigQuery table for each key in the PCollection (i'm using the python SDK). Here are two similar threads, which unfortunately didn't help:
1) https://stackoverflow.com/questions/31156774/about-key-grouping-with-groupbykey
2) Dynamic table name when writing to BQ from dataflow pipelines
When I execute the following code, the rows for the first key get inserted into BigQuery and then the pipeline fails with the error below. Would really appreciate any suggestions on what I'm doing wrong or any suggestions on how to fix it.
Pipeline code:
rows = p | 'read_bq_table' >> beam.io.Read(beam.io.BigQuerySource(query=query))
class par_upload(beam.DoFn):
def process(self, context):
key, value = context.element
### This block causes issues ###
value | 'write_to_bq' >> beam.io.Write(
beam.io.BigQuerySink(
'PROJECT-NAME:analytics.first_table', #will be replace by a dynamic name based on key
schema=schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
)
)
### End block ######
return [value]
### Following part works fine ###
filtered = (rows | 'filter_rows' >> beam.Filter(lambda row: row['topic'] == 'analytics')
| 'apply_projection' >> beam.Map(apply_projection, projection_fields)
| 'group_by_key' >> beam.GroupByKey()
| 'par_upload_to_bigquery' >> beam.ParDo(par_upload())
| 'flat_map' >> beam.FlatMap(lambda l: l) #this step is just for testing
)
### This part works fine if I comment out the 'write_to_bq' block above
filtered | 'write_to_bq' >> beam.io.Write(
beam.io.BigQuerySink(
'PROJECT-NAME:analytics.another_table',
schema=schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
)
Error message:
INFO:oauth2client.client:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Attempting refresh to obtain initial access_token
INFO:root:Writing 1 rows to PROJECT-NAME:analytics.first_table table.
INFO:root:Final: Debug counters: {'element_counts': Counter({'CreatePInput0': 1, 'write_to_bq/native_write': 1})}
ERROR:root:Error while visiting par_upload_to_bigquery
Traceback (most recent call last):
File "split_events.py", line 137, in <module>
run()
File "split_events.py", line 132, in run
p.run()
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/pipeline.py", line 159, in run
return self.runner.run(self)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/direct_runner.py", line 102, in run
super(DirectPipelineRunner, self).run(pipeline)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/runner.py", line 98, in run
pipeline.visit(RunVisitor(self))
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/pipeline.py", line 182, in visit
self._root_transform().visit(visitor, self, visited)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/pipeline.py", line 419, in visit
part.visit(visitor, pipeline, visited)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/pipeline.py", line 422, in visit
visitor.visit_transform(self)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/runner.py", line 93, in visit_transform
self.runner.run_transform(transform_node)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/runner.py", line 168, in run_transform
return m(transform_node)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/direct_runner.py", line 98, in func_wrapper
func(self, pvalue, *args, **kwargs)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/direct_runner.py", line 180, in run_ParDo
runner.process(v)
File "apache_beam/runners/common.py", line 133, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:4483)
File "apache_beam/runners/common.py", line 139, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:4311)
File "apache_beam/runners/common.py", line 150, in apache_beam.runners.common.DoFnRunner.reraise_augmented (apache_beam/runners/common.c:4677)
File "apache_beam/runners/common.py", line 137, in apache_beam.runners.common.DoFnRunner.process (apache_beam/runners/common.c:4245)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/typehints/typecheck.py", line 149, in process
return self.run(self.dofn.process, context, args, kwargs)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/typehints/typecheck.py", line 134, in run
result = method(context, *args, **kwargs)
File "split_events.py", line 73, in process
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/transforms/ptransform.py", line 724, in __ror__
return self.transform.__ror__(pvalueish, self.label)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/transforms/ptransform.py", line 445, in __ror__
return _MaterializePValues(cache).visit(result)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/transforms/ptransform.py", line 105, in visit
return self._pvalue_cache.get_unwindowed_pvalue(node)
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/runner.py", line 262, in get_unwindowed_pvalue
return [v.value for v in self.get_pvalue(pvalue)]
File "/Users/dimitri/anaconda/lib/python2.7/site-packages/apache_beam/runners/runner.py", line 244, in get_pvalue
value_with_refcount = self._cache[self.key(pvalue)]
KeyError: "(4384177040, None) [while running 'par_upload_to_bigquery']"
Edit (after the first answer):
I didn't realise my value needs to be a PCollection.
I've changed my code to this now (which is probably very inefficient):
key_pipe = p | 'pipe_' + key >> beam.Create(value)
key_pipe | 'write_' + key >> beam.io.Write(beam.io.BigQuerySink(..))
Which now works fine locally but not with BlockingDataflowPipelineRunner :-(
The pipeline fails with the following error:
JOB_MESSAGE_ERROR: (979394c29490e588): Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/dataflow_worker/batchworker.py", line 474, in do_work
work_executor.execute()
File "dataflow_worker/executor.py", line 901, in dataflow_worker.executor.MapTaskExecutor.execute (dataflow_worker/executor.c:24331)
op.start()
File "dataflow_worker/executor.py", line 465, in dataflow_worker.executor.DoOperation.start (dataflow_worker/executor.c:14193)
def start(self):
File "dataflow_worker/executor.py", line 469, in dataflow_worker.executor.DoOperation.start (dataflow_worker/executor.c:13499)
fn, args, kwargs, tags_and_types, window_fn = (
ValueError: too many values to unpack (expected 5)

In the similar threads, the only suggestion to do BigQuery write operations in a ParDo was to use the BigQuery API directly, or using a client.
The code that you wrote is putting a Dataflow ParDo class beam.io.BigQuerySink() into a DoFn function. The ParDo class expects to work on a PCollection like filtered in the working code example. Which is not the case for the non-functioning code working on value.
I think the easiest option would be to take a look at the gcloud-python BigQuery function insert_data() and put this inside your ParDo.

Related

TensorFlow Federated (TFF) TypeError in tff.templates.IterativeProcess.next() when clients_per_round exceed 99

I implemented a custom federated learning GAN training loop with TFF similar to this code by Google Research.
The client data for a particular training round is found using the following code snippet:
def client_dataset_fn():
# Sample clients and data
sampled_clients = np.random.choice(train_data.client_ids, size=cfg.clients_per_round, replace=False)
datasets = [(next(client_gen_inputs_iterator),
train_data.create_tf_dataset_for_client(client_id).take(cfg.n_critic))
for client_id in sampled_clients]
return datasets
client_noise_inputs, client_real_data = zip(*client_dataset_fn())
This works perfectly up until cfg.clients_per_round is set to 99. When it is set to 100 or a larger value (with the total number of clients being larger of course), I receive the following error:
Traceback (most recent call last):
File "main.py", line 109, in main
metrics = run_single_trial(train_data, test_data, cfg)
File "/mnt/workspace/tff/GAN/federated/fedgan_main.py", line 73, in run_single_trial
metrics = train_loop(iterative_process, server_dataset_fn, client_dataset_fn, model, eval_hook_fn, cfg)
File "/mnt/workspace/tff/GAN/federated/fedgan_main.py", line 124, in train_loop
client_real_data)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/computation/function_utils.py", line 525, in __call__
return context.invoke(self, arg)
File "/usr/local/lib/python3.6/dist-packages/retrying.py", line 49, in wrapped_f
return Retrying(*dargs, **dkw).call(f, *args, **kw)
File "/usr/local/lib/python3.6/dist-packages/retrying.py", line 206, in call
return attempt.get(self._wrap_exception)
File "/usr/local/lib/python3.6/dist-packages/retrying.py", line 247, in get
six.reraise(self.value[0], self.value[1], self.value[2])
File "/usr/local/lib/python3.6/dist-packages/six.py", line 703, in reraise
raise value
File "/usr/local/lib/python3.6/dist-packages/retrying.py", line 200, in call
attempt = Attempt(fn(*args, **kwargs), attempt_number, False)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/execution_context.py", line 226, in invoke
_ingest(executor, unwrapped_arg, arg.type_signature)))
File "/usr/lib/python3.6/asyncio/base_events.py", line 484, in run_until_complete
return future.result()
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/tracing.py", line 396, in _wrapped
return await coro
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/execution_context.py", line 111, in _ingest
ingested = await asyncio.gather(*ingested)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/execution_context.py", line 116, in _ingest
return await executor.create_value(val, type_spec)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/tracing.py", line 201, in async_trace
result = await fn(*fn_args, **fn_kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/reference_resolving_executor.py", line 294, in create_value
value, type_spec))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/tracing.py", line 201, in async_trace
result = await fn(*fn_args, **fn_kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/thread_delegating_executor.py", line 111, in create_value
self._target_executor.create_value(value, type_spec))
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/thread_delegating_executor.py", line 105, in _delegate
result_value = await _delegate_with_trace_ctx(coro, self._event_loop)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/tracing.py", line 396, in _wrapped
return await coro
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/tracing.py", line 201, in async_trace
result = await fn(*fn_args, **fn_kwargs)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/federating_executor.py", line 394, in create_value
return await self._strategy.compute_federated_value(value, type_spec)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/core/impl/executors/federated_composing_strategy.py", line 279, in compute_federated_value
py_typecheck.check_type(value, list)
File "/usr/local/lib/python3.6/dist-packages/tensorflow_federated/python/common_libs/py_typecheck.py", line 41, in check_type
type_string(type_spec), type_string(type(target))))
TypeError: Expected list, found tuple.
During debugging, I looked at the target variable in the final line of the traceback and found it to be the abovementioned client_real_data and client_noise_inputs. Their types are in fact tuples not lists, however, this does not change with different numbers of cfg.clients_per_round. The only usage of cfg.clients_per_round is shown above in the random choice.
I really cannot explain why this is happening, maybe somebody out there has experienced something similar and can help me out.
My used package versions are as follows:
Python 3.6.9 or 3.8.10 (checked both)
tensorflow 2.5.1
tensorflow-federated 0.19.0
retrying 1.3.3
six 1.15.0
As a workaround I now manually change the data type of client_noise_inputs and client_real_data using list(tuple_var), but I am still curious as to why the list is required somehow.
(Copying and pasting from original on GitHub)
This seems to me to be an implementation distinction between the federated_composing_strategy and the federated_resolving_strategy. IIRC, by default we don't inject a composing executor into your stack until you hit 100 clients--which would be the source of this exciting mystery.
In particular, the composing strategy is programmed against the assumption that the incoming clients-placed value is represented as a list, whereas the resolving strategy codes against a much more flexible set of containers.
It's not wild to coerce your clients-placed value to a list--we also could extend the permitted representation of clients-placed values in the composing executor to match that in the resolving one, possibly pulling the appropriate logic to a shared place like here. I think its a contribution wed be very happy to accept if youre up for it!

Delete bulk rows in Big Query table based on list of unique ids

So I tried to delete some rows in Big Query table with simple query like this:
client = bigquery.Client()
query = "DELETE FROM Sample_Dataset.Sample_Table WHERE Sample_Table_Id IN {}".format(primary_key_list)
query_job = client.query(query, location="us-west1")
Where primary_key_list is some python list contains a list of Sample_Table unique id like: [123, 124, 125, ...]
Things working good with small data I retrieve but when the primary_key_list grows, it gives me an error:
The query is too large. The maximum standard SQL query length is
1024.00K characters, including comments and white space characters.
I realize the query will be long enough to reach max query length, and after searching on stack overflow I found out there's a solution to use parameterized queries, so I change my code to this:
client = bigquery.Client()
query = "DELETE FROM Sample_Dataset.Sample_Table WHERE Sample_Table_Id IN UNNEST(#List_Sample_Table_Id)"
job_config = bigquery.QueryJobConfig(
query_parameters=[
bigquery.ArrayQueryParameter("List_Sample_Table_Id", "INT64", primary_key_list),
]
)
query_job = client.query(query, job_config=job_config)
It stops giving me maximum standard SQL query length, but returns me another error exception, any idea to delete bulk rows from Big Query?
I don't know if this information is useful or not but, I'm running this python code on google cloud Dataproc, and this is just some function I recently added, before I add this function everything is working, and here's the log I've got from running delete using parameterized queries.
Traceback (most recent call last):
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 320, in _send_until_done
return self.connection.send(data)
File "/opt/conda/default/lib/python3.7/site-packages/OpenSSL/SSL.py", line 1737, in send
self._raise_ssl_error(self._ssl, result)
File "/opt/conda/default/lib/python3.7/site-packages/OpenSSL/SSL.py", line 1639, in _raise_ssl_error
raise SysCallError(errno, errorcode.get(errno))
OpenSSL.SSL.SysCallError: (32, 'EPIPE')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/connectionpool.py", line 354, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1290, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1239, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1065, in _send_output
self.send(chunk)
File "/opt/conda/default/lib/python3.7/http/client.py", line 987, in send
self.sock.sendall(data)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 331, in sendall
sent = self._send_until_done(data[total_sent:total_sent + SSL_WRITE_BLOCKSIZE])
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 326, in _send_until_done
raise SocketError(str(e))
OSError: (32, 'EPIPE')
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/opt/conda/default/lib/python3.7/site-packages/requests/adapters.py", line 449, in send
timeout=timeout
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/connectionpool.py", line 638, in urlopen
_stacktrace=sys.exc_info()[2])
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/util/retry.py", line 368, in increment
raise six.reraise(type(error), error, _stacktrace)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/packages/six.py", line 685, in reraise
raise value.with_traceback(tb)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
chunked=chunked)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/connectionpool.py", line 354, in _make_request
conn.request(method, url, **httplib_request_kw)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1244, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1290, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1239, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/opt/conda/default/lib/python3.7/http/client.py", line 1065, in _send_output
self.send(chunk)
File "/opt/conda/default/lib/python3.7/http/client.py", line 987, in send
self.sock.sendall(data)
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 331, in sendall
sent = self._send_until_done(data[total_sent:total_sent + SSL_WRITE_BLOCKSIZE])
File "/opt/conda/default/lib/python3.7/site-packages/urllib3/contrib/pyopenssl.py", line 326, in _send_until_done
raise SocketError(str(e))
urllib3.exceptions.ProtocolError: ('Connection aborted.', OSError("(32, 'EPIPE')"))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tmp/a05a15822be04a9abdc1ba05e317bb2f/ItemHistory-get.py", line 92, in <module>
delete_duplicate_pk()
File "/tmp/a05a15822be04a9abdc1ba05e317bb2f/ItemHistory-get.py", line 84, in delete_duplicate_pk
query_job = client.query(query, job_config=job_config, location="asia-southeast2")
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 2893, in query
query_job._begin(retry=retry, timeout=timeout)
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/bigquery/job/query.py", line 1069, in _begin
super(QueryJob, self)._begin(client=client, retry=retry, timeout=timeout)
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/bigquery/job/base.py", line 438, in _begin
timeout=timeout,
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/bigquery/client.py", line 643, in _call_api
return call()
File "/opt/conda/default/lib/python3.7/site-packages/google/api_core/retry.py", line 286, in retry_wrapped_func
on_error=on_error,
File "/opt/conda/default/lib/python3.7/site-packages/google/api_core/retry.py", line 184, in retry_target
return target()
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/_http.py", line 434, in api_request
timeout=timeout,
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/_http.py", line 292, in _make_request
method, url, headers, data, target_object, timeout=timeout
File "/opt/conda/default/lib/python3.7/site-packages/google/cloud/_http.py", line 330, in _do_request
url=url, method=method, headers=headers, data=data, timeout=timeout
File "/opt/conda/default/lib/python3.7/site-packages/google/auth/transport/requests.py", line 470, in request
**kwargs
File "/opt/conda/default/lib/python3.7/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/opt/conda/default/lib/python3.7/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/opt/conda/default/lib/python3.7/site-packages/requests/adapters.py", line 498, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', OSError("(32, 'EPIPE')"))
As #blackbishop mention, you can try to upgrade requests to the latest version (in my case it solve the problem) but since I tried to update bulk rows (let's say 500.000+ row in Big Query, which every row have a unique id), turns out it gives me a timeout with small machine type Dataproc clusters that I used (if someone has a resource to try with better Dataproc cluster and success please feel free to edit this answer).
So I go with the Merge statement as documented here:
https://cloud.google.com/bigquery/docs/reference/standard-sql/dml-syntax#merge_statement
The script would be like this to update existing rows (assuming I have the new data I retrieve and already load it to Staging_Sample_Dataset.Staging_Sample_Table):
def merge_data():
client = bigquery.Client()
query = """MERGE INTO Sample_Dataset.Sample_Table st
USING Staging_Sample_Dataset.Staging_Sample_Table ss
ON st.Sample_Table_Id = ss.Sample_Table_Id
WHEN MATCHED UPDATE SET st.Your_Column = ss.Your_Column -- and the list goes on...
WHEN NOT MATCHED THEN INSERT ROW
"""
query_job = client.query(query, location="asia-southeast2")
results = query_job.result()
or I could delete bulk rows and call another function to load after this function executed:
def bulk_delete():
client = bigquery.Client()
query = """MERGE INTO Sample_Dataset.Sample_Table st
USING Staging_Sample_Dataset.Staging_Sample_Table sst
ON st.Sample_Table_Id = sst.Sample_Table_Id
WHEN MATCHED THEN DELETE
"""
query_job = client.query(query, location="asia-southeast2")
results = query_job.result()
the concept is looping the array of the ids you have and run the delete fro each one of them
it is just the structure, not the actual code. so it might need more than just tweaking
set arrlength = ARRAY_LENGTH(primary_key_list)
client = bigquery.Client()
WHILE i < arrlength DO
query = "delete from Sample_Dataset.Sample_Table WHERE Sample_Table_Id=primary_key_list[OFFSET(i)]"
query_job = client.query(query, location="us-west1")
set i = i+1;
END WHILE;

Running into uvloop issues with Database queries from Rasa-X?

I'm trying to make a simple query to my amazon neptune database, from Rasa-x.
Here is the code from my actions.py:
class ActionQueryDietary(Action):
def name(self) -> Text:
return "action_query_dietary"
def run(self, dispatcher: CollectingDispatcher, tracker: Tracker, domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:
available = False
restaurant = "XXXX"
dietaryQuestion=tracker.get_slot('dietaryQuestion')
g, remoteConn = kb.openConnection()
dietary = kb.getDietary(g, restaurant, dietaryQuestion)
if(dietary=="Yes"):
available = True
if(available==True):
dispatcher.utter_message(text="According to our knowledge base, {} is on the menu").format(dietaryQuestion)
else:
dispatcher.utter_message(text="Sorry, according to our knowledge base, we don't have this option on the menu")
kb.closeConnection(remoteConn)
return []
and here is the code from knowledgebase.py:
def getDietary(g, restaurant, dietary):
properties = queryRestaurantProperties(g, restaurant)
result = properties[dietary]
print(result)
return result
but any query to the knowledgebase results in this error:
2020-10-22T18:01:22.347345241Z File "/opt/venv/lib/python3.7/site-packages/sanic/app.py", line 973, in handle_request
2020-10-22T18:01:22.347351643Z response = await response
2020-10-22T18:01:22.347357446Z File "/app/rasa_sdk/endpoint.py", line 102, in webhook
2020-10-22T18:01:22.347363522Z result = await executor.run(action_call)
2020-10-22T18:01:22.347369398Z File "/app/rasa_sdk/executor.py", line 392, in run
2020-10-22T18:01:22.347375473Z events = action(dispatcher, tracker, domain)
2020-10-22T18:01:22.347381348Z File "/app/actions/actions.py", line 33, in run
2020-10-22T18:01:22.347387063Z dietary = kb.getDietary(g, restaurant, dietaryQuestion)
2020-10-22T18:01:22.347392835Z File "/app/actions/knowledgebase.py", line 117, in getDietary
2020-10-22T18:01:22.347399112Z properties = queryRestaurantProperties(g, restaurant)
2020-10-22T18:01:22.347405111Z File "/app/actions/knowledgebase.py", line 86, in queryRestaurantProperties
2020-10-22T18:01:22.347411869Z result = g.V().has('name', restaurant).valueMap().next()
2020-10-22T18:01:22.347418048Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/process/traversal.py", line 89, in next
2020-10-22T18:01:22.347424293Z return self.__next__()
2020-10-22T18:01:22.347430069Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/process/traversal.py", line 48, in __next__
2020-10-22T18:01:22.347436333Z self.traversal_strategies.apply_strategies(self)
2020-10-22T18:01:22.347441940Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/process/traversal.py", line 573, in apply_strategies
2020-10-22T18:01:22.347447667Z traversal_strategy.apply(traversal)
2020-10-22T18:01:22.347453031Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/remote_connection.py", line 149, in apply
2020-10-22T18:01:22.347459352Z remote_traversal = self.remote_connection.submit(traversal.bytecode)
2020-10-22T18:01:22.347465069Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/driver_remote_connection.py", line 55, in submit
2020-10-22T18:01:22.347486749Z result_set = self._client.submit(bytecode)
2020-10-22T18:01:22.347493788Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/client.py", line 127, in submit
2020-10-22T18:01:22.347499424Z return self.submitAsync(message, bindings=bindings).result()
2020-10-22T18:01:22.347505093Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/client.py", line 146, in submitAsync
2020-10-22T18:01:22.347511092Z return conn.write(message)
2020-10-22T18:01:22.347516610Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/connection.py", line 55, in write
2020-10-22T18:01:22.347522673Z self.connect()
2020-10-22T18:01:22.347529987Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/connection.py", line 45, in connect
2020-10-22T18:01:22.347536097Z self._transport.connect(self._url, self._headers)
2020-10-22T18:01:22.347542222Z File "/opt/venv/lib/python3.7/site-packages/gremlin_python/driver/tornado/transport.py", line 36, in connect
2020-10-22T18:01:22.347547822Z lambda: websocket.websocket_connect(url))
2020-10-22T18:01:22.347553477Z File "/opt/venv/lib/python3.7/site-packages/tornado/ioloop.py", line 571, in run_sync
2020-10-22T18:01:22.347559295Z self.start()
2020-10-22T18:01:22.347564864Z File "/opt/venv/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 132, in start
2020-10-22T18:01:22.347570978Z self.asyncio_loop.run_forever()
2020-10-22T18:01:22.347576693Z File "uvloop/loop.pyx", line 1351, in uvloop.loop.Loop.run_forever
2020-10-22T18:01:22.347582342Z File "uvloop/loop.pyx", line 484, in uvloop.loop.Loop._run
2020-10-22T18:01:22.347588222Z RuntimeError: Cannot run the event loop while another loop is running
I tried using nest_asyncio.apply, but that resulted in this error:
ValueError: Can't patch loop of type <class 'uvloop.Loop'>
which according to the docs is just a rule.
So I don't really know how to proceed?
Adding my comment above as an answer. In some cases it is necessary to downlevel the version of Tornado being used. There are some issues that you can sometimes run into if the event loop is already running when someone else tries to create it. For now, down leveling to Tornado 4.5.1 with Gremlin Python should resolve any issues.

Apache Beam Dataflow: 'NoneType' object has no attribute 'parts'

I'm trying to write a pipeline to read a stream from pubsub and write it to bigquery using google cloud dataflow with apache beam.
I have this code:
import apache_beam as beam
from apache_beam.transforms.window import FixedWindows
topic = 'projects/???/topics/???'
table = '???.???'
gcs_path = "gs://???"
with beam.Pipeline(runner="DataflowRunner", argv=[
"--project", "???",
"--staging_location", ("%s/staging_location" % gcs_path),
"--temp_location", ("%s/temp" % gcs_path),
"--output", ("%s/output" % gcs_path)
]) as p:
(p
| 'winderow' >> beam.WindowInto(FixedWindows(60))
| 'hello' >> beam.io.gcp.pubsub.ReadStringsFromPubSub(topic)
| 'hello2' >> beam.io.Write(beam.io.gcp.bigquery.BigQuerySink(table))
)
p.run().wait_until_finish()
But I'm getting this error when running it:
No handlers could be found for logger "oauth2client.contrib.multistore_file"
ERROR:root:Error while visiting winderow
Traceback (most recent call last):
File ".\main.py", line 20, in <module>
p.run().wait_until_finish()
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\pipeline.py", line 339, in run
return self.runner.run(self)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 296, in run
super(DataflowRunner, self).run(pipeline)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\runner.py", line 138, in run
pipeline.visit(RunVisitor(self))
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\pipeline.py", line 367, in visit
self._root_transform().visit(visitor, self, visited)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\pipeline.py", line 710, in visit
part.visit(visitor, pipeline, visited)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\pipeline.py", line 713, in visit
visitor.visit_transform(self)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\runner.py", line 133, in visit_transform
self.runner.run_transform(transform_node)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\runner.py", line 176, in run_transform
return m(transform_node)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\dataflow\dataflow_runner.py", line 526, in run_ParDo
input_step = self._cache.get_pvalue(transform_node.inputs[0])
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\runner.py", line 252, in get_pvalue
self._ensure_pvalue_has_real_producer(pvalue)
File "C:\ProgramData\Anaconda2\lib\site-packages\apache_beam\runners\runner.py", line 226, in _ensure_pvalue_has_real_producer
while real_producer.parts:
AttributeError: 'NoneType' object has no attribute 'parts'
Is this a problem with the code or configuration?
How can I get it working?
I don't have experience with windowed pipelines yet but for what I understand from the concept, the windows are supposed to be applied to your input data and not as a pipeline setup.
This being the case, your code probably should be:
with beam.Pipeline(runner="DataflowRunner", argv=[
"--project", "???",
"--staging_location", ("%s/staging_location" % gcs_path),
"--temp_location", ("%s/temp" % gcs_path),
"--output", ("%s/output" % gcs_path)
]) as p:
(p
| 'hello' >> beam.io.gcp.pubsub.ReadStringsFromPubSub(topic)
| 'winderow' >> beam.WindowInto(FixedWindows(60))
| 'hello2' >> beam.io.Write(beam.io.gcp.bigquery.BigQuerySink(table))
)
p.run().wait_until_finish()
The official repo have some samples on windowed operations as well.

Blank sessionid cookie causes error in request.user

An end user somehow ended up with their sessionid cookie blank (as in "sessionid=;"). This causes the following error call stack (below the function call to request.user) when using Django in conjunction with GAE:
File "/src/django/utils/functional.py", line 204, in inner
self._setup()
File "/src/django/utils/functional.py", line 270, in _setup
self._wrapped = self._setupfunc()
File "/src/django/contrib/auth/middleware.py", line 18, in <lambda>
request.user = SimpleLazyObject(lambda: get_user(request))
File "/src/django/contrib/auth/middleware.py", line 10, in get_user
request._cached_user = auth.get_user(request)
File "/src/django/contrib/auth/__init__.py", line 136, in get_user
user_id = request.session[SESSION_KEY]
File "/src/django/contrib/sessions/backends/base.py", line 44, in __getitem__
return self._session[key]
File "/src/django/contrib/sessions/backends/base.py", line 167, in _get_session
self._session_cache = self.load()
File "/src/django/contrib/sessions/backends/cached_db.py", line 39, in load
expire_date__gt=timezone.now()
File "/src/django/db/models/manager.py", line 143, in get
return self.get_query_set().get(*args, **kwargs)
File "/src/django/db/models/query.py", line 398, in get
num = len(clone)
File "/src/django/db/models/query.py", line 106, in __len__
self._result_cache = list(self.iterator())
File "/src/django/db/models/query.py", line 317, in iterator
for row in compiler.results_iter():
File "/src/djangotoolbox/db/basecompiler.py", line 375, in results_iter
results = self.build_query(fields).fetch(
File "/src/djangotoolbox/db/basecompiler.py", line 481, in build_query
query.add_filters(self.query.where)
File "/src/djangotoolbox/db/basecompiler.py", line 174, in add_filters
self.add_filters(child)
File "/src/djangotoolbox/db/basecompiler.py", line 176, in add_filters
field, lookup_type, value = self._decode_child(child)
File "/src/djangotoolbox/db/basecompiler.py", line 216, in _decode_child
lookup_type, value, field, annotation)
File "/src/djangotoolbox/db/basecompiler.py", line 254, in _normalize_lookup_value
return self.ops.value_for_db(value, field, lookup_type)
File "/src/djangoappengine/db/base.py", line 128, in value_for_db
return super_value_for_db(value, field, lookup)
File "/src/djangotoolbox/db/base.py", line 245, in value_for_db
field_kind, db_type, lookup)
File "/src/djangoappengine/db/base.py", line 160, in _value_for_db
raise DatabaseError("Only strings and positive integers "
DatabaseError: Only strings and positive integers may be used as keys on GAE.
This error does not occur if sessionid is set to some invalid non-empty value (such as "session=garbage;"). I think it is related to follow contrast of behavior in a python shell:
>>> Session.objects.filter(session_key='abc').exists()
0
>>> Session.objects.filter(session_key='').exists()
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "/src/django/db/models/query.py", line 610, in exists
return self.query.has_results(using=self.db)
File "/src/django/db/models/sql/query.py", line 445, in has_results
return compiler.has_results()
File "/src/dbindexer/compiler.py", line 32, in has_results
return super(SQLCompiler, self).has_results()
File "/src/djangotoolbox/db/basecompiler.py", line 384, in has_results
return self.get_count(check_exists=True)
File "/src/djangotoolbox/db/basecompiler.py", line 468, in get_count
return self.build_query().count(high_mark)
File "/src/djangotoolbox/db/basecompiler.py", line 481, in build_query
query.add_filters(self.query.where)
File "/src/djangotoolbox/db/basecompiler.py", line 174, in add_filters
self.add_filters(child)
File "/src/djangotoolbox/db/basecompiler.py", line 176, in add_filters
field, lookup_type, value = self._decode_child(child)
File "/src/djangotoolbox/db/basecompiler.py", line 216, in _decode_child
lookup_type, value, field, annotation)
File "/src/djangotoolbox/db/basecompiler.py", line 254, in _normalize_lookup_value
return self.ops.value_for_db(value, field, lookup_type)
File "/src/djangoappengine/db/base.py", line 128, in value_for_db
return super_value_for_db(value, field, lookup)
File "/src/djangotoolbox/db/base.py", line 245, in value_for_db
field_kind, db_type, lookup)
File "/src/djangoappengine/db/base.py", line 160, in _value_for_db
raise DatabaseError("Only strings and positive integers "
DatabaseError: Only strings and positive integers may be used as keys on GAE.
Is this a djangoappengine or djangotoolbox bug, or a Django bug? What's the proper way to prevent this error, and consider user unauthenticated?
Ok, I think I may have to add a middleware class to handle this special case, and place it directly after SessionMiddleware:
class EmptySessionMiddleware(object):
def process_request(self, request):
session = request.session
if session.session_key is not None and len(session.session_key) == 0:
logging.info('[EmptySessionMiddleware] setting empty session key to None')
session._session_key = None
It's a weird special case, but basically the problem is that Django session middleware only checks for None session before looking up in db (not empty string), and an empty string primary key query in djangoappengine raises an exception. I'm not sure there's another way to handle this case.