python motor mongo cursor length or peek next - pymongo

is there a way of determining the length of the motor mongo cursor or peeking ahead to see if there is a next ( instead of fetch_next perhaps has_next )
and not the cursor.size() that does not take into the provided limit()
basically i desire to add the required json comma
while (yield cursor.fetch_next):
document = cursor.next_object()
print document
if cursor.has_next() # Sweeet
print ","

You can use the "alive" property. Try running this:
from tornado import gen, ioloop
import motor
client = motor.MotorClient()
#gen.coroutine
def f():
collection = client.test.collection
yield collection.drop()
yield collection.insert([{'_id': i} for i in range(100)])
cursor = collection.find()
while (yield cursor.fetch_next):
print cursor.next_object(), cursor.alive
ioloop.IOLoop.current().run_sync(f)
It prints "True" until the final document, when alive is "False".
A MotorCursor fetches data from the server in batches. (The MongoDB documentation on batches explains how cursors and batches work for all MongoDB drivers, including Motor.) When "alive" is True it means either that there is more data available on the server, or data is buffered in the MotorCursor, or both.
There is a race condition, however. Say that you fetch all but the final document, and before you fetch that last document another client deletes it, then you'll fail to find the last document even though "alive" was "True". Better to rearrange your loop:
#gen.coroutine
def f():
collection = client.test.collection
yield collection.drop()
yield collection.insert([{'_id': i} for i in range(100)])
cursor = collection.find()
if (yield cursor.fetch_next):
sys.stdout.write(str(cursor.next_object()))
while (yield cursor.fetch_next):
sys.stdout.write(", ")
sys.stdout.write(str(cursor.next_object()))
print

Related

Python BigQuery Storage Write retry strategy when writing to default stream

I'm testing python-bigquery-storage to insert multiple items into a table using the _default stream.
I used the example shown in the official docs as a basis, and modified it to use the default stream.
Here is a minimal example that's similar to what I'm trying to do:
customer_record.proto
syntax = "proto2";
message CustomerRecord {
optional string customer_name = 1;
optional int64 row_num = 2;
}
append_rows_default.py
from itertools import islice
from google.cloud import bigquery_storage_v1
from google.cloud.bigquery_storage_v1 import types
from google.cloud.bigquery_storage_v1 import writer
from google.protobuf import descriptor_pb2
import customer_record_pb2
import logging
logging.basicConfig(level=logging.DEBUG)
CHUNK_SIZE = 2 # Maximum number of rows to use in each AppendRowsRequest.
def chunks(l, n):
"""Yield successive `n`-sized chunks from `l`."""
_it = iter(l)
while True:
chunk = [*islice(_it, 0, n)]
if chunk:
yield chunk
else:
break
def create_stream_manager(project_id, dataset_id, table_id, write_client):
# Use the default stream
# The stream name is:
# projects/{project}/datasets/{dataset}/tables/{table}/_default
parent = write_client.table_path(project_id, dataset_id, table_id)
stream_name = f'{parent}/_default'
# Create a template with fields needed for the first request.
request_template = types.AppendRowsRequest()
# The initial request must contain the stream name.
request_template.write_stream = stream_name
# So that BigQuery knows how to parse the serialized_rows, generate a
# protocol buffer representation of our message descriptor.
proto_schema = types.ProtoSchema()
proto_descriptor = descriptor_pb2.DescriptorProto()
customer_record_pb2.CustomerRecord.DESCRIPTOR.CopyToProto(proto_descriptor)
proto_schema.proto_descriptor = proto_descriptor
proto_data = types.AppendRowsRequest.ProtoData()
proto_data.writer_schema = proto_schema
request_template.proto_rows = proto_data
# Create an AppendRowsStream using the request template created above.
append_rows_stream = writer.AppendRowsStream(write_client, request_template)
return append_rows_stream
def send_rows_to_bq(project_id, dataset_id, table_id, write_client, rows):
append_rows_stream = create_stream_manager(project_id, dataset_id, table_id, write_client)
response_futures = []
row_count = 0
# Send the rows in chunks, to limit memory usage.
for chunk in chunks(rows, CHUNK_SIZE):
proto_rows = types.ProtoRows()
for row in chunk:
row_count += 1
proto_rows.serialized_rows.append(row.SerializeToString())
# Create an append row request containing the rows
request = types.AppendRowsRequest()
proto_data = types.AppendRowsRequest.ProtoData()
proto_data.rows = proto_rows
request.proto_rows = proto_data
future = append_rows_stream.send(request)
response_futures.append(future)
# Wait for all the append row requests to finish.
for f in response_futures:
f.result()
# Shutdown background threads and close the streaming connection.
append_rows_stream.close()
return row_count
def create_row(row_num: int, name: str):
row = customer_record_pb2.CustomerRecord()
row.row_num = row_num
row.customer_name = name
return row
def main():
write_client = bigquery_storage_v1.BigQueryWriteClient()
rows = [ create_row(i, f"Test{i}") for i in range(0,20) ]
send_rows_to_bq("PROJECT_NAME", "DATASET_NAME", "TABLE_NAME", write_client, rows)
if __name__ == '__main__':
main()
Note:
In the above, CHUNK_SIZE is 2 just for this minimal example, but, in a real situation, I used a chunk size of 5000.
In real usage, I have several separate streams of data that need to be processed in parallel, so I make several calls to send_rows_to_bq, one for each stream of data, using a thread pool (one thread per stream of data). (I'm assuming here that AppendRowsStream is not meant to be shared by multiple threads, but I might be wrong).
It mostly works, but I often get a mix of intermittent errors in the call to append_rows_stream's send method:
google.cloud.bigquery_storage_v1.exceptions.StreamClosedError: This manager has been closed and can not be used.
google.api_core.exceptions.Unknown: None There was a problem opening the stream. Try turning on DEBUG level logs to see the error.
I think I just need to retry on these errors, but I'm not sure how to best implement a retry strategy here. My impression is that I need to use the following strategy to retry errors when calling send:
If the error is a StreamClosedError, the append_rows_stream stream manager can't be used anymore, and so I need to call close on it and then call my create_stream_manager again to create a new one, then try to call send on the new stream manager.
Otherwise, on any google.api_core.exceptions.ServerError error, retry the call to send on the same stream manager.
Am I approaching this correctly?
Thank you.
The best solution to this problem is to update to the newer lib release.
This problem happens or was happening in the older versions because once the connection write API reaches 10MB, it hangs.
If the update to the newer lib does not work you can try these options:
Limit the connection to < 10MB.
Disconnect and connect again to the API.

GtkTreeView stops updating unless I change the focus of the window

I have a GtkTreeView object that uses a GtkListStore model that is constantly being updated as follows:
Get new transaction
Feed data into numpy array
Convert numbers to formatted strings, store in pandas dataframe
Add updated token info to GtkListStore via GtkListStore.set(titer, liststore_cols, liststore_data), where liststore_data is the updated info, liststore_cols is the name of the columns (both are lists).
Here's the function that updates the ListStore:
# update ListStore
titer = ls_full.get_iter(row)
liststore_data = []
[liststore_data.append(df.at[row, col])
for col in my_vars['ls_full'][3:]]
# check for NaN value, add a (space) placeholder is necessary
for i in range(3, len(liststore_data)):
if liststore_data[i] != liststore_data[i]:
liststore_data[i] = " "
liststore_cols = []
[liststore_cols.append(my_vars['ls_full'].index(col) + 1)
for col in my_vars['ls_full'][3:]]
ls_full.set(titer, liststore_cols, liststore_data)
Class that gets the messages from the websocket:
class MyWebsocketClient(cbpro.WebsocketClient):
# class exceptions to WebsocketClient
def on_open(self):
# sets up ticker Symbol, subscriptions for socket feed
self.url = "wss://ws-feed.pro.coinbase.com/"
self.channels = ['ticker']
self.products = list(cbp_symbols.keys())
def on_message(self, msg):
# gets latest message from socket, sends off to be processed
if "best_ask" and "time" in msg:
# checks to see if token price has changed before updating
update_needed = parse_data(msg)
if update_needed:
update_ListStore(msg)
else:
print(f'Bad message: {msg}')
When the program first starts, the updates are consistent. Each time a new transaction comes in, the screen reflects it, updating the proper token. However, after a random amount of time - seen it anywhere from 5 minutes to over an hour - the screen will stop updating, unless I change the focus of the window (either activate or inactive). This does not last long, though (only enough to update the screen once). No other errors are being reported, memory usage is not spiking (constant at 140 MB).
How can I troubleshoot this? I'm not even sure where to begin. The data back-ends seem to be OK (data is never corrupted nor lags behind).
As you've said in the comments that it is running in a separate thread then i'd suggest wrapping your "update liststore" function with GLib.idle_add.
from gi.repository import GLib
GLib.idle_add(update_liststore)
I've had similar issues in the past and this fixed things. Sometimes updating liststore is fine, sometimes it will randomly spew errors.
Basically only one thread should update the GUI at a time. So by wrapping in GLib.idle_add() you make sure your background thread does not intefer with the main thread updating the GUI.

What is the best way to communicate among multiple processes in ubuntu

I've three different machine learning models in python. To improve performance, I run them on different terminals in parallel. They are communicating and sharing data with one another through files. These models are creating batches of files to make available for other. All the processes are running in parallel but dependent on data prepared by other process. Once a process A prepares a batch of data, it creates a file to give signal to other process that data is ready, then process B starts processing it, while looking for other batch too simultaneously. How can this huge data be shared with next process without creating files? Is there any better way to communicate among these processes without creating/deleting temporary files in python?
Thanks
You could consider running up a small Redis instance... a very fast, in-memory data structure server.
It allows you to share strings, lists, queues, hashes, atomic integers, sets, ordered sets between processes very simply.
As it is networked, you can share all these data structures not only within a single machine, but across multiple machines.
As it has bindings for C/C++, Python, bash, Ruby, Perl and so on, it also means you can use the shell, for example, to quickly inject commands/data into your app to change its behaviour, or get debugging insight by looking at how variables are set.
Here's an example of how to do multiprocessing in Python3. Instead of storing results in a file the results are stored in a dictionary (see output)
from multiprocessing import Pool, cpu_count
def multi_processor(function_name):
file_list = []
# Test, put 6 strings in the list so your_function should run six times
# with 6 processors in parallel, (assuming your CPU has enough cores)
file_list.append("test1")
file_list.append("test2")
file_list.append("test3")
file_list.append("test4")
file_list.append("test5")
file_list.append("test6")
# Use max number of system processors - 1
pool = Pool(processes=cpu_count()-1)
pool.daemon = True
results = {}
# for every item in the file_list, start a new process
for aud_file in file_list:
results[aud_file] = pool.apply_async(your_function, args=("arg1", "arg2"))
# Wait for all processes to finish before proceeding
pool.close()
pool.join()
# Results and any errors are returned
return {your_function: result.get() for your_function, result in results.items()}
def your_function(arg1, arg2):
try:
print("put your stuff in this function")
your_results = ""
return your_results
except Exception as e:
return str(e)
if __name__ == "__main__":
some_results = multi_processor("your_function")
print(some_results)
The output is
put your stuff in this function
put your stuff in this function
put your stuff in this function
put your stuff in this function
put your stuff in this function
put your stuff in this function
{'test1': '', 'test2': '', 'test3': '', 'test4': '', 'test5': '', 'test6': ''}
Try using a sqlite database to share files.
I made this for this exact purpose:
https://pypi.org/project/keyvalue-sqlite/
You can use it like this:
from keyvalue_sqlite import KeyValueSqlite
DB_PATH = '/path/to/db.sqlite'
db = KeyValueSqlite(DB_PATH, 'table-name')
# Now use standard dictionary operators
db.set_default('0', '1')
actual_value = db.get('0')
assert '1' == actual_value
db.set_default('0', '2')
assert '1' == db.get('0')

Lego-EV3: How to fix EOFError when catching user-input via multiprocessing?

Currently, I am working with a EV3 lego robot that is controlled by several neurons. Now I want to modify the code (running on
python3) in such a way that one can change certain parameter values on the run via the shell (Ubuntu) in order to manipulate the robot's dynamics at any time (and for multiple times). Here is a schema of what I have achieved so far based on a short example code:
from multiprocessing import Process
from multiprocessing import SimpleQueue
import ev3dev.ev3 as ev3
class Neuron:
(definitions of class variables and update functions)
def check_input(queue):
while (True):
try:
new_para = str(input("Type 'parameter=value': "))
float(new_para[2:0]) # checking for float in input
var = new_para[0:2]
if (var == "k="): # change parameter k
queue.put(new_para)
elif (var == "g="): # change parameter g
queue.put(new_para)
else:
print("Error". Type 'k=...' or 'g=...')
queue.put(0) # put anything in queue
except (ValueError, EOFError):
print("New value is not a number. Try again!")
(some neuron-specific initializations)
queue = SimpleQueue()
check = Process(target=check_input, args=(queue,))
check.start()
while (True):
if (not queue.empty()):
cmd = queue.get()
var = cmd[0]
val = float(cmd[2:])
if (var == "k"):
Neuron.K = val
elif (var == "g"):
Neuron.g = val
(updating procedure for neurons, writing data to file)
Since I am new to multiprocessing there are certainly some mistakes concerning taking care of locking, efficiency and so on but the robot moves and input fields occur in the shell. However, the current problem is that it's actually impossible to make an input:
> python3 controller_multiprocess.py
> Type 'parameter=value': New value is not a number. Try again!
> Type 'parameter=value': New value is not a number. Try again!
> Type 'parameter=value': New value is not a number. Try again!
> ... (and so on)
I know that this behaviour is caused by putting the exception of EOFError due to the fact that this error occurs when the exception is removed (and the process crashes). Hence, the program just rushes through the try-loop here and assumes that no input (-> empty string) was made over and over again. Why does this happen? - when not called as a threaded procedure the program patiently waits for an input as expected. And how can one fix or bypass this issue so that changing parameters gets possible as wanted?
Thanks in advance!

How to prevent django from loading objects in memory when using `delete()`?

I'm having memory issues because it looks like Django is loading the objects into memory when using delete(). Is there any way to prevent Django from doing that?
From the Django docs:
Django needs to fetch objects into memory to send signals and handle cascades. However, if there are no cascades and no signals, then Django may take a fast-path and delete objects without fetching into memory. For large deletes this can result in significantly reduced memory usage. The amount of executed queries can be reduced, too.
https://docs.djangoproject.com/en/1.8/ref/models/querysets/#delete
I don't use signals. I do have foreign keys on the model I'm trying to delete, but I don't see why Django would need to load the objects into memory. It looks like it does, because my memory is rising as the query runs.
You can use a function like this to iterate over an huge number of objects without using too much memory:
import gc
def queryset_iterator(qs, batchsize = 500, gc_collect = True):
iterator = qs.values_list('pk', flat=True).order_by('pk').distinct().iterator()
eof = False
while not eof:
primary_key_buffer = []
try:
while len(primary_key_buffer) < batchsize:
primary_key_buffer.append(iterator.next())
except StopIteration:
eof = True
for obj in qs.filter(pk__in=primary_key_buffer).order_by('pk').iterator():
yield obj
if gc_collect:
gc.collect()
Then you can use the function to iterate over the objects to delete:
for obj in queryset_iterator(HugeQueryset.objects.all()):
obj.delete()
For more information you can check this blog post.
You can import django database connection and use it with sql to delete. I had exact same problem as you do and this helps me a lot. Here's some snippet(I'm using mysql by the way, but you can run any sql statement):
from django.db import connection
sql_query = "DELETE FROM usage WHERE date < '%s' ORDER BY date" % date
cursor = connection.cursor()
try:
cursor.execute(sql_query)
finally:
c.close()
This should execute only the delete operation on that table without affecting any of your model relationships.