MemoryError when querying database from Process - process

I am trying to create a program with 3 processes that read from the same database. The code was working before I started introducing processes.
I am getting MemoryError when performing a select() from PeeWee, I suspect there is something wrong with sharing of resources. Minimal example:
models.py
from playhouse.pool import PooledSqliteExtDatabase
file_scanner_database = PooledSqliteExtDatabase(
None,
max_connections=32,
)
class FileModel(Model):
class Meta:
database = file_scanner_database
main.py
from file_scanner import FileScanner
from models import file_scanner_database
from models import FileModel
from multiprocessing import Process
def create_scanner_agent(data):
scanner = FileScanner(data)
scanner.start_scanner()
shared_info = {'db_location': '/absolute/path/to/database'}
file_scanner_database.init(shared_info['db_location'])
file_scanner_database.connect()
file_scanner_database.create_tables([FileModel], safe=True)
new_process = Process(
target=create_scanner_agent,
args=(shared_info,)
)
new_process.daemon = True
new_process.start()
try:
new_process.join()
except KeyboardInterrupt:
pass
new_process.terminate()
file_scanner.py
from models import file_scanner_database
from models import FileModel
class FileScanner:
def __init__(self, data):
for k, v in data.items():
setattr(self, k, v)
file_scanner_database.init(self.db_location)
file_scanner_database.connect()
def start_scanner(self):
while True:
# THIS IS WHERE THE PROGRAM CRASHES
for row in FileModel.select():
...

It looks like you're trying to access memory across a fork? Or some such craziness? I think the answer is that you're doing it wrong homie. Try opening your DB connection after the fork.

Related

How to write python unittest cases to mock redis connection (redis.StrictRedis) in Django

How can I mock the following function for connecting to Redis?
import redis
class RedisCache:
redis_instance = None
#classmethod
def set_connect(cls):
redis_instance = redis.StrictRedis(host='0.0.0.0', port=6379, password='xyz', charset='utf-8', decode_responses=True, socket_timeout=30)
return redis_instance
#classmethod
def get_conn(cls):
cls.redis_instance = cls.set_connect()
return cls.redis_instance
I looked for some solutions, but they were basically using fakeredis module. I wanted to have a simpler way to mock these functions.
Note-
data returned by the function: Redis<ConnectionPool<Connection<host=127.0.0.1,port=6379,db=0>>>
You can use patch() function to mock out redis.StrictRedis class. See where-to-patch
E.g.
redis_cache.py:
import redis
class RedisCache:
redis_instance = None
#classmethod
def set_connect(cls):
redis_instance = redis.StrictRedis(host='0.0.0.0', port=6379, password='xyz',
charset='utf-8', decode_responses=True, socket_timeout=30)
return redis_instance
#classmethod
def get_conn(cls):
cls.redis_instance = cls.set_connect()
return cls.redis_instance
test_redis_cache.py:
from unittest import TestCase
import unittest
from unittest.mock import patch, Mock
from redis_cache import RedisCache
class TestRedisCache(TestCase):
def test_set_connect(self):
with patch('redis.StrictRedis') as mock_StrictRedis:
mock_redis_instance = mock_StrictRedis.return_value
actual = RedisCache.set_connect()
self.assertEqual(actual, mock_redis_instance)
mock_StrictRedis.assert_called_once_with(host='0.0.0.0', port=6379, password='xyz',
charset='utf-8', decode_responses=True, socket_timeout=30)
#patch('redis.StrictRedis')
def test_get_conn(self, mock_StrictRedis):
mock_redis_instance = mock_StrictRedis.return_value
RedisCache.get_conn()
self.assertEqual(RedisCache.redis_instance, mock_redis_instance)
if __name__ == '__main__':
unittest.main()
test result:
..
----------------------------------------------------------------------
Ran 2 tests in 0.004s
OK
Name Stmts Miss Cover Missing
------------------------------------------------------------------------------
src/stackoverflow/70016401/redis_cache.py 11 0 100%
src/stackoverflow/70016401/test_redis_cache.py 18 0 100%
------------------------------------------------------------------------------
TOTAL 29 0 100%

Python multiprocessing how to update a complex object in a manager list without using .join() method

I started programming in Python about 2 months ago and I've been struggling with this problem in the last 2 weeks.
I know there are many similar threads to this one but I can't really find a solution which suits my case.
I need to have the main process which is the one which interacts with Telegram and another process, buffer, which understands the complex object received from the main and updates it.
I'd like to do this in a simpler and smoother way.
At the moment objects are not being updated due to the use of multi-processing without the join() method.
I tried then to use multi-threading instead but it gives me compatibility problems with Pyrogram a framework which i am using to interact with Telegram.
I wrote again the "complexity" of my project in order to reproduce the same error I am getting and in order to get and give the best help possible from and for everyone.
a.py
class A():
def __init__(self, length = -1, height = -1):
self.length = length
self.height = height
b.py
from a import A
class B(A):
def __init__(self, length = -1, height = -1, width = -1):
super().__init__(length = -1, height = -1)
self.length = length
self.height = height
self.width = width
def setHeight(self, value):
self.height = value
c.py
class C():
def __init__(self, a, x = 0, y = 0):
self.a = a
self.x = x
self.y = y
def func1(self):
if self.x < 7:
self.x = 7
d.py
from c import C
class D(C):
def __init__(self, a, x = 0, y = 0, z = 0):
super().__init__(a, x = 0, y = 0)
self.a = a
self.x = x
self.y = y
self.z = z
def func2(self):
self.func1()
main.py
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to keep running while the buffer process takes a few minutes to end an instance passed in the list
#hence I can't wait the join() function to update the objects inside the buffer but i need objects updated in order to pop them out from the list
import datetime as dt
t = dt.datetime.now()
#library of kind of multithreading (pool of 4 processes), uses asyncio lib
#this while was put to reproduce the same error I am getting
while True:
if t + dt.timedelta(seconds = 10) < dt.datetime.now():
lizt.append(D(B(5, 5, 5)))
t = dt.datetime.now()
"""
#This is the code which looks like the one in my project
#main.py
from pyrogram import Client #library of kind of multithreading (pool of 4 processes), uses asyncio lib
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to run at the same time as the buffer process
#hence I can't wait the join() function to update the objects inside the buffer
#app.on_message()
def my_handler(client, message):
lizt.append(complex_object_conatining_message)
"""
buffer.py
def buffer(buffer):
print("buffer was defined")
while True:
if len(buffer) > 0:
print(buffer[0].x) #prints 0
buffer[0].func2() #this changes the class attribute locally in the class instance but not in here
print(buffer[0].x) #prints 0, but I'd like it to be 7
print(buffer[0].a.height) #prints 5
buffer[0].a.setHeight(10) #and this has the same behaviour
print(buffer[0].a.height) #prints 5 but I'd like it to be 10
buffer.pop(0)
This is the whole code about the problem I am having.
Literally every suggestion is welcome, hopefully constructive, thank you in advance!
At last I had to change the way to solve this problem, which was using asyncio like the framework was doing as well.
This solution offers everything I was looking for:
-complex objects update
-avoiding the problems of multiprocessing (in particular with join())
It is also:
-lightweight: before I had 2 python processes 1) about 40K 2) about 75K
This actual process is about 30K (and it's also faster and cleaner)
Here's the solution, I hope it will be useful for someone else like it was for me:
The part of the classes is skipped because this solution updates complex objects absolutely fine
main.py
from pyrogram import Client
import asyncio
import time
def cancel_tasks():
#get all task in current loop
tasks = asyncio.Task.all_tasks()
for t in tasks:
t.cancel()
try:
buffer = []
firstWorker(buffer) #this one is the old buffer.py file and function
#the missing loop and loop method are explained in the next piece of code
except KeyboardInterrupt:
print("")
finally:
print("Closing Loop")
cancel_tasks()
firstWorker.py
import asyncio
def firstWorker(buffer):
print("First Worker Executed")
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
#app.on_message()
async def my_handler(client, message):
print("Message Arrived")
buffer.append(complex_object_conatining_message)
await asyncio.sleep(1)
app.run(secondWorker(buffer)) #here is the trick: I changed the
#method run() of the Client class
#inside the Pyrogram framework
#since it was a loop itself.
#In this way I added another task
#to the existing loop in orther to
#let run both of them together.
my secondWorker.py
import asyncio
async def secondWorker(buffer):
while True:
if len(buffer) > 0:
print(buffer.pop(0))
await asyncio.sleep(1)
The resources to understand the asyncio used in this code can be found here:
Asyncio simple tutorial
Python Asyncio Official Documentation
This tutorial about how to fix classical Asyncio errors

Apache Beam job (Python) using Tensorflow Transform is killed by Cloud Dataflow

I'm trying to run an Apache Beam job based on Tensorflow Transform on Dataflow but its killed. Someone has experienced that behaviour? This is a simple example with DirectRunner, that runs ok on my local but fails on Dataflow (I change the runner properly):
import os
import csv
import datetime
import numpy as np
import tensorflow as tf
import tensorflow_transform as tft
from apache_beam.io import textio
from apache_beam.io import tfrecordio
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam import tft_beam_io
from tensorflow_transform.tf_metadata import dataset_metadata
from tensorflow_transform.tf_metadata import dataset_schema
import apache_beam as beam
NUMERIC_FEATURE_KEYS = ['feature_'+str(i) for i in range(2000)]
def _create_raw_metadata():
column_schemas = {}
for key in NUMERIC_FEATURE_KEYS:
column_schemas[key] = dataset_schema.ColumnSchema(tf.float32, [], dataset_schema.FixedColumnRepresentation())
raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(column_schemas))
return raw_data_metadata
def preprocessing_fn(inputs):
outputs={}
for key in NUMERIC_FEATURE_KEYS:
outputs[key] = tft.scale_to_0_1(inputs[key])
return outputs
def main():
output_dir = '/tmp/tmp-folder-{}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
RUNNER = 'DirectRunner'
with beam.Pipeline(RUNNER) as p:
with beam_impl.Context(temp_dir=output_dir):
raw_data_metadata = _create_raw_metadata()
_ = (raw_data_metadata | 'WriteInputMetadata' >> tft_beam_io.WriteMetadata(os.path.join(output_dir, 'rawdata_metadata'), pipeline=p))
m = numpy_dataset = np.random.rand(100,2000)*100
raw_data = (p
| 'CreateTestDataset' >> beam.Create([dict(zip(NUMERIC_FEATURE_KEYS, m[i,:])) for i in range(m.shape[0])]))
raw_dataset = (raw_data, raw_data_metadata)
transform_fn = (raw_dataset | 'Analyze' >> beam_impl.AnalyzeDataset(preprocessing_fn))
_ = (transform_fn | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(output_dir))
(transformed_data, transformed_metadata) = ((raw_dataset, transform_fn) | 'Transform' >> beam_impl.TransformDataset())
transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)
_ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(os.path.join(output_dir, 'train'), file_name_suffix='.gz', coder=transformed_data_coder)
if __name__ == '__main__':
main()
Also, my production code (not shown) fail with the message: The job graph is too large. Please try again with a smaller job graph, or split your job into two or more smaller jobs.
Any hint?
The restriction on the pipeline description size is documented here:
https://cloud.google.com/dataflow/quotas#limits
There is a way around that, instead of creating stages for each tensor that goes into tft.scale_to_0_1 we could fuse them by first stacking them together, and then passing them into tft.scale_to_0_1 with 'elementwise=True'.
The result will be the same, because the min and max are computed per 'column' instead of across the whole tensor.
This would look something like this:
stacked = tf.stack([inputs[key] for key in NUMERIC_FEATURE_KEYS], axis=1)
scaled_stacked = tft.scale_to_0_1(stacked, elementwise=True)
for key, tensor in zip(NUMERIC_FEATURE_KEYS, tf.unstack(scaled_stacked, axis=1)):
outputs[key] = tensor

reactivex: how to make a behaviorsubject emit from observable

I'm going to be using rxandroid in an android app. I'm trying to model the behavior right now in rxpy because it was the easiest for me to set up and play with. In the example below, source3 is emitting the correct data; which is a concatenation of an initialization that takes some time and a permanent subscription which I have just faked out. I want the BehaviorSubject because I need the last value immediately for field initialization.
I cannot figure out how to chain the BehaviorSubject on top of source3 so that it emits source 3 while remembering the last value. I have searched the internet for two days and not found a clear direction on this use case. Here is my code, and the question is why I don't get any emissions from the observer.
from rx import Observable, Observer
from rx.subjects import BehaviorSubject
import time, random
def fake_initialization(observer):
time.sleep(5) # It takes some time
observer.on_next("Alpha")
observer.on_completed()
def fake_subscription(observer):
iter = 0 # Subscription emits forever
while True:
observer.on_next("message %02d"%(iter))
time.sleep(random.randrange(2,5))
iter += 1
class PrintObserver(Observer):
def on_next(self, value):
print("Received {0}".format(value))
#bsubject.on_next(value)
def on_completed(self):
print("Done!")
def on_error(self, error):
print("Error Occurred: {0}".format(error))
source1 = Observable.create(fake_initialization)
source2 = Observable.create(fake_subscription)
source3 = source1 + source2
bsubject = BehaviorSubject(False)
source4 = source3.multicast(bsubject)
source4.connect()
source4.subscribe(PrintObserver())
This was actually a fairly easy answer for someone. I'm posting this in case anyone else ends up in this situation. Admittedly, I didn't read the rxpy page closely enough. You need to add concurrency on your own, presumably because there are so many concurrent solutions in Python. Here is the final working code:
import random
import time
import multiprocessing
from rx import Observable,Observer
from rx.concurrency import ThreadPoolScheduler
from rx.subjects import Subject
class PrintObserver1(Observer):
def on_next(self, value):
print("Received 1 {0}".format(value))
#bsubject.on_next(value)
def on_completed(self):
print("Done 1!")
def on_error(self, error):
print("Error Occurred: 1 {0}".format(error))
class PrintObserver2(Observer):
def on_next(self, value):
print("Received 2 {0}".format(value))
#bsubject.on_next(value)
def on_completed(self):
print("Done 2!")
def on_error(self, error):
print("Error Occurred: 2 {0}".format(error))
def fake_initialization(observer):
time.sleep(5) # It takes some time
observer.on_next("Alpha")
observer.on_completed()
def fake_subscription(observer):
iter = 0 # Subscription emits forever
while True:
observer.on_next("message %02d"%(iter))
time.sleep(random.randrange(2,5))
iter += 1
optimal_thread_count = multiprocessing.cpu_count()
pool_scheduler = ThreadPoolScheduler(optimal_thread_count)
source1 = Observable.create(fake_initialization).subscribe_on(pool_scheduler)
source2 = Observable.create(fake_subscription).subscribe_on(pool_scheduler)
catted_source = source1 + source2
native_source = Observable.interval(1000)
print native_source,catted_source
#source = source3
subject = Subject()
# native_source = works
# catted_source = not works
subSource = catted_source.subscribe(subject)
#####
subSubject1 = subject.subscribe(PrintObserver1())
subSubject2 = subject.subscribe(PrintObserver2())
time.sleep(30)
subject.on_completed()
subSubject1.dispose()
subSubject2.dispose()
Also note that you have to install the 'futures' package for concurrency to work on Python 2.7.
If you get this error:
from concurrent.futures import ThreadPoolExecutor
ImportError: No module named concurrent.futures
Read this (link is for slightly different error but solution works):
ImportError: No module named concurrent.futures.process

return a list from class object

I am using multiprocessing module to generate 35 dataframes. I guess this will save my time. But the problem is that the class does not return anything. I expect the list of dataframes to be returned from self.dflist
Here is how to create dfnames list.
urls=[]
fnames=[]
dfnames=[]
for x in xrange(100,3600,100):
y = str(x)
i = y.zfill(4)
filename='DCHB_Town_Release_'+i+'.xlsx'
url = "http://www.censusindia.gov.in/2011census/dchb/"+filename
urls.append(url)
fnames.append(filename)
dfnames.append((filename, 'DCHB_Town_Release_'+i))
This is the class that uses the dfnames generated by above code.
import pandas as pd
import multiprocessing
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dflist=list()
self.jobs=list()
self.dfnames=dfnames
def dframe_create(self, filename, dfname):
print 'abc', filename, dfname
dfname=pd.read_excel(filename)
self.dflist.append(dfname)
print self.dflist
return self.dflist
def mp(self):
for f,d in self.dfnames:
p = multiprocessing.Process(target=self.dframe_create, args=(f,d))
self.jobs.append(p)
p.start()
#return self.dflist
for j in self.jobs:
j.join()
print '%s.exitcode = %s' % (j.name, j.exitcode)
This class when called like this...
dflist=[]
jobs=[]
x=mydf1(dflist, jobs, dfnames)
y=x.mp()
Prints the self.dflist correctly. But does not return anything.
I can collect all datafarmes sequentially. But in order to save time, I need to use multiple processes simultaneously to generate and add dataframes to a list.
In your case I prefer to write as less code as possible and use Pool:
import pandas as pd
import logging
import multiprocessing
def dframe_create(filename):
try:
return pd.read_excel(filename)
except Exception as e:
logging.error("Something went wrong: %s", e, exc_info=1)
return None
p = multiprocessing.Pool()
excel_files = p.map(dframe_create, dfnames)
for f in excel_files:
if f is not None:
print 'Ready to work'
else:
print ':('
Prints the self.dflist correctly. But does not return anything.
That's because you don't have a return statement in the mp method, e.g.
def mp(self):
...
return self.dflist
It's not entirely clear what you're issue is, however, you have to take some care here in that you can't just pass objects/lists across processes. That's why you have special objects (which lock while they make modifications to a list), that way you don't get tripped up when two processes try to make a change at the same time (and you only get one update).
That is, you have to use multiprocessing's list.
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dflist = multiprocessing.list() # perhaps should be multiprocessing.list(dflist or ())
self.jobs = list()
self.dfnames = dfnames
However you have a bigger problem: the whole point of multiprocessing is that they may run/finish out of order, so keeping two lists like this is doomed to fail. You should use a multiprocessing.dict that way the DataFrame is saved unambiguously with the filename.
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dfdict = multiprocessing.dict()
...
def dframe_create(self, filename, dfname):
print 'abc', filename, dfname
df = pd.read_excel(filename)
self.dfdict[dfname] = df