reindexObject fails during FileField to BlobField migration in Plone 4.0.7 - migration

I'm trying to migrate from plone 3.3.5 to plone 4.0.7 and I'm stuck on a step that converts all the FileFields to BlobFields.
Plone upgrade script successfully converts all native FileFields but I have several custom AT-based classes which have to be converted manually. I've tried two ways of doing the conversion which leads me to the same error.
Using schemaextender as outlined in Plone migration guide and a source code example
Renaming all FileFields to blob fields and then running this script:
from AccessControl.SecurityManagement import newSecurityManager
from AccessControl import getSecurityManager
from Products.CMFCore.utils import getToolByName
from zope.app.component.hooks import setSite
from Products.contentmigration.migrator import BaseInlineMigrator
from Products.contentmigration.walker import CustomQueryWalker
from plone.app.blob.field import BlobField
admin=app.acl_users.getUserById("admin")
newSecurityManager(None, admin)
portal = app.plone
setSite(portal)
def find_all_types_fields(portal_catalog, type_instance_to_search):
output = {}
searched = []
for k in catalog():
kobj = k.getObject()
if kobj.__class__.__name__ in searched:
continue
searched.append(kobj.__class__.__name__)
for field in kobj.schema.fields():
if isinstance(field, type_instance_to_search):
if kobj.__class__.__name__ in output:
output[kobj.__class__.__name__].append(field.__name__)
else:
output[kobj.__class__.__name__] = [field.__name__]
return output
def produce_migrator(field_map):
source_class = field_map.keys()[0]
fields = {}
for x in field_map.values()[0]: fields[x] = None
class FileBlobMigrator(BaseInlineMigrator):
'''Migrating ExtensionBlobField (which is still a FileField) to BlobField'''
src_portal_type = source_class
src_meta_type = source_class
fields_map = fields
def migrate_data(self):
'''Unfinished'''
for k in self.fields_map.keys():
#print "examining attributes"
#import pdb; pdb.set_trace()
#if hasattr(self.obj, k):
if k in self.obj.schema.keys():
print("***converting attribute:", k)
field = self.obj.getField(k).get(self.obj)
mutator = self.obj.getField(k).getMutator(self.obj)
mutator(field)
def last_migrate_reindex(self):
'''Unfinished'''
self.obj.reindexObject()
return FileBlobMigrator
def consume_migrator(portal_catalog, migrator):
walker = CustomQueryWalker(portal_catalog, migrator, full_transaction=True)
transaction.savepoint(optimistic=True)
walker_status = walker.go()
return walker.getOutput()
def migrate_blobs(catalog, migrate_type):
all_fields = find_all_types_fields(catalog, migrate_type)
import pdb; pdb.set_trace()
for k in [ {k : all_fields[k]} for k in all_fields]:
migrator = produce_migrator(k)
print consume_migrator(catalog, migrator)
catalog = getToolByName(portal, 'portal_catalog')
migrate_blobs(catalog, BlobField)
The problem occurs on self.obj.reindexObject() line where I receive the following traceback:
2011-08-09 17:21:12 ERROR Zope.UnIndex KeywordIndex: unindex_object could not remove documentId -1945041983 from index object_provides. This should not happen.
Traceback (most recent call last):
File "/home/alex/projects/plone4/eggs/Zope2-2.12.18-py2.6-linux-x86_64.egg/Products/PluginIndexes/common/UnIndex.py", line 166, in removeForwardIndexEntry indexRow.remove(documentId)
KeyError: -1945041983
> /home/alex/projects/plone4/eggs/Zope2-2.12.18-py2.6-linux-x86_64.egg/Products/PluginIndexes/common/UnIndex.py(192)removeForwardIndexEntry()
191 str(documentId), str(self.id)),
--> 192 exc_info=sys.exc_info())
193 else:
If I remove the line that triggers reindexing, the conversion completes successfully, but if I try to manually reindex catalog later, every object that's been converted can no longer be found, and I'm a bit at loss of what to do now.
The site has LinguaPlone installed, maybe it has something to do with this?

One option would be to run the migration without the reindexObject() call and do a "Clear and Rebuild" in the catalog ZMI Advanced tab after migrating.

Related

Python Panel dashboard causing BufferError and RuntimeErrors

I have struggled for some time to create a data streaming interface using Panel.
Essentially I have approximately 20 named python objects that I monitor and read the spectral output from.
I want to have a dashboard displaying this in the form of 20 plots which must continuously overwrite themselves as the spectral output must be displayed over the same x-range (channels).
The dashboard runs fine for some time and then I either get:
a) RuntimeError: _pending_writes should be non-None when we have a document lock, and we should have the lock when the document changes
or
b) BufferError: Existing exports of data: object cannot be re-sized {PYTHON_ENV_PATH}/lib/python3.6/site-packages/bokeh/document/document.py:500: RuntimeWarning: coroutine 'WSHandler.send_message' was never awaited gc.collect()
I've drafted up a MRE as follows:
import numpy as np
import pandas as pd
import hvplot.streamz
import numpy as np
import panel as pn
from streamz.dataframe import PeriodicDataFrame
pn.extension()
#object from which data is collected:
class data_gen:
def __init__(self,name,size=1024,sets=4):
self.name = name
self.size = size
self.sets = sets
def get_data(self):
return np.random.randn(self.sets,self.size)
#Have a dictionary of items with name:
data_dict = {
"a" : data_gen("a"),
"b" : data_gen("b"),
"c" : data_gen("c"),
"d" : data_gen("d"),
"e" : data_gen("e"),
"f" : data_gen("f"),
}
#Generate dataframe
def name_dataFrame(**kwargs):
dct = {}
for name,dg in data_dict.items():
d = dg.get_data()
sets, size = d.shape
t_dict ={}
for i in range(sets):
t_dict[i] = {
c : d[i,c] for c in range(size)
}
t_df = pd.DataFrame(t_dict).transpose()
dct[name] = t_df
df = pd.concat(dct).transpose()
return df
#Have it be streamed
df = PeriodicDataFrame(name_dataFrame, interval='10s')
#Compose panel layout
pn_realtime = pn.Column("# Data Dashboard")
for name in data_dict:
pn_realtime.append(
(pn.Row(f"""##Name: {name}""")))
pn_realtime.append(pn.Row(
df[name].hvplot.line(backlog=1024, width = 600, height=500, xlabel="n", ylabel="f(n)", grid=True)
))
pn_realtime.servable()
My set up is:
# Name Version Build Channel
panel 0.12.1 pyhd3eb1b0_0
hvplot 0.7.3 pyhd3eb1b0_1
pandas 1.1.5 py36ha9443f7_0
streamz 0.6.3 pyhd3eb1b0_0
Python 3.6.13 :: Anaconda, Inc.
Ubuntu 20.04.3 LTS (Focal Fossa)
I'm pretty new to dashboard design (and pandas for that matter) so I wouldn't be surprised if there were a vastly simpler way to do what I am attempting to do.
My suspicion is that the appending of Panel objects is causing memory buffers to overfill and garbage collection cannot handle it. If so, what can I do?
Running this MRE on my beefier Windows machine with python 3.9.7 did not seem to crash, but perhaps that is simply because I've not run it for long enough?
I've also set ylims on the hvplot and that seemed to stop crashes from occurring (again maybe I did not run it for long enough), but due to the nature of my application, I cannot have static ylims.
I appreciate your time and input.
Cheers.

Python multiprocessing how to update a complex object in a manager list without using .join() method

I started programming in Python about 2 months ago and I've been struggling with this problem in the last 2 weeks.
I know there are many similar threads to this one but I can't really find a solution which suits my case.
I need to have the main process which is the one which interacts with Telegram and another process, buffer, which understands the complex object received from the main and updates it.
I'd like to do this in a simpler and smoother way.
At the moment objects are not being updated due to the use of multi-processing without the join() method.
I tried then to use multi-threading instead but it gives me compatibility problems with Pyrogram a framework which i am using to interact with Telegram.
I wrote again the "complexity" of my project in order to reproduce the same error I am getting and in order to get and give the best help possible from and for everyone.
a.py
class A():
def __init__(self, length = -1, height = -1):
self.length = length
self.height = height
b.py
from a import A
class B(A):
def __init__(self, length = -1, height = -1, width = -1):
super().__init__(length = -1, height = -1)
self.length = length
self.height = height
self.width = width
def setHeight(self, value):
self.height = value
c.py
class C():
def __init__(self, a, x = 0, y = 0):
self.a = a
self.x = x
self.y = y
def func1(self):
if self.x < 7:
self.x = 7
d.py
from c import C
class D(C):
def __init__(self, a, x = 0, y = 0, z = 0):
super().__init__(a, x = 0, y = 0)
self.a = a
self.x = x
self.y = y
self.z = z
def func2(self):
self.func1()
main.py
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to keep running while the buffer process takes a few minutes to end an instance passed in the list
#hence I can't wait the join() function to update the objects inside the buffer but i need objects updated in order to pop them out from the list
import datetime as dt
t = dt.datetime.now()
#library of kind of multithreading (pool of 4 processes), uses asyncio lib
#this while was put to reproduce the same error I am getting
while True:
if t + dt.timedelta(seconds = 10) < dt.datetime.now():
lizt.append(D(B(5, 5, 5)))
t = dt.datetime.now()
"""
#This is the code which looks like the one in my project
#main.py
from pyrogram import Client #library of kind of multithreading (pool of 4 processes), uses asyncio lib
from b import B
from d import D
from multiprocessing import Process, Manager
from buffer import buffer
if __name__ == "__main__":
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
manager = Manager()
lizt = manager.list()
buffer = Process(target = buffer, args = (lizt, )) #passing the list as a parameter
buffer.start()
#can't invoke buffer.join() here because I need the below code to run at the same time as the buffer process
#hence I can't wait the join() function to update the objects inside the buffer
#app.on_message()
def my_handler(client, message):
lizt.append(complex_object_conatining_message)
"""
buffer.py
def buffer(buffer):
print("buffer was defined")
while True:
if len(buffer) > 0:
print(buffer[0].x) #prints 0
buffer[0].func2() #this changes the class attribute locally in the class instance but not in here
print(buffer[0].x) #prints 0, but I'd like it to be 7
print(buffer[0].a.height) #prints 5
buffer[0].a.setHeight(10) #and this has the same behaviour
print(buffer[0].a.height) #prints 5 but I'd like it to be 10
buffer.pop(0)
This is the whole code about the problem I am having.
Literally every suggestion is welcome, hopefully constructive, thank you in advance!
At last I had to change the way to solve this problem, which was using asyncio like the framework was doing as well.
This solution offers everything I was looking for:
-complex objects update
-avoiding the problems of multiprocessing (in particular with join())
It is also:
-lightweight: before I had 2 python processes 1) about 40K 2) about 75K
This actual process is about 30K (and it's also faster and cleaner)
Here's the solution, I hope it will be useful for someone else like it was for me:
The part of the classes is skipped because this solution updates complex objects absolutely fine
main.py
from pyrogram import Client
import asyncio
import time
def cancel_tasks():
#get all task in current loop
tasks = asyncio.Task.all_tasks()
for t in tasks:
t.cancel()
try:
buffer = []
firstWorker(buffer) #this one is the old buffer.py file and function
#the missing loop and loop method are explained in the next piece of code
except KeyboardInterrupt:
print("")
finally:
print("Closing Loop")
cancel_tasks()
firstWorker.py
import asyncio
def firstWorker(buffer):
print("First Worker Executed")
api_id = 1234567
api_hash = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
app = Client("my_account", api_id, api_hash)
#app.on_message()
async def my_handler(client, message):
print("Message Arrived")
buffer.append(complex_object_conatining_message)
await asyncio.sleep(1)
app.run(secondWorker(buffer)) #here is the trick: I changed the
#method run() of the Client class
#inside the Pyrogram framework
#since it was a loop itself.
#In this way I added another task
#to the existing loop in orther to
#let run both of them together.
my secondWorker.py
import asyncio
async def secondWorker(buffer):
while True:
if len(buffer) > 0:
print(buffer.pop(0))
await asyncio.sleep(1)
The resources to understand the asyncio used in this code can be found here:
Asyncio simple tutorial
Python Asyncio Official Documentation
This tutorial about how to fix classical Asyncio errors

MemoryError when querying database from Process

I am trying to create a program with 3 processes that read from the same database. The code was working before I started introducing processes.
I am getting MemoryError when performing a select() from PeeWee, I suspect there is something wrong with sharing of resources. Minimal example:
models.py
from playhouse.pool import PooledSqliteExtDatabase
file_scanner_database = PooledSqliteExtDatabase(
None,
max_connections=32,
)
class FileModel(Model):
class Meta:
database = file_scanner_database
main.py
from file_scanner import FileScanner
from models import file_scanner_database
from models import FileModel
from multiprocessing import Process
def create_scanner_agent(data):
scanner = FileScanner(data)
scanner.start_scanner()
shared_info = {'db_location': '/absolute/path/to/database'}
file_scanner_database.init(shared_info['db_location'])
file_scanner_database.connect()
file_scanner_database.create_tables([FileModel], safe=True)
new_process = Process(
target=create_scanner_agent,
args=(shared_info,)
)
new_process.daemon = True
new_process.start()
try:
new_process.join()
except KeyboardInterrupt:
pass
new_process.terminate()
file_scanner.py
from models import file_scanner_database
from models import FileModel
class FileScanner:
def __init__(self, data):
for k, v in data.items():
setattr(self, k, v)
file_scanner_database.init(self.db_location)
file_scanner_database.connect()
def start_scanner(self):
while True:
# THIS IS WHERE THE PROGRAM CRASHES
for row in FileModel.select():
...
It looks like you're trying to access memory across a fork? Or some such craziness? I think the answer is that you're doing it wrong homie. Try opening your DB connection after the fork.

minimal example of how to export a jupyter notebook to pdf using nbconvert and PDFExporter()

I am trying to export a pdf copy of a jupyter notebook using nbconvert from within a notebook cell. I have read the documentation, but I just cannot find some basic code to actually execute the nbconvert command and export to pdf.
I was able to get this far, but I was hoping that someone could just fill in the final gaps.
from nbconvert import PDFExporter
notebook_pdf = PDFExporter()
notebook_pdf.template_file = '../print_script/pdf_nocode.tplx'
Note sure how to get from here to actually getting the pdf created.
Any help would be appreciated.
I'm no expert, but managed to get this working. The key is that you need to preprocess the notebook which will allow you to use the PDFExporter.from_notebook_node() function. This will give you your pdf_data in byte format that can then be written to file:
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert import PDFExporter
notebook_filename = "notebook.ipynb"
with open(notebook_filename) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=600, kernel_name='python3')
ep.preprocess(nb, {'metadata': {'path': 'notebooks/'}})
pdf_exporter = PDFExporter()
pdf_data, resources = pdf_exporter.from_notebook_node(nb)
with open("notebook.pdf", "wb") as f:
f.write(pdf_data)
f.close()
It's worth noting that the ExecutePreprocessor requires the resources dict, but we don't use it in this example.
Following is rest api that convert .ipynb file into .html
POST: http://URL/export/<id>
Get: http://URL/export/<id> will return a id.html
import os
from flask import Flask, render_template, make_response
from flask_cors import CORS
from flask_restful import reqparse, abort, Api, Resource
from nbconvert.exporters import HTMLExporter
exporter = HTMLExporter()
app = Flask(__name__)
cors = CORS(app, resources={r"/export/*": {"origins": "*"}})
api = Api(app)
parser = reqparse.RequestParser()
parser.add_argument('path')
notebook_file_srv = '/path of your .ipynb file'
def notebook_doesnt_exist(nb):
abort(404, message="Notebook {} doesn't exist".format(nb))
class Notebook(Resource):
def get(self, id):
headers = {'Content-Type': 'text/html'}
return make_response(render_template(id + '.html'), 200, headers)
def post(self, id):
args = parser.parse_args()
notebook_file = args['path']
notebook_file = notebook_file_srv + id + '.ipynb'
if not os.path.exists(notebook_file):
return 'notebook \'.ipynb\' file not found', 404
else:
nb_name, _ = os.path.splitext(os.path.basename(notebook_file))
# dirname = os.path.dirname(notebook_file)
output_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'templates')
output_path = os.path.join(output_path, '{}.html'.format(nb_name))
output, resources = exporter.from_filename(notebook_file)
f = open(output_path, 'wb')
f.write(output.encode('utf8'))
f.close()
return 'done', 201
api.add_resource(Notebook, '/export/<id>')
if __name__ == '__main__':
app.run(debug=True)

Is there a way to get tensorflow tf.Print output to appear in Jupyter Notebook output

I'm using the tf.Print op in a Jupyter notebook. It works as required, but will only print the output to the console, without printing in the notebook. Is there any way to get around this?
An example would be the following (in a notebook):
import tensorflow as tf
a = tf.constant(1.0)
a = tf.Print(a, [a], 'hi')
sess = tf.Session()
a.eval(session=sess)
That code will print 'hi[1]' in the console, but nothing in the notebook.
Update Feb 3, 2017
I've wrapped this into memory_util package. Example usage
# install memory util
import urllib.request
response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py")
open("memory_util.py", "wb").write(response.read())
import memory_util
sess = tf.Session()
a = tf.random_uniform((1000,))
b = tf.random_uniform((1000,))
c = a + b
with memory_util.capture_stderr() as stderr:
sess.run(c.op)
print(stderr.getvalue())
** Old stuff**
You could reuse FD redirector from IPython core. (idea from Mark Sandler)
import os
import sys
STDOUT = 1
STDERR = 2
class FDRedirector(object):
""" Class to redirect output (stdout or stderr) at the OS level using
file descriptors.
"""
def __init__(self, fd=STDOUT):
""" fd is the file descriptor of the outpout you want to capture.
It can be STDOUT or STERR.
"""
self.fd = fd
self.started = False
self.piper = None
self.pipew = None
def start(self):
""" Setup the redirection.
"""
if not self.started:
self.oldhandle = os.dup(self.fd)
self.piper, self.pipew = os.pipe()
os.dup2(self.pipew, self.fd)
os.close(self.pipew)
self.started = True
def flush(self):
""" Flush the captured output, similar to the flush method of any
stream.
"""
if self.fd == STDOUT:
sys.stdout.flush()
elif self.fd == STDERR:
sys.stderr.flush()
def stop(self):
""" Unset the redirection and return the captured output.
"""
if self.started:
self.flush()
os.dup2(self.oldhandle, self.fd)
os.close(self.oldhandle)
f = os.fdopen(self.piper, 'r')
output = f.read()
f.close()
self.started = False
return output
else:
return ''
def getvalue(self):
""" Return the output captured since the last getvalue, or the
start of the redirection.
"""
output = self.stop()
self.start()
return output
import tensorflow as tf
x = tf.constant([1,2,3])
a=tf.Print(x, [x])
redirect=FDRedirector(STDERR)
sess = tf.InteractiveSession()
redirect.start();
a.eval();
print "Result"
print redirect.stop()
I ran into the same problem and got around it by using a function like this in my notebooks:
def tf_print(tensor, transform=None):
# Insert a custom python operation into the graph that does nothing but print a tensors value
def print_tensor(x):
# x is typically a numpy array here so you could do anything you want with it,
# but adding a transformation of some kind usually makes the output more digestible
print(x if transform is None else transform(x))
return x
log_op = tf.py_func(print_tensor, [tensor], [tensor.dtype])[0]
with tf.control_dependencies([log_op]):
res = tf.identity(tensor)
# Return the given tensor
return res
# Now define a tensor and use the tf_print function much like the tf.identity function
tensor = tf_print(tf.random_normal([100, 100]), transform=lambda x: [np.min(x), np.max(x)])
# This will print the transformed version of the tensors actual value
# (which was summarized to just the min and max for brevity)
sess = tf.InteractiveSession()
sess.run([tensor])
sess.close()
FYI, using a logger instead of calling "print" in my custom function worked wonders for me as the stdout is often buffered by jupyter and not shown before "Loss is Nan" kind of errors -- which was the whole point in using that function in the first place in my case.
You can check the terminal where you launched the jupyter notebook to see the message.
import tensorflow as tf
tf.InteractiveSession()
a = tf.constant(1)
b = tf.constant(2)
opt = a + b
opt = tf.Print(opt, [opt], message="1 + 2 = ")
opt.eval()
In the terminal, I can see:
2018-01-02 23:38:07.691808: I tensorflow/core/kernels/logging_ops.cc:79] 1 + 2 = [3]
A simple way, tried it in regular python, but not jupyter yet.
os.dup2(sys.stdout.fileno(), 1)
os.dup2(sys.stdout.fileno(), 2)
Explanation is here: In python, how to capture the stdout from a c++ shared library to a variable
The issue that I faced was that one can't run a session inside a Tensorflow Graph, like in the training or in the evaluation.
That's why the options to use sess.run(opt) or opt.eval() were not a solution for me.
The best thing was to use tf.Print() and redirect the logging to an external file.
I did this using a temporal file, which I transferred to a regular file like this:
STDERR=2
import os
import sys
import tempfile
class captured:
def __init__(self, fd=STDERR):
self.fd = fd
self.prevfd = None
def __enter__(self):
t = tempfile.NamedTemporaryFile()
self.prevfd = os.dup(self.fd)
os.dup2(t.fileno(), self.fd)
return t
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.prevfd, self.fd)
with captured(fd=STDERR) as tmp:
...
classifier.evaluate(input_fn=input_fn, steps=100)
with open('log.txt', 'w') as f:
print(open(tmp.name).read(), file=f)
And then in my evaluation I do:
a = tf.constant(1)
a = tf.Print(a, [a], message="a: ")