Simliest and pythonic way to Serialize/Deserialize Object <->dict <-> JSON in Python3.5+ - serialization

I am new to python, and I know there are many answers, but most of the use the __dict__ which is not present in Python3 any more.
Let's say I have object:
class A(object):
def __init__(self, f1, f2):
self.f1 = f1
self.f2 = f2
now the f2 is another object:
class F2(object):
def __init__(self, f21, f22):
self.f21 = f21
self.f22 = f22
So A object is complex object. What is the simpliest way to:
serialize A to dict and than to json.
and then to deserialize it back from json -> A
all in Python 3.5+ and possibly without additional imports as our internal company nexus is limited.

import pickle
import base64
import json
class A(object):
def __init__(self, f1, f2):
self.f1 = f1
self.f2 = f2
a = pickle.dumps(A, 3)
j = base64.b64encode(a).decode('utf-8')
x = json.dumps([j])
print(a)
print(j)
print(x)
# output:
# b'\x80\x03c__main__\nA\nq\x00.'
# gANjX19tYWluX18KQQpxAC4=
# ["gANjX19tYWluX18KQQpxAC4="]
Deserialization would be the inverse; json.loads, string.encode base64.b64decode, pickle.loads.

Related

PYQGIS: How to use QgsRasterFileWriter.writeRaster to create raster from numpy array

I am trying to use the method writeRaster from qgis.core.writeRaster to create a singleBand raster of float and Nans but according to the documentation, I need to provide theses inputs:
writeRaster(
self, # OK
pipe: QgsRasterPipe, # Q1
nCols: int, # OK
nRows: int, # OK
outputExtent: QgsRectangle, # Q2
crs: QgsCoordinateReferenceSystem, # OK
feedback: QgsRasterBlockFeedback = None # OK
) → QgsRasterFileWriter.WriterError
I have 2 questions here:
Q1: What is a QgsRasterPipe, how to use it and what is its purpose?
The documentation says: Constructor for QgsRasterPipe. Base class for processing modules.
Few examples online of writeRaster just initialize this object. So what do I need to provide in the argument pipe ?
Q2: The argument outputExtent of type QgsRectangle seems to be the bounding area of my raster: QgsRectangle(x_min, y_min, x_max, y_max). But here is my question: Where do I declare the values of pixels?
Here is the script (not working) I have for the moment:
import os
import numpy
from qgis.core import (
QgsMapLayer,
QgsRasterFileWriter,
QgsCoordinateReferenceSystem,
QgsRasterPipe,
)
def write_to_geotiff(data: list, filename: str, epsg: str, layer: str=None) -> None:
x_data = data[0]
y_data = data[1]
z_data = data[2]
nx, ny = len(x_data), len(y_data)
QgsRasterFileWriter.writeRaster(
QgsRasterPipe(),
nCols=nx,
nRows=ny,
QgsRectangle(
min(x_data),
min(y_data),
max(x_data),
max(y_data)
),
crs = QgsCoordinateReferenceSystem(f"epsg:{epsg}"),
)
if __name__ == "__main__":
filename = r"C:\Users\vince\Downloads\test.gpkg"
x_data = numpy.asarray([0, 1, 2])
y_data = numpy.asarray([0, 1])
z_data = numpy.asarray([
[0.1, numpy.nan],
[0.5, 139.5],
[150.98, numpy.nan],
])
epsg = "4326"
write_to_geotiff(
[x_data, y_data, z_data],
filename,
epsg
)
I saw this answer for Q1, the data is in the pipe variable. But I don t know how to create a qgsRasterBlock from my numpy array...
I get it using the method QgsRasterFileWriter.createOneBandRaster creating a provider.
You can get the bloc of the provider of type QgsRasterBlock and use the method setValue to associate values.
writer = QgsRasterFileWriter(filename)
provider = QgsRasterFileWriter.createOneBandRaster(
writer,
dataType=Qgis.Float32,
width=nx,
height=ny,
extent=QgsRectangle(
min(x_data),
min(y_data),
max(x_data),
max(y_data)
),
crs = QgsCoordinateReferenceSystem(f"epsg:{epsg}"),
)
provider.setNoDataValue(1, -1)
provider.setEditable(True)
block = provider.block(
bandNo=1,
boundingBox=provider.extent(),
width=provider.xSize(),
height=provider.ySize()
)
for ix in range(nx):
for iy in range(ny):
value = z_data[ix][iy]
if value == numpy.nan:
continue
block.setValue(iy, ix, value)
provider.writeBlock(
block=block,
band=1,
xOffset=0,
yOffset=0
)
provider.setEditable(False)
This will create a tiffile:

Calling Numba-generated PyCFunctionWithKeywords from Python

I serialized a jitted Numba function to a byte array and now want to deserialize and call it. This works fine for primitive data types with llvm_cfunc_wrapper_name:
import numba, ctypes
import llvmlite.binding as llvm
#numba.njit("f8(f8)")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cfunc_name = foo.overloads[sig].fndesc.llvm_cfunc_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cfunc_name)
func = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double)(func_ptr)
print(func(0.25))
But I want to call functions with NumPy arguments. There is a llvm_cpython_wrapper_name for that which uses PyCFunctionWithKeywords, but unfortunately my best guess segfaults:
import numba, ctypes
import llvmlite.binding as llvm
import numpy as np
#numba.njit("f8[:](f8[:])")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cpython_name = foo.overloads[sig].fndesc.llvm_cpython_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cpython_name)
def func(*args, **kwargs):
py_obj_ptr = ctypes.POINTER(ctypes.py_object)
return ctypes.CFUNCTYPE(py_obj_ptr, py_obj_ptr, py_obj_ptr, py_obj_ptr)(func_ptr)(
ctypes.cast(id(None), py_obj_ptr),
ctypes.cast(id(args), py_obj_ptr),
ctypes.cast(id(kwargs), py_obj_ptr))
# segfaults here
print(func(np.ones(3)))
Here are some links to Numba source code (unfortunately very hard to follow), which might be helpful to figure this out.
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/callwrapper.py#L105
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/pythonapi.py#L1456

Is there a way to get tensorflow tf.Print output to appear in Jupyter Notebook output

I'm using the tf.Print op in a Jupyter notebook. It works as required, but will only print the output to the console, without printing in the notebook. Is there any way to get around this?
An example would be the following (in a notebook):
import tensorflow as tf
a = tf.constant(1.0)
a = tf.Print(a, [a], 'hi')
sess = tf.Session()
a.eval(session=sess)
That code will print 'hi[1]' in the console, but nothing in the notebook.
Update Feb 3, 2017
I've wrapped this into memory_util package. Example usage
# install memory util
import urllib.request
response = urllib.request.urlopen("https://raw.githubusercontent.com/yaroslavvb/memory_util/master/memory_util.py")
open("memory_util.py", "wb").write(response.read())
import memory_util
sess = tf.Session()
a = tf.random_uniform((1000,))
b = tf.random_uniform((1000,))
c = a + b
with memory_util.capture_stderr() as stderr:
sess.run(c.op)
print(stderr.getvalue())
** Old stuff**
You could reuse FD redirector from IPython core. (idea from Mark Sandler)
import os
import sys
STDOUT = 1
STDERR = 2
class FDRedirector(object):
""" Class to redirect output (stdout or stderr) at the OS level using
file descriptors.
"""
def __init__(self, fd=STDOUT):
""" fd is the file descriptor of the outpout you want to capture.
It can be STDOUT or STERR.
"""
self.fd = fd
self.started = False
self.piper = None
self.pipew = None
def start(self):
""" Setup the redirection.
"""
if not self.started:
self.oldhandle = os.dup(self.fd)
self.piper, self.pipew = os.pipe()
os.dup2(self.pipew, self.fd)
os.close(self.pipew)
self.started = True
def flush(self):
""" Flush the captured output, similar to the flush method of any
stream.
"""
if self.fd == STDOUT:
sys.stdout.flush()
elif self.fd == STDERR:
sys.stderr.flush()
def stop(self):
""" Unset the redirection and return the captured output.
"""
if self.started:
self.flush()
os.dup2(self.oldhandle, self.fd)
os.close(self.oldhandle)
f = os.fdopen(self.piper, 'r')
output = f.read()
f.close()
self.started = False
return output
else:
return ''
def getvalue(self):
""" Return the output captured since the last getvalue, or the
start of the redirection.
"""
output = self.stop()
self.start()
return output
import tensorflow as tf
x = tf.constant([1,2,3])
a=tf.Print(x, [x])
redirect=FDRedirector(STDERR)
sess = tf.InteractiveSession()
redirect.start();
a.eval();
print "Result"
print redirect.stop()
I ran into the same problem and got around it by using a function like this in my notebooks:
def tf_print(tensor, transform=None):
# Insert a custom python operation into the graph that does nothing but print a tensors value
def print_tensor(x):
# x is typically a numpy array here so you could do anything you want with it,
# but adding a transformation of some kind usually makes the output more digestible
print(x if transform is None else transform(x))
return x
log_op = tf.py_func(print_tensor, [tensor], [tensor.dtype])[0]
with tf.control_dependencies([log_op]):
res = tf.identity(tensor)
# Return the given tensor
return res
# Now define a tensor and use the tf_print function much like the tf.identity function
tensor = tf_print(tf.random_normal([100, 100]), transform=lambda x: [np.min(x), np.max(x)])
# This will print the transformed version of the tensors actual value
# (which was summarized to just the min and max for brevity)
sess = tf.InteractiveSession()
sess.run([tensor])
sess.close()
FYI, using a logger instead of calling "print" in my custom function worked wonders for me as the stdout is often buffered by jupyter and not shown before "Loss is Nan" kind of errors -- which was the whole point in using that function in the first place in my case.
You can check the terminal where you launched the jupyter notebook to see the message.
import tensorflow as tf
tf.InteractiveSession()
a = tf.constant(1)
b = tf.constant(2)
opt = a + b
opt = tf.Print(opt, [opt], message="1 + 2 = ")
opt.eval()
In the terminal, I can see:
2018-01-02 23:38:07.691808: I tensorflow/core/kernels/logging_ops.cc:79] 1 + 2 = [3]
A simple way, tried it in regular python, but not jupyter yet.
os.dup2(sys.stdout.fileno(), 1)
os.dup2(sys.stdout.fileno(), 2)
Explanation is here: In python, how to capture the stdout from a c++ shared library to a variable
The issue that I faced was that one can't run a session inside a Tensorflow Graph, like in the training or in the evaluation.
That's why the options to use sess.run(opt) or opt.eval() were not a solution for me.
The best thing was to use tf.Print() and redirect the logging to an external file.
I did this using a temporal file, which I transferred to a regular file like this:
STDERR=2
import os
import sys
import tempfile
class captured:
def __init__(self, fd=STDERR):
self.fd = fd
self.prevfd = None
def __enter__(self):
t = tempfile.NamedTemporaryFile()
self.prevfd = os.dup(self.fd)
os.dup2(t.fileno(), self.fd)
return t
def __exit__(self, exc_type, exc_value, traceback):
os.dup2(self.prevfd, self.fd)
with captured(fd=STDERR) as tmp:
...
classifier.evaluate(input_fn=input_fn, steps=100)
with open('log.txt', 'w') as f:
print(open(tmp.name).read(), file=f)
And then in my evaluation I do:
a = tf.constant(1)
a = tf.Print(a, [a], message="a: ")

return a list from class object

I am using multiprocessing module to generate 35 dataframes. I guess this will save my time. But the problem is that the class does not return anything. I expect the list of dataframes to be returned from self.dflist
Here is how to create dfnames list.
urls=[]
fnames=[]
dfnames=[]
for x in xrange(100,3600,100):
y = str(x)
i = y.zfill(4)
filename='DCHB_Town_Release_'+i+'.xlsx'
url = "http://www.censusindia.gov.in/2011census/dchb/"+filename
urls.append(url)
fnames.append(filename)
dfnames.append((filename, 'DCHB_Town_Release_'+i))
This is the class that uses the dfnames generated by above code.
import pandas as pd
import multiprocessing
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dflist=list()
self.jobs=list()
self.dfnames=dfnames
def dframe_create(self, filename, dfname):
print 'abc', filename, dfname
dfname=pd.read_excel(filename)
self.dflist.append(dfname)
print self.dflist
return self.dflist
def mp(self):
for f,d in self.dfnames:
p = multiprocessing.Process(target=self.dframe_create, args=(f,d))
self.jobs.append(p)
p.start()
#return self.dflist
for j in self.jobs:
j.join()
print '%s.exitcode = %s' % (j.name, j.exitcode)
This class when called like this...
dflist=[]
jobs=[]
x=mydf1(dflist, jobs, dfnames)
y=x.mp()
Prints the self.dflist correctly. But does not return anything.
I can collect all datafarmes sequentially. But in order to save time, I need to use multiple processes simultaneously to generate and add dataframes to a list.
In your case I prefer to write as less code as possible and use Pool:
import pandas as pd
import logging
import multiprocessing
def dframe_create(filename):
try:
return pd.read_excel(filename)
except Exception as e:
logging.error("Something went wrong: %s", e, exc_info=1)
return None
p = multiprocessing.Pool()
excel_files = p.map(dframe_create, dfnames)
for f in excel_files:
if f is not None:
print 'Ready to work'
else:
print ':('
Prints the self.dflist correctly. But does not return anything.
That's because you don't have a return statement in the mp method, e.g.
def mp(self):
...
return self.dflist
It's not entirely clear what you're issue is, however, you have to take some care here in that you can't just pass objects/lists across processes. That's why you have special objects (which lock while they make modifications to a list), that way you don't get tripped up when two processes try to make a change at the same time (and you only get one update).
That is, you have to use multiprocessing's list.
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dflist = multiprocessing.list() # perhaps should be multiprocessing.list(dflist or ())
self.jobs = list()
self.dfnames = dfnames
However you have a bigger problem: the whole point of multiprocessing is that they may run/finish out of order, so keeping two lists like this is doomed to fail. You should use a multiprocessing.dict that way the DataFrame is saved unambiguously with the filename.
class mydf1():
def __init__(self, dflist, jobs, dfnames):
self.dfdict = multiprocessing.dict()
...
def dframe_create(self, filename, dfname):
print 'abc', filename, dfname
df = pd.read_excel(filename)
self.dfdict[dfname] = df

Itertools for containers

Considder the following interactive example
>>> l=imap(str,xrange(1,4))
>>> list(l)
['1', '2', '3']
>>> list(l)
[]
Does anyone know if there is already an implementation somewhere out there with a version of imap (and the other itertools functions) such that the second time list(l) is executed you get the same as the first. And I don't want the regular map because building the entire output in memory can be a waste of memory if you use larger ranges.
I want something that basically does something like
class cmap:
def __init__(self, function, *iterators):
self._function = function
self._iterators = iterators
def __iter__(self):
return itertools.imap(self._function, *self._iterators)
def __len__(self):
return min( map(len, self._iterators) )
But it would be a waste of time to do this manually for all itertools if someone already did this.
ps.
Do you think containers are more zen then iterators since for an iterator something like
for i in iterator:
do something
implicitly empties the iterator while a container you explicitly need to remove elements.
You do not have to build such an object for each type of container. Basically, you have the following:
mkimap = lambda: imap(str,xrange(1,4))
list(mkimap())
list(mkimap())
Now you onlky need a nice wrapping object to prevent the "ugly" function calls. This could work this way:
class MultiIter(object):
def __init__(self, f, *a, **k):
if a or k:
self.create = lambda: f(*a, **k)
else: # optimize
self.create = f
def __iter__(self):
return self.create()
l = MultiIter(lambda: imap(str, xrange(1,4)))
# or
l = MultiIter(imap, str, xrange(1,4))
# or even
#MultiIter
def l():
return imap(str, xrange(1,4))
# and then
print list(l)
print list(l)
(untested, hope it works, but you should get the idea)
For your 2nd question: Iterators and containers both have their uses. You should take whatever best fits your needs.
You may be looking for itertools.tee()
Iterators are my favorite topic ;)
from itertools import imap
class imap2(object):
def __init__(self, f, *args):
self.g = imap(f,*args)
self.lst = []
self.done = False
def __iter__(self):
while True:
try: # try to get something from g
x = next(self.g)
except StopIteration:
if self.done:
# give the old values
for x in self.lst:
yield x
else:
# g was consumed for the first time
self.done = True
return
else:
self.lst.append(x)
yield x
l=imap2(str,xrange(1,4))
print list(l)
print list(l)