cannot pickle 'PyCapsule' object error, when using pybind11 function and dask - pandas

See the following example extracted from the pybind11 help (https://github.com/pybind/python_example):
The setup.py is:
import sys
# Available at setup time due to pyproject.toml
from pybind11 import get_cmake_dir
from pybind11.setup_helpers import Pybind11Extension, build_ext
from setuptools import setup
__version__ = "0.0.1"
# The main interface is through Pybind11Extension.
# * You can add cxx_std=11/14/17, and then build_ext can be removed.
# * You can set include_pybind11=false to add the include directory yourself,
# say from a submodule.
#
# Note:
# Sort input source files if you glob sources to ensure bit-for-bit
# reproducible builds (https://github.com/pybind/python_example/pull/53)
ext_modules = [
Pybind11Extension("python_example",
["src/main.cpp"],
# Example: passing in the version to the compiled code
define_macros = [('VERSION_INFO', __version__)],
),
]
setup(
name="python_example",
version=__version__,
author="Sylvain Corlay",
author_email="sylvain.corlay#gmail.com",
url="https://github.com/pybind/python_example",
description="A test project using pybind11",
long_description="",
ext_modules=ext_modules,
extras_require={"test": "pytest"},
# Currently, build_ext only provides an optional "highest supported C++
# level" feature, but in the future it may provide more features.
cmdclass={"build_ext": build_ext},
zip_safe=False,
python_requires=">=3.7",
)
The Cpp part is (src/main.cpp):
#include <pybind11/pybind11.h>
#define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x)
int add(int i, int j) {
return i + j;
}
namespace py = pybind11;
PYBIND11_MODULE(python_example, m) {
m.doc() = R"pbdoc(
Pybind11 example plugin
-----------------------
.. currentmodule:: python_example
.. autosummary::
:toctree: _generate
add
subtract
)pbdoc";
m.def("add", &add, R"pbdoc(
Add two numbers
Some other explanation about the add function.
)pbdoc");
m.def("subtract", [](int i, int j) { return i - j; }, R"pbdoc(
Subtract two numbers
Some other explanation about the subtract function.
)pbdoc");
#ifdef VERSION_INFO
m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
#else
m.attr("__version__") = "dev";
#endif
}
And the python code that I want to run is this (example.py):
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from python_example import add
def python_add(i: int, j: int) -> int:
return i + j
def add_column_values_python(row: pd.Series) -> pd.Series:
row['sum'] = python_add(row['i'], row['j'])
def add_column_values(row: pd.Series) -> pd.Series:
row['sum'] = add(int(row['i']), int(row['j']))
def main():
dataframe = pd.read_csv('./example.csv', index_col=[])
dataframe['sum'] = np.nan
with ProgressBar():
d_dataframe = dd.from_pandas(dataframe, npartitions=16)
dataframe = d_dataframe.map_partitions(
lambda df: df.apply(add_column_values_python, axis=1)).compute(scheduler='processes')
with ProgressBar():
d_dataframe = dd.from_pandas(dataframe, npartitions=16)
dataframe = d_dataframe.map_partitions(
lambda df: df.apply(add_column_values, axis=1), meta=pd.Series(dtype='float64')).compute(scheduler='processes')
if __name__ == '__main__':
main()
And the example.csv file looks like this:
i,j
1,2
3,4
5,6
7,8
9,10
But when I run this code I get the following error when using the C++ add version:
[########################################] | 100% Completed | 1.24 ss
[ ] | 0% Completed | 104.05 ms
Traceback (most recent call last):
File "/Users/user/local/src/python_example/example.py", line 38, in <module>
main()
File "/Users/user/local/src/python_example/example.py", line 33, in main
dataframe = d_dataframe.map_partitions(
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/dask/base.py", line 314, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/dask/base.py", line 599, in compute
results = schedule(dsk, keys, **kwargs)
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/dask/multiprocessing.py", line 233, in get
result = get_async(
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/dask/local.py", line 499, in get_async
fire_tasks(chunksize)
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/dask/local.py", line 481, in fire_tasks
dumps((dsk[key], data)),
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 73, in dumps
cp.dump(obj)
File "/Users/user/local/src/python_example/.venv/lib/python3.9/site-packages/cloudpickle/cloudpickle_fast.py", line 632, in dump
return Pickler.dump(self, obj)
TypeError: cannot pickle 'PyCapsule' object
Is there a way to solve that, maybe by defining something in the C++ module definition?
Note that this example is only to illustrate the problem.

General
If you want to pass an object from one process to another in python (with or without dask), you need a way to serialise it. The default method for this is "pickle". Objects in C libraries are fundamentally dynamic pointer-based things and pickle doesn't know what to do with them. You can implement the pickle protocol for your C object by providing getstate/setstate or reduce dunder methods.
Alternatively, dask has a layer of serialisation where you can register specific ser/de functions for specific classes, but that is only with the distributed scheduler, not multiprocessing (the former is better in every way, there is no good reason you should be using multiprocessing).
Specific
A couple of simpler options:
use the threading scheduler, so that no serialisation is needed (C code ought to release the GIL and get full parallelism)
I think it's only the add function that is the problem, it's probably enough to move your import into the add_column_values function, so that each worker gets its own copy instead of passing it from the closure.

Thanks to mdurant I finally got it working, here is the updated code:
Now the main.cpp look like this:
#include <pybind11/pybind11.h>
#define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x)
int add(int i, int j) {
return i + j;
}
class Add {
public:
Add() {};
int add(int i, int j) {
return i + j;
}
};
namespace py = pybind11;
PYBIND11_MODULE(python_example, m) {
m.doc() = R"pbdoc(
Pybind11 example plugin
-----------------------
.. currentmodule:: python_example
.. autosummary::
:toctree: _generate
add
subtract
)pbdoc";
m.def("add", &add, R"pbdoc(
Add two numbers
Some other explanation about the add function.
)pbdoc");
m.def("subtract", [](int i, int j) { return i - j; }, R"pbdoc(
Subtract two numbers
Some other explanation about the subtract function.
)pbdoc");
#ifdef VERSION_INFO
m.attr("__version__") = MACRO_STRINGIFY(VERSION_INFO);
#else
m.attr("__version__") = "dev";
#endif
py::class_<Add>(m, "Add")
.def(py::init<>())
.def("__call__", &Add::add)
.def("__getstate__", [](const Add &p) {
/* Return a tuple that fully encodes the state of the object */
return py::make_tuple();
})
.def("__setstate__", [](Add &p, py::tuple t) {
if (t.size() != 0)
throw std::runtime_error("Invalid state!");
/* Invoke the in-place constructor. Note that this is needed even
when the object just has a trivial default constructor */
new (&p) Add();
/* Assign any additional state */
});
}
And the example.py file looks like this:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from python_example import Add
def python_add(i: int, j: int) -> int:
return i + j
def add_column_values_python(row: pd.Series) -> pd.Series:
row['sum'] = python_add(row['i'], row['j'])
return row
def add_column_values(row: pd.Series) -> pd.Series:
row['sum'] = Add()(int(row['i']), int(row['j']))
return row
def add_column_values_import(row: pd.Series) -> pd.Series:
from python_example import add
row['sum'] = add(int(row['i']), int(row['j']))
return row
def main():
dataframe = pd.read_csv('./example.csv', index_col=[])
dataframe['sum'] = np.nan
with ProgressBar():
d_dataframe = dd.from_pandas(dataframe, npartitions=16)
dataframe = d_dataframe.map_partitions(
lambda df: df.apply(add_column_values_python, axis=1)).compute(scheduler='processes')
with ProgressBar():
d_dataframe = dd.from_pandas(dataframe, npartitions=16)
dataframe = d_dataframe.map_partitions(
lambda df: df.apply(add_column_values, axis=1)).compute(scheduler='processes')
with ProgressBar():
d_dataframe = dd.from_pandas(dataframe, npartitions=16)
dataframe = d_dataframe.map_partitions(
lambda df: df.apply(add_column_values_import, axis=1)).compute(scheduler='processes')
if __name__ == '__main__':
main()
The idea is to put the functions inside a class and then define the __getstate__ and __setstate__ python magic functions or the import inside the function.
For more information:
https://pybind11-jagerman.readthedocs.io/en/stable/advanced.html

Related

Calling Numba-generated PyCFunctionWithKeywords from Python

I serialized a jitted Numba function to a byte array and now want to deserialize and call it. This works fine for primitive data types with llvm_cfunc_wrapper_name:
import numba, ctypes
import llvmlite.binding as llvm
#numba.njit("f8(f8)")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cfunc_name = foo.overloads[sig].fndesc.llvm_cfunc_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cfunc_name)
func = ctypes.CFUNCTYPE(ctypes.c_double, ctypes.c_double)(func_ptr)
print(func(0.25))
But I want to call functions with NumPy arguments. There is a llvm_cpython_wrapper_name for that which uses PyCFunctionWithKeywords, but unfortunately my best guess segfaults:
import numba, ctypes
import llvmlite.binding as llvm
import numpy as np
#numba.njit("f8[:](f8[:])")
def foo(x):
return x + 0.5
# serialize function to byte array
sig = foo.signatures[0]
lib = foo.overloads[sig].library
cpython_name = foo.overloads[sig].fndesc.llvm_cpython_wrapper_name
function_bytes = lib._get_compiled_object()
# deserialize function_bytes to func
llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()
target = llvm.Target.from_default_triple()
target_machine = target.create_target_machine()
backing_mod = llvm.parse_assembly("")
engine = llvm.create_mcjit_compiler(backing_mod, target_machine)
engine.add_object_file(llvm.ObjectFileRef.from_data(function_bytes))
func_ptr = engine.get_function_address(cpython_name)
def func(*args, **kwargs):
py_obj_ptr = ctypes.POINTER(ctypes.py_object)
return ctypes.CFUNCTYPE(py_obj_ptr, py_obj_ptr, py_obj_ptr, py_obj_ptr)(func_ptr)(
ctypes.cast(id(None), py_obj_ptr),
ctypes.cast(id(args), py_obj_ptr),
ctypes.cast(id(kwargs), py_obj_ptr))
# segfaults here
print(func(np.ones(3)))
Here are some links to Numba source code (unfortunately very hard to follow), which might be helpful to figure this out.
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/callwrapper.py#L105
https://github.com/numba/numba/blob/61ec1fd0f69aeadece218dccf4c39ebc5c7dfbc4/numba/core/pythonapi.py#L1456

Pandas: which “function names” can be used? (how are they looked up?)

When using pandas you can in certain cases pass names of functions as strings instead of actual references to those functions. For example: df.transform('round').
In the pandas docs they call these strings "function names".
I discovered that the lookup mechanism here doesn't look at the current namespace:
import pandas as pd
sales = pd.DataFrame(data={
"price": [23.12, 22.34, 12.56, 27.78, 11.9],
})
display(sales)
def new_price(price):
return price * 1.1
display(sales.transform('round')) # Works
display(sales.transform(new_price)) # Works
display(sales.transform('new_price')) # Does not work
My question: is there a list of these function names that you can use in cases like this?
This is the relevant code from the pandas source:
class Apply(metaclass=abc.ABCMeta):
...
def _try_aggregate_string_function(self, obj, arg: str, *args, **kwargs):
"""
if arg is a string, then try to operate on it:
- try to find a function (or attribute) on ourselves
- try to find a numpy function
- raise
"""
assert isinstance(arg, str)
f = getattr(obj, arg, None)
if f is not None:
if callable(f):
return f(*args, **kwargs)
# people may try to aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
return f
f = getattr(np, arg, None)
if f is not None and hasattr(obj, "__array__"):
# in particular exclude Window
return f(obj, *args, **kwargs)
raise AttributeError(
f"'{arg}' is not a valid function for '{type(obj).__name__}' object"
)
It basically searches for a method of self with that name or for a numpy method.

can i use OR-tools for TSP with partial distance matrix (for a huge set of nodes)?

i'm trying to solve tsp with OR-tools for a problem of something like 80,000 nodes, the problem is, I need a huge distance matrix that takes to much memory ,so its infeasible and i don't get a solution.
so:
is there an option to work with partial distance matrix in or-tools?
if not is there a way to improve my code?
is there another external solver that can work for this task in python?
import math
from collections import namedtuple
import random
import time
from collections import namedtuple
from sklearn.metrics.pairwise import euclidean_distances
import numpy as np
import numba
from scipy.spatial import distance_matrix
from sklearn.metrics.pairwise import euclidean_distances
from math import sqrt
Point = namedtuple("Point", ['x', 'y'])
def solve_it(input_data):
# Modify this code to run your optimization algorithm
global POINTS
# parse the input
lines = input_data.split('\n')
nodeCount = int(lines[0])
points = []
for i in range(1, nodeCount+1):
line = lines[i]
parts = line.split()
points.append(Point(float(parts[0]), float(parts[1])))
#2.routing with or tools
def dist_matrix(nodeCount,points):
data=[]
for k in range(len(points)):
data.append([int(points[k].x),int(points[k].y)])
D=euclidean_distances(data, data)
return D
def create_data_model(D):
"""Stores the data for the problem."""
data = {}
data['distance_matrix'] = D # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def print_solution(manager, routing, solution):
index = routing.Start(0)
plan_output = []#Route for vehicle 0:\n'
route_distance = 0
while not routing.IsEnd(index):
plan_output.append(manager.IndexToNode(index))
index = solution.Value(routing.NextVar(index))
return plan_output
def or_main(nodeCount,points):
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
"""Entry point of the program."""
# Instantiate the data problem.
global sol
D=dist_matrix(nodeCount,points)
data = create_data_model(D)
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),
data['num_vehicles'], data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index, to_index):
"""Returns the distance between the two nodes."""
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.local_search_metaheuristic = (
routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
k = 100
if nodeCount <= 100:
k = 30
elif 100 <= nodeCount <= 1000:
k = 300
elif nodeCount > 1000:
k = 17000
search_parameters.time_limit.seconds =k
search_parameters.log_search = True
# Solve the problem.
solution = routing.SolveWithParameters(search_parameters)
# #print solution on console.
if solution:
sol=print_solution(manager, routing, solution)
return sol
######################################################################
solution=or_main(nodeCount,points)
# calculate the length of the tour
obj = length(points[solution[-1]], points[solution[0]])
for index in range(0, nodeCount-1):
obj += length(points[solution[index]], points[solution[index+1]])
# prepare the solution in the specified output format
output_data = '%.2f' % obj + ' ' + str(0) + '\n'
output_data += ' '.join(map(str, solution))
return output_data
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
file_location = sys.argv[1].strip()
with open(file_location, 'r') as input_data_file:
input_data = input_data_file.read()
#print(solve_it(input_data))
else:
print('This test requires an input file. Please select one from the data directory. (i.e. python solver.py ./data/tsp_51_1)')

concatenate results after multiprocessing

I have a function which is creating a data frame by doing multiprocessing on a df:-
Suppose if I am having 10 rows in my df so the function processor will process all 10 rows separately. what I want is to concatenate all the output of the function processor and make one data frame.
def processor(dff):
"""
reading data from a data frame and doing all sorts of data manipulation
for multiprocessing
"""
return df
def main(infile, mdebug):
global debug
debug = mdebug
try:
lines = sum(1 for line in open(infile))
except Exception as err:
print("Error {} opening file: {}").format(err, infile)
sys.exit(2000)
if debug >= 2:
print(infile)
try:
dff = pd.read_csv(infile)
except Exception as err:
print("Error {}, opening file: {}").format(err, infile)
sys.exit(2000)
df_split = np.array_split(dff, (lines+1))
cores = multiprocessing.cpu_count()
cores = 64
# pool = Pool(cores)
pool = Pool(lines-1)
for n, frame in enumerate(pool.imap(processor, df_split), start=1):
if frame is not None:
frame.to_csv('{}'.format(n))
pool.close()
pool.join()
if __name__ == "__main__":
args = parse_args()
"""
print "Debug is: {}".format(args.debug)
"""
if args.debug >= 1:
print("Running in debug mode: "), args.debug
main(infile=args.infile, mdebug=args.debug)
you can use either the data frame constructor or concat to solve your problem. the appropriate one to use depends on details of your code that you haven't included
here's a more complete example:
import numpy as np
import pandas as pd
# create dummy dataset
dff = pd.DataFrame(np.random.rand(101, 5), columns=list('abcde'))
# process data
with Pool() as pool:
result = pool.map(processor, np.array_split(dff, 7))
# put it all back together in one dataframe
result = np.concat(result)

Itertools for containers

Considder the following interactive example
>>> l=imap(str,xrange(1,4))
>>> list(l)
['1', '2', '3']
>>> list(l)
[]
Does anyone know if there is already an implementation somewhere out there with a version of imap (and the other itertools functions) such that the second time list(l) is executed you get the same as the first. And I don't want the regular map because building the entire output in memory can be a waste of memory if you use larger ranges.
I want something that basically does something like
class cmap:
def __init__(self, function, *iterators):
self._function = function
self._iterators = iterators
def __iter__(self):
return itertools.imap(self._function, *self._iterators)
def __len__(self):
return min( map(len, self._iterators) )
But it would be a waste of time to do this manually for all itertools if someone already did this.
ps.
Do you think containers are more zen then iterators since for an iterator something like
for i in iterator:
do something
implicitly empties the iterator while a container you explicitly need to remove elements.
You do not have to build such an object for each type of container. Basically, you have the following:
mkimap = lambda: imap(str,xrange(1,4))
list(mkimap())
list(mkimap())
Now you onlky need a nice wrapping object to prevent the "ugly" function calls. This could work this way:
class MultiIter(object):
def __init__(self, f, *a, **k):
if a or k:
self.create = lambda: f(*a, **k)
else: # optimize
self.create = f
def __iter__(self):
return self.create()
l = MultiIter(lambda: imap(str, xrange(1,4)))
# or
l = MultiIter(imap, str, xrange(1,4))
# or even
#MultiIter
def l():
return imap(str, xrange(1,4))
# and then
print list(l)
print list(l)
(untested, hope it works, but you should get the idea)
For your 2nd question: Iterators and containers both have their uses. You should take whatever best fits your needs.
You may be looking for itertools.tee()
Iterators are my favorite topic ;)
from itertools import imap
class imap2(object):
def __init__(self, f, *args):
self.g = imap(f,*args)
self.lst = []
self.done = False
def __iter__(self):
while True:
try: # try to get something from g
x = next(self.g)
except StopIteration:
if self.done:
# give the old values
for x in self.lst:
yield x
else:
# g was consumed for the first time
self.done = True
return
else:
self.lst.append(x)
yield x
l=imap2(str,xrange(1,4))
print list(l)
print list(l)