Numba / Numpy - Understanding Error Message - numpy

I'm experimenting with Numba to try and speed up a union-find algorithm I'm working on. Here's some example code. When I experiment with some sample data I cannot understand the type complaint that Numba appears to be raising.
from numba import jit
import numpy as np
indices = np.arange(8806806, dtype=np.int64)
sizes = np.ones(8806806, dtype=np.int64)
connected_components = 8806806
#jit(npython=True)
def root(p: int) -> int:
while p != indices[p]:
indices[p] = indices[indices[p]]
p = indices[p]
return p
#jit(npython=True)
def connected( p: int, q: int) -> bool:
return root(p) == root(q)
#jit(npython=True)
def union( p: int, q: int) -> None:
root1 = root(p)
root2 = root(q)
if root1 == root2:
return
if (sizes[root1] < sizes[root2]):
indices[root1] = root2
sizes[root2] += sizes[root1]
else:
indices[root2] = root1
sizes[root1] += sizes[root2]
connected_components -= 1
#jit(nopython=True)
def process_values(arr):
for row in arr:
typed_arr = row.astype('int64')
for first, second in zip(arr, arr[1:]):
union(first, second)
process_values(
np.array(
[np.array([8018361, 4645960]),
np.array([1137555, 7763897]),
np.array([7532943, 2248813]),
np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
I cannot understand this error:
TypingError Traceback (most recent call last)
<ipython-input-45-62735e65f581> in <module>
44 np.array([1137555, 7763897]),
45 np.array([7532943, 2248813]),
---> 46 np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in _compile_for_args(self, *args, **kws)
399 e.patch_message(msg)
400
--> 401 error_rewrite(e, 'typing')
402 except errors.UnsupportedError as e:
403 # Something unsupported is present in the user code, add help info
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in error_rewrite(e, issue_type)
342 raise e
343 else:
--> 344 reraise(type(e), e, None)
345
346 argtypes = []
/opt/conda/lib/python3.7/site-packages/numba/core/utils.py in reraise(tp, value, tb)
78 value = tp()
79 if value.__traceback__ is not tb:
---> 80 raise value.with_traceback(tb)
81 raise value
82
TypingError: Failed in nopython mode pipeline (step: nopython frontend)
non-precise type array(pyobject, 1d, C)
[1] During: typing of argument at <ipython-input-45-62735e65f581> (36)
File "<ipython-input-45-62735e65f581>", line 36:
def process_values(arr):
for row in arr:
^
Does this have anything to do with process_values taking an array of irregularly shaped arrays? Any pointers? Thanks!

the problem is that Numba does not accept arrays of dtype 'object'. You seem to be placing arrays inside arrays, you will have to use lists inside lists. Look for the typed.List class in Numba, https://numba.pydata.org/numba-doc/dev/reference/pysupported.html#typed-list
Alternatively, you can use awkward arrays: https://github.com/scikit-hep/awkward-1.0

Related

to_numpy() doesnt work on pandas dataframe

im working with tensorflow. Previously i used tensorflowjs, but as all of you now it has limited functionalities. S, to create the model i started to use numpy+pandas+tensorflow on vscode + ipynb
i got a dataframe "seqs":
[code, C, M, S, string_to_classified]
the string can be classified on three categories(non exclusives) C, M and S.
So the label should be [C, M, S].
This code work and give me a nice pd dataframe:
trainingData = pd.DataFrame()
trainingData['string_to_classified'] = seqs['string_to_classified'].apply(nucleoBits)
trainingData['label']= seqs[['C', 'M', 'S']].values.tolist()`
however, when i try this
trainingDataSet = tf.data.Dataset.from_tensor_slices((trainingData['string_to_classified'].values, trainingData['label'].values))
I got
<ipython-input-89-897ad7666fa6> in <module>
----> 1 trainingDataSet = tf.data.Dataset.from_tensor_slices((trainingData['string_to_classified'].values, trainingData['label'].values))
2
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in from_tensor_slices(tensors, name)
812 Dataset: A `Dataset`.
813 """
--> 814 return TensorSliceDataset(tensors, name=name)
815
816 class _GeneratorState(object):
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in __init__(self, element, is_files, name)
4706 def __init__(self, element, is_files=False, name=None):
4707 """See `Dataset.from_tensor_slices()` for details."""
-> 4708 element = structure.normalize_element(element)
4709 batched_spec = structure.type_spec_from_value(element)
4710 self._tensors = structure.to_batched_tensor_list(batched_spec, element)
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element, element_signature)
124 dtype = getattr(spec, "dtype", None)
125 normalized_components.append(
--> 126 ops.convert_to_tensor(t, name="component_%d" % i, dtype=dtype))
127 return nest.pack_sequence_as(pack_as, normalized_components)
...
--> 102 return ops.EagerTensor(value, ctx.device_name, dtype)
103
104
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
pd: tfjs was simplier
const labels = tf.tensor3d(seqs.C, seqs.M, seqs.S)
and it was done

Length mismatch error in ColumnTransformer sklearn v

Length Mismatch error when setting transform_output to "pandas" on the custom transformer (deleting NaN values)
I'm implementing the custom transformer to delete the rows containing NaNs. The code is
from sklearn.base import BaseEstimator,TransformerMixin
class NaRemover(BaseEstimator,TransformerMixin):
def __init__(self):
self._columns = []
def fit(self, X):
self._columns = X.columns.values
return self
def transform(self, X):
X = X.dropna()
return X
It works correctly as standalone.
Then I put it in the ColumnTransformer:
features = X_train.columns.values
ct_nan = ColumnTransformer([('delete_na',NaRemover(),features)])
ct_nan.fit(X_train)
and get the error:
ValueError: Length mismatch: Expected axis has 109 elements, new values have 140 elements
Problem is caused by the function that wraps the output into the pandas dataframe
129 # dense_config == "pandas"
--> 130 return _wrap_in_pandas_container(
131 data_to_wrap=data_to_wrap,
132 index=getattr(original_input, "index", None),
As far as could gather, it checks the integrity of the dataframe index, which I obviously destroy when applying transform (although I don't understand why should it check it on the fit stage)
214 def set_axis(self, axis: int, new_labels: Index) -> None:
215 # Caller is responsible for ensuring we have an Index object.
--> 216 self._validate_set_axis(axis, new_labels)
217 self.axes[axis] = new_labels
218
/usr/local/lib/python3.8/dist-packages/pandas/core/internals/base.py in _validate_set_axis(self, axis, new_labels)
55
56 elif new_len != old_len:
---> 57 raise ValueError(
58 f"Length mismatch: Expected axis has {old_len} elements, new "
59 f"values have {new_len} elements"
Is it what the functionality supposed to be? Are the transformers changing the shape of the dataframe not allowed? And if not, how can I overcome the problem?

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

TypeError using sns.distplot() on dataframe with one row

I'm plotting subsets of a dataframe, and one subset happens to have only one row. This is the only reason I can think of for why it's causing problems. This is what it looks like:
problem_dataframe = prob_df[prob_df['Date']==7]
problem_dataframe.head()
I try to do:
sns.distplot(problem_dataframe['floatTime'])
But I get the error:
TypeError: len() of unsized object
Would someone please tell me what's causing this and how to work around it?
The TypeError is resolved by setting bins=1.
But that uncovers a different error, ValueError: x must be 1D or 2D, which gets triggered by an internal function in Matplotlib's hist(), called _normalize_input():
import pandas as pd
import seaborn as sns
df = pd.DataFrame(['Tue','Feb',7,'15:37:58',2017,15.6196]).T
df.columns = ['Day','Month','Date','Time','Year','floatTime']
sns.distplot(df.floatTime, bins=1)
Output:
ValueError Traceback (most recent call last)
<ipython-input-25-858df405d200> in <module>()
6 df.columns = ['Day','Month','Date','Time','Year','floatTime']
7 df.floatTime.values.astype(float)
----> 8 sns.distplot(df.floatTime, bins=1)
/home/andrew/anaconda3/lib/python3.6/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
213 hist_color = hist_kws.pop("color", color)
214 ax.hist(a, bins, orientation=orientation,
--> 215 color=hist_color, **hist_kws)
216 if hist_color != color:
217 hist_kws["color"] = hist_color
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
1890 warnings.warn(msg % (label_namer, func.__name__),
1891 RuntimeWarning, stacklevel=2)
-> 1892 return func(ax, *args, **kwargs)
1893 pre_doc = inner.__doc__
1894 if pre_doc is None:
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
6141 x = np.array([[]])
6142 else:
-> 6143 x = _normalize_input(x, 'x')
6144 nx = len(x) # number of datasets
6145
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in _normalize_input(inp, ename)
6080 else:
6081 raise ValueError(
-> 6082 "{ename} must be 1D or 2D".format(ename=ename))
6083 if inp.shape[1] < inp.shape[0]:
6084 warnings.warn(
ValueError: x must be 1D or 2D
_normalize_input() was removed from Matplotlib (it looks like sometime last year), so I guess Seaborn is referring to an older version under the hood.
You can see _normalize_input() in this old commit:
def _normalize_input(inp, ename='input'):
"""Normalize 1 or 2d input into list of np.ndarray or
a single 2D np.ndarray.
Parameters
----------
inp : iterable
ename : str, optional
Name to use in ValueError if `inp` can not be normalized
"""
if (isinstance(x, np.ndarray) or
not iterable(cbook.safe_first_element(inp))):
# TODO: support masked arrays;
inp = np.asarray(inp)
if inp.ndim == 2:
# 2-D input with columns as datasets; switch to rows
inp = inp.T
elif inp.ndim == 1:
# new view, single row
inp = inp.reshape(1, inp.shape[0])
else:
raise ValueError(
"{ename} must be 1D or 2D".format(ename=ename))
...
I can't figure out why inp.ndim!=1, though. Performing the same np.asarray().ndim on the input returns 1 as expected:
np.asarray(df.floatTime).ndim # 1
So you're facing a few obstacles if you want to make a single-valued input work with sns.distplot().
Suggested Workaround
Check for a single-element df.floatTime, and if that's the case, just use plt.hist() instead (which is what distplot goes to anyway, along with KDE):
plt.hist(df.floatTime)

How to fit two numpy matrices with Pyspark's SVM?

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees