I have a model with a custom activation. As a result,
model2 = keras.models.clone_model(model)
gives an error. I'm able to load saved models using custom_objects keyword, but I see no such option on clone_model. Is there a way around it besides remaking the model and transferring weights?
EDIT:
Here's example code (toy problem):
import tensorflow.keras as keras
import tensorflow.keras.backend as K
def myTanh(x):
return K.tanh(x)
inp = keras.Input(shape=(10,10,1))
flat = keras.layers.Flatten()(inp)
out = keras.layers.Dense(20, activation=myTanh)(flat)
model = keras.Model(inp,out)
model.compile(optimizer=keras.optimizers.Adam(lr=0.001),loss='categorical_crossentropy')
model2 = keras.models.clone_model(model)
And the error dump:
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in clone_model(model, input_tensors)
269 return _clone_sequential_model(model, input_tensors=input_tensors)
270 else:
--> 271 return _clone_functional_model(model, input_tensors=input_tensors)
272
273
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/models.py in _clone_functional_model(model, input_tensors)
129 if layer not in layer_map:
130 # Clone layer.
--> 131 new_layer = layer.__class__.from_config(layer.get_config())
132 layer_map[layer] = new_layer
133 layer = new_layer
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/engine/base_layer.py in from_config(cls, config)
400 A layer instance.
401 """
--> 402 return cls(**config)
403
404 def compute_output_shape(self, input_shape):
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/layers/core.py in __init__(self, units, activation, use_bias, kernel_initializer, bias_initializer, kernel_regularizer, bias_regularizer, activity_regularizer, kernel_constraint, bias_constraint, **kwargs)
920 activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
921 self.units = int(units)
--> 922 self.activation = activations.get(activation)
923 self.use_bias = use_bias
924 self.kernel_initializer = initializers.get(kernel_initializer)
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in get(identifier)
209 if isinstance(identifier, six.string_types):
210 identifier = str(identifier)
--> 211 return deserialize(identifier)
212 elif callable(identifier):
213 return identifier
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/activations.py in deserialize(name, custom_objects)
200 module_objects=globals(),
201 custom_objects=custom_objects,
--> 202 printable_module_name='activation function')
203
204
~/.conda/envs/tf-gpu/lib/python3.6/site-packages/tensorflow/python/keras/utils/generic_utils.py in deserialize_keras_object(identifier, module_objects, custom_objects, printable_module_name)
210 if fn is None:
211 raise ValueError('Unknown ' + printable_module_name + ':' +
--> 212 function_name)
213 return fn
214 else:
ValueError: Unknown activation function:myTanh
I solved the issue by calling
keras.utils.get_custom_objects().update(custom_objects)
Right after the definition of the additional objects that keras must be aware of to properly clone the model.
def lrelu(x, alpha=0.2):
return tf.nn.relu(x) * (1 - alpha) + x * alpha
custom_object = {
'lrelu': lrelu,
}
keras.utils.get_custom_objects().update(custom_objects)
This is an open bug in Keras.
The suggested way around is to use a Lambda layer in stead of an Activation layer.
x = keras.layers.Lambda(my_custom_activation_function)(x)
Related
Dimension error appears when trying to call minimize function
import numpy as np
import math
import scipy
from scipy import optimize
from scipy.optimize import minimize, line_search
x1=[1,2,1] ; y1=0
x2=[1,1,2] ; y2=0
x3=[2,3,3] ; y3=1
x4=[2,2,1] ; y4=1
x5=[1,2,3] ; y5=0
x6=[1,3,1] ; y6=1
x7=[1,1,1] ; y7=0
x8=[1,2,2] ; y8=0
x9=[1,2,1] ; y9=0
x10=[1,1,1] ; y10=0
x11=[2,2,2] ; y11=1
x12=[1,2,2] ; y12=0
X= np.array([x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12])
y=np.array([y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12])
n=len(y)
def h(x):
return 1/(1+np.exp(-x))
def r(beta):
f=0
for i in range(n):
f=f+ (1-y[i])* np.dot(X[i],beta) + np.log( 1+np.exp(-np.dot(X[i],beta) ))
return np.array([f/n])
#gradient of r
def gradr(beta):
f=0
for i in range(n):
mu= h(np.dot(X[i],beta))
f=f+ (mu-y[i])*X[i]
return (f/n).reshape(3,1)
def exactsearch(beta_0,d):
phi_aux = lambda alfa : r(beta_0+ alfa*d)
alfa_0=np.array([1])
bds=[(0,None)]
res = minimize(phi_aux, alfa_0, bounds=bds)
alfa=np.array([res.x])
return alfa
def GradientMethod(beta,f):
N=0
e=10**(-5)
p=-gradr(beta)
alfa=f(beta,p)
while True:
if r(beta)==r(beta+alfa*p):break
if N==10000:break
if alfa<=e:break
else:
N=N+1
beta=beta+alfa*p
p=-gradr(beta)
alfa=f(beta,p)
return [beta,r(beta),N]
GradientMethod(np.array([1,1,1]),exactsearch)
X is a 3 by 12 matrix, r is a function that takes a size 3 vector and operates it with the vectors of X
When changing np.exp to math.exp the error changes to TypeError: only size-1 arrays can be converted to Python scalars. Also, previously I encountered the error ValueError: shapes (3,) and (1,3) not aligned: 3 (dim 0) != 1 (dim 0) but it went away when reshaping gradr.
I must add that I don't understand that much the function exactsearch, since it was given to me.
Full error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-bb9e6dc26271> in <module>
62 return [beta,r(beta),N]
63
---> 64 GradientMethod(np.array([1,1,1]),exactsearch)
<ipython-input-20-bb9e6dc26271> in GradientMethod(beta, f)
50 e=10**(-5)
51 p=-gradr(beta)
---> 52 alfa=f(beta,p)
53 while True:
54 if r(beta)==r(beta+alfa*p):break
<ipython-input-20-bb9e6dc26271> in exactsearch(beta_0, d)
42 alfa_0=np.array([1])
43 bds=[(0,None)]
---> 44 res = minimize(phi_aux, alfa_0, bounds=bds)
45 alfa=np.array([res.x])
46 return alfa
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
615 **options)
616 elif meth == 'l-bfgs-b':
--> 617 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
618 callback=callback, **options)
619 elif meth == 'tnc':
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
304 iprint = disp
305
--> 306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
307 bounds=new_bounds,
308 finite_diff_rel_step=finite_diff_rel_step)
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
259 # ScalarFunction caches. Reuse of fun(x) during grad
260 # calculation reduces overall function evaluations.
--> 261 sf = ScalarFunction(fun, x0, args, grad, hess,
262 finite_diff_rel_step, bounds, epsilon=epsilon)
263
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
93
94 self._update_grad_impl = update_grad
---> 95 self._update_grad()
96
97 # Hessian Evaluation
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in _update_grad(self)
169 def _update_grad(self):
170 if not self.g_updated:
--> 171 self._update_grad_impl()
172 self.g_updated = True
173
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in update_grad()
89 self._update_fun()
90 self.ngev += 1
---> 91 self.g = approx_derivative(fun_wrapped, self.x, f0=self.f,
92 **finite_diff_options)
93
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_numdiff.py in approx_derivative(fun, x0, method, rel_step, abs_step, f0, bounds, sparsity, as_linear_operator, args, kwargs)
386 f0 = np.atleast_1d(f0)
387 if f0.ndim > 1:
--> 388 raise ValueError("`f0` passed has more than 1 dimension.")
389
390 if np.any((x0 < lb) | (x0 > ub)):
ValueError: `f0` passed has more than 1 dimension.
So apparently the problem was with the dimensions of the arrays, somewhere for some reason an array went from being like [1,2,3] to [[1,2,3]], changing its shape from (3,) to (1,3), I tried to fix it by reshaping gradr to (3,1) but this just made everything worse, instead the solution was to eliminate the reshape of gradr and reshape every beta to (3,) before its operated.
def r(beta):
f=0
beta=beta.reshape(3,)
for i in range(n):
f=f+ (1-y[i])* np.dot(X[i],beta) + np.log( 1+np.exp(-np.dot(X[i],beta) ))
return np.array([f/n])
#gradient of r
def gradr(beta):
f=0
beta=beta.reshape(3,)
for i in range(n):
mu= h(np.dot(X[i],beta))
f=f+ (mu-y[i])*X[i]
return (f/n)
I have created a custom Keras metric, similar to the demo implementation below:
import tensorflow as tf
class MyMetric(tf.keras.metrics.Mean):
def __init__(self, name='my_metric', dtype=None):
super(MyMetric, self).__init__(name=name, dtype=dtype)
def update_state(self, y_true, y_pred, sample_weight=None):
return super(MyMetric, self).update_state(
y_pred, sample_weight=sample_weight)
I have turned the implementation into a Python module with the init/main files and added the path to the system's PYTHONPATH.
I can use the metric when I train the Keras model.
Unfortunately, I haven't found a way to make the custom metric available to TensorFlow Model Analysis (TFMA).
In my interactive context notebook, I can load the metric when I create the eval_config.
import tensorflow as tf
import tensorflow_model_analysis as tfma
from mymetric.metric import MyMetric
metrics = [MyMetric()]
metrics_specs = tfma.metrics.specs_from_metrics(metrics)
eval_config = tfma.EvalConfig(
model_specs=[tfma.ModelSpec(label_key='label_xf')],
metrics_specs=metrics_specs,
slicing_specs=[tfma.SlicingSpec()]
)
evaluator = Evaluator(
examples=example_gen.outputs['examples'],
model=trainer.outputs['model'],
baseline_model=model_resolver.outputs['model'],
eval_config=eval_config)
When I try to execute the evaluator, the metric is listed as in the metric specifications
metrics_specs {
metrics {
class_name: "MyMetric"
config: "{\"dtype\": \"float32\", \"name\": \"my_metric\"}"
threshold {
}
}
}
but the execution fails with the error
ValueError: Unknown metric function: MyMetric
Since the metric calculation is executed via Apache Beam's executor.Do function, I assume that Beam can't find the module (even though it is on the PYTHONPATH). If that is the case, how can I make the module available to Apache Beam beyond the PYTHONPATH configuration?
Traceback:
/usr/local/lib/python3.6/dist-packages/tensorflow_model_analysis/metrics/metric_specs.py in _deserialize_tf_metric(metric_config, custom_objects)
741 cls_name, cfg = _tf_class_and_config(metric_config)
742 with tf.keras.utils.custom_object_scope(custom_objects):
--> 743 return tf.keras.metrics.deserialize({'class_name': cls_name, 'config': cfg})
744
745
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/metrics.py in deserialize(config, custom_objects)
3441 module_objects=globals(),
3442 custom_objects=custom_objects,
-> 3443 printable_module_name='metric function')
3444
3445
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/utils/generic_utils.py in deserialize_keras_object(identifier, module_objects, custom_objects, printable_module_name)
345 config = identifier
346 (cls, cls_config) = class_and_config_for_serialized_keras_object(
--> 347 config, module_objects, custom_objects, printable_module_name)
348
349 if hasattr(cls, 'from_config'):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/utils/generic_utils.py in class_and_config_for_serialized_keras_object(config, module_objects, custom_objects, printable_module_name)
294 cls = get_registered_object(class_name, custom_objects, module_objects)
295 if cls is None:
--> 296 raise ValueError('Unknown ' + printable_module_name + ': ' + class_name)
297
298 cls_config = config['config']
ValueError: Unknown metric function: MyMetric
You need to specify the module so that TFX knows where to find your MyMetric class. One way of doing this is to specify it as part of the metric specs:
from tensorflow_model_analysis import config
metric_config = [config.MetricConfig(class_name='MyMetric', module='mymodule.mymetric')]
metrics_specs = [config.MetricsSpec(metrics=metric_config)]
You will also need to create a module called mymodule and put your MyMetric class in in mymetric.py for this to work. Also make sure that the module is accessible from where you are executing the code (which should be the case if you have added it to your PYTHONPATH).
I am very new to Python and ML. I have been doing few courses from Kaggle and working on pipelines.Everything seemed to work fine without the pipelines but got XGBoostError when I piped it all. I have an issue with my code but I cannot figure it out. Here below is the code and the error after:
X_full = pd.read_csv(train_path).copy()
X_test = pd.read_csv(test_path).copy()
def cleaning(var):
q1, q3 = np.percentile(var['Fare'], [25, 75])
iqr = q3 - q1
lower_bound_val = q1 - (1.5 * iqr)
upper_bound_val = q3 + (1.5 * iqr)
var = var[(var['Fare'] >= lower_bound_val) & (var['Fare'] < upper_bound_val)].copy()
var['family_size'] = var.SibSp + var.Parch
drop_cols = ['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin', 'Embarked']
var = var.drop(drop_cols, axis=1)
return var
get_cleaning = FunctionTransformer(cleaning, validate=False)
age_transformer = SimpleImputer(missing_values=np.nan, strategy='median')
age_col = ['Age']
sex_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
sex_col = ['Sex']
# Define the model
xgboost_m = XGBRegressor(random_state=0)
prepro_col = ColumnTransformer(
transformers=[
('age', age_transformer, age_col),
('sex', sex_transformer, sex_col)
])
pl = Pipeline(steps=[('get_cleaning', get_cleaning),
('prepro_col', prepro_col),
('XGBoost', xgboost_m)
])
# Drop assign target to y and drop from X_full
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)
pl.fit(X_train, y_train)
And here the error:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-887-676d922c8ba5> in <module>
----> 1 pl.fit(X_train, y_train)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, callbacks)
546 obj=obj, feval=feval,
547 verbose_eval=verbose, xgb_model=xgb_model,
--> 548 callbacks=callbacks)
549
550 if evals_result:
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
210 evals=evals,
211 obj=obj, feval=feval,
--> 212 xgb_model=xgb_model, callbacks=callbacks)
213
214
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
73 # Skip the first update if it is a recovery step.
74 if version % 2 == 0:
---> 75 bst.update(dtrain, i, obj)
76 bst.save_rabit_checkpoint()
77 version += 1
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1159 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1160 ctypes.c_int(iteration),
-> 1161 dtrain.handle))
1162 else:
1163 pred = self.predict(dtrain, output_margin=True, training=True)
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
186 """
187 if ret != 0:
--> 188 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
189
190
XGBoostError: [22:28:42] ../src/data/data.cc:530: Check failed: labels_.Size() == num_row_ (712 vs. 622) : Size of labels must equal to number of rows.
Stack trace:
[bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xa5dc4) [0x7f27232f2dc4]
[bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x106c92) [0x7f2723353c92]
[bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1a84b7) [0x7f27233f54b7]
[bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1aae4e) [0x7f27233f7e4e]
[bt] (4) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x55) [0x7f27232e4f35]
[bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2783ff0630]
[bt] (6) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2783feffed]
[bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f278323c60e]
[bt] (8) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x13044) [0x7f278323d044]
The error indicates that, labels_.Size() == num_row_ (712 vs. 622) , your have 622 rows and 712 label, that isn't equal. Check your dataset and try again. In your dataset y = X_full.Survived is label/ Target Output.
This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.
I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees