ValueError: `f0` passed has more than 1 dimension - numpy

Dimension error appears when trying to call minimize function
import numpy as np
import math
import scipy
from scipy import optimize
from scipy.optimize import minimize, line_search
x1=[1,2,1] ; y1=0
x2=[1,1,2] ; y2=0
x3=[2,3,3] ; y3=1
x4=[2,2,1] ; y4=1
x5=[1,2,3] ; y5=0
x6=[1,3,1] ; y6=1
x7=[1,1,1] ; y7=0
x8=[1,2,2] ; y8=0
x9=[1,2,1] ; y9=0
x10=[1,1,1] ; y10=0
x11=[2,2,2] ; y11=1
x12=[1,2,2] ; y12=0
X= np.array([x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12])
y=np.array([y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12])
n=len(y)
def h(x):
return 1/(1+np.exp(-x))
def r(beta):
f=0
for i in range(n):
f=f+ (1-y[i])* np.dot(X[i],beta) + np.log( 1+np.exp(-np.dot(X[i],beta) ))
return np.array([f/n])
#gradient of r
def gradr(beta):
f=0
for i in range(n):
mu= h(np.dot(X[i],beta))
f=f+ (mu-y[i])*X[i]
return (f/n).reshape(3,1)
def exactsearch(beta_0,d):
phi_aux = lambda alfa : r(beta_0+ alfa*d)
alfa_0=np.array([1])
bds=[(0,None)]
res = minimize(phi_aux, alfa_0, bounds=bds)
alfa=np.array([res.x])
return alfa
def GradientMethod(beta,f):
N=0
e=10**(-5)
p=-gradr(beta)
alfa=f(beta,p)
while True:
if r(beta)==r(beta+alfa*p):break
if N==10000:break
if alfa<=e:break
else:
N=N+1
beta=beta+alfa*p
p=-gradr(beta)
alfa=f(beta,p)
return [beta,r(beta),N]
GradientMethod(np.array([1,1,1]),exactsearch)
X is a 3 by 12 matrix, r is a function that takes a size 3 vector and operates it with the vectors of X
When changing np.exp to math.exp the error changes to TypeError: only size-1 arrays can be converted to Python scalars. Also, previously I encountered the error ValueError: shapes (3,) and (1,3) not aligned: 3 (dim 0) != 1 (dim 0) but it went away when reshaping gradr.
I must add that I don't understand that much the function exactsearch, since it was given to me.
Full error
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-20-bb9e6dc26271> in <module>
62 return [beta,r(beta),N]
63
---> 64 GradientMethod(np.array([1,1,1]),exactsearch)
<ipython-input-20-bb9e6dc26271> in GradientMethod(beta, f)
50 e=10**(-5)
51 p=-gradr(beta)
---> 52 alfa=f(beta,p)
53 while True:
54 if r(beta)==r(beta+alfa*p):break
<ipython-input-20-bb9e6dc26271> in exactsearch(beta_0, d)
42 alfa_0=np.array([1])
43 bds=[(0,None)]
---> 44 res = minimize(phi_aux, alfa_0, bounds=bds)
45 alfa=np.array([res.x])
46 return alfa
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
615 **options)
616 elif meth == 'l-bfgs-b':
--> 617 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
618 callback=callback, **options)
619 elif meth == 'tnc':
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
304 iprint = disp
305
--> 306 sf = _prepare_scalar_function(fun, x0, jac=jac, args=args, epsilon=eps,
307 bounds=new_bounds,
308 finite_diff_rel_step=finite_diff_rel_step)
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/optimize.py in _prepare_scalar_function(fun, x0, jac, args, bounds, epsilon, finite_diff_rel_step, hess)
259 # ScalarFunction caches. Reuse of fun(x) during grad
260 # calculation reduces overall function evaluations.
--> 261 sf = ScalarFunction(fun, x0, args, grad, hess,
262 finite_diff_rel_step, bounds, epsilon=epsilon)
263
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in __init__(self, fun, x0, args, grad, hess, finite_diff_rel_step, finite_diff_bounds, epsilon)
93
94 self._update_grad_impl = update_grad
---> 95 self._update_grad()
96
97 # Hessian Evaluation
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in _update_grad(self)
169 def _update_grad(self):
170 if not self.g_updated:
--> 171 self._update_grad_impl()
172 self.g_updated = True
173
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_differentiable_functions.py in update_grad()
89 self._update_fun()
90 self.ngev += 1
---> 91 self.g = approx_derivative(fun_wrapped, self.x, f0=self.f,
92 **finite_diff_options)
93
~/anaconda3/lib/python3.8/site-packages/scipy/optimize/_numdiff.py in approx_derivative(fun, x0, method, rel_step, abs_step, f0, bounds, sparsity, as_linear_operator, args, kwargs)
386 f0 = np.atleast_1d(f0)
387 if f0.ndim > 1:
--> 388 raise ValueError("`f0` passed has more than 1 dimension.")
389
390 if np.any((x0 < lb) | (x0 > ub)):
ValueError: `f0` passed has more than 1 dimension.

So apparently the problem was with the dimensions of the arrays, somewhere for some reason an array went from being like [1,2,3] to [[1,2,3]], changing its shape from (3,) to (1,3), I tried to fix it by reshaping gradr to (3,1) but this just made everything worse, instead the solution was to eliminate the reshape of gradr and reshape every beta to (3,) before its operated.
def r(beta):
f=0
beta=beta.reshape(3,)
for i in range(n):
f=f+ (1-y[i])* np.dot(X[i],beta) + np.log( 1+np.exp(-np.dot(X[i],beta) ))
return np.array([f/n])
#gradient of r
def gradr(beta):
f=0
beta=beta.reshape(3,)
for i in range(n):
mu= h(np.dot(X[i],beta))
f=f+ (mu-y[i])*X[i]
return (f/n)

Related

ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2

What is the solution to the following error in tensorflow.
ValueError: The two structures don't have the same sequence length.
Input structure has length 1, while shallow structure has length 2.
I tried tensorflow versions: 2.9.1 and 2.4.0.
The toy example is given to reproduce the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=(None,))
for x, y in d2.as_numpy_iterator():
pass
The full error is:
ValueError Traceback (most recent call last)
~/Documents/pythonProject/tfProjects/asr/transformer/dataset.py in <module>
256 return tuple([y1, y2])
257 d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
---> 258 d3 = d2.padded_batch(3, padded_shapes=(None,))
259 for x, y in d2.as_numpy_iterator():
260 pass
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in padded_batch(self, batch_size, padded_shapes, padding_values, drop_remainder, name)
1887 padding_values,
1888 drop_remainder,
-> 1889 name=name)
1890
1891 def map(self,
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py in __init__(self, input_dataset, batch_size, padded_shapes, padding_values, drop_remainder, name)
5171
5172 input_shapes = get_legacy_output_shapes(input_dataset)
-> 5173 flat_padded_shapes = nest.flatten_up_to(input_shapes, padded_shapes)
5174
5175 flat_padded_shapes_as_tensors = []
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in flatten_up_to(shallow_tree, input_tree)
377 `input_tree`.
378 """
--> 379 assert_shallow_structure(shallow_tree, input_tree)
380 return list(_yield_flat_up_to(shallow_tree, input_tree))
381
~/miniconda3/envs/jtf2/lib/python3.7/site-packages/tensorflow/python/data/util/nest.py in assert_shallow_structure(shallow_tree, input_tree, check_types)
290 if len(input_tree) != len(shallow_tree):
291 raise ValueError(
--> 292 "The two structures don't have the same sequence length. Input "
293 f"structure has length {len(input_tree)}, while shallow structure "
294 f"has length {len(shallow_tree)}.")
ValueError: The two structures don't have the same sequence length. Input structure has length 1, while shallow structure has length 2.
The following modification in padded_shapes argument will resolve the error.
import tensorflow as tf
d1 = tf.data.Dataset.range(10)
d1 = d1.map(lambda x:tf.cast([x], tf.float32))
def func1(x):
y1 = 2.0 * x
y2 = -3.0 * x
return tuple([y1, y2])
d2 = d1.map(lambda x: tf.py_function(func1, [x], [tf.float32, tf.float32]))
d3 = d2.padded_batch(3, padded_shapes=([None],[None]))
for x, y in d2.as_numpy_iterator():
pass

XGBoostError: problem with pipeline and scikit.learn

I am very new to Python and ML. I have been doing few courses from Kaggle and working on pipelines.Everything seemed to work fine without the pipelines but got XGBoostError when I piped it all. I have an issue with my code but I cannot figure it out. Here below is the code and the error after:
X_full = pd.read_csv(train_path).copy()
X_test = pd.read_csv(test_path).copy()
def cleaning(var):
q1, q3 = np.percentile(var['Fare'], [25, 75])
iqr = q3 - q1
lower_bound_val = q1 - (1.5 * iqr)
upper_bound_val = q3 + (1.5 * iqr)
var = var[(var['Fare'] >= lower_bound_val) & (var['Fare'] < upper_bound_val)].copy()
var['family_size'] = var.SibSp + var.Parch
drop_cols = ['PassengerId', 'Name', 'Parch', 'SibSp', 'Ticket', 'Cabin', 'Embarked']
var = var.drop(drop_cols, axis=1)
return var
get_cleaning = FunctionTransformer(cleaning, validate=False)
age_transformer = SimpleImputer(missing_values=np.nan, strategy='median')
age_col = ['Age']
sex_transformer = OneHotEncoder(handle_unknown='ignore', sparse=False)
sex_col = ['Sex']
# Define the model
xgboost_m = XGBRegressor(random_state=0)
prepro_col = ColumnTransformer(
transformers=[
('age', age_transformer, age_col),
('sex', sex_transformer, sex_col)
])
pl = Pipeline(steps=[('get_cleaning', get_cleaning),
('prepro_col', prepro_col),
('XGBoost', xgboost_m)
])
# Drop assign target to y and drop from X_full
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)
pl.fit(X_train, y_train)
And here the error:
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-887-676d922c8ba5> in <module>
----> 1 pl.fit(X_train, y_train)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/xgboost/sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, callbacks)
546 obj=obj, feval=feval,
547 verbose_eval=verbose, xgb_model=xgb_model,
--> 548 callbacks=callbacks)
549
550 if evals_result:
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
210 evals=evals,
211 obj=obj, feval=feval,
--> 212 xgb_model=xgb_model, callbacks=callbacks)
213
214
/opt/conda/lib/python3.7/site-packages/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
73 # Skip the first update if it is a recovery step.
74 if version % 2 == 0:
---> 75 bst.update(dtrain, i, obj)
76 bst.save_rabit_checkpoint()
77 version += 1
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
1159 _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
1160 ctypes.c_int(iteration),
-> 1161 dtrain.handle))
1162 else:
1163 pred = self.predict(dtrain, output_margin=True, training=True)
/opt/conda/lib/python3.7/site-packages/xgboost/core.py in _check_call(ret)
186 """
187 if ret != 0:
--> 188 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
189
190
XGBoostError: [22:28:42] ../src/data/data.cc:530: Check failed: labels_.Size() == num_row_ (712 vs. 622) : Size of labels must equal to number of rows.
Stack trace:
[bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0xa5dc4) [0x7f27232f2dc4]
[bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x106c92) [0x7f2723353c92]
[bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1a84b7) [0x7f27233f54b7]
[bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1aae4e) [0x7f27233f7e4e]
[bt] (4) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x55) [0x7f27232e4f35]
[bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7f2783ff0630]
[bt] (6) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7f2783feffed]
[bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7f278323c60e]
[bt] (8) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x13044) [0x7f278323d044]
The error indicates that, labels_.Size() == num_row_ (712 vs. 622) , your have 622 rows and 712 label, that isn't equal. Check your dataset and try again. In your dataset y = X_full.Survived is label/ Target Output.

TypeError: "Set type is unordered" in OSMnx isochrones example

Running the OSMnx isochrones example, get a TypeError: "Set type is unordered" on the last cell.
Any idea what's going wrong?
OSMnx 0.15.1 on Python 3.8.5, Pandas 1.1.1, GeoPandas 0.8.1.
It works as expected with Pandas 1.0.5, but fails with Pandas 1.1 or 1.1.1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
26 return isochrone_polys
27
---> 28 isochrone_polys = make_iso_polys(G, edge_buff=25, node_buff=0, infill=True)
29 fig, ax = ox.plot_graph(G, show=False, close=False, edge_color='#999999', edge_alpha=0.2, node_size=0)
30 for polygon, fc in zip(isochrone_polys, iso_colors):
in make_iso_polys(G, edge_buff, node_buff, infill)
5
6 node_points = [Point((data['x'], data['y'])) for node, data in subgraph.nodes(data=True)]
----> 7 nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
8 nodes_gdf = nodes_gdf.set_index('id')
9
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/geopandas/geodataframe.py in __init__(self, *args, **kwargs)
87 crs = kwargs.pop("crs", None)
88 geometry = kwargs.pop("geometry", None)
---> 89 super(GeoDataFrame, self).__init__(*args, **kwargs)
90
91 # need to set this before calling self['geometry'], because
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
81
82 # don't force copy because getting jammed in an ndarray anyway
---> 83 arrays = _homogenize(arrays, index, dtype)
84
85 columns = ensure_index(columns)
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
349 val = dict(val)
350 val = lib.fast_multiget(val, oindex._values, default=np.nan)
--> 351 val = sanitize_array(
352 val, index, dtype=dtype, copy=False, raise_cast_failure=False
353 )
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
450 subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
451 elif isinstance(data, abc.Set):
--> 452 raise TypeError("Set type is unordered")
453 elif lib.is_scalar(data) and index is not None and dtype is not None:
454 data = maybe_cast_to_datetime(data, dtype)
TypeError: Set type is unordered
This is an issue in the example. It it initializes a data frame with subgraph.nodes()
nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
subgraph.nodes() is a NodeView, which behaves both like a dictionary and a set. These are unordered types, but Pandas needs an ordered collection such as a numpy array or list. Pandas 1.1 introduced a type check to catch this in issue 32582.
A workaround is to explicitly convert the NodeView to a list:
nodes_gdf = gpd.GeoDataFrame({'id': list(subgraph.nodes())}, geometry=node_points)
I submitted a bug and a PR, which has already been accepted, so this is no longer an issue.

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

How to fit two numpy matrices with Pyspark's SVM?

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees