Data wrangling using CPU workers and training xgboost using GPU workers with dask - gpu

I am trying to read 200 parquet files from hdfs and then try to train a model using 4 GPUs. I have 48 vcores available on the machine as well. If I start the cluster with just the GPU workers then reading part is going to be very slow (since it just uses 4 cpu workers assigned to the gpu workers and you can't really run more workers than the number of gpus you have unless you run them on separate shells and then it gets nasty because you are on your own for memory management issues.) I would like to read the files using CPU workers, play with the data with the cpu workers and then train an xgboost model using GPU workers. I read the documentation here about how to start and assign workers with different resources to different tasks. Also I have seen this question, but I am confused a bit.
Here is the the code I am trying to run to read the .parquet files:
import dask.dataframe as dd
df = dd \
.read_parquet(
"hdfs://address/to/the/*.parquet",
storage_options = {
"user":user,
"kerb_ticket":kerb_ticket},
engine='pyarrow') \
.persist()
This will automatically use all the cpu and gpu workers which is fine. After this I need to create my training data and label. Let's say I have X_train, y_train, and params. Here I convert them to dask_cudf:
X_train = dask_cudf.from_dask_dataframe(X_train)
y_train = dask_cudf.from_dask_dataframe(y_train)
Here is the part that I need to use just GPU workers:
Xy = dxgb.DaskDMatrix(client, X_train, y_train)
in order to follow the document I should convert it to this:
Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
But then I'll get this error:
distributed.protocol.pickle - INFO - Failed to serialize (<Client: 'tcp://169.68.236.35:8786' processes=52 threads=52, memory=1.97 TiB>, <dask_cudf.DataFrame | 19200 tasks | 200 npartitions>, <dask_cudf.Series | 600 tasks | 200 npartitions>). Exception: cannot pickle 'socket' object
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
48 buffers.clear()
---> 49 result = pickle.dumps(x, **dump_kwargs)
50 if len(result) < 1000:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
TypeError: cannot pickle 'socket' object
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
<ipython-input-12-0d6a943365a9> in <module>
1 # Xy = dxgb.DaskDMatrix(client, X_train, y_train)
2 # Xy = dxgb.DaskDeviceQuantileDMatrix(client, X_train, y_train)
----> 3 Xy = client.submit(dxgb.DaskDMatrix, client, X_train, y_train, resources={'GPU': 1})
4 # Xy_valid = dxgb.DaskDMatrix(client, X_valid, y_valid)
/envs/dask/lib/python3.8/site-packages/distributed/client.py in submit(self, func, key, workers, resources, retries, priority, fifo_timeout, allow_other_workers, actor, actors, pure, *args, **kwargs)
1629 dsk = {skey: (func,) + tuple(args)}
1630
-> 1631 futures = self._graph_to_futures(
1632 dsk,
1633 [skey],
/envs/dask/lib/python3.8/site-packages/distributed/client.py in _graph_to_futures(self, dsk, keys, workers, allow_other_workers, priority, user_priority, resources, retries, fifo_timeout, actors)
2646 # Pack the high level graph before sending it to the scheduler
2647 keyset = set(keys)
-> 2648 dsk = dsk.__dask_distributed_pack__(self, keyset, annotations)
2649
2650 # Create futures before sending graph (helps avoid contention)
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, client, client_keys, annotations)
1045 "__module__": layer.__module__,
1046 "__name__": type(layer).__name__,
-> 1047 "state": layer.__dask_distributed_pack__(
1048 self.get_all_external_keys(),
1049 self.key_dependencies,
/envs/dask/lib/python3.8/site-packages/dask/highlevelgraph.py in __dask_distributed_pack__(self, all_hlg_keys, known_key_dependencies, client, client_keys)
424 for k, v in dsk.items()
425 }
--> 426 dsk = toolz.valmap(dumps_task, dsk)
427 return {"dsk": dsk, "dependencies": dependencies}
428
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/cytoolz/dicttoolz.pyx in cytoolz.dicttoolz.valmap()
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in dumps_task(task)
3784 return d
3785 elif not any(map(_maybe_complex, task[1:])):
-> 3786 return {"function": dumps_function(task[0]), "args": warn_dumps(task[1:])}
3787 return to_serialize(task)
3788
/envs/dask/lib/python3.8/site-packages/distributed/worker.py in warn_dumps(obj, dumps, limit)
3793 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
3794 """Dump an object to bytes, warn if those bytes are large"""
-> 3795 b = dumps(obj, protocol=4)
3796 if not _warn_dumps_warned[0] and len(b) > limit:
3797 _warn_dumps_warned[0] = True
/envs/dask/lib/python3.8/site-packages/distributed/protocol/pickle.py in dumps(x, buffer_callback, protocol)
58 try:
59 buffers.clear()
---> 60 result = cloudpickle.dumps(x, **dump_kwargs)
61 except Exception as e:
62 logger.info("Failed to serialize %s. Exception: %s", x, e)
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dumps(obj, protocol, buffer_callback)
71 file, protocol=protocol, buffer_callback=buffer_callback
72 )
---> 73 cp.dump(obj)
74 return file.getvalue()
75
/envs/dask/lib/python3.8/site-packages/cloudpickle/cloudpickle_fast.py in dump(self, obj)
561 def dump(self, obj):
562 try:
--> 563 return Pickler.dump(self, obj)
564 except RuntimeError as e:
565 if "recursion" in e.args[0]:
/envs/dask/lib/python3.8/socket.py in __getstate__(self)
270
271 def __getstate__(self):
--> 272 raise TypeError(f"cannot pickle {self.__class__.__name__!r} object")
273
274 def dup(self):
TypeError: cannot pickle 'socket' object
Anyone knows how to fix this issue?

The problem is that dask.Client is not serializable, so you can't submit it.
You can work around this problem accessing dask.Client within a task by using dask.distributed.get_client:
from dask.distributed import get_client
def create_dmatrix(X_train, y_train):
client = get_client()
return dxgb.DaskDMatrix(client, X_train, y_train)
Xy = client.submit(create_dmatrix, X_train, y_train, resources={'GPU': 1})

Related

Error on building the retrieval index in Tensorflow Recommenders

I am using BruteForce from Tensorflow Recommenders
index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model, k = 400)
the candidates dataset looks like this:
<ZipDataset element_spec=({'article_id': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'prod_name': TensorSpec(shape=(None,), dtype=tf.string, name=None), 'product_type_name': TensorSpec(shape=(None,), dtype=tf.string, name=None)}, TensorSpec(shape=(None, 64), dtype=tf.float32, name=None))>
but when i try to build the retrievel index
index.index_from_dataset(candidates)
i get the following error:
AttributeError Traceback (most recent call last)
Input In [28], in <cell line: 6>()
4 candidates = tf.data.Dataset.zip((articles.batch(128), articles.batch(128).map(model.article_model)))
5 print(candidates)
----> 6 index.index_from_dataset(candidates)
File ~/miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py:197, in TopK.index_from_dataset(self, candidates)
174 def index_from_dataset(
175 self,
176 candidates: tf.data.Dataset
177 ) -> "TopK":
178 """Builds the retrieval index.
179
180 When called multiple times the existing index will be dropped and a new one
(...)
194 ValueError if the dataset does not have the correct structure.
195 """
--> 197 _check_candidates_with_identifiers(candidates)
199 spec = candidates.element_spec
201 if isinstance(spec, tuple):
File ~/miniconda3/envs/tf/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py:127, in _check_candidates_with_identifiers(candidates)
119 raise ValueError(
120 "The dataset must yield candidate embeddings or "
121 "tuples of (candidate identifiers, candidate embeddings). "
122 f"Got {spec} instead."
123 )
125 identifiers_spec, candidates_spec = spec
--> 127 if candidates_spec.shape[0] != identifiers_spec.shape[0]:
128 raise ValueError(
129 "Candidates and identifiers have to have the same batch dimension. "
130 f"Got {candidates_spec.shape[0]} and {identifiers_spec.shape[0]}."
131 )
AttributeError: 'dict' object has no attribute 'shape'
I assume it has a problem with my dataset which is created from a dictionary.
How should i pass the candidates Dataset so i don't get the error ?
i figured it out
i was building the candidates dataset like so:
candidates = tf.data.Dataset.zip(articles.batch(128).map(model.article_model)))
index.index_from_dataset(candidates)
But i needed to also pass the candidate identifiers, not just candidates embeddings:
candidates = tf.data.Dataset.zip((articles.batch(128).map(lambda x: x["article_id"]), articles.batch(128).map(model.article_model)))
index.index_from_dataset(candidates)

How do you do a grid search with cuml without a datatype error?

I tried doing a grid search with cuml. (rapids 21.10) I get a cupy conversion error. This doesn't happen if I build the model with the same dataset without a grid search. It also works doing it with the Data not lying in Videomemory, but it is then obviously slower than cpu.
The data is float32 for X and int32 for y:
X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
​
y_cudf_train = cudf.Series(y_train.values)
RF_classifier_cu = RandomForestClassifier_cu(random_state = 123)
grid_search_RF_cu = GridSearchCV_cu(estimator=RF_classifier_cu, param_grid=grid_RF, cv=3, verbose=1)
grid_search_RF_cu.fit(X_cudf_train,y_cudf_train)
print(grid_search_RF_cu.best_params_)
The error:
/home/asdanjer/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cuml/internals/api_decorators.py:794: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams==1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
return func(**kwargs)
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<timed exec> in <module>
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
800 fit_params = _check_fit_params(X, fit_params)
801
--> 802 cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
803 n_splits = cv_orig.get_n_splits(X, y, groups)
804
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_split.py in check_cv(cv, y, classifier)
2301 classifier
2302 and (y is not None)
-> 2303 and (type_of_target(y) in ("binary", "multiclass"))
2304 ):
2305 return StratifiedKFold(cv)
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
277 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
278
--> 279 if is_multilabel(y):
280 return "multilabel-indicator"
281
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in is_multilabel(y)
149 warnings.simplefilter("error", np.VisibleDeprecationWarning)
150 try:
--> 151 y = np.asarray(y)
152 except np.VisibleDeprecationWarning:
153 # dtype=object should be provided explicitly for ragged arrays,
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cudf/core/frame.py in __array__(self, dtype)
1636
1637 def __array__(self, dtype=None):
-> 1638 raise TypeError(
1639 "Implicit conversion to a host NumPy array via __array__ is not "
1640 "allowed, To explicitly construct a GPU array, consider using "
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU array, consider using cupy.asarray(...)
To explicitly construct a host array, consider using .to_array()

Using Sagemaker predictor in a Spark UDF function

I am trying to run inference on a Tensorflow model deployed on SageMaker from a Python Spark job.
I am running a (Databricks) notebook which has the following cell:
def call_predict():
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
If I just call call_predict() it works fine:
call_predict()
and I get the output:
Process time: 65.261396
Out[61]: {'model_spec': {'name': u'generic_model',
'signature_name': u'serving_default',
'version': {'value': 1578909324L}},
'outputs': {u'ages': {'dtype': 1,
'float_val': [5.680944442749023],
'tensor_shape': {'dim': [{'size': 1L}]}}}}
but when I try to call from a Spark context (in a UDF) I get a serialization error.
The code I'm trying to run is:
dataRange = range(1, 10001)
rangeRDD = sc.parallelize(dataRange, 8)
new_data = rangeRDD.map(lambda x : call_predict())
new_data.count()
and the error I get is:
---------------------------------------------------------------------------
PicklingError Traceback (most recent call last)
<command-2282434> in <module>()
2 rangeRDD = sc.parallelize(dataRange, 8)
3 new_data = rangeRDD.map(lambda x : call_predict())
----> 4 new_data.count()
5
/databricks/spark/python/pyspark/rdd.pyc in count(self)
1094 3
1095 """
-> 1096 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum()
1097
1098 def stats(self):
/databricks/spark/python/pyspark/rdd.pyc in sum(self)
1085 6.0
1086 """
-> 1087 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add)
1088
1089 def count(self):
/databricks/spark/python/pyspark/rdd.pyc in fold(self, zeroValue, op)
956 # zeroValue provided to each partition is unique from the one provided
957 # to the final reduce call
--> 958 vals = self.mapPartitions(func).collect()
959 return reduce(op, vals, zeroValue)
960
/databricks/spark/python/pyspark/rdd.pyc in collect(self)
829 # Default path used in OSS Spark / for non-credential passthrough clusters:
830 with SCCallSiteSync(self.context) as css:
--> 831 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd())
832 return list(_load_from_socket(sock_info, self._jrdd_deserializer))
833
/databricks/spark/python/pyspark/rdd.pyc in _jrdd(self)
2573
2574 wrapped_func = _wrap_function(self.ctx, self.func, self._prev_jrdd_deserializer,
-> 2575 self._jrdd_deserializer, profiler)
2576 python_rdd = self.ctx._jvm.PythonRDD(self._prev_jrdd.rdd(), wrapped_func,
2577 self.preservesPartitioning, self.is_barrier)
/databricks/spark/python/pyspark/rdd.pyc in _wrap_function(sc, func, deserializer, serializer, profiler)
2475 assert serializer, "serializer should not be empty"
2476 command = (func, profiler, deserializer, serializer)
-> 2477 pickled_command, broadcast_vars, env, includes = _prepare_for_python_RDD(sc, command)
2478 return sc._jvm.PythonFunction(bytearray(pickled_command), env, includes, sc.pythonExec,
2479 sc.pythonVer, broadcast_vars, sc._javaAccumulator)
/databricks/spark/python/pyspark/rdd.pyc in _prepare_for_python_RDD(sc, command)
2461 # the serialized command will be compressed by broadcast
2462 ser = CloudPickleSerializer()
-> 2463 pickled_command = ser.dumps(command)
2464 if len(pickled_command) > sc._jvm.PythonUtils.getBroadcastThreshold(sc._jsc): # Default 1M
2465 # The broadcast will have same life cycle as created PythonRDD
/databricks/spark/python/pyspark/serializers.pyc in dumps(self, obj)
709 msg = "Could not serialize object: %s: %s" % (e.__class__.__name__, emsg)
710 cloudpickle.print_exec(sys.stderr)
--> 711 raise pickle.PicklingError(msg)
712
713
PicklingError: Could not serialize object: TypeError: can't pickle _ssl._SSLSocket objects
Not sure what is this serialization error - does is complain about failing to deserialize the Predictor
My notebook has a cell which was called prior to the above cells with the following imports:
import sagemaker
import boto3
from sagemaker.tensorflow.model import TensorFlowPredictor
import tensorflow as tf
import numpy as np
import time
The Predictor was created with the following code:
sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session, sagemaker_client=sagemaker_client, sagemaker_runtime_client=sagemaker_runtime_client)
predictor = TensorFlowPredictor('endpoint-poc', sagemaker_session)
The udf function will be executed by multiple spark tasks in parallel. Those tasks run in completely isolated python processes and they are scheduled to physically different machines. Hence each data, those functions reference, must be on the same node. This is the case for everything created within the udf.
Whenever you reference any object outside of the udf from the function, this data structure needs to be serialised (pickled) to each executor. Some object state, like open connections to a socket, cannot be pickled.
You need to make sure, that connections are lazily opened each executor. It must happen only on the first function call on that executor. The connection pooling topic is covered in the docs, however only in the spark streaming guide (though it also applies for normal batch jobs).
Normally one can use the Singleton Pattern for this. But in python people use the Borgh pattern.
class Env:
_shared_state = {
"sagemaker_client": None
"sagemaker_runtime_client": None
"boto_session": None
"sagemaker_session": None
"predictor": None
}
def __init__(self):
self.__dict__ = self._shared_state
if not self.predictor:
self.sagemaker_client = boto3.client('sagemaker', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.sagemaker_runtime_client = boto3.client('sagemaker-runtime', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY, region_name='us-east-1')
self.boto_session = boto3.Session(region_name='us-east-1')
self.sagemaker_session = sagemaker.Session(self.boto_session, sagemaker_client=self.sagemaker_client, sagemaker_runtime_client=self.sagemaker_runtime_client)
self.predictor = TensorFlowPredictor('endpoint-poc', self.sagemaker_session)
#....
def call_predict():
env = Env()
batch_size = 1
data = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2]]
tensor_proto = tf.make_tensor_proto(values=np.asarray(data), shape=[batch_size, len(data[0])], dtype=tf.float32)
prediction = env.predictor.predict(tensor_proto)
print("Process time: {}".format((time.clock() - start)))
return prediction
new_data = rangeRDD.map(lambda x : call_predict())
The Env class is defined on the master node. Its _shared_state has empty entries. When then Env object is instantiated first time, it shares the state with all further instances of Env on any subsequent call to the udf. On each separate parallel running process this will happen exactly one time. This way the sessions are shared and do not need to pickled.

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

How to fit two numpy matrices with Pyspark's SVM?

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees