How to fit two numpy matrices with Pyspark's SVM? - numpy

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)

Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees

Related

How do you do a grid search with cuml without a datatype error?

I tried doing a grid search with cuml. (rapids 21.10) I get a cupy conversion error. This doesn't happen if I build the model with the same dataset without a grid search. It also works doing it with the Data not lying in Videomemory, but it is then obviously slower than cpu.
The data is float32 for X and int32 for y:
X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
​
y_cudf_train = cudf.Series(y_train.values)
RF_classifier_cu = RandomForestClassifier_cu(random_state = 123)
grid_search_RF_cu = GridSearchCV_cu(estimator=RF_classifier_cu, param_grid=grid_RF, cv=3, verbose=1)
grid_search_RF_cu.fit(X_cudf_train,y_cudf_train)
print(grid_search_RF_cu.best_params_)
The error:
/home/asdanjer/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cuml/internals/api_decorators.py:794: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams==1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
return func(**kwargs)
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<timed exec> in <module>
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
800 fit_params = _check_fit_params(X, fit_params)
801
--> 802 cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
803 n_splits = cv_orig.get_n_splits(X, y, groups)
804
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_split.py in check_cv(cv, y, classifier)
2301 classifier
2302 and (y is not None)
-> 2303 and (type_of_target(y) in ("binary", "multiclass"))
2304 ):
2305 return StratifiedKFold(cv)
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
277 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
278
--> 279 if is_multilabel(y):
280 return "multilabel-indicator"
281
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in is_multilabel(y)
149 warnings.simplefilter("error", np.VisibleDeprecationWarning)
150 try:
--> 151 y = np.asarray(y)
152 except np.VisibleDeprecationWarning:
153 # dtype=object should be provided explicitly for ragged arrays,
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cudf/core/frame.py in __array__(self, dtype)
1636
1637 def __array__(self, dtype=None):
-> 1638 raise TypeError(
1639 "Implicit conversion to a host NumPy array via __array__ is not "
1640 "allowed, To explicitly construct a GPU array, consider using "
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU array, consider using cupy.asarray(...)
To explicitly construct a host array, consider using .to_array()

Numba / Numpy - Understanding Error Message

I'm experimenting with Numba to try and speed up a union-find algorithm I'm working on. Here's some example code. When I experiment with some sample data I cannot understand the type complaint that Numba appears to be raising.
from numba import jit
import numpy as np
indices = np.arange(8806806, dtype=np.int64)
sizes = np.ones(8806806, dtype=np.int64)
connected_components = 8806806
#jit(npython=True)
def root(p: int) -> int:
while p != indices[p]:
indices[p] = indices[indices[p]]
p = indices[p]
return p
#jit(npython=True)
def connected( p: int, q: int) -> bool:
return root(p) == root(q)
#jit(npython=True)
def union( p: int, q: int) -> None:
root1 = root(p)
root2 = root(q)
if root1 == root2:
return
if (sizes[root1] < sizes[root2]):
indices[root1] = root2
sizes[root2] += sizes[root1]
else:
indices[root2] = root1
sizes[root1] += sizes[root2]
connected_components -= 1
#jit(nopython=True)
def process_values(arr):
for row in arr:
typed_arr = row.astype('int64')
for first, second in zip(arr, arr[1:]):
union(first, second)
process_values(
np.array(
[np.array([8018361, 4645960]),
np.array([1137555, 7763897]),
np.array([7532943, 2248813]),
np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
I cannot understand this error:
TypingError Traceback (most recent call last)
<ipython-input-45-62735e65f581> in <module>
44 np.array([1137555, 7763897]),
45 np.array([7532943, 2248813]),
---> 46 np.array([5352737, 71466, 3590473, 5352738, 2712260])], dtype='object'))
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in _compile_for_args(self, *args, **kws)
399 e.patch_message(msg)
400
--> 401 error_rewrite(e, 'typing')
402 except errors.UnsupportedError as e:
403 # Something unsupported is present in the user code, add help info
/opt/conda/lib/python3.7/site-packages/numba/core/dispatcher.py in error_rewrite(e, issue_type)
342 raise e
343 else:
--> 344 reraise(type(e), e, None)
345
346 argtypes = []
/opt/conda/lib/python3.7/site-packages/numba/core/utils.py in reraise(tp, value, tb)
78 value = tp()
79 if value.__traceback__ is not tb:
---> 80 raise value.with_traceback(tb)
81 raise value
82
TypingError: Failed in nopython mode pipeline (step: nopython frontend)
non-precise type array(pyobject, 1d, C)
[1] During: typing of argument at <ipython-input-45-62735e65f581> (36)
File "<ipython-input-45-62735e65f581>", line 36:
def process_values(arr):
for row in arr:
^
Does this have anything to do with process_values taking an array of irregularly shaped arrays? Any pointers? Thanks!
the problem is that Numba does not accept arrays of dtype 'object'. You seem to be placing arrays inside arrays, you will have to use lists inside lists. Look for the typed.List class in Numba, https://numba.pydata.org/numba-doc/dev/reference/pysupported.html#typed-list
Alternatively, you can use awkward arrays: https://github.com/scikit-hep/awkward-1.0

TypeError: "Set type is unordered" in OSMnx isochrones example

Running the OSMnx isochrones example, get a TypeError: "Set type is unordered" on the last cell.
Any idea what's going wrong?
OSMnx 0.15.1 on Python 3.8.5, Pandas 1.1.1, GeoPandas 0.8.1.
It works as expected with Pandas 1.0.5, but fails with Pandas 1.1 or 1.1.1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
26 return isochrone_polys
27
---> 28 isochrone_polys = make_iso_polys(G, edge_buff=25, node_buff=0, infill=True)
29 fig, ax = ox.plot_graph(G, show=False, close=False, edge_color='#999999', edge_alpha=0.2, node_size=0)
30 for polygon, fc in zip(isochrone_polys, iso_colors):
in make_iso_polys(G, edge_buff, node_buff, infill)
5
6 node_points = [Point((data['x'], data['y'])) for node, data in subgraph.nodes(data=True)]
----> 7 nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
8 nodes_gdf = nodes_gdf.set_index('id')
9
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/geopandas/geodataframe.py in __init__(self, *args, **kwargs)
87 crs = kwargs.pop("crs", None)
88 geometry = kwargs.pop("geometry", None)
---> 89 super(GeoDataFrame, self).__init__(*args, **kwargs)
90
91 # need to set this before calling self['geometry'], because
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
81
82 # don't force copy because getting jammed in an ndarray anyway
---> 83 arrays = _homogenize(arrays, index, dtype)
84
85 columns = ensure_index(columns)
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
349 val = dict(val)
350 val = lib.fast_multiget(val, oindex._values, default=np.nan)
--> 351 val = sanitize_array(
352 val, index, dtype=dtype, copy=False, raise_cast_failure=False
353 )
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
450 subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
451 elif isinstance(data, abc.Set):
--> 452 raise TypeError("Set type is unordered")
453 elif lib.is_scalar(data) and index is not None and dtype is not None:
454 data = maybe_cast_to_datetime(data, dtype)
TypeError: Set type is unordered
This is an issue in the example. It it initializes a data frame with subgraph.nodes()
nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
subgraph.nodes() is a NodeView, which behaves both like a dictionary and a set. These are unordered types, but Pandas needs an ordered collection such as a numpy array or list. Pandas 1.1 introduced a type check to catch this in issue 32582.
A workaround is to explicitly convert the NodeView to a list:
nodes_gdf = gpd.GeoDataFrame({'id': list(subgraph.nodes())}, geometry=node_points)
I submitted a bug and a PR, which has already been accepted, so this is no longer an issue.

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

pyspark: creating a k-means clustering model using spark-ml with spark data frame

I am using the following code to create a clustering model:
import pandas as pd
pandas_df = pd.read_pickle('df_features.pickle')
spark_df = sqlContext.createDataFrame(pandas_df)
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
Then I got errors:
AnalysisException Traceback (most recent call last)
<ipython-input-26-00e1e2ba1983> in <module>()
3
4 kmeans = KMeans(k=2, seed=1.0)
----> 5 modela = kmeans.fit(spark_df)
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/base.pyc in fit(self, dataset, params)
62 return self.copy(params)._fit(dataset)
63 else:
---> 64 return self._fit(dataset)
65 else:
66 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)
211
212 def _fit(self, dataset):
--> 213 java_model = self._fit_java(dataset)
214 return self._create_model(java_model)
215
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)
208 """
209 self._transfer_params_to_java()
--> 210 return self._java_obj.fit(dataset._jdf)
211
212 def _fit(self, dataset):
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: u"cannot resolve '`features`' given input columns: [field_1, field_2, field_3, field_4, field_5, field_6, field_7];"
Did I create the data frame wrong? Does anyone know what I missed? Thanks!
You need to use VectorAssembler
http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=spark_df.columns, outputCol="features")
vector_df = vecAssembler.transform(spark_df)
kmeans = KMeans().setK(n_clusters).setSeed(1)
model = kmeans.fit(vector_df )
For kmeans, it requires an rdd of DenseVectors. So you need to create a rdd of DenseVectors, where each vector corresponds to one row of your dataframe. So supposing that your dataframe has three columns you are feeding into the K Means model, I would refactor it to be along the lines of:
spark_rdd = spark_df.rdd.sortByKey()
modelInput = spark_rdd.map(lambda x: Vectors.dense(x[0],x[1],x[2])).sortByKey()
modelObject = Kmeans.train(modelInput,2)
Then if you want to get the results back from an RDD into a dataframe, I would do something like:
labels = modelInput.map(lambda x: model.predict(x))
results = labels.zip(spark_rdd)
resultFrame = results.map(lambda x: Row(Label = x[0], Column1 = x[0][1], Column2 = x[1][1],Column3 = x[1][2]).toDF()
data = [(Vectors.dense( [x[0], x[1]]),) for x in pandas_df.iloc[0:,2:4].values]
spark_df = spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
for more details refer to the official manual