pyspark: creating a k-means clustering model using spark-ml with spark data frame - pandas

I am using the following code to create a clustering model:
import pandas as pd
pandas_df = pd.read_pickle('df_features.pickle')
spark_df = sqlContext.createDataFrame(pandas_df)
from pyspark.ml.linalg import Vectors
from pyspark.ml.clustering import KMeans
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
Then I got errors:
AnalysisException Traceback (most recent call last)
<ipython-input-26-00e1e2ba1983> in <module>()
3
4 kmeans = KMeans(k=2, seed=1.0)
----> 5 modela = kmeans.fit(spark_df)
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/base.pyc in fit(self, dataset, params)
62 return self.copy(params)._fit(dataset)
63 else:
---> 64 return self._fit(dataset)
65 else:
66 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit(self, dataset)
211
212 def _fit(self, dataset):
--> 213 java_model = self._fit_java(dataset)
214 return self._create_model(java_model)
215
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/ml/wrapper.pyc in _fit_java(self, dataset)
208 """
209 self._transfer_params_to_java()
--> 210 return self._java_obj.fit(dataset._jdf)
211
212 def _fit(self, dataset):
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/lib/py4j-0.10.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
931 answer = self.gateway_client.send_command(command)
932 return_value = get_return_value(
--> 933 answer, self.gateway_client, self.target_id, self.name)
934
935 for temp_arg in temp_args:
/home/edamame/spark/spark-2.0.0-bin-hadoop2.6/python/pyspark/sql/utils.pyc in deco(*a, **kw)
67 e.java_exception.getStackTrace()))
68 if s.startswith('org.apache.spark.sql.AnalysisException: '):
---> 69 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
70 if s.startswith('org.apache.spark.sql.catalyst.analysis'):
71 raise AnalysisException(s.split(': ', 1)[1], stackTrace)
AnalysisException: u"cannot resolve '`features`' given input columns: [field_1, field_2, field_3, field_4, field_5, field_6, field_7];"
Did I create the data frame wrong? Does anyone know what I missed? Thanks!

You need to use VectorAssembler
http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.VectorAssembler
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=spark_df.columns, outputCol="features")
vector_df = vecAssembler.transform(spark_df)
kmeans = KMeans().setK(n_clusters).setSeed(1)
model = kmeans.fit(vector_df )

For kmeans, it requires an rdd of DenseVectors. So you need to create a rdd of DenseVectors, where each vector corresponds to one row of your dataframe. So supposing that your dataframe has three columns you are feeding into the K Means model, I would refactor it to be along the lines of:
spark_rdd = spark_df.rdd.sortByKey()
modelInput = spark_rdd.map(lambda x: Vectors.dense(x[0],x[1],x[2])).sortByKey()
modelObject = Kmeans.train(modelInput,2)
Then if you want to get the results back from an RDD into a dataframe, I would do something like:
labels = modelInput.map(lambda x: model.predict(x))
results = labels.zip(spark_rdd)
resultFrame = results.map(lambda x: Row(Label = x[0], Column1 = x[0][1], Column2 = x[1][1],Column3 = x[1][2]).toDF()

data = [(Vectors.dense( [x[0], x[1]]),) for x in pandas_df.iloc[0:,2:4].values]
spark_df = spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1.0)
modela = kmeans.fit(spark_df)
for more details refer to the official manual

Related

to_numpy() doesnt work on pandas dataframe

im working with tensorflow. Previously i used tensorflowjs, but as all of you now it has limited functionalities. S, to create the model i started to use numpy+pandas+tensorflow on vscode + ipynb
i got a dataframe "seqs":
[code, C, M, S, string_to_classified]
the string can be classified on three categories(non exclusives) C, M and S.
So the label should be [C, M, S].
This code work and give me a nice pd dataframe:
trainingData = pd.DataFrame()
trainingData['string_to_classified'] = seqs['string_to_classified'].apply(nucleoBits)
trainingData['label']= seqs[['C', 'M', 'S']].values.tolist()`
however, when i try this
trainingDataSet = tf.data.Dataset.from_tensor_slices((trainingData['string_to_classified'].values, trainingData['label'].values))
I got
<ipython-input-89-897ad7666fa6> in <module>
----> 1 trainingDataSet = tf.data.Dataset.from_tensor_slices((trainingData['string_to_classified'].values, trainingData['label'].values))
2
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in from_tensor_slices(tensors, name)
812 Dataset: A `Dataset`.
813 """
--> 814 return TensorSliceDataset(tensors, name=name)
815
816 class _GeneratorState(object):
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\ops\dataset_ops.py in __init__(self, element, is_files, name)
4706 def __init__(self, element, is_files=False, name=None):
4707 """See `Dataset.from_tensor_slices()` for details."""
-> 4708 element = structure.normalize_element(element)
4709 batched_spec = structure.type_spec_from_value(element)
4710 self._tensors = structure.to_batched_tensor_list(batched_spec, element)
c:\Users\Dua\anaconda3\lib\site-packages\tensorflow\python\data\util\structure.py in normalize_element(element, element_signature)
124 dtype = getattr(spec, "dtype", None)
125 normalized_components.append(
--> 126 ops.convert_to_tensor(t, name="component_%d" % i, dtype=dtype))
127 return nest.pack_sequence_as(pack_as, normalized_components)
...
--> 102 return ops.EagerTensor(value, ctx.device_name, dtype)
103
104
ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
pd: tfjs was simplier
const labels = tf.tensor3d(seqs.C, seqs.M, seqs.S)
and it was done

TypeError: "Set type is unordered" in OSMnx isochrones example

Running the OSMnx isochrones example, get a TypeError: "Set type is unordered" on the last cell.
Any idea what's going wrong?
OSMnx 0.15.1 on Python 3.8.5, Pandas 1.1.1, GeoPandas 0.8.1.
It works as expected with Pandas 1.0.5, but fails with Pandas 1.1 or 1.1.1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
26 return isochrone_polys
27
---> 28 isochrone_polys = make_iso_polys(G, edge_buff=25, node_buff=0, infill=True)
29 fig, ax = ox.plot_graph(G, show=False, close=False, edge_color='#999999', edge_alpha=0.2, node_size=0)
30 for polygon, fc in zip(isochrone_polys, iso_colors):
in make_iso_polys(G, edge_buff, node_buff, infill)
5
6 node_points = [Point((data['x'], data['y'])) for node, data in subgraph.nodes(data=True)]
----> 7 nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
8 nodes_gdf = nodes_gdf.set_index('id')
9
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/geopandas/geodataframe.py in __init__(self, *args, **kwargs)
87 crs = kwargs.pop("crs", None)
88 geometry = kwargs.pop("geometry", None)
---> 89 super(GeoDataFrame, self).__init__(*args, **kwargs)
90
91 # need to set this before calling self['geometry'], because
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
81
82 # don't force copy because getting jammed in an ndarray anyway
---> 83 arrays = _homogenize(arrays, index, dtype)
84
85 columns = ensure_index(columns)
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
349 val = dict(val)
350 val = lib.fast_multiget(val, oindex._values, default=np.nan)
--> 351 val = sanitize_array(
352 val, index, dtype=dtype, copy=False, raise_cast_failure=False
353 )
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
450 subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
451 elif isinstance(data, abc.Set):
--> 452 raise TypeError("Set type is unordered")
453 elif lib.is_scalar(data) and index is not None and dtype is not None:
454 data = maybe_cast_to_datetime(data, dtype)
TypeError: Set type is unordered
This is an issue in the example. It it initializes a data frame with subgraph.nodes()
nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
subgraph.nodes() is a NodeView, which behaves both like a dictionary and a set. These are unordered types, but Pandas needs an ordered collection such as a numpy array or list. Pandas 1.1 introduced a type check to catch this in issue 32582.
A workaround is to explicitly convert the NodeView to a list:
nodes_gdf = gpd.GeoDataFrame({'id': list(subgraph.nodes())}, geometry=node_points)
I submitted a bug and a PR, which has already been accepted, so this is no longer an issue.

How to fix "Data must be 1-dimensional" exception in python

I am trying to create a dataset for checking my Logistic Regression Algorithm, but I am unable to create a pandas DataFrame from a dictinoary.
I am getting a 'Data must be 1-dimensional' exception.
x1 = np.random.random(size=(10,1))*2
x2 = np.random.random(size=(10,1))*2
x3 = np.random.random(size=(10,1))*2 + 2
x4 = np.random.random(size=(10,1))*2 + 2
y0 = np.zeros(shape=(10,1))
y1 = np.ones(shape=(10,1))
plt.scatter(x1,x2, color='g', marker='o')
plt.scatter(x3,x4, color='r', marker='o')
dict_data = { 'X1':np.concatenate((x1,x3)),
'X2':np.concatenate((x2,x4)),
'Y':np.concatenate((y0,y1))}
data = pd.DataFrame(dict_data, index=np.arange(20))
I am getting this as output, with the error Data must be 1 dimenstional.
--------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-49-fe81f079ebc6> in <module>
13 dict_data = { 'X1':np.concatenate((x1,x3)), 'X2':np.concatenate((x2,x4)),'Y':np.concatenate((y0,y1))}
14 #print(dict_data.shape)
---> 15 data = pd.DataFrame(dict_data, index=np.arange(20).reshape(20))
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
328 dtype=dtype, copy=copy)
329 elif isinstance(data, dict):
--> 330 mgr = self._init_dict(data, index, columns, dtype=dtype)
331 elif isinstance(data, ma.MaskedArray):
332 import numpy.ma.mrecords as mrecords
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _init_dict(self, data, index, columns, dtype)
459 arrays = [data[k] for k in keys]
460
--> 461 return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
462
463 def _init_ndarray(self, values, index, columns, dtype=None, copy=False):
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
6166
6167 # don't force copy because getting jammed in an ndarray anyway
-> 6168 arrays = _homogenize(arrays, index, dtype)
6169
6170 # from BlockManager perspective
~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _homogenize(data, index, dtype)
6475 v = lib.fast_multiget(v, oindex.values, default=np.nan)
6476 v = _sanitize_array(v, index, dtype=dtype, copy=False,
-> 6477 raise_cast_failure=False)
6478
6479 homogenized.append(v)
~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in _sanitize_array(data, index, dtype, copy, raise_cast_failure)
3273 elif subarr.ndim > 1:
3274 if isinstance(data, np.ndarray):
-> 3275 raise Exception('Data must be 1-dimensional')
3276 else:
3277 subarr = _asarray_tuplesafe(data, dtype=dtype)
Exception: Data must be 1-dimensional
np.random.random(size=(10,1)) produces 2-dimensional array of shape (10, 1) however pandas constructs DataFrames as a collection of 1-dimensional arrays.
So use np.random.random(size=(10)) to make 1-D arrays, which then can be used to make DataFrame.

tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

How to fit two numpy matrices with Pyspark's SVM?

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees