tf.keras.layers.Concatenate() works with a list but fails on a tuple of tensors - tensorflow

This will work:
tf.keras.layers.Concatenate()([features['a'], features['b']])
While this:
tf.keras.layers.Concatenate()((features['a'], features['b']))
Results in:
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'
Is that expected? If so - why does it matter what sequence do I pass?
Thanks,
Zach
EDIT (adding a code example):
import pandas as pd
import numpy as np
data = {
'a': [1.0, 2.0, 3.0],
'b': [0.1, 0.3, 0.2],
}
with tf.Session() as sess:
ds = tf.data.Dataset.from_tensor_slices(data)
ds = ds.batch(1)
it = ds.make_one_shot_iterator()
features = it.get_next()
concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
try:
while True:
print(sess.run(concat))
except tf.errors.OutOfRangeError:
pass
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-135-0e1a45017941> in <module>()
6 features = it.get_next()
7
----> 8 concat = tf.keras.layers.Concatenate()((features['a'], features['b']))
9
10
google3/third_party/tensorflow/python/keras/engine/base_layer.py in __call__(self, inputs, *args, **kwargs)
751 # the user has manually overwritten the build method do we need to
752 # build it.
--> 753 self.build(input_shapes)
754 # We must set self.built since user defined build functions are not
755 # constrained to set self.built.
google3/third_party/tensorflow/python/keras/utils/tf_utils.py in wrapper(instance, input_shape)
148 tuple(tensor_shape.TensorShape(x).as_list()) for x in input_shape]
149 else:
--> 150 input_shape = tuple(tensor_shape.TensorShape(input_shape).as_list())
151 output_shape = fn(instance, input_shape)
152 if output_shape is not None:
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, dims)
688 else:
689 # Got a list of dimensions
--> 690 self._dims = [as_dimension(d) for d in dims_iter]
691
692 #property
google3/third_party/tensorflow/python/framework/tensor_shape.py in as_dimension(value)
630 return value
631 else:
--> 632 return Dimension(value)
633
634
google3/third_party/tensorflow/python/framework/tensor_shape.py in __init__(self, value)
183 raise TypeError("Cannot convert %s to Dimension" % value)
184 else:
--> 185 self._value = int(value)
186 if (not isinstance(value, compat.bytes_or_text_types) and
187 self._value != value):
TypeError: int() argument must be a string or a number, not 'TensorShapeV1'

https://github.com/keras-team/keras/blob/master/keras/layers/merge.py#L329
comment on the concanate class states it requires a list.
this class calls K.backend's concatenate function
https://github.com/keras-team/keras/blob/master/keras/backend/tensorflow_backend.py#L2041
which also states it requires a list.
in tensorflow https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/python/ops/array_ops.py#L1034
also states it requires a list of tensors. Why? I don't know. in this function the tensors (variable called "values") actually gets checked if its a list or tuple. but somewhere along the way you still get an error.

Related

How do you do a grid search with cuml without a datatype error?

I tried doing a grid search with cuml. (rapids 21.10) I get a cupy conversion error. This doesn't happen if I build the model with the same dataset without a grid search. It also works doing it with the Data not lying in Videomemory, but it is then obviously slower than cpu.
The data is float32 for X and int32 for y:
X_cudf_train = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
​
y_cudf_train = cudf.Series(y_train.values)
RF_classifier_cu = RandomForestClassifier_cu(random_state = 123)
grid_search_RF_cu = GridSearchCV_cu(estimator=RF_classifier_cu, param_grid=grid_RF, cv=3, verbose=1)
grid_search_RF_cu.fit(X_cudf_train,y_cudf_train)
print(grid_search_RF_cu.best_params_)
The error:
/home/asdanjer/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cuml/internals/api_decorators.py:794: UserWarning: For reproducible results in Random Forest Classifier or for almost reproducible results in Random Forest Regressor, n_streams==1 is recommended. If n_streams is > 1, results may vary due to stream/thread timing differences, even when random_state is set
return func(**kwargs)
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<timed exec> in <module>
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
800 fit_params = _check_fit_params(X, fit_params)
801
--> 802 cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
803 n_splits = cv_orig.get_n_splits(X, y, groups)
804
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/model_selection/_split.py in check_cv(cv, y, classifier)
2301 classifier
2302 and (y is not None)
-> 2303 and (type_of_target(y) in ("binary", "multiclass"))
2304 ):
2305 return StratifiedKFold(cv)
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in type_of_target(y)
277 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
278
--> 279 if is_multilabel(y):
280 return "multilabel-indicator"
281
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/sklearn/utils/multiclass.py in is_multilabel(y)
149 warnings.simplefilter("error", np.VisibleDeprecationWarning)
150 try:
--> 151 y = np.asarray(y)
152 except np.VisibleDeprecationWarning:
153 # dtype=object should be provided explicitly for ragged arrays,
~/miniconda3/envs/rapids-21.10/lib/python3.8/site-packages/cudf/core/frame.py in __array__(self, dtype)
1636
1637 def __array__(self, dtype=None):
-> 1638 raise TypeError(
1639 "Implicit conversion to a host NumPy array via __array__ is not "
1640 "allowed, To explicitly construct a GPU array, consider using "
TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, To explicitly construct a GPU array, consider using cupy.asarray(...)
To explicitly construct a host array, consider using .to_array()

TypeError: "Set type is unordered" in OSMnx isochrones example

Running the OSMnx isochrones example, get a TypeError: "Set type is unordered" on the last cell.
Any idea what's going wrong?
OSMnx 0.15.1 on Python 3.8.5, Pandas 1.1.1, GeoPandas 0.8.1.
It works as expected with Pandas 1.0.5, but fails with Pandas 1.1 or 1.1.1
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
in
26 return isochrone_polys
27
---> 28 isochrone_polys = make_iso_polys(G, edge_buff=25, node_buff=0, infill=True)
29 fig, ax = ox.plot_graph(G, show=False, close=False, edge_color='#999999', edge_alpha=0.2, node_size=0)
30 for polygon, fc in zip(isochrone_polys, iso_colors):
in make_iso_polys(G, edge_buff, node_buff, infill)
5
6 node_points = [Point((data['x'], data['y'])) for node, data in subgraph.nodes(data=True)]
----> 7 nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
8 nodes_gdf = nodes_gdf.set_index('id')
9
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/geopandas/geodataframe.py in __init__(self, *args, **kwargs)
87 crs = kwargs.pop("crs", None)
88 geometry = kwargs.pop("geometry", None)
---> 89 super(GeoDataFrame, self).__init__(*args, **kwargs)
90
91 # need to set this before calling self['geometry'], because
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
466
467 elif isinstance(data, dict):
--> 468 mgr = init_dict(data, index, columns, dtype=dtype)
469 elif isinstance(data, ma.MaskedArray):
470 import numpy.ma.mrecords as mrecords
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
281 arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
282 ]
--> 283 return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
284
285
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype, verify_integrity)
81
82 # don't force copy because getting jammed in an ndarray anyway
---> 83 arrays = _homogenize(arrays, index, dtype)
84
85 columns = ensure_index(columns)
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/internals/construction.py in _homogenize(data, index, dtype)
349 val = dict(val)
350 val = lib.fast_multiget(val, oindex._values, default=np.nan)
--> 351 val = sanitize_array(
352 val, index, dtype=dtype, copy=False, raise_cast_failure=False
353 )
~/miniconda3/envs/osmnx-examples/lib/python3.8/site-packages/pandas/core/construction.py in sanitize_array(data, index, dtype, copy, raise_cast_failure)
450 subarr = _try_cast(arr, dtype, copy, raise_cast_failure)
451 elif isinstance(data, abc.Set):
--> 452 raise TypeError("Set type is unordered")
453 elif lib.is_scalar(data) and index is not None and dtype is not None:
454 data = maybe_cast_to_datetime(data, dtype)
TypeError: Set type is unordered
This is an issue in the example. It it initializes a data frame with subgraph.nodes()
nodes_gdf = gpd.GeoDataFrame({'id': subgraph.nodes()}, geometry=node_points)
subgraph.nodes() is a NodeView, which behaves both like a dictionary and a set. These are unordered types, but Pandas needs an ordered collection such as a numpy array or list. Pandas 1.1 introduced a type check to catch this in issue 32582.
A workaround is to explicitly convert the NodeView to a list:
nodes_gdf = gpd.GeoDataFrame({'id': list(subgraph.nodes())}, geometry=node_points)
I submitted a bug and a PR, which has already been accepted, so this is no longer an issue.

TypeError using sns.distplot() on dataframe with one row

I'm plotting subsets of a dataframe, and one subset happens to have only one row. This is the only reason I can think of for why it's causing problems. This is what it looks like:
problem_dataframe = prob_df[prob_df['Date']==7]
problem_dataframe.head()
I try to do:
sns.distplot(problem_dataframe['floatTime'])
But I get the error:
TypeError: len() of unsized object
Would someone please tell me what's causing this and how to work around it?
The TypeError is resolved by setting bins=1.
But that uncovers a different error, ValueError: x must be 1D or 2D, which gets triggered by an internal function in Matplotlib's hist(), called _normalize_input():
import pandas as pd
import seaborn as sns
df = pd.DataFrame(['Tue','Feb',7,'15:37:58',2017,15.6196]).T
df.columns = ['Day','Month','Date','Time','Year','floatTime']
sns.distplot(df.floatTime, bins=1)
Output:
ValueError Traceback (most recent call last)
<ipython-input-25-858df405d200> in <module>()
6 df.columns = ['Day','Month','Date','Time','Year','floatTime']
7 df.floatTime.values.astype(float)
----> 8 sns.distplot(df.floatTime, bins=1)
/home/andrew/anaconda3/lib/python3.6/site-packages/seaborn/distributions.py in distplot(a, bins, hist, kde, rug, fit, hist_kws, kde_kws, rug_kws, fit_kws, color, vertical, norm_hist, axlabel, label, ax)
213 hist_color = hist_kws.pop("color", color)
214 ax.hist(a, bins, orientation=orientation,
--> 215 color=hist_color, **hist_kws)
216 if hist_color != color:
217 hist_kws["color"] = hist_color
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
1890 warnings.warn(msg % (label_namer, func.__name__),
1891 RuntimeWarning, stacklevel=2)
-> 1892 return func(ax, *args, **kwargs)
1893 pre_doc = inner.__doc__
1894 if pre_doc is None:
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
6141 x = np.array([[]])
6142 else:
-> 6143 x = _normalize_input(x, 'x')
6144 nx = len(x) # number of datasets
6145
/home/andrew/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in _normalize_input(inp, ename)
6080 else:
6081 raise ValueError(
-> 6082 "{ename} must be 1D or 2D".format(ename=ename))
6083 if inp.shape[1] < inp.shape[0]:
6084 warnings.warn(
ValueError: x must be 1D or 2D
_normalize_input() was removed from Matplotlib (it looks like sometime last year), so I guess Seaborn is referring to an older version under the hood.
You can see _normalize_input() in this old commit:
def _normalize_input(inp, ename='input'):
"""Normalize 1 or 2d input into list of np.ndarray or
a single 2D np.ndarray.
Parameters
----------
inp : iterable
ename : str, optional
Name to use in ValueError if `inp` can not be normalized
"""
if (isinstance(x, np.ndarray) or
not iterable(cbook.safe_first_element(inp))):
# TODO: support masked arrays;
inp = np.asarray(inp)
if inp.ndim == 2:
# 2-D input with columns as datasets; switch to rows
inp = inp.T
elif inp.ndim == 1:
# new view, single row
inp = inp.reshape(1, inp.shape[0])
else:
raise ValueError(
"{ename} must be 1D or 2D".format(ename=ename))
...
I can't figure out why inp.ndim!=1, though. Performing the same np.asarray().ndim on the input returns 1 as expected:
np.asarray(df.floatTime).ndim # 1
So you're facing a few obstacles if you want to make a single-valued input work with sns.distplot().
Suggested Workaround
Check for a single-element df.floatTime, and if that's the case, just use plt.hist() instead (which is what distplot goes to anyway, along with KDE):
plt.hist(df.floatTime)

How to fit two numpy matrices with Pyspark's SVM?

I have two numpy matrices like this:
Features:
(878049, 6)
<type 'numpy.ndarray'>
Labels:
(878049,)
<type 'numpy.ndarray'>
I was curious about if I can use Pyspark's random forests to fit the previous mentioned matrices. From the documentation, we have that RF algorithm can be used as follows:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
Thus, my questions are: do I need to transform the numpy arrays to an rdd or in which format should I need to convert the features and labels matrices in order to fit them with the RF implementation of MLlib?.
Update
Then from #CafeFeed answer I tried the following:
In [24]:
#CV
(trainingData, testData) = data.randomSplit([0.7, 0.3])
In [26]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
import numpy as np
​
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
​
model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y))
​
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
​
However, I got this exception:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-27-ded4b074521b> in <module>()
6 # Empty categoricalFeaturesInfo indicates all features are continuous.
7
----> 8 model = DecisionTree.trainClassifier(trainingData, numClasses=np.unique(y), categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=32)
9
10 # Evaluate model on test instances and compute test error
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in trainClassifier(cls, data, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
183 """
184 return cls._train(data, "classification", numClasses, categoricalFeaturesInfo,
--> 185 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
186
187 #classmethod
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/tree.pyc in _train(cls, data, type, numClasses, features, impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
124 assert isinstance(first, LabeledPoint), "the data should be RDD of LabeledPoint"
125 model = callMLlibFunc("trainDecisionTreeModel", data, type, numClasses, features,
--> 126 impurity, maxDepth, maxBins, minInstancesPerNode, minInfoGain)
127 return DecisionTreeModel(model)
128
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callMLlibFunc(name, *args)
128 sc = SparkContext._active_spark_context
129 api = getattr(sc._jvm.PythonMLLibAPI(), name)
--> 130 return callJavaFunc(sc, api, *args)
131
132
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in callJavaFunc(sc, func, *args)
120 def callJavaFunc(sc, func, *args):
121 """ Call Java Function """
--> 122 args = [_py2java(sc, a) for a in args]
123 return _java2py(sc, func(*args))
124
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/mllib/common.pyc in _py2java(sc, obj)
86 else:
87 data = bytearray(PickleSerializer().dumps(obj))
---> 88 obj = sc._jvm.SerDe.loads(data)
89 return obj
90
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/pyspark/sql/utils.pyc in deco(*a, **kw)
34 def deco(*a, **kw):
35 try:
---> 36 return f(*a, **kw)
37 except py4j.protocol.Py4JJavaError as e:
38 s = e.java_exception.toString()
/usr/local/Cellar/apache-spark/1.5.1/libexec/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.mllib.api.python.SerDe.loads.
: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.core.multiarray._reconstruct)
at net.razorvine.pickle.objects.ClassDictConstructor.construct(ClassDictConstructor.java:23)
at net.razorvine.pickle.Unpickler.load_reduce(Unpickler.java:701)
at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:171)
at net.razorvine.pickle.Unpickler.load(Unpickler.java:85)
at net.razorvine.pickle.Unpickler.loads(Unpickler.java:98)
at org.apache.spark.mllib.api.python.SerDe$.loads(PythonMLLibAPI.scala:1462)
at org.apache.spark.mllib.api.python.SerDe.loads(PythonMLLibAPI.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:207)
at java.lang.Thread.run(Thread.java:745)
Docs are clear. You need RDD:
>>> from pyspark.mllib.regression import LabeledPoint
>>> from pyspark.mllib.tree import RandomForest
>>> import numpy as np
>>>
>>> np.random.seed(1)
>>> features = np.random.random((100, 10))
>>> labels = np.random.choice([0, 1], 100)
>>> data = sc.parallelize(zip(labels, features)).map(lambda x: LabeledPoint(x[0], x[1]))
>>> RandomForest.trainClassifier(data, numClasses=2, categoricalFeaturesInfo={}, numTrees=2)
TreeEnsembleModel classifier with 2 trees

"TypeError: only length-1 arrays can be converted to Python scalars"

This looks like a bug in pandas.Series.all to me (df is a Pandas DataFrame object, and pd is shorthand for pandas):
In [18]: df.foo.apply(lambda x: x.startswith(u'bar').head()
Out[18]:
0 True
1 False
2 True
3 True
4 False
Name: foo
In [19]: (df.baz == u'frobozz').head()
Out[19]:
0 False
1 False
2 True
3 True
4 False
Name: baz
In [20]: (type(Out[20]), type(Out[19]))
Out[20]: (pandas.core.series.Series, pandas.core.series.Series)
In [21]: pd.Series.all(Out[18], Out[19])
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-310-d132f431d45f> in <module>()
----> 1 pd.Series.all(Out[18], Out[19])
/home/jones/.virtualenvs/proj/local/lib/python2.7/site-packages/pandas/core/series.pyc in f(self, *args, **kwargs)
276 #Appender(func.__doc__)
277 def f(self, *args, **kwargs):
--> 278 result = func(self, *args, **kwargs)
279 if isinstance(result, np.ndarray) and result.ndim == 0:
280 # return NumPy type
/home/jones/.virtualenvs/proj/local/lib/python2.7/site-packages/numpy/core/_methods.pyc in _all(a, axis, dtype, out, keepdims)
28 def _all(a, axis=None, dtype=None, out=None, keepdims=False):
29 return um.logical_and.reduce(a, axis=axis, dtype=dtype, out=out,
---> 30 keepdims=keepdims)
31
32 def _count_reduce_items(arr, axis):
TypeError: only length-1 arrays can be converted to Python scalars
What's going on?
This doesn't seem like a bug to me, but I don't know what you think pd.Series.all(Out[18], Out[19]) does.
>>> pd.Series.all?
Type: instancemethod
String Form:<unbound method Series.all>
File: /Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas-0.10.1-py2.7-macosx-10.6-intel.egg/pandas/core/series.py
Definition: pd.Series.all(self, *args, **kwargs)
Docstring:
a.all(axis=None, out=None)
Returns True if all elements evaluate to True.
You're using the version from the class, so the first argument is being interpreted as the instance, and the second as the axis. pandas is trying to convert the second Series you're passing to an integer to make sense of it as an axis, which can't work if the array has length > 1.
From the doc, pd.Series.all appears to only take one Series object. Try this -
pd.Series.all(Out[18].append(Out[19]))