Scoring returning a numpy.core.memmap instead of a numpy.Number in grid search - numpy

We are able (only within the context of our application atm) to reproduce on Ubuntu 15.04 and OS X with scikit 0.17 the following problem when using GridSearchCV with a LogisticRegression on larger data sets.
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/pipeline.py in fit(self=Pipeline(steps=[('cpencoder', <cpml.whitebox.Lin...s', refit=True, scoring=u'roc_auc', verbose=1))]), X= Unnamed: 0 member_id loan_a... 42.993346
[152536 rows x 45 columns], y=array([0, 1, 0, ..., 1, 1, 0]), **fit_params={})
160 y : iterable, default=None
161 Training targets. Must fulfill label requirements for all steps of
162 the pipeline.
163 """
164 Xt, fit_params = self._pre_transform(X, y, **fit_params)
--> 165 self.steps[-1][-1].fit(Xt, y, **fit_params)
self.steps.fit = undefined
Xt = array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]])
y = array([0, 1, 0, ..., 1, 1, 0])
fit_params = {}
166 return self
167
168 def fit_transform(self, X, y=None, **fit_params):
169 """Fit all the transforms one after the other and transform the
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring=u'roc_auc', verbose=1), X=array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=array([0, 1, 0, ..., 1, 1, 0]))
799 y : array-like, shape = [n_samples] or [n_samples, n_output], optional
800 Target relative to X for classification or regression;
801 None for unsupervised learning.
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
self._fit = <bound method GridSearchCV._fit of GridSearchCV(...obs', refit=True, scoring=u'roc_auc', verbose=1)>
X = array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]])
y = array([0, 1, 0, ..., 1, 1, 0])
self.param_grid = {'C': [1], 'class_weight': ['auto'], 'fit_intercept': [False], 'intercept_scaling': [1], 'penalty': ['l2']}
805
806
807 class RandomizedSearchCV(BaseSearchCV):
808 """Randomized search on hyper parameters.
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
...jobs', refit=True, scoring=u'roc_auc', verbose=1), X=array([[ 0.00000000e+00, 1.29659900e+06, 5....000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=array([0, 1, 0, ..., 1, 1, 0]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
548 )(
549 delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
550 train, test, self.verbose, parameters,
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
parameters = undefined
parameter_iterable = <sklearn.grid_search.ParameterGrid object>
554 for train, test in cv)
555
556 # Out is a list of triplet: score, estimator, n_test_samples
557 n_fits = len(out)
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=2), iterable=<generator object <genexpr>>)
807 if pre_dispatch == "all" or n_jobs == 1:
808 # The iterable was consumed all at once by the above for loop.
809 # No need to wait for async callbacks to trigger to
810 # consumption.
811 self._iterating = False
--> 812 self.retrieve()
self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=2)>
813 # Make sure that we get a last message telling us we are done
814 elapsed_time = time.time() - self._start_time
815 self._print('Done %3i out of %3i | elapsed: %s finished',
816 (len(self._output), len(self._output),
---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError Mon Jan 18 11:58:09 2016
PID: 71840 Python 2.7.10: /Users/samuelhopkins/.virtualenvs/cpml/bin/python
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
67 def __init__(self, iterator_slice):
68 self.items = list(iterator_slice)
69 self._size = len(self.items)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
75 return self._size
76
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=LogisticRegression(C=1, class_weight='auto', dua... tol=0.0001, verbose=0, warm_start=False), X=memmap([[ 0.00000000e+00, 1.29659900e+06, 5...000000e+00, 0.00000000e+00, 4.29933458e+01]]), y=memmap([0, 1, 0, ..., 1, 1, 0]), scorer=make_scorer(roc_auc_score, needs_threshold=True), train=array([ 49100, 49101, 49102, ..., 152533, 152534, 152535]), test=array([ 0, 1, 2, ..., 57517, 57522, 57532]), verbose=1, parameters={'C': 1, 'class_weight': 'auto', 'fit_intercept': False, 'intercept_scaling': 1, 'penalty': 'l2'}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
1545 " numeric value. (Hint: if using 'raise', please"
1546 " make sure that it has been spelled correctly.)"
1547 )
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
1553
1554 scoring_time = time.time() - start_time
...........................................................................
/Users/samuelhopkins/.virtualenvs/cpml/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator=LogisticRegression(C=1, class_weight='auto', dua... tol=0.0001, verbose=0, warm_start=False), X_test=memmap([[ 0.00000000e+00, 1.29659900e+06, 5...000000e+01, 0.00000000e+00, 4.29933458e+01]]), y_test=memmap([0, 1, 0, ..., 1, 1, 1]), scorer=make_scorer(roc_auc_score, needs_threshold=True))
1604 score = scorer(estimator, X_test)
1605 else:
1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
-> 1609 % (str(score), type(score)))
1610 return score
1611
1612
1613 def _permutation_test_score(estimator, X, y, cv, scorer):
ValueError: scoring must return a number, got 0.998981811748 (<class 'numpy.core.memmap.memmap'>) instead.
We have made several attempts to reproduce it outside of the context of the application, but are not having any luck. We have made the following change to cross_validation.py and it fixed our particular problem:
...
if isinstance(score, np.core.memmap):
score = np.float(score)
if not isinstance(score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s) instead."
...
Some more information:
we are on python 2.7
we are using a Pipeline to ensure all inputs are numeric
My questions are the following:
How might we go about reproducing this problem so as to cause the scorer to return a memmap?
Is anyone else having this particular problem?
Is the change we made in cross_validation.py actually a decent solution?

Yes, had a similar case
I fell in love with .memmap-s due to O/S limits on memory allocations and I consider .memmap-s a smart tool for large scale machine-learning, using 'em in .fit()-s and other sklearn methods. ( GridSearchCV() not being yet the case, due to it's adverse effect of pre-allocation of memory on large HyperPARAMETERs' grids with n_jobs = -1 )
How might we ... reproduce ...? As far as I remember, mine case was similar and the change from "ordinary" numpy.ndarray to a numpy.memmap() started these artifacts. So, if you strive to create one such artificially, wrap your data into a .memmap()-ed representation of array and make it be returned, even while containing a single cell of data, instead of a plain number. One shall receive a view into a .memmap()-ed sub-range of generic array representation of that cell.
Is the change ... a decent solution? Well, I have got rid of the .memmap()-ed wrapper by explicitly returning a cell value, by referencing the result's [0] component. An enforced conversion by.float() seems fine.

Related

Error resulting from ImageDataGenerator during data augmentation

Can someone please help me in fixing the error? The code works fine before the for loop. Before the for loop, an array of the image was printed. Is there something wrong with the for loop? The output should be a file stored with augmented images of the input image. The input image is a jpg image.
The code I wrote:
import keras
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
data_gen = tf.keras.preprocessing.image.ImageDataGenerator(
rotation_range=45,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='contrast',
cval=125
)
x = io.imread('mona.jpg')
x = x.reshape((1, ) + x.shape) #Array with shape (1, 256, 256, 3)
i = 0
for batch in data_gen.flow(x, batch_size=16, save_to_dir='/Users/ghad/Desktop',
save_prefix='aug',
save_format='jpg'):
i += 1
if i > 20:
The generated error:
RuntimeError Traceback (most recent call last)
Input In [14], in <cell line: 31>()
28 x = x.reshape((1, ) + x.shape) #Array with shape (1, 256, 256, 3)
30 i = 0
---> 31 for batch in data_gen.flow(x, batch_size=16,
32 save_to_dir='/Users/ghadahalhabib/Desktop',
33 save_prefix='aug',
34 save_format='jpg'):
35 i += 1
36 if i > 20:
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:148, in Iterator.__next__(self, *args, **kwargs)
147 def __next__(self, *args, **kwargs):
--> 148 return self.next(*args, **kwargs)
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:160, in Iterator.next(self)
157 index_array = next(self.index_generator)
158 # The transformation of images is not under thread lock
159 # so it can be done in parallel
--> 160 return self._get_batches_of_transformed_samples(index_array)
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:709, in NumpyArrayIterator._get_batches_of_transformed_samples(self, index_array)
707 x = self.x[j]
708 params = self.image_data_generator.get_random_transform(x.shape)
--> 709 x = self.image_data_generator.apply_transform(
710 x.astype(self.dtype), params)
711 x = self.image_data_generator.standardize(x)
712 batch_x[i] = x
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:1800, in ImageDataGenerator.apply_transform(self, x, transform_parameters)
1797 img_col_axis = self.col_axis - 1
1798 img_channel_axis = self.channel_axis - 1
-> 1800 x = apply_affine_transform(
1801 x,
1802 transform_parameters.get('theta', 0),
1803 transform_parameters.get('tx', 0),
1804 transform_parameters.get('ty', 0),
1805 transform_parameters.get('shear', 0),
1806 transform_parameters.get('zx', 1),
1807 transform_parameters.get('zy', 1),
1808 row_axis=img_row_axis,
1809 col_axis=img_col_axis,
1810 channel_axis=img_channel_axis,
1811 fill_mode=self.fill_mode,
1812 cval=self.cval,
1813 order=self.interpolation_order)
1815 if transform_parameters.get('channel_shift_intensity') is not None:
1816 x = apply_channel_shift(x,
1817 transform_parameters['channel_shift_intensity'],
1818 img_channel_axis)
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:2324, in apply_affine_transform(x, theta, tx, ty, shear, zx, zy, row_axis, col_axis, channel_axis, fill_mode, cval, order)
2321 final_affine_matrix = transform_matrix[:2, :2]
2322 final_offset = transform_matrix[:2, 2]
-> 2324 channel_images = [ndimage.interpolation.affine_transform( # pylint: disable=g-complex-comprehension
2325 x_channel,
2326 final_affine_matrix,
2327 final_offset,
2328 order=order,
2329 mode=fill_mode,
2330 cval=cval) for x_channel in x]
2331 x = np.stack(channel_images, axis=0)
2332 x = np.rollaxis(x, 0, channel_axis + 1)
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/keras/preprocessing/image.py:2324, in <listcomp>(.0)
2321 final_affine_matrix = transform_matrix[:2, :2]
2322 final_offset = transform_matrix[:2, 2]
-> 2324 channel_images = [ndimage.interpolation.affine_transform( # pylint: disable=g-complex-comprehension
2325 x_channel,
2326 final_affine_matrix,
2327 final_offset,
2328 order=order,
2329 mode=fill_mode,
2330 cval=cval) for x_channel in x]
2331 x = np.stack(channel_images, axis=0)
2332 x = np.rollaxis(x, 0, channel_axis + 1)
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/scipy/ndimage/interpolation.py:574, in affine_transform(input, matrix, offset, output_shape, output, order, mode, cval, prefilter)
572 npad = 0
573 filtered = input
--> 574 mode = _ni_support._extend_mode_to_code(mode)
575 matrix = numpy.asarray(matrix, dtype=numpy.float64)
576 if matrix.ndim not in [1, 2] or matrix.shape[0] < 1:
File ~/opt/anaconda3/envs/tensorflow/lib/python3.9/site-packages/scipy/ndimage/_ni_support.py:54, in _extend_mode_to_code(mode)
52 return 6
53 else:
---> 54 raise RuntimeError('boundary mode not supported')
RuntimeError: boundary mode not supported
for the code
for batch in data_gen.flow(x, batch_size=16, save_to_dir='/Users/ghad/Desktop', save_prefix='aug', save_format='jpg'):
you are inputting only a single image but asking to produce 16 augmented images. That won't work. Normal the length of x is LARGER than the batch size. Set the batch size to 1. That way you will produce 1 augment image each time you feed a new image into the generator

Numpy: using np.pad() for an RGB image causing "operands could not be broadcast together with shapes (4,4,3) (4,4,5)" error

I have a function color_image_padding that takes an RGB image and adds one layer of zeros padding to the borders. The image has dimensions (Width, Height, 3), with 3 representing the 3 color channels.
My code is:
import numpy as np
def color_image_padding(image: np.ndarray) -> np.ndarray:
return np.pad(image, pad_width=1)
I'm seeing this error:
"operands could not be broadcast together with shapes (4,4,3) (4,4,5)"
It's probably the color channels that are causing this error. Doesn't np.pad split the image into 3 matrices and add the zero padding accordingly?
Thanks in advance for your assistance!
EDIT
See comments below... It turns out that the generalized function image_padding() was throwing an error message because some greyscale images (i.e. 2D Numpy matrices) were passed in. Here's a minimal example:
bar = np.ones((1, 3))
bar.ndim
2
def image_padding(image: np.ndarray, amt: int) -> np.ndarray:
return np.pad(image, pad_width=((amt, amt), (amt, amt), (0, 0)))
image_padding(bar, 2)
Full Traceback:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_8116/4065018867.py in <module>
----> 1 img(bar, 3)
~\AppData\Local\Temp/ipykernel_8116/1455868751.py in img(image, amt)
1 def img(image, amt):
----> 2 return np.pad(image, pad_width=((amt, amt), (amt, amt), (0, 0)))
<__array_function__ internals> in pad(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\arraypad.py in pad(array, pad_width, mode, **kwargs)
741
742 # Broadcast to shape (array.ndim, 2)
--> 743 pad_width = _as_pairs(pad_width, array.ndim, as_index=True)
744
745 if callable(mode):
~\anaconda3\lib\site-packages\numpy\lib\arraypad.py in _as_pairs(x, ndim, as_index)
516 # Converting the array with `tolist` seems to improve performance
517 # when iterating and indexing the result (see usage in `pad`)
--> 518 return np.broadcast_to(x, (ndim, 2)).tolist()
519
520
<__array_function__ internals> in broadcast_to(*args, **kwargs)
~\anaconda3\lib\site-packages\numpy\lib\stride_tricks.py in broadcast_to(array, shape, subok)
409 [1, 2, 3]])
410 """
--> 411 return _broadcast_to(array, shape, subok=subok, readonly=True)
412
413
~\anaconda3\lib\site-packages\numpy\lib\stride_tricks.py in _broadcast_to(array, shape, subok, readonly)
346 'negative')
347 extras = []
--> 348 it = np.nditer(
349 (array,), flags=['multi_index', 'refs_ok', 'zerosize_ok'] + extras,
350 op_flags=['readonly'], itershape=shape, order='C')
ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (3,2) and requested shape (2,2)
Testing whether the image is greyscale or color resolves the issue:
def image_padding(image: np.ndarray, amt: int) -> np.ndarray:
if image.ndim == 2:
return np.pad(image, pad_width=(amt, amt))
elif img.ndim == 3:
return np.pad(image, pad_width=((amt, amt), (amt, amt), (0, 0)))
So this reproduces your error - using the three term pad_width on a 2d array:
ok with 3d:
In [194]: x = np.ones((5,5,3),int)
In [196]: amt_padding=1;np.pad(x, pad_width=((amt_padding, amt_padding), (amt_padding, amt_padding), (0, 0))).shape
Out[196]: (7, 7, 3)
but if the array is 2d:
In [197]: amt_padding=1;np.pad(x[:,:,0], pad_width=((amt_padding, amt_padding), (amt_padding, amt_padding), (0, 0)))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [197], in <cell line: 1>()
----> 1 amt_padding=1;np.pad(x[:,:,0], pad_width=((amt_padding, amt_padding), (amt_padding, amt_padding), (0, 0)))
File <__array_function__ internals>:5, in pad(*args, **kwargs)
File ~\anaconda3\lib\site-packages\numpy\lib\arraypad.py:743, in pad(array, pad_width, mode, **kwargs)
740 raise TypeError('`pad_width` must be of integral type.')
742 # Broadcast to shape (array.ndim, 2)
--> 743 pad_width = _as_pairs(pad_width, array.ndim, as_index=True)
745 if callable(mode):
746 # Old behavior: Use user-supplied function with np.apply_along_axis
747 function = mode
File ~\anaconda3\lib\site-packages\numpy\lib\arraypad.py:518, in _as_pairs(x, ndim, as_index)
514 raise ValueError("index can't contain negative values")
516 # Converting the array with `tolist` seems to improve performance
517 # when iterating and indexing the result (see usage in `pad`)
--> 518 return np.broadcast_to(x, (ndim, 2)).tolist()
File <__array_function__ internals>:5, in broadcast_to(*args, **kwargs)
File ~\anaconda3\lib\site-packages\numpy\lib\stride_tricks.py:411, in broadcast_to(array, shape, subok)
366 #array_function_dispatch(_broadcast_to_dispatcher, module='numpy')
367 def broadcast_to(array, shape, subok=False):
368 """Broadcast an array to a new shape.
369
370 Parameters
(...)
409 [1, 2, 3]])
410 """
--> 411 return _broadcast_to(array, shape, subok=subok, readonly=True)
File ~\anaconda3\lib\site-packages\numpy\lib\stride_tricks.py:348, in _broadcast_to(array, shape, subok, readonly)
345 raise ValueError('all elements of broadcast shape must be non-'
346 'negative')
347 extras = []
--> 348 it = np.nditer(
349 (array,), flags=['multi_index', 'refs_ok', 'zerosize_ok'] + extras,
350 op_flags=['readonly'], itershape=shape, order='C')
351 with it:
352 # never really has writebackifcopy semantics
353 broadcast = it.itviews[0]
ValueError: operands could not be broadcast together with remapped shapes [original->remapped]: (3,2) and requested shape (2,2)
It's passing the task to np.nditer (via broadcast_to), which is raising the error. That would account for why I've never seen it before. I've explored nditer some, but it's not something I regularly use or recommend to others.
The _as_pairs expands widths like
In [206]: np.lib.arraypad._as_pairs(1,3, as_index=True)
Out[206]: ((1, 1), (1, 1), (1, 1))
In [207]: np.lib.arraypad._as_pairs(((1,),(2,),(3,)),3, as_index=True)
Out[207]: [[1, 1], [2, 2], [3, 3]]

ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42)

I already looked at the other similar questions, but they did not help me. I'm attempting to use GridSearchCV. I'm using three pipelines to predict nfl play data. It works pretty well until the grid search part.
Here is my code.
pipe_nfl1_1 = Pipeline([
('ssc', StandardScaler()),
('lr', LogisticRegression(random_state=42))
])
pipe_nfl1_2 = Pipeline([
('mms', MinMaxScaler()),
('rfc', RandomForestClassifier(random_state=42))
])
pipe_nfl1_3 = Pipeline([
('mms', MinMaxScaler()),
('svc', svm.SVC(random_state=42))
])
pipelines1 = [pipe_nfl1_1, pipe_nfl1_2, pipe_nfl1_3]
pipe_dict1 = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'SVC'}
for pipe in pipelines1:
pipe.fit(X_train1, y_train1)
print('Pipeline test accuracy for predicting 1st downs:')
for idx, val in enumerate(pipelines1):
print(' %s: %.4f' % (pipe_dict1[idx], val.score(X_test1, y_test1)))
best_acc1 = 0.0
best_clf1 = 0
best_pipe1 = ''
for idx, val in enumerate(pipelines1):
if val.score(X_test1, y_test1) > best_acc1:
best_acc1 = val.score(X_test1, y_test1)
best_pipe1 = val
best_clf1 = idx
best_acc1 *= 100
print('Classifier with best accuracy for predicting 1st downs is %s with %.2f' % (pipe_dict1[best_clf1], best_acc1) + '%')
param_grid1 = {
'lr__n_estimators': [2, 4, 6]
}
grid_search1 = GridSearchCV(pipe_nfl1_1, param_grid1, cv=2)
# fine-tune the hyperparameters
grid_search1.fit(X_train1, y_train1)
# get the best model
final_model1 = grid_search1.best_estimator_
grid_search.best_score_
But I'm getting an error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-33-6b0007d9b8f1> in <module>
2
3 # fine-tune the hyperparameters
----> 4 grid_search1.fit(X_train1, y_train1)
5
6 # get the best model
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
518 cloned_parameters[k] = clone(v, safe=False)
519
--> 520 estimator = estimator.set_params(**cloned_parameters)
521
522 start_time = time.time()
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\pipeline.py in set_params(self, **kwargs)
139 self
140 """
--> 141 self._set_params('steps', **kwargs)
142 return self
143
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\utils\metaestimators.py in _set_params(self, attr, **params)
51 self._replace_estimator(attr, name, params.pop(name))
52 # 3. Step parameters and other initialisation arguments
---> 53 super().set_params(**params)
54 return self
55
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
259
260 for key, sub_params in nested_params.items():
--> 261 valid_params[key].set_params(**sub_params)
262
263 return self
~\AppData\Local\Programs\Python\Python38\lib\site-packages\sklearn\base.py in set_params(self, **params)
247 key, delim, sub_key = key.partition('__')
248 if key not in valid_params:
--> 249 raise ValueError('Invalid parameter %s for estimator %s. '
250 'Check the list of available parameters '
251 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter n_estimators for estimator LogisticRegression(random_state=42). Check the list of available parameters with `estimator.get_params().keys()`.
I've done LogisticRegression.get_params().keys() to get the keys, but it returns get_params() missing 1 required positional argument: 'self'.
You shouldn't have the leading underscores in the parameter names. You want your param_grid1 dict to consist of keys that are actually parameters accepted by the model you're using. That would be n_estimators for RandomForest, and C for LogisticRegression. With that said, n_estimators is a parameter for the model RandomForest, but it's not a parameter for LogisticRegression. C is a parameter for LogisticRegression.
I think what you want to do is a grid search over the parameter space of the model that performs best, right? In that case, your param_grid1 variable should be updated to the model that performs best. The parameters accepted by the models you're testing vary from model to model.

xarray: mean of data stored via OPeNDAP

I'm using xarray's very cool pydap back-end (http://xarray.pydata.org/en/stable/io.html#opendap) to read data stored via OPenDAP at IRI:
import xarray as xr
remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
print(remote_data)
#<xarray.DataArray 'zg' (P: 2, S: 6569, M: 3, L: 45, Y: 181, X: 360)>
#[115569730800 values with dtype=float32]
#Coordinates:
# * L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
# * Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
# * S (S) datetime64[ns] 1999-01-07 1999-01-08 1999-01-09 1999-01-10 ...
# * M (M) float32 1.0 2.0 3.0
# * X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
# * P (P) int32 500 200
#Attributes:
# level_type: pressure level
# standard_name: geopotential_height
# long_name: Geopotential Height
# units: m
For reference it's sub-seasonal forecast data where L is lead-time (45 days forecasts), S is initialization date and M is ensemble.
I would like to do an ensemble mean and i'm only interested in the 500 hPa level. However, it crashes out and gives a RuntimeError: NetCDF: Access failure:
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
RuntimeError Traceback (most recent call last)
<ipython-input-46-eca488e9def5> in <module>()
1 remote_data = xr.open_dataarray('http://iridl.ldeo.columbia.edu/SOURCES/.Models' '/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods')
2 da = remote_data.sel(P=500)
----> 3 da_ensmean = da.mean(dim='M')
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/common.py in wrapped_func(self, dim, axis, skipna, keep_attrs, **kwargs)
20 keep_attrs=False, **kwargs):
21 return self.reduce(func, dim, axis, keep_attrs=keep_attrs,
---> 22 skipna=skipna, allow_lazy=True, **kwargs)
23 else:
24 def wrapped_func(self, dim=None, axis=None, keep_attrs=False,
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/dataarray.py in reduce(self, func, dim, axis, keep_attrs, **kwargs)
1359 summarized data and the indicated dimension(s) removed.
1360 """
-> 1361 var = self.variable.reduce(func, dim, axis, keep_attrs, **kwargs)
1362 return self._replace_maybe_drop_dims(var)
1363
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in reduce(self, func, dim, axis, keep_attrs, allow_lazy, **kwargs)
1264 if dim is not None:
1265 axis = self.get_axis_num(dim)
-> 1266 data = func(self.data if allow_lazy else self.values,
1267 axis=axis, **kwargs)
1268
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in data(self)
293 return self._data
294 else:
--> 295 return self.values
296
297 #data.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in values(self)
385 def values(self):
386 """The variable's data as a numpy.ndarray"""
--> 387 return _as_array_or_item(self._data)
388
389 #values.setter
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/variable.py in _as_array_or_item(data)
209 TODO: remove this (replace with np.asarray) once these issues are fixed
210 """
--> 211 data = np.asarray(data)
212 if data.ndim == 0:
213 if data.dtype.kind == 'M':
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
622
623 def __array__(self, dtype=None):
--> 624 self._ensure_cached()
625 return np.asarray(self.array, dtype=dtype)
626
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in _ensure_cached(self)
619 def _ensure_cached(self):
620 if not isinstance(self.array, NumpyIndexingAdapter):
--> 621 self.array = NumpyIndexingAdapter(np.asarray(self.array))
622
623 def __array__(self, dtype=None):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
600
601 def __array__(self, dtype=None):
--> 602 return np.asarray(self.array, dtype=dtype)
603
604 def __getitem__(self, key):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in __getitem__(self, key)
64
65 def __getitem__(self, key):
---> 66 return self.func(self.array[key])
67
68 def __repr__(self):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/coding/variables.py in _apply_mask(data, encoded_fill_values, decoded_fill_value, dtype)
133 for fv in encoded_fill_values:
134 condition |= data == fv
--> 135 data = np.asarray(data, dtype=dtype)
136 return np.where(condition, decoded_fill_value, data)
137
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
490
491 """
--> 492 return array(a, dtype, copy=False, order=order)
493
494
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/core/indexing.py in __array__(self, dtype)
506 def __array__(self, dtype=None):
507 array = as_indexable(self.array)
--> 508 return np.asarray(array[self.key], dtype=None)
509
510 def transpose(self, order):
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/netCDF4_.py in __getitem__(self, key)
63 with self.datastore.ensure_open(autoclose=True):
64 try:
---> 65 array = getitem(self.get_array(), key.tuple)
66 except IndexError:
67 # Catch IndexError in netCDF4 and return a more informative
~/anaconda/envs/SubXNAO/lib/python3.6/site-packages/xarray/backends/common.py in robust_getitem(array, key, catch, max_retries, initial_delay)
114 for n in range(max_retries + 1):
115 try:
--> 116 return array[key]
117 except catch:
118 if n == max_retries:
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable.__getitem__()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4.Variable._get()
netCDF4/_netCDF4.pyx in netCDF4._netCDF4._ensure_nc_success()
RuntimeError: NetCDF: Access failure
Breaking down the calculation removes the RuntimeError. Guess it was just too hefty of a calculation with all the start times. Shouldn't be too difficult to put in a loop over S:
da = remote_data.isel(P=0,S=0)
da_ensmean = da.mean(dim='M')
print(da_ensmean)
<xarray.DataArray 'zg' (L: 45, Y: 181, X: 360)>
array([[[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
[5231.1445, 5231.1445, ..., 5231.1445, 5231.1445],
...,
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383],
[5056.2383, 5056.2383, ..., 5056.2383, 5056.2383]],
[[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
[5211.346 , 5211.346 , ..., 5211.346 , 5211.346 ],
...,
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ],
[5082.062 , 5082.062 , ..., 5082.062 , 5082.062 ]],
...,
[[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
[5108.8247, 5108.8247, ..., 5108.8247, 5108.8247],
...,
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173],
[5154.2173, 5154.2173, ..., 5154.2173, 5154.2173]],
[[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
[5106.4893, 5106.4893, ..., 5106.4893, 5106.4893],
...,
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063],
[5226.0063, 5226.0063, ..., 5226.0063, 5226.0063]]], dtype=float32)
Coordinates:
* L (L) timedelta64[ns] 0 days 12:00:00 1 days 12:00:00 ...
* Y (Y) float32 -90.0 -89.0 -88.0 -87.0 -86.0 -85.0 -84.0 -83.0 ...
S datetime64[ns] 1999-01-07
* X (X) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 11.0 ...
P int32 500
This is a good use-case for chunking with dask, e.g.,
import xarray as xr
url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.RSMAS/.CCSM4/.hindcast/.zg/dods'
remote_data = xr.open_dataarray(url, chunks={'S': 1, 'L': 1})
da = remote_data.sel(P=500)
da_ensmean = da.mean(dim='M')
This version will access the data server in parallel, using many smaller chunks. It will still be slow to download 231 GB of data, but your request will have much better odds of success.

DBSCAN plot - The color values passed in plt.plot() is throwing ValueError

I am using DBSCAN to perform clustering on a dataset. I think it is because of the color argument passed to the markerfacecolor in plt.plot() which is not a single value. Please let me know if am wrong here. My features are latitude, longitude,speed_mph,speedlimit_mph,vehicle_id,driver_id.
Here is my clustering code
dbsc = DBSCAN(eps = .5, min_samples = 5).fit(df_cont)
labels = dbsc.labels_
print(labels)
num_clusters = len(set(labels))
clusters = pd.Series([df_cont[labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))
# No of clusters : 5687
core_samples = np.zeros_like(labels, dtype = bool)
core_samples[dbsc.core_sample_indices_] = True
unique_labels = np.unique(labels)
colors = plt.cm.Spectral(np.linspace(0,1, len(unique_labels)))
for (label, color) in zip(unique_labels, colors):
class_member_mask = (labels == label)
xy = df_cont[class_member_mask & core_samples]
print("color:",color)
# color: [ 0.61960784 0.00392157 0.25882353 1. ]
plt.plot(xy.values[:,0],xy.values[:,1], marker='o', markerfacecolor = color, markersize = 10)
xy2 = df_cont[class_member_mask & ~core_samples]
plt.plot(xy2.values[:,0],xy2.values[:,1], 'o', markerfacecolor = color, markersize = 5)
plt.title("DBSCAN Driver - Speed MPH")
plt.xlabel("driver")
plt.ylabel("Speed")
plt.show()
Here is the error message thrown
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-105-0192647e6baf> in <module>()
3 xy = df_cont[class_member_mask & core_samples]
4 print("color:",color)
----> 5 plt.plot(xy.values[:,0],xy.values[:,1], marker='o', markerfacecolor = color, markersize = 10)
6
7 xy2 = df_cont[class_member_mask & ~core_samples]
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in plot(*args, **kwargs)
3315 mplDeprecation)
3316 try:
-> 3317 ret = ax.plot(*args, **kwargs)
3318 finally:
3319 ax._hold = washold
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
1896 warnings.warn(msg % (label_namer, func.__name__),
1897 RuntimeWarning, stacklevel=2)
-> 1898 return func(ax, *args, **kwargs)
1899 pre_doc = inner.__doc__
1900 if pre_doc is None:
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_axes.py in plot(self, *args, **kwargs)
1404 kwargs = cbook.normalize_kwargs(kwargs, _alias_map)
1405
-> 1406 for line in self._get_lines(*args, **kwargs):
1407 self.add_line(line)
1408 lines.append(line)
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _grab_next_args(self, *args, **kwargs)
405 return
406 if len(remaining) <= 3:
--> 407 for seg in self._plot_args(remaining, kwargs):
408 yield seg
409 return
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _plot_args(self, tup, kwargs)
393 ncx, ncy = x.shape[1], y.shape[1]
394 for j in xrange(max(ncx, ncy)):
--> 395 seg = func(x[:, j % ncx], y[:, j % ncy], kw, kwargs)
396 ret.append(seg)
397 return ret
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/axes/_base.py in _makeline(self, x, y, kw, kwargs)
300 default_dict = self._getdefaults(None, kw)
301 self._setdefaults(default_dict, kw)
--> 302 seg = mlines.Line2D(x, y, **kw)
303 return seg
304
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/lines.py in __init__(self, xdata, ydata, linewidth, linestyle, color, marker, markersize, markeredgewidth, markeredgecolor, markerfacecolor, markerfacecoloralt, fillstyle, antialiased, dash_capstyle, solid_capstyle, dash_joinstyle, solid_joinstyle, pickradius, drawstyle, markevery, **kwargs)
418 self._markerfacecoloralt = None
419
--> 420 self.set_markerfacecolor(markerfacecolor)
421 self.set_markerfacecoloralt(markerfacecoloralt)
422 self.set_markeredgecolor(markeredgecolor)
/home/radiance/anaconda3/lib/python3.6/site-packages/matplotlib/lines.py in set_markerfacecolor(self, fc)
1204 if fc is None:
1205 fc = 'auto'
-> 1206 if self._markerfacecolor != fc:
1207 self.stale = True
1208 self._markerfacecolor = fc
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Also I tried to do clustering taking my lat, long with other features. DBSCAN throwed error that only two features are allowed. Should I ask this as a separate question?
dbsc = DBSCAN(eps = .5, min_samples = 5, algorithm='ball_tree', metric='haversine').fit(np.radians(df_cont))
The contents of df_cont are-
{'Day': [1, 1, 1, 1, 1],
'Month': [6, 6, 6, 6, 6],
'Year': [2015, 2015, 2015, 2015, 2015],
'driver_id': [5693, 5693, 916461, 1145487, 1145487],
'latitude': [34.640141, 34.64373, 34.551254, 35.613663, 35.614525],
'longitude': [-77.938721,
-77.9394,
-78.78463,
-78.470596,
-78.47466999999999],
'speed_mph': [64, 64, 1, 62, 61],
'speedlimit_mph': [70, 70, 55, 70, 70],
'vehicle_id': [1208979, 1208979, 1262441, 1280223, 1280223]}
I got the error fixed by using a scatter plot. plt.scatter(xy.values[:,0],xy.values[:,1],s=10,c=color,marke‌​r='o')