Trying to bin some geolocated data using scipy stats.binned_statisc_2d but it seems there cannot exist any negative values in the data. Is there a way to do this accurately and fast?
import numpy as np
ilats = np.linspace(90,-90, 4000)
ilons = np.linspace(-180, 179.955, 8000)
values = np.array([17,-14, -7,-8,-11,-8,-7,-8,-10,-5,-3,
-12,-5, -6,21,30, 2, 4,-8, 6, 4, 7,
3,-6,-13, 21, 4, 5,11,-6, 8,-5,-6,
9,8,-8, -2,-16,-5,-5,-9,-4,-6,33,
-8,-5,-14,-8,-11,21,24,-7,-13,12,-6,
5,7,8,-3,-3,-4, 4, 9,-3, 9,-11,
-8,6,4, 8,-6,-6,-4,-3, 4, 5,11,
-3,-6,-4,-8,-4,12,-9,-8,15,-10,-5,
-4,12,5,-4, 4, 7,-13, 5,-4,-4,-5,
-8,-10,-9,-7.])
lats = np.array([ 6.7427, 42.7027, 42.6963, 10.3688, 37.5713, 37.5798,
-12.1563, 42.7127, 41.7457, 37.8122, 37.66 , 41.7456,
41.7457, 38.4462, 8.5418, -12.7309, -10.9395, -10.9464,
38.0641, -10.9507, -12.7316, -10.9313, -12.7235, 37.6469,
38.1234, 20.3964, -12.0847, -12.0844, 10.3794, 38.1302,
10.3627, 38.1582, 38.1463, 22.6466, 20.4246, 38.1401,
-36.6505, 38.2352, 37.8795, 40.2281, 37.8125, 42.323 ,
37.8775, 9.3717, 38.732 , 38.7202, 38.2688, 38.9148,
38.9414, -4.8618, -4.8525, 39.0108, 38.8187, -6.5067,
38.009 , -6.5174, -6.5101, -6.51 , 37.7243, 37.7512,
37.7215, -6.4902, -6.5113, 37.5409, 1.9481, 37.6398,
-6.5073, 37.8037, -11.133 , 9.0896, 38.177 , 9.089 ,
37.8708, 38.3848, -3.553 , 9.4345, -3.5343, -3.5769,
37.6847, 37.6045, 37.8857, 38.32 , 8.1673, 37.8822,
37.9113, 8.6278, 37.5652, 37.8236, 37.8593, 8.6219,
-3.5614, 37.924 , 37.7845, 37.8436, 37.8666, 37.6804,
37.639 , 40.7691, 40.7744, 37.8029, 42.9793, 8.207 ,
39.302 ])
lons = np.array([ 60.8964, -96.1017, -96.1049, 71.595 , -97.0008, -97.0126,
57.4887, -96.109 , -95.1058, -97.1088, -96.6413, -95.1054,
-95.1062, -95.2395, 58.3938, -73.7145, -70.626 , -70.5864,
-95.5678, -70.5914, -73.7525, -70.6048, -73.753 , -96.7662,
-95.504 , 100.3965, -70.7921, -70.7905, 71.5499, -95.4816,
71.5457, -95.326 , -95.3355, 96.8339, 100.2684, -95.8697,
39.1031, -95.4456, -96.3814, -94.5726, -96.3782, -95.4554,
-96.3797, -66.7449, -95.1513, -95.1465, -95.0972, -95.2498,
-95.2054, 84.2004, 84.21 , -94.5695, -94.9174, 114.0945,
-95.942 , 114.0592, 114.0956, 114.0873, -96.4689, -96.4599,
-96.463 , 114.0741, 114.0975, -96.582 , 117.2901, -96.572 ,
114.0561, -96.5539, -74.9417, 71.3391, -95.4253, 71.2452,
-96.5511, -95.065 , 107.5832, 71.3906, 107.6005, 107.4975,
-96.9722, -96.9307, -96.2627, -95.1745, 72.5249, -96.2632,
-96.3324, 57.9562, -96.9309, -96.5123, -96.589 , 57.9627,
107.6405, -96.2711, -96.5737, -96.2344, -96.2099, -96.5062,
-96.5248, -94.8421, -94.8522, -96.5873, -97.1523, 72.4707,
-95.0489])
ret = stats.binned_statistic_2d(lons, lats, values, 'count', bins=[ilons, ilats])
Trying to examine the ungridded data, values by gridding them on a coarse grid (ilats, ilons) and plotting the counts first case and mean later on. But the above results produce:
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [52], in <cell line: 1>()
----> 1 ret = stats.binned_statistic_2d(lons, lats, ka, 'count', bins=[ilons, ilats])
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:352, in binned_statistic_2d(x, y, values, statistic, bins, range, expand_binnumbers)
349 xedges = yedges = np.asarray(bins, float)
350 bins = [xedges, yedges]
--> 352 medians, edges, binnumbers = binned_statistic_dd(
353 [x, y], values, statistic, bins, range,
354 expand_binnumbers=expand_binnumbers)
356 return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:571, in binned_statistic_dd(sample, values, statistic, bins, range, expand_binnumbers, binned_statistic_result)
569 if binned_statistic_result is None:
570 nbin, edges, dedges = _bin_edges(sample, bins, range)
--> 571 binnumbers = _bin_numbers(sample, nbin, edges, dedges)
572 else:
573 edges = binned_statistic_result.bin_edges
File ~/anaconda3/envs/synrad/lib/python3.8/site-packages/scipy-1.9.3-py3.8-linux-x86_64.egg/scipy/stats/_binned_statistic.py:752, in _bin_numbers(sample, nbin, edges, dedges)
750 if dedges_min == 0:
751 raise ValueError('The smallest edge difference is numerically 0.')
-->** 752 decimal = int(-np.log10(dedges_min)) + 6**
753 # Find which points are on the rightmost edge.
754 on_edge = np.where((sample[:, i] >= edges[i][-1]) &
755 (np.around(sample[:, i], decimal) ==
756 np.around(edges[i][-1], decimal)))[0]
ValueError: cannot convert float NaN to integer
It looks like there is a log operation and I don't see a way around it.
Alright, just started a new job and i have been tasked with writing a simple notebook in jupyter. I really want to impress my supervisor and have been working on this code for hours and can't get it to work, hopefully somebody here can help me.
Here is the code I have been working on:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
df = pd.read_csv(r'C:\Users\jk2588\Documents\EDA\EDA Practice\top1000_dataset.csv', converters={'GENDER': lambda x: int(x == 'Male')}, usecols = ['MEMBER_ID', 'GENDER', 'Age', 'Dement'])
df_gp_1 = df[['MEMBER_ID', 'Dement']].groupby('MEMBER_ID').agg(np.mean).reset_index()
df_gp_2 = df[['MEMBER_ID', 'GENDER', 'Age']].groupby('MEMBER_ID').agg(max).reset_index()
df_gp = pd.merge(df_gp_1, df_gp_2, on = ['MEMBER_ID'])
df.head()
Output: MEMBER_ID Age Dement GENDER
0 000000002 01 36 NaN 0
1 000000002 01 36 NaN 0
2 000000002 01 36 NaN 0
3 000000002 01 36 NaN 0
4 000000002 01 36 NaN 0
df['Dement'] = df['Dement'].fillna(0)
df['Dement'] = df['Dement'].astype('int64')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 4 columns):
MEMBER_ID 999 non-null object
Age 999 non-null int64
Dement 999 non-null int64
GENDER 999 non-null int64
dtypes: int64(3), object(1)
memory usage: 31.3+ KB
freq = ((df_gp.Age.value_counts(normalize = True).reset_index().sort_values(by = 'index').Age)*100).tolist()
number_gp = 7
def ax_settings(ax, var_name, x_min, x_max):
ax.set_xlim(x_min,x_max)
ax.set_yticks([])
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_edgecolor('#444444')
ax.spines['bottom'].set_linewidth(2)
ax.text(0.02, 0.05, var_name, fontsize=17, fontweight="bold", transform = ax.transAxes)
return None
fig = plt.figure(figsize=(12,7))
gs = gridspec.GridSpec(nrows=number_gp,
ncols=2,
figure=fig,
width_ratios= [3, 1],
height_ratios= [1]*number_gp,
wspace=0.2, hspace=0.05
)
ax = [None]*(number_gp + 1)
features = ['0-17', '18-25', '26-35', '36-45', '46-50', '51-55', '55+']
for i in range(number_gp):
ax[i] = fig.add_subplot(gs[i, 0])
ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
if i < (number_gp - 1): ax[i].set_xticks([])
ax[0].legend(['Male', 'Female'], facecolor='w')
ax[number_gp] = fig.add_subplot(gs[:, 1])
ax[number_gp].spines['right'].set_visible(False)
ax[number_gp].spines['top'].set_visible(False)
ax[number_gp].barh(features, freq, color='#004c99', height=0.4)
ax[number_gp].set_xlim(0,100)
ax[number_gp].invert_yaxis()
ax[number_gp].text(1.09, -0.04, '(%)', fontsize=10, transform = ax[number_gp].transAxes)
ax[number_gp].tick_params(axis='y', labelsize = 14)
plt.show()
I am then met with:
C:\Users\jk2588\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py:1167: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
result = method(y)
--------------------------------------------------------------------------
TypeError Traceback (most recent call last
<ipython-input-38-8665030edb1c> in <module>()
24 ax[i] = fig.add_subplot(gs[i, 0])
25 ax_settings(ax[i], 'Age: ' + str(features[i]), -1000, 20000)
---> 26 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'M') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="blue", bw=300, legend=False)
27 sns.kdeplot(data=df_gp[(df_gp.GENDER == 'F') & (df_gp.Age == features[i])].Dement, ax=ax[i], shade=True, color="red", bw=300, legend=False)
28 if i < (number_gp - 1): ax[i].set_xticks([])
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in wrapper(self, other, axis)
1281
1282 with np.errstate(all='ignore'):
-> 1283 res = na_op(values, other)
1284 if is_scalar(res):
1285 raise TypeError('Could not compare {typ} type with Series'
~\AppData\Local\Continuum\anaconda3\lib\site-packages\pandas\core\ops.py in na_op(x, y)
1167 result = method(y)
1168 if result is NotImplemented:
-> 1169 raise TypeError("invalid type comparison")
1170 else:
1171 result = op(x, y)
TypeError: invalid type comparison
Please help, i have been faced with an absurd amount of errors this week
I have features which are numeric and a binary response. I am trying to build ensemble decision trees such as random forest and gradient-boosted trees. However, I get an error. I have reproduced the error with iris data.
The error is below and the whole error message is at the bottom.
TypeError: Could not convert 12.631578947368421 to int
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
y = list(iris.target)
df = pd.read_csv("https://raw.githubusercontent.com/venky14/Machine- Learning-with-Iris-Dataset/master/Iris.csv")
df = df.drop(['Species'], axis = 1)
df['label'] = y
spark_df = spark.createDataFrame(df).drop('Id')
cols = spark_df.drop('label').columns
assembler = VectorAssembler(inputCols = cols, outputCol = 'features')
output_dat = assembler.transform(spark_df).select('label', 'features')
rf = RandomForestClassifier(labelCol = "label", featuresCol = "features")
paramGrid_rf = ParamGridBuilder() \
.addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
.addGrid(rf.numTrees, np.linspace(10, 60, 20)).build()
crossval_rf = CrossValidator(estimator = rf,
estimatorParamMaps = paramGrid_rf,
evaluator = BinaryClassificationEvaluator(),
numFolds = 5)
cvModel_rf = crossval_rf.fit(output_dat)
TypeError Traceback (most recent call last)
<ipython-input-24-44f8f759ed8e> in <module>
2 paramGrid_rf = ParamGridBuilder() \
3 .addGrid(rf.maxDepth, np.linspace(5, 30, 6)) \
----> 4 .addGrid(rf.numTrees, np.linspace(10, 60, 20)) \
5 .build()
6
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in build(self)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
--> 122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
123
124
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in to_key_value_pairs(keys, values)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/tuning.py in <listcomp>(.0)
118
119 def to_key_value_pairs(keys, values):
--> 120 return [(key, key.typeConverter(value)) for key, value in zip(keys, values)]
121
122 return [dict(to_key_value_pairs(keys, prod)) for prod in itertools.product(*grid_values)]
~/spark-2.4.0-bin-hadoop2.7/python/pyspark/ml/param/__init__.py in toInt(value)
197 return int(value)
198 else:
--> 199 raise TypeError("Could not convert %s to int" % value)
200
201 #staticmethod
TypeError: Could not convert 12.631578947368421 to int```
Both maxDepth and numTrees need to be integers; Numpy linspace procudes floats:
import numpy as np
np.linspace(10, 60, 20)
Result:
array([ 10. , 12.63157895, 15.26315789, 17.89473684,
20.52631579, 23.15789474, 25.78947368, 28.42105263,
31.05263158, 33.68421053, 36.31578947, 38.94736842,
41.57894737, 44.21052632, 46.84210526, 49.47368421,
52.10526316, 54.73684211, 57.36842105, 60. ])
So, your code bumps upon the first non-integer value (here 12.63157895), and produces an error.
Use arange instead:
np.arange(10, 60, 20)
# array([10, 30, 50])
After upgrading from Pandas 0.18.1 to 0.19.2, I am getting the following error when I try to add new levels and labels to my dataframe. Any idea what the problem is?
print index
MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
print levels
[['1', '2', 'Total'], ['nextLevel']]
print labels
[[0, 1, 2], [0, 0, 0]]
index = index.set_levels(levels)
print index
MultiIndex(levels=[[u'Supported', u'Unsupported', u'Total'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index = index.set_labels(labels)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-f6fb11fbbb3a> in <module>()
288
289 # Initialize dfplot
--> 290 slice_data()
291
292 if len(resultList)==1:
<ipython-input-11-f6fb11fbbb3a> in slice_data(*args)
71 index = index.set_levels(levels)
72 print index
---> 73 index = index.set_labels(labels)
74 data_slice = data_slice.reindex(index)
75
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in set_labels(self, labels, level, inplace, verify_integrity)
350 idx = self._shallow_copy()
351 idx._reset_identity()
--> 352 idx._set_labels(labels, level=level, verify_integrity=verify_integrity)
353 if not inplace:
354 return idx
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _set_labels(self, labels, level, copy, validate, verify_integrity)
285
286 if verify_integrity:
--> 287 self._verify_integrity(labels=new_labels)
288
289 self._labels = new_labels
/Users/user1/anaconda/lib/python2.7/site-packages/pandas/indexes/multi.pyc in _verify_integrity(self, labels, levels)
145 if len(label) != label_length:
146 raise ValueError("Unequal label lengths: %s" %
--> 147 ([len(lab) for lab in labels]))
148 if len(label) and label.max() >= len(level):
149 raise ValueError("On level %d, label max (%d) >= length of"
ValueError: Unequal label lengths: [3, 3]
I'm wondering if it's a bug in the new pandas code. Perhaps self.labels[0] should be labels[0]?
def _verify_integrity(self, labels=None, levels=None):
"""
Parameters
----------
labels : optional list
Labels to check for validity. Defaults to current labels.
levels : optional list
Levels to check for validity. Defaults to current levels.
Raises
------
ValueError
* if length of levels and labels don't match or any label would
exceed level bounds
"""
# NOTE: Currently does not check, among other things, that cached
# nlevels matches nor that sortorder matches actually sortorder.
labels = labels or self.labels
levels = levels or self.levels
if len(levels) != len(labels):
raise ValueError("Length of levels and labels must match. NOTE:"
" this index is in an inconsistent state.")
label_length = len(self.labels[0])
for i, (level, label) in enumerate(zip(levels, labels)):
if len(label) != label_length:
raise ValueError("Unequal label lengths: %s" %
([len(lab) for lab in labels]))
if len(label) and label.max() >= len(level):
raise ValueError("On level %d, label max (%d) >= length of"
" level (%d). NOTE: this index is in an"
" inconsistent state" % (i, label.max(),
len(level)))
I tested my fix and it worked! I submitted a bug to Pandas:
https://github.com/pandas-dev/pandas/issues/15157
I'm not sure if its a bug - I suppose Pandas could replace all the extra indexes with missing values doing it your way but I think you should use reindex
df.reindex(index2)
index = pd.MultiIndex(levels=[[u'1', u'2'], [u'nextLevel']],
labels=[[0, 1], [0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
index2 = pd.MultiIndex(levels=[['1', '2', 'Total'], ['nextLevel']],
labels=[[0, 1, 2], [0, 0, 0]],
names=[u'segment..ASRinfo..supportedUtt', u'label'])
I am new to Pandas, and I found the documentation on MultiIndexing difficult to adapt to solving my own problem. Basically, I want to add some extra rows. This is the solution I came up with. There is probably a much better way to do it. Feel free to share if you'd like.
groupbyColumns = ['label0', 'label1']
data_slice = dataframe.groupby(by=groupbyColumns).sum()
index = data_slice.index
levels = list()
for levelIter in range(len(data_slice.index.levels)):
levels.append([x for x in data_slice.index.levels[levelIter]])
levels[0].append('Total')
if len(resultList)==2:
levels[-1].append('Difference')
addIndexCountForDifferenceRow = 1
else:
addIndexCountForDifferenceRow = 0
# Create new indexing sequence since we are adding Total (and Difference if doing comparison) rows
labels = list()
for labelIter in range(len(data_slice.index.labels)):
labels.append(list())
if len(data_slice.index.labels)==2:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
if len(data_slice.index.labels)==3:
labels0 = [x for x in data_slice.index.labels[0]]
labels1 = [x for x in data_slice.index.labels[1]]
labels2 = [x for x in data_slice.index.labels[2]]
for iter0 in range(max(labels0)+2):
for iter1 in range(max(labels1)+1):
for iter2 in range(max(labels2)+1+addIndexCountForDifferenceRow):
labels[0].append(iter0)
labels[1].append(iter1)
labels[2].append(iter2)
index = index.set_levels(levels)
index = index.set_labels(labels)
data_slice = data_slice.reindex(index)
I've following code which seems to be performance bottleneck:
for x, y, intensity in myarr:
target_map[x, y] = target_map[x,y] + intensity
There are multiple coordinates for same coordinate with variable intensity.
Datatypes:
> print myarr.shape, myarr.dtype
(219929, 3) uint32
> print target_map.shape, target_map.dtype
(150, 200) uint32
Is there any way to optimize this loop, other than writing it in C?
This seems to be related question, how ever I couldn't get the accepted answer working for me: How to convert python list of points to numpy image array?
I get following error message:
Traceback (most recent call last):
File "<pyshell#38>", line 1, in <module>
image[coordinates] = 1
IndexError: too many indices for array
If you convert your 2D coordinates into target_map into flat indices into it using np.ravel_multi_index, you can use np.unique and np.bincount to speed things up quite a bit:
def vec_intensity(my_arr, target_map) :
flat_coords = np.ravel_multi_index((my_arr[:, 0], my_arr[:, 1]),
dims=target_map.shape)
unique_, idx = np.unique(flat_coords, return_inverse=True)
sum_ = np.bincount(idx, weights=my_arr[:, 2])
target_map.ravel()[unique_] += sum_
return target_map
def intensity(my_arr, target_map) :
for x, y, intensity in myarr:
target_map[x, y] += intensity
return target_map
#sample data set
rows, cols = 150, 200
items = 219929
myarr = np.empty((items, 3), dtype=np.uint32)
myarr[:, 0] = np.random.randint(rows, size=(items,))
myarr[:, 1] = np.random.randint(cols, size=(items,))
myarr[:, 2] = np.random.randint(100, size=(items,))
And now:
In [6]: %timeit target_map_1 = np.zeros((rows, cols), dtype=np.uint32); target_map_1 = vec_intensity(myarr, target_map_1)
10 loops, best of 3: 53.1 ms per loop
In [7]: %timeit target_map_2 = np.zeros((rows, cols), dtype=np.uint32); target_map_2 = intensity(myarr, target_map_2)
1 loops, best of 3: 934 ms per loop
In [8]: np.all(target_map_1 == target_map_2)
Out[8]: True
That's almost a 20x speed increase.