dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = np.random.uniform(0,1)
if x > 0.5:
y = 100
return y
dataset = dataset.map(aug)
print(list(dataset))
Run this code, then all the elements in the dataset are as they were, or all equal to 100. How do I make it so each element is individually transformed?
My more specific question below is basically asking this
I create my segmentation training set by:
dataset = tf.data.Dataset.from_tensor_slices((image_paths, mask_paths))
I then apply my augmentation function to the dataset:
def augment(image_path, mask_path)):
//use tf.io.read_file and tf.io.decode_jpeg to convert paths to tensors
x = np.random.choice([0,1])
if x == 1:
image = tf.image.flip_up_down(image)
mask = tf.image.flip_up_down(mask)
return image, mask
training_dataset = dataset.map(augment)
BATCH_SIZE=2
training_dataset = training_dataset.shuffle(100, reshuffle_each_iteration=True)
training_dataset = training_dataset.batch(BATCH_SIZE)
training_dataset = training_dataset.repeat()
training_dataset = training_dataset.prefetch(-1)
However when I visualise my training dataset, all the images have same flip applied- the are all either flipped upside down or not flipped. Where as I'm expecting them to have different flips- some upside down and some not.
Why is this happening?
You need to use tensorflow operations (not numpy or normal python) because tf.data.Dataset.map() executes the mapped function as a graph. When converting a function to a graph, numpy and base python are converted to constants. The augmentation function is only running np.random.uniform(0,1) once and storing it as a constant.
Note that irrespective of the context in which map_func is defined (eager vs. graph), tf.data traces the function and executes it as a graph.
The source for the above is here.
One solution is to use tensorflow operations. I have included an example below. Note that the y value in the if has to be cast to the same dtype as the input.
dataset = tf.data.Dataset.range(1, 6)
def aug(y):
x = tf.random.uniform([], 0, 1)
if x > 0.5:
y = tf.cast(100, y.dtype)
return y
dataset = dataset.map(aug)
print(list(dataset))
You can use a uniform random function or other probability distribution
tf.random.uniform(
shape, minval=0, maxval=None, dtype=tf.dtypes.float32, seed=None, name=None
)
even you can use prebuild method in TensorFlow or Keras for fliping
tf.keras.layers.experimental.preprocessing.RandomFlip(
mode=HORIZONTAL_AND_VERTICAL, seed=None, name=None, **kwargs
)
I would like to have a 2x2 grid each being 8 times three bars next to each other.
Its because I have four evaluation parameter, each evaluation the quality of 8 languages for three models.
I was able to plot one of them, but it doesn't work for the other three being in my plot-grid.
At the moment, it looks like that:
eval_list = ["F1-Werte (micro)", "Precision", "Recall", "Accuracy"]
lang_list = ["Alle Sprachen", "Chinesisch", "Deutsch", "Englisch", "Finnisch", "Französisch", "Italienisch", "Spanisch"]
x = np.arange(start=1, stop=9, step=1)
fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=False)
df_list = list()
count = 0
for eval_elem in eval_list:
flair = list()
bert = list()
roberta = list()
for lang in lang_list:
flair.append(dfBest.query("Eval==#eval_elem").query("Lang==#lang").query("Model=='multiflair'").iloc[0]['Average'])
bert.append(dfBest.query("Eval==#eval_elem").query("Lang==#lang").query("Model=='bert'").iloc[0]['Average'])
roberta.append(dfBest.query("Eval==#eval_elem").query("Lang==#lang").query("Model=='roberta'").iloc[0]['Average'])
#ax = plt.subplot(2,2,count+1)
a = 1 if count > 1 else 0
b = 1 if count%2!=0 else 0
axes[a][b].bar(x-0.2, flair, width=0.2, color='b', align='center')
axes[a][b].bar(x, bert, width=0.2, color='g', align='center')
axes[a][b].bar(x+0.2, roberta, width=0.2, color='r', align='center')
plt.xticks(x, lang_list, rotation=90)
plt.title(eval_list[count])
plt.legend(bbox_to_anchor=(1.1, 1), labels=["multiflair", "bert", "roberta"])
plt.show()
count += 1
The three lists flair, bert and roberta represent the data and they all correctly put out a list of eight floats in each iteration, but otherwise I don't know, whats wrong with the layout.
Fig 7.1, An Introduction To Statistical Learning
I am currently studying a book named Introduction to Statistical Learning with applications in R, and also converting the solutions to python language.
I am not able to get how to get the confidence intervals and plot them as shown in the above image(dashed lines).
I have plotted the line. Here's my code for that -
(I am using polynomial regression with predictiors - 'age' and response - 'wage',degree is 4)
poly = PolynomialFeatures(4)
X = poly.fit_transform(data['age'].to_frame())
y = data['wage']
# X.shape
model = sm.OLS(y,X).fit()
print(model.summary())
# So, what we want here is not only the final line, but also the standart error related to the line
# TO find that we need to calcualte the predictions for some values of age
test_ages = np.linspace(data['age'].min(),data['age'].max(),100)
X_test = poly.transform(test_ages.reshape(-1,1))
pred = model.predict(X_test)
plt.figure(figsize = (12,8))
plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray')
plt.plot(test_ages,pred)
Here data is WAGE data which is available in R.
This is the resulting graph i get -
I have used bootstraping to calculate the confidence intervals, for this i have used a self customed module -
import numpy as np
import pandas as pd
from tqdm import tqdm
class Bootstrap_ci:
def boot(self,X_data,y_data,R,test_data,model):
predictions = []
for i in tqdm(range(R)):
predictions.append(self.alpha(X_data,y_data,self.get_indices(X_data,200),test_data,model))
return np.percentile(predictions,2.5,axis = 0),np.percentile(predictions,97.5,axis = 0)
def alpha(self,X_data,y_data,index,test_data,model):
X = X_data.loc[index]
y = y_data.loc[index]
lr = model
lr.fit(pd.DataFrame(X),y)
return lr.predict(pd.DataFrame(test_data))
def get_indices(self,data,num_samples):
return np.random.choice(data.index, num_samples, replace=True)
The above module can be used as -
poly = PolynomialFeatures(4)
X = poly.fit_transform(data['age'].to_frame())
y = data['wage']
X_test = np.linspace(min(data['age']),max(data['age']),100)
X_test_poly = poly.transform(X_test.reshape(-1,1))
from bootstrap import Bootstrap_ci
bootstrap = Bootstrap_ci()
li,ui = bootstrap.boot(pd.DataFrame(X),y,1000,X_test_poly,LinearRegression())
This will give us the lower confidence interval, and upper confidence interval.
To plot the graph -
plt.scatter(data['age'],data['wage'],facecolors='none', edgecolors='darkgray')
plt.plot(X_test,pred,label = 'Fitted Line')
plt.plot(X_test,ui,linestyle = 'dashed',color = 'r',label = 'Confidence Intervals')
plt.plot(X_test,li,linestyle = 'dashed',color = 'r')
The resultant graph is
Following code results in the 95% confidence interval
from scipy import stats
confidence = 0.95
squared_errors = (<<predicted values>> - <<true y_test values>>) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))
I have a vector of 5 different values that I use as my sample value, and the label is a single integer of 0, 1, or 3. The machine learning algorithms work when I pass an array as a sample, but I get this warning. How do I pass feature vectors without getting this warning?
import numpy as np
from numpy import random
from sklearn import neighbors
from sklearn.model_selection import train_test_split
import pandas as pd
filepath = 'test.csv'
# example label values
index = [0,1,3,1,1,1,0,0]
# example sample arrays
data = []
for i in range(len(index)):
d = []
for i in range(6):
d.append(random.randint(50,200))
data.append(d)
feat1 = 'brightness'
feat2, feat3, feat4 = ['h', 's', 'v']
feat5 = 'median hue'
feat6 = 'median value'
features = [feat1, feat2, feat3, feat4, feat5, feat6]
df = pd.DataFrame(data, columns=features, index=index)
df.index.name = 'state'
with open(filepath, 'a') as f:
df.to_csv(f, header=f.tell() == 0)
states = pd.read_csv(filepath, usecols=['state'])
df_partial = pd.read_csv(filepath, usecols=features)
states = states.astype(np.float32)
states = states.values
labels = states
samples = np.array([])
for i, row in df_partial.iterrows():
r = row.values
samples = np.vstack((samples, r)) if samples.size else r
n_neighbors = 5
test_size = .3
labels, test_labels, samples, test_samples = train_test_split(labels, samples, test_size=test_size)
clf1 = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
clf1 = clf1.fit(samples, labels)
score1 = clf1.score(test_samples, test_labels)
print("Here's how the models performed \nknn: %d %%" %(score1 * 100))
Warning:
"DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). clf1 = clf1.fit(samples, labels)"
sklearn documentation for fit(self, X, Y)
Try replacing
states = states.values by states = states.values.flatten()
OR
clf1 = clf1.fit(samples, labels) by clf1 = clf1.fit(samples, labels.flatten()).
states = states.values holds the correct labels that were stored in your panda dataframe, however they are getting stored on different rows. Using .flatten() put all those labels on the same row. (https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.ndarray.flatten.html)
In Sklearn's KNeighborsClassifier documentation
(https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html), they show in their example that the labels must be stored on the same row: y = [0, 0, 1, 1].
When you retrieve data from dataframe states, it is stored in multiple rows (column vector) whereas it expected values in single row.
You can also try using ravel() function which is used to create a contiguous flattened array.
numpy.ravel(array, order = ‘C’) : returns contiguous flattened array (1D array with all the input-array elements and with the same type as it)
Try:
states = states.values.ravel() in place of states = states.values
I have a masked array which is used by matplotlib.plt.contourf to project a temperature contour on a glabal map. I was trying to smooth the contour, but unfortunately none of the proposed solutions seems to be able to handle masked array. I tested these solutions:
-scipy.ndimage.gaussian_filter - moving averages
scipy.ndimage.zoom
none of them works(they count in the masked values also). Is there any way I can smooth my contour on maskedArray
I have added this part after trying the proposed 'inpaint' solution and the results were unchanged. here is the code (if it helps)
import Scientific.IO.NetCDF as S
import mpl_toolkits.basemap as bm
import numpy.ma as MA
import numpy as np
import matplotlib.pyplot as plt
import inpaint
def main():
fileobj = S.NetCDFFile('Bias.ANN.tas_A1_1.nc', mode='r')
# take the values
set1 = {'time', 'lat', 'lon'}
set2 = set(fileobj.variables.keys())
set3 = set2 - set1
datadim = set3.pop()
print "******************datadim: "+datadim
data = fileobj.variables[datadim].getValue()[0,:,:]
lon = fileobj.variables['lon'].getValue()
lat = fileobj.variables['lat'].getValue()
fileobj.close()
data, lon = bm.shiftgrid(180.,data, lon,start=False)
data = MA.masked_equal(data, 1.0e20)
#data2 = inpaint.replace_nans(data, 10, 0.25, 2, 'idw')
#- Make 2-D longitude and latitude arrays:
[lon2d, lat2d] =np.meshgrid(lon, lat)
#- Set up map:
mapproj = bm.Basemap(projection='cyl',
llcrnrlat=-90.0, llcrnrlon=-180.00,
urcrnrlat=90.0, urcrnrlon=180.0)
mapproj.drawcoastlines(linewidth=.5)
mapproj.drawmapboundary(fill_color='.8')
#mapproj.drawparallels(N.array([-90, -45, 0, 45, 90]), labels=[1,0,0,0])
#mapproj.drawmeridians(N.array([0, 90, 180, 270, 360]), labels=[0,0,0,1])
lonall, latall = mapproj(lon2d, lat2d)
cmap=plt.cm.Spectral
#- Make a contour plot of the temperature:
mymapf = plt.contourf(lonall, latall, data, 20, cmap=cmap)
#plt.clabel(mymapf, fontsize=12)
plt.title(cmap.name)
plt.colorbar(mymapf, orientation='horizontal')
plt.savefig('sample2.png', dpi=150, edgecolor='red', format='png', bbox_inches='tight', pad_inches=.2)
plt.close()
if __name__ == "__main__":
main()
I am comparing the output from this code (the first figure), with output of the same datafile from Panoply. Zoomin in and looking more precisely it seems like it is not the smoothness problem, but the pyplot model provides one stripe slimmer, or the contours are cut earlier (the outer boundaries shows this clearly, and inner contours are different due to this fact). It makes it to look like that the pyplot model is not as smooth as the Panoply one. how can I get (nearly) the same model? Am I distinguishing it right?
I had similar problem and google pointed me to this: blog post. Basically he's using inpaint algorithm to interpolate missing values and produce valid array for filtering.
The code is at the end of the post, and you can save it to site-packages (or else) and load it as module (i.e. inpaint.py):
import inpaint
filled = inpaint.replace_nans(NANMask, 5, 0.5, 2, 'idw')
I'm happy with the result, and I guess it will suite missing temperature values just fine. There is also next version here: github but code will need some cleaning for general usage as it's part of a project.
For reference, easy use and preservation sake I'll post the code (of initial version) here:
# -*- coding: utf-8 -*-
"""A module for various utilities and helper functions"""
import numpy as np
#cimport numpy as np
#cimport cython
DTYPEf = np.float64
#ctypedef np.float64_t DTYPEf_t
DTYPEi = np.int32
#ctypedef np.int32_t DTYPEi_t
##cython.boundscheck(False) # turn of bounds-checking for entire function
##cython.wraparound(False) # turn of bounds-checking for entire function
def replace_nans(array, max_iter, tol,kernel_size=1,method='localmean'):
"""Replace NaN elements in an array using an iterative image inpainting algorithm.
The algorithm is the following:
1) For each element in the input array, replace it by a weighted average
of the neighbouring elements which are not NaN themselves. The weights depends
of the method type. If ``method=localmean`` weight are equal to 1/( (2*kernel_size+1)**2 -1 )
2) Several iterations are needed if there are adjacent NaN elements.
If this is the case, information is "spread" from the edges of the missing
regions iteratively, until the variation is below a certain threshold.
Parameters
----------
array : 2d np.ndarray
an array containing NaN elements that have to be replaced
max_iter : int
the number of iterations
kernel_size : int
the size of the kernel, default is 1
method : str
the method used to replace invalid values. Valid options are
`localmean`, 'idw'.
Returns
-------
filled : 2d np.ndarray
a copy of the input array, where NaN elements have been replaced.
"""
# cdef int i, j, I, J, it, n, k, l
# cdef int n_invalids
filled = np.empty( [array.shape[0], array.shape[1]], dtype=DTYPEf)
kernel = np.empty( (2*kernel_size+1, 2*kernel_size+1), dtype=DTYPEf )
# cdef np.ndarray[np.int_t, ndim=1] inans
# cdef np.ndarray[np.int_t, ndim=1] jnans
# indices where array is NaN
inans, jnans = np.nonzero( np.isnan(array) )
# number of NaN elements
n_nans = len(inans)
# arrays which contain replaced values to check for convergence
replaced_new = np.zeros( n_nans, dtype=DTYPEf)
replaced_old = np.zeros( n_nans, dtype=DTYPEf)
# depending on kernel type, fill kernel array
if method == 'localmean':
print 'kernel_size', kernel_size
for i in range(2*kernel_size+1):
for j in range(2*kernel_size+1):
kernel[i,j] = 1
print kernel, 'kernel'
elif method == 'idw':
kernel = np.array([[0, 0.5, 0.5, 0.5,0],
[0.5,0.75,0.75,0.75,0.5],
[0.5,0.75,1,0.75,0.5],
[0.5,0.75,0.75,0.5,1],
[0, 0.5, 0.5 ,0.5 ,0]])
print kernel, 'kernel'
else:
raise ValueError( 'method not valid. Should be one of `localmean`.')
# fill new array with input elements
for i in range(array.shape[0]):
for j in range(array.shape[1]):
filled[i,j] = array[i,j]
# make several passes
# until we reach convergence
for it in range(max_iter):
print 'iteration', it
# for each NaN element
for k in range(n_nans):
i = inans[k]
j = jnans[k]
# initialize to zero
filled[i,j] = 0.0
n = 0
# loop over the kernel
for I in range(2*kernel_size+1):
for J in range(2*kernel_size+1):
# if we are not out of the boundaries
if i+I-kernel_size < array.shape[0] and i+I-kernel_size >= 0:
if j+J-kernel_size < array.shape[1] and j+J-kernel_size >= 0:
# if the neighbour element is not NaN itself.
if filled[i+I-kernel_size, j+J-kernel_size] == filled[i+I-kernel_size, j+J-kernel_size] :
# do not sum itself
if I-kernel_size != 0 and J-kernel_size != 0:
# convolve kernel with original array
filled[i,j] = filled[i,j] + filled[i+I-kernel_size, j+J-kernel_size]*kernel[I, J]
n = n + 1*kernel[I,J]
# divide value by effective number of added elements
if n != 0:
filled[i,j] = filled[i,j] / n
replaced_new[k] = filled[i,j]
else:
filled[i,j] = np.nan
# check if mean square difference between values of replaced
#elements is below a certain tolerance
print 'tolerance', np.mean( (replaced_new-replaced_old)**2 )
if np.mean( (replaced_new-replaced_old)**2 ) < tol:
break
else:
for l in range(n_nans):
replaced_old[l] = replaced_new[l]
return filled
def sincinterp(image, x, y, kernel_size=3 ):
"""Re-sample an image at intermediate positions between pixels.
This function uses a cardinal interpolation formula which limits
the loss of information in the resampling process. It uses a limited
number of neighbouring pixels.
The new image :math:`im^+` at fractional locations :math:`x` and :math:`y` is computed as:
.. math::
im^+(x,y) = \sum_{i=-\mathtt{kernel\_size}}^{i=\mathtt{kernel\_size}} \sum_{j=-\mathtt{kernel\_size}}^{j=\mathtt{kernel\_size}} \mathtt{image}(i,j) sin[\pi(i-\mathtt{x})] sin[\pi(j-\mathtt{y})] / \pi(i-\mathtt{x}) / \pi(j-\mathtt{y})
Parameters
----------
image : np.ndarray, dtype np.int32
the image array.
x : two dimensions np.ndarray of floats
an array containing fractional pixel row
positions at which to interpolate the image
y : two dimensions np.ndarray of floats
an array containing fractional pixel column
positions at which to interpolate the image
kernel_size : int
interpolation is performed over a ``(2*kernel_size+1)*(2*kernel_size+1)``
submatrix in the neighbourhood of each interpolation point.
Returns
-------
im : np.ndarray, dtype np.float64
the interpolated value of ``image`` at the points specified
by ``x`` and ``y``
"""
# indices
# cdef int i, j, I, J
# the output array
r = np.zeros( [x.shape[0], x.shape[1]], dtype=DTYPEf)
# fast pi
pi = 3.1419
# for each point of the output array
for I in range(x.shape[0]):
for J in range(x.shape[1]):
#loop over all neighbouring grid points
for i in range( int(x[I,J])-kernel_size, int(x[I,J])+kernel_size+1 ):
for j in range( int(y[I,J])-kernel_size, int(y[I,J])+kernel_size+1 ):
# check that we are in the boundaries
if i >= 0 and i <= image.shape[0] and j >= 0 and j <= image.shape[1]:
if (i-x[I,J]) == 0.0 and (j-y[I,J]) == 0.0:
r[I,J] = r[I,J] + image[i,j]
elif (i-x[I,J]) == 0.0:
r[I,J] = r[I,J] + image[i,j] * np.sin( pi*(j-y[I,J]) )/( pi*(j-y[I,J]) )
elif (j-y[I,J]) == 0.0:
r[I,J] = r[I,J] + image[i,j] * np.sin( pi*(i-x[I,J]) )/( pi*(i-x[I,J]) )
else:
r[I,J] = r[I,J] + image[i,j] * np.sin( pi*(i-x[I,J]) )*np.sin( pi*(j-y[I,J]) )/( pi*pi*(i-x[I,J])*(j-y[I,J]))
return r
#cdef extern from "math.h":
# double sin(double)
A simple smoothing function that works with masked data will solve this. One can then avoid the approaches that involve making up data (ie, interpolating, inpainting, etc); and making up data should always be avoided.
The main issue that arises when smoothing masked data is that for each point, smoothing uses the neighboring values to calculate a new value at a center point, but when those neighbors are masked, the new value for the center point will also become masked due to the rules of masked arrays. Therefore, one needs to do the calculation with unmasked data, and explicitly account for the mask. That's easy to do, and is not in the function smooth below.
from numpy import *
import pylab as plt
# make a grid and a striped mask as test data
N = 100
x = linspace(0, 5, N, endpoint=True)
grid = 2. + 1.*(sin(2*pi*x)[:,newaxis]*sin(2*pi*x)>0.)
m = resize((sin(pi*x)>0), (N,N))
plt.imshow(grid.copy(), cmap='jet', interpolation='nearest')
plt.colorbar()
plt.title('original data')
def smooth(u, mask):
m = ~mask
r = u*m # set all 'masked' points to 0. so they aren't used in the smoothing
a = 4*r[1:-1,1:-1] + r[2:,1:-1] + r[:-2,1:-1] + r[1:-1,2:] + r[1:-1,:-2]
b = 4*m[1:-1,1:-1] + m[2:,1:-1] + m[:-2,1:-1] + m[1:-1,2:] + m[1:-1,:-2] # a divisor that accounts for masked points
b[b==0] = 1. # for avoiding divide by 0 error (region is masked so value doesn't matter)
u[1:-1,1:-1] = a/b
# run the data through the smoothing filter a few times
for i in range(10):
smooth(grid, m)
mg = ma.array(grid, mask=m) # put together the mask and the data
plt.figure()
plt.imshow(mg, cmap='jet', interpolation='nearest')
plt.colorbar()
plt.title('smoothed with mask')
plt.show()
The main point is that at the boundary of the mask, the masked values are not used in the smoothing. (This is also where the grid squares switch values, so it would be clear in the figure if the masked neighboring values were being used.)
We also just had this problem and the astropy package has us covered:
import numpy as np
import matplotlib.pyplot as plt
# Some Axes
x = np.arange(100)
y = np.arange(100)
#Some Interesting Shape
z = np.array(np.outer(np.sin((x+y)/10),np.sin(y/3)),dtype=float)
# some mask
mask = np.outer(np.sin((x+y)/20),np.sin(y/5))**2>.9
# masked data represent noise, so lets put in some trash into the masked points
z[mask] = (np.random.random(size = (100,100))*10)[mask]
# masked data
z_masked = np.ma.masked_array(z, mask)
# "Conventional" filter
filter_kernelsize = 2
import scipy.ndimage
z_filtered_bad = scipy.ndimage.gaussian_filter(z_masked,filter_kernelsize)
# Lets filter it
import astropy.convolution.convolve
from astropy.convolution import Gaussian2DKernel
k = Gaussian2DKernel(1.5)
z_filtered = astropy.convolution.convolve(z_masked, k, boundary='extend')
### Plots:
fig, axes = plt.subplots(2,2)
plt.sca(axes[0,0])
plt.title('Raw Data')
plt.imshow(z)
plt.colorbar()
plt.sca(axes[0,1])
plt.title('Raw Data Masked')
plt.imshow(z_masked)
plt.colorbar()
plt.sca(axes[1,0])
plt.title('ndimage filter (ignores mask)')
plt.imshow(z_filtered_bad)
plt.colorbar()
plt.sca(axes[1,1])
plt.title('astropy filter (uses mask)')
plt.imshow(z_filtered)
plt.colorbar()
plt.tight_layout()
Output plot of the code