Please let me know what is the problem ;( [index 13 is out of bounds for axis 0 with size 13] - indexing

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
ILSED = pd.read_csv('/kaggle/input/ilsed12/ILSED.csv')
ILSED.keys()
print(ILSED.iloc)
X = np.array(ILSED.iloc[:, :-1])
Y = np.array(ILSED.iloc[12])
kf = KFold(n_splits = 100, shuffle = True, random_state = 50)
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
Error is that "index 13 is out of bounds for axis 0 with size 13".
Please let me know what should I do? bb

Related

Different results between training and loading autokeras-model

I trained a regression-model with autokeras resulting in a model with a MAE of 0.2 with that code, where x and y were input and output-dataframes:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
search = StructuredDataRegressor(max_trials=1000, loss='mean_squared_error', max_model_size=100000000000, overwrite = True)
search.fit(x=X_train, y=y_train, verbose=2, validation_data=(X_test, y_test))
model = search.export_model()
model.summary()
model.save('model_best')
Refeeding my data to the model delivers a MAE of about 30 with pretty nonsense predictions. My test-output values are in the range of 3 to 10, predicted output-values are in the range of -10 to 5.
model = load_model("model_best2", custom_objects=ak.CUSTOM_OBJECTS)
mae, _ = model.evaluate(x, y, verbose=2)
print('MAE: %.3f' % mae)
Those results are reproducible with any provided model from autokeras. Do you have any clue why training and evaluation results are totally different?
I created a minimal example which is delivering similar bad results so you can try on your own:
from numpy import asarray
from pandas import read_csv
from sklearn.model_selection import train_test_split
from autokeras import StructuredDataRegressor
import matplotlib.pyplot as plt
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/auto-insurance.csv'
dataframe = read_csv(url, header=None)
data = dataframe.values
data = data.astype('float32')
X, y = data[:, :-1], data[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
search = StructuredDataRegressor(max_trials=15, loss='mean_absolute_error')
search.fit(x=X_train, y=y_train, verbose=2)
mae, _ = search.evaluate(X_test, y_test, verbose=2)
print('MAE: %.3f' % mae)
predictions = search.predict(X)
miny = float(y.min())
maxy = float(y.max())
minp = float(min(predictions))
maxp = float(max(predictions))
plt.figure(figsize=(15,15))
plt.scatter(y, predictions, c='crimson',s=5)
p1 = max(maxp, maxy)
p2 = min(minp, miny)
plt.plot([p1, 0], [p1, 0], 'b-')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.axis('equal')
plt.show()

How to match dimensions in CNN

I'm trying to build a CNN, where the goal is from 3 features to predict the label, but is giving an error of dimension.
Could someone help me?
updated after comments from #M.Innat
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D
from tensorflow.keras.models import Sequential, load_model
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from sklearn import metrics
import tensorflow as tf
import random
# Create data
n = 8500
l = [2, 3, 4, 5,6]
k = int(np.ceil(n/len(l)))
labels = [item for item in l for i in range(k)]
random.shuffle(labels,random.random)
labels =np.array(labels)
label_unique = np.unique(labels)
x = np.linspace(613000, 615000, num=n) + np.random.uniform(-5, 5, size=n)
y = np.linspace(7763800, 7765800, num=n) + np.random.uniform(-5, 5, size=n)
z = np.linspace(1230, 1260, num=n) + np.random.uniform(-5, 5, size=n)
X = np.column_stack((x,y,z))
Y = labels
# Split the dataset into training and testing.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
seq_len=len(X_train)
n_features=len(X_train[0])
droprate=0.1
exit_un=len(label_unique)
print('n_features: {} \n seq_len: {} \n exit_un: {}'.format(n_features,seq_len,exit_un))
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
drop_prob = 0.5
my_model = Sequential()
my_model.add(Conv2D(input_shape=(seq_len,n_features,1),filters=32,kernel_size=(3,3),padding='same',activation="relu")) # 1 channel of grayscale.
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Conv2D(filters=64,kernel_size=(5,5), padding='same',activation="relu"))
my_model.add(MaxPooling2D(pool_size=(2,1)))
my_model.add(Flatten())
my_model.add(Dense(units = 1024, activation="relu"))
my_model.add(Dropout(rate=drop_prob))
my_model.add(Dense(units = exit_un, activation="softmax"))
n_epochs = 100
batch_size = 10
learn_rate = 0.005
# Define the optimizer and then compile.
my_optimizer=Adam(lr=learn_rate)
my_model.compile(loss = "categorical_crossentropy", optimizer = my_optimizer, metrics=['categorical_crossentropy','accuracy'])
my_summary = my_model.fit(X_train, Y_train, epochs=n_epochs, batch_size = batch_size, verbose = 1)
The error I have is:
ValueError: Data cardinality is ambiguous:
x sizes: 1
y sizes: 5950
Make sure all arrays contain the same number of samples.
You're passing the input sample without the channel axis and also the batch axis. Also, according to your loss function, you should transform your integer label to one-hot encoded.
exit_un=len(label_unique)
drop_prob = 0.5
X_train = X_train[..., None][None, ...] # add channel axis+batch aix
X_train = np.repeat(X_train, repeats=100, axis=0) # batch-ing
Y_train = np.repeat(Y_train, repeats=100, axis=0) # batch-ing
Y_train = pd.get_dummies(Y_train) # transform to one-hot encoded
print(X_train.shape, Y_train.shape)
my_model = Sequential()
...
update
Based on the discussion, it seems like you need the conv1d operation in the modeling time and need to reshape your sample as mentioned in the comment. Here is the colab, it should work now.

Having TypeError: 'numpy.ndarray' object is not callable

I'm getting the following error on the mentioned lines inside the code:
TypeError: 'numpy.ndarray' object is not callable
Please someone resolve this issue. Removal of parenthesis didn't help, another error pops up stating:
IndexError: arrays used as indices must be of integer (or boolean)
type
import numpy as np
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
n_pts = 500
np.random.seed(0)
Xa = np.array([np.random.normal(13, 2, n_pts), <*Type error encountered here*>
np.random.normal(12, 2, n_pts)]).T
Xb = np.array([np.random.normal(8, 2, n_pts),
np.random.normal(6, 2, n_pts)]).T
X = np.vstack((Xa, Xb))
y = np.matrix(np.append(np.zeros(n_pts), np.ones(n_pts))).T
plt.scatter(X[:n_pts,0], X[:n_pts,1])
plt.scatter(X[n_pts:,0], X[n_pts:,1])
model = Sequential()
model.add(Dense(units = 1, input_shape = (2,), activation = 'sigmoid'))
adam = Adam(lr = 0.1)
model.compile(adam, loss = 'binary_crossentropy', metrics= ['accuracy'])
h = model.fit(x = X, y = y, verbose = 1, batch_size = 50, epochs = 500, shuffle = 'true')
plt.plot(h.history['accuracy'])
plt.title('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy'])
plt.plot(h.history['loss'])
plt.title('loss')
plt.xlabel('epoch')
plt.legend(['loss'])
def plot_decision_boundary(X,y, model):
x_span = np.linspace(min(X[:,0]) - 1, max(X[:, 0]) + 1)
y_span = np.linspace(min(X[:,1]) - 1, max(X[:, 0]) + 1)
xx, yy = np.meshgrid(x_span, y_span)
xx_, yy_ = xx.ravel(), yy.ravel()
grid = np.c_[xx_, yy_]
pred_func = model.predict(grid)
z = pred_func.reshape(xx.shape)
plt.contourf(xx,yy, z)
plot_decision_boundary(X, y , model)
plt.scatter(X[:n_pts, 0], X[:n_pts,1])
plt.scatter(X[n_pts:,0], X[n_pts:,1])
x = 7.5
y = 5
point = np.array([[x,y]])
prediction = model.predict(point)
plt.plot([x],[y], marker="o", markersize=10, color="red")
print("Prediction", prediction)

plotting Iris Classification

The code below classifies three groups of Iris through the Decision Tree classifier.
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeClassifier
iris = datasets.load_iris()
dataset = pd.DataFrame(iris['data'], columns=iris['feature_names'])
dataset['target'] = iris['target']
X=dataset[[dataset.columns[1], dataset.columns[2]]]
y=dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
model = DecisionTreeClassifier(max_depth=3)
model.fit(X_train, y_train)
And For plotting this classification we can use these lines of code:
import numpy as np
from matplotlib.colors import ListedColormap
X_set, y_set = X_test.values, y_test.values
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, model.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha = 0.75, cmap = ListedColormap(('red', 'green','blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green','blue'))(i), label = j)
plt.title('Classifier (Test set)')
plt.xlabel('sepal width (cm)')
plt.ylabel('petal length (cm)')
plt.legend()
plt.show()
the result would be like below:
Visualising the Test set results
But when I wanted to use more than two features for training,
X=dataset[[dataset.columns[1], dataset.columns[2], dataset.columns[3]]]
y=dataset['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
I couldn't visualize the results like the picture above! Could someone please explain to me how I can visualize the results?
Thank you
Since you've 3 data and its corresponding label, you can only show it in a 3D plot.
I've tried to do that in the following code:
%matplotlib notebook
from sklearn.linear_model import Ridge
X_set, y_set = X_test.values, y_test.values
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
np.arange(start = X_set[:, 1].min() - 1, stop =X_set[:, 1].max() + 1, step = 0.01))
model = Ridge()
model.fit(np.array([X_set[:, 0],X_set[:, 1]]).T,X_set[:,2])
X3=model.predict(np.array([X1.flatten(),X2.flatten()]).T)
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
Dict={0:'red',1:'blue',2:'purple'}
ax.plot_surface(X1, X2, X3.reshape(X1.shape), cmap="YlGn", linewidth=0, antialiased=False, alpha=0.5)
for Id in range(X_set.shape[0]):
ax.scatter3D(*X_set[Id,:],color=Dict[y_set[Id]],linewidths=10)
ax.set_xlabel("Data_1")
ax.set_ylabel('Data_2')
ax.set_zlabel("Data_3")
plt.show()
Also since ax.plot_surface wants given shapes as X1.shape=X2.shape=X3.shape, I have predicted X3 values with a linear model(If you use a tree model it gives a different shape).
One can ask why we haven't used a meshgrid for the 3 data features and create a 3d plot with it. The reason for that is matplotlib plot_surface or 3dcountrp. just accepts 2d params and meshgrid with 3 features returns 3d data for each.
Hope that questions your answer.

Getting Errors with StandardScaler Python

I am trying to scale my training and test data for a Logistic Regression but an error popped-up.
I implemented the answer in this stack: How to standard scale a 3D matrix?
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
df = pd.read_csv('GonzagaTakers.csv')
x_df=df.loc[:, df.columns !='Remarks_P']
features = x_df.keys()
target = 'Remarks_P'
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
scalers ={}
for i in range(X_train.shape[1]):
scalers[i] = StandardScaler()
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
for i in range(y_test.shape[1]):
y_test[:,:,i]=scalers[i].fit_transform(y_test[:,:,i])
_model = LogisticRegression(class_weight='balanced')
_model.fit(X_train, y_train)
accuracy = _model.score(X_test, y_test) * 100
Error occurs in this line
X_train[:, :, i]=scalers[i].fit_transform(X_train[:, :, i])
TypeError: '(slice(None, None, None), slice(None, None, None), 0)' is
an invalid key