MLP outputting average of all training output for any input - numpy

I have tried to implement a multi layer perceptron with sigmoid activations. Below is the code:
import numpy as np
def sigmoid(x):
return 1.0/(1.0 + np.exp(-x))
def sigmoid_derivative(x):
return sigmoid(x) * (1.0 - sigmoid(x))
class MLP:
def __init__(self, layers, x_train, y_train):
self.layers = layers
self.inputs = x_train
self.outputs = y_train
def forward(self, input):
output = input
for layer in self.layers:
layer.activations = output
output = layer.feedforward(output)
return output
def backward(self, output, predicted):
error = np.multiply(2 * np.subtract(output, predicted), sigmoid_derivative(predicted))
for layer in self.layers[::-1]:
#recursively backpropagate the error
error = layer.backpropagate(error)
def train(self):
for i in range(1,500):
predicted = self.forward(self.inputs)
self.backward(self.outputs,predicted)
def test(self, input):
return self.forward(input)
class Layer:
def __init__(self, prevNodes, selfNodes):
self.weights = np.random.rand(prevNodes,selfNodes)
self.biases = np.zeros(selfNodes)
self.activations = np.array([])
def feedforward(self, input):
return sigmoid(np.dot(input, self.weights) + self.biases)
def backpropagate(self, error):
delPropagate = np.dot(error, self.weights.transpose())
dw = np.dot(self.activations.transpose(), error)
db = error.mean(axis=0) * self.activations.shape[0]
self.weights = self.weights + 0.1 * dw
self.biases = self.biases + 0.1 * db
return np.multiply(delPropagate ,sigmoid_derivative(self.activations))
layer1 = Layer(3,4)
layer2 = Layer(4,1)
x_train = np.array([[0,0,1],[0,1,1],[1,0,1],[1,1,1]])
y_train = np.array([[0],[1],[1],[0]])
x_test = np.array([[0,0,1]])
mlp = MLP([layer1,layer2], x_train, y_train)
mlp.train()
mlp.test(x_test)
However the problem is the network saturates to output the average of all training outputs for any input. For eg, in the above case the y_train avarage is about 0.5 and no matter what 'test_x' value I feed to the network the output is always around the 0.5 mark.
Where could be the problem in code. Am I missing something in the algorithms. Help is appreciated

The issue seems to be with lesser number of iterations, increasing iterations from 500 to 50000 works or changing the learning rate to 0.5 also works with lesser number of iterations. The matrix manipulations and all mathematics seem to be consistent

Related

Why my chemical vae cannot learn any thing with toy dataset?

I m trying to implement a mini version of chemical vae referred in this paper: 10.1021/acscentsci.7b00572. The model can be successfully trained, and the loss is changing. However, the predicted properties of all samples are same, near to the mean value. And the autoencoder cannot reconstruct the input data. It means the model cannot learn anything by training. I have carefully check my codes, but failed to find any wrong. Can any one help? Thank you.
Here is my code:
import numpy as np
import tensorflow as tf
# example smiles and properties
smiles = ['CCCCO', 'C1CCCCC1', 'C[C##H](C(=O)O)N', 'C[C#H](C(=O)O)N', 'CC(=O)O'] * 200
y = [1,2,3,4,5] * 200
# smiles to one-hot
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
dicts = set(''.join(smiles))
num_words = len(dicts) + 1
max_lens = 15
tokenizer = Tokenizer(num_words=num_words, char_level=True)
tokenizer.fit_on_texts(smiles)
sequences = tokenizer.texts_to_sequences(smiles)
sequences = pad_sequences(sequences, maxlen = max_lens, padding='post', truncating='post')
x = to_categorical(sequences, num_classes=num_words)
# model
from tensorflow.keras import layers, Model
class VAEWithRegressor(Model):
"""Combines a variational autoencoder with a property regressor."""
def __init__(self, latent_dim):
super(VAEWithRegressor, self).__init__()
# Define the encoder layers
self.encoder = tf.keras.Sequential(
[
layers.InputLayer(input_shape=x[0].shape),
layers.GRU(units=64, return_sequences=True),
layers.BatchNormalization(),
layers.GRU(units=32),
layers.BatchNormalization(),
layers.Dense(units=16),
layers.BatchNormalization(),
layers.Dense(latent_dim * 2),
]
)
# Define the decoder layers
self.decoder = tf.keras.Sequential(
[
layers.InputLayer(input_shape=(latent_dim,)),
layers.Dense(units=16),
layers.BatchNormalization(),
layers.Dense(units=32),
layers.BatchNormalization(),
layers.RepeatVector(max_lens),
layers.GRU(units = max_lens, return_sequences=True),
layers.BatchNormalization(),
layers.TimeDistributed(layers.Dense(units=num_words)),
layers.Activation('softmax')
]
)
# Define the regressor layers
self.regressor = tf.keras.Sequential(
[
layers.InputLayer(input_shape=(latent_dim,)),
layers.Dense(units=32),
layers.Dense(units=16),
layers.Dense(units=1),
]
)
def encode(self, x):
# Compute the mean and log variance of the latent variable
h = self.encoder(x)
mean, log_var = tf.split(h, num_or_size_splits=2, axis=1)
return mean, log_var
def reparameterize(self, mean, log_var):
# Sample from the latent variable distribution
eps = tf.random.normal(tf.shape(mean))
std_dev = tf.exp(0.5 * log_var)
z = mean + std_dev * eps
return z
def decode(self, z):
# Reconstruct the input from the latent variable
return self.decoder(z)
def predict_properties(self, z):
# Predict the properties of the input
return self.regressor(z)
def call(self, x):
# Define the forward pass of the model
mean, log_var = self.encode(x)
z = self.reparameterize(mean, log_var)
x_pred = self.decode(z)
properties = self.predict_properties(z)
return x_pred, mean, log_var, properties
def vae_loss(self, x, x_pred, mean, log_var):
recon_loss = tf.reduce_sum(tf.keras.losses.binary_crossentropy(x, x_pred), axis = 1)
kl_loss = -0.5 * tf.reduce_sum(1 + log_var - tf.square(mean) - tf.exp(log_var), axis = 1)
return tf.reduce_mean(recon_loss + kl_loss)
def property_loss(self, y_true, y_pred):
# Compute the mean squared error between the true and predicted properties
return tf.reduce_mean(tf.keras.losses.mean_squared_error(y_true, y_pred))
def train_step(self, x, y_true):
with tf.GradientTape() as tape:
x_pred, mean, log_var, y_pred = self.call(x)
vae_loss_value = self.vae_loss(x, x_pred, mean, log_var)
property_loss_value = self.property_loss(y_true, y_pred)
total_loss = vae_loss_value + property_loss_value
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
gradients = tape.gradient(total_loss, self.trainable_variables)
optimizer.apply_gradients(zip(gradients, self.trainable_variables))
return vae_loss_value, property_loss_value
latent_dim = 8
num_epochs = 50
batch_size = 256
vae = VAEWithRegressor(latent_dim)
x_train = x
y_train = y
for epoch in range(num_epochs):
epoch_vae_loss = 0
epoch_property_loss = 0
for i in range(0, len(x_train), batch_size):
x_batch = x_train[i:i+batch_size]
y_batch = y_train[i:i+batch_size]
vae_loss_value, property_loss_value = vae.train_step(x_batch, y_batch)
epoch_vae_loss += vae_loss_value
epoch_property_loss += property_loss_value
epoch_vae_loss /= (len(x_train) / batch_size)
epoch_property_loss /= (len(x_train) / batch_size)
print('Epoch {}, VAE loss: {}, Property loss: {}'.format(epoch+1, epoch_vae_loss, epoch_property_loss))
z_sample = vae.encoder.predict(x)[:,:latent_dim]
x_pred = np.array(vae.decoder.predict(z_sample))
y_pred = np.array(vae.predict_properties(z_sample))

BERT for multi label text classification always has a similar val accuracy despite fine tuning

I have a dataset of German news articles that I need to classify in my job. Since it is imbalanced, I am focussing on only 12 of 30 labels currently. Therefore I tried to balance the dataset by oversampling enhanced with data augmentation. Each sample can belong to multiple categories, thus it is a multi label problem.
The train dataset contains about 127.000 samples.
I am using a German BERT model with Tensorflow but despite fine tuning and even adding new layers, my val accuracy is always about 65%. Sometimes 67 to 68 but never higher. I wondered if my code is maybe broken or if it is due to the dataset.
Here is what I have right now:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
transformer_model = TFAutoModel.from_pretrained("dbmdz/bert-base-german-cased", output_hidden_states=False)
def multi_label_accuracy(y_true: tf.Tensor, y_pred: tf.Tensor) -> tf.Tensor:
"""For multi-label classification, one has to define a custom
acccuracy function because neither tf.keras.metrics.Accuracy nor
tf.keras.metrics.CategoricalAccuracy evaluate the number of
exact matches.
:Example:
>>> from tensorflow.keras import metrics
>>> y_true = tf.convert_to_tensor([[1., 1.]])
>>> y_pred = tf.convert_to_tensor([[1., 0.]])
>>> metrics.Accuracy()(y_true, y_pred).numpy()
0.5
>>> metrics.CategoricalAccuracy()(y_true, y_pred).numpy()
1.0
>>> multi_label_accuracy(y_true, y_pred).numpy()
0.0
"""
y_pred = tf.math.sigmoid(y_pred)
y_pred = tf.math.round(y_pred)
exact_matches = tf.math.reduce_all(y_pred == y_true, axis=1)
exact_matches = tf.cast(exact_matches, tf.float32)
return tf.math.reduce_mean(exact_matches)
def f1_score(y_true, y_logit):
'''
Calculate F1 score
y_true: true value
y_logit: predicted value
'''
y_logit = tf.math.sigmoid(y_logit)
true_positives = K.sum(K.round(K.clip(y_true * y_logit, 0, 1)))
possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + K.epsilon())
predicted_positives = K.sum(K.round(K.clip(y_logit, 0, 1)))
precision = true_positives / (predicted_positives + K.epsilon())
return (2 * precision * recall) / (precision + recall + K.epsilon())
for l in transformer_model.layers:
l.trainable = True
bert = transformer_model.layers[0]
input_ids = tf.keras.layers.Input(shape=(60,), name='input_ids', dtype=np.int32)
attention_masks = tf.keras.layers.Input(shape=(60,), name='attention_masks', dtype=np.int32)
bert_model = bert(input_ids, attention_mask=attention_masks)[0][:, 0, :]
dropout = tf.keras.layers.Dropout(0.2, name="pooled_output")
pooled_output = dropout(bert_model)
dense = tf.keras.layers.Dense(units=256, activation="sigmoid")(pooled_output)
dropout2 = tf.keras.layers.Dropout(0.2)(dense)
dense2 = tf.keras.layers.Dense(units=64, activation="relu")(dropout2)
output = tf.keras.layers.Dense(units=12, name="output")(dense2)
model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
print("Compile model...", flush=True)
optimizer = Adam(learning_rate=1e-5, decay=1e-6)
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=optimizer, metrics=[f1_score, multi_label_accuracy]
)
history = model.fit([dataset['train']['bert'], dataset['train']['bert2']], dataset['train']['outputs'], epochs=4, batch_size=64, validation_data=([dataset['val']['bert'], dataset['val']['bert2']], dataset['val']['outputs']))
I would expect the val accuracy to change a lot more by changing the architecture of the model.

Keras/Deepchem: epochs in data generator for prediction in graph convolutions affects prediction size

I am using graph convolutions in Deepchem/Keras for predicting molecular properties. Following the Deepchem tutorials I created a data generator. While there is no error in my code below, I fail to understand why the size of pred changes with epoch and batch_size.
First we create some dummy data.
!pip install --pre deepchem
!pip install --pre rdkit
import deepchem as dc
import numpy as np
import tensorflow as tf
from deepchem.feat.mol_graphs import ConvMol
mol = ['C-C-O']*240
ftr = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)
X=ftr.featurize(mol)
y = np.arange(0,240,1)
w = np.arange(0,240,1)
ids = np.arange(0,240,1)
ds = dc.data.NumpyDataset(X=X, y=y, ids=ids)
Edit: We use the following function as generator:
def data_generator(dataset, epochs=1, batch_size = 100, pad_batches = True):
print(dataset)
for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
deterministic=False, pad_batches=pad_batches)):
multiConvMol = ConvMol.agglomerate_mols(X_b)
inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
labels = [y_b]
weights = [w_b]
yield (inputs, labels, weights)
(end edit)
Then we define the model and fit it to the dataset generated above:
batch_size = 100
n_tasks = 1
class TestModel(tf.keras.Model):
def __init__(self, model = 1):
super(TestModel, self).__init__()
self.model = model
#____________Test Model 1___________
if self.model == 1:
self.gc1 = GraphConv(128, activation_fn=tf.nn.tanh)
self.readout = GraphGather(batch_size=batch_size,
activation_fn=tf.nn.tanh)
self.dense2 = layers.Dense(1)
def call(self, inputs):
#____________Test Model 1___________
if self.model == 1:
gc1_output = self.gc1(inputs)
readout_output = self.readout([gc1_output]+ inputs[1:])
dense2_output = self.dense2(readout_output)
return dense2_output
#Fit_generator
print("_________\nFitting:")
testmodel = dc.models.KerasModel(TestModel(1), loss=dc.models.losses.L2Loss())
testmodel.fit_generator(data_generator(ds, epochs=1, batch_size = 100))
Finally we try to predict the dataset labels setting epochs = 2:
#Predict
print("_________\nPredicting:")
pred = testmodel.predict_on_generator(data_generator(ds, epochs = 2, batch_size = 100, pad_batches = True))
print(ds.y.shape, pred.shape)
Giving:
_________
Predicting:
<NumpyDataset X.shape: (240,), y.shape: (240,), w.shape: (240,), ids: [0 1 2 ... 237 238 239], task_names: [0]>
(240,) (600, 1)
However if I change epochs to 1, the size of pred changes (300, 1) i.e. half of what we had before. Similarly, changing the batch_size affects the prediction size too.
Can anyone explain what I'm doing wrong?

Converting Tensorflow code to Pytorch - performance metrics very different

I have converted a tensorflow code for timeseries analysis to pytorch and performance difference is very high, in fact pytorch layers cannot account for seasonality at all. It feels like I must be missing something important.
Please help find where the pytorch code is lacking that the learning is not up to the par. I noticed that loss values has high jumps when it encounters the season change and is not learning that. With the same layers, nodes and every other thing, I imagined the performance to be close.
# tensorflow code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
dataset = tf.data.Dataset.from_tensor_slices(series)
dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
dataset = dataset.shuffle(shuffle_buffer).map(lambda window: (window[:-1], window[-1]))
dataset = dataset.batch(batch_size).prefetch(1)
return dataset
dataset = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)
model = tf.keras.models.Sequential([
tf.keras.layers.Dense(100, input_shape=[window_size], activation="relu"),
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(1)
])
model.compile(loss="mse", optimizer=tf.keras.optimizers.SGD(lr=1e-6, momentum=0.9))
model.fit(dataset,epochs=100,verbose=0)
forecast = []
for time in range(len(series) - window_size):
forecast.append(model.predict(series[time:time + window_size][np.newaxis]))
forecast = forecast[split_time-window_size:]
results = np.array(forecast)[:, 0, 0]
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
tf.keras.metrics.mean_absolute_error(x_valid, results).numpy()
# pytorch code
window_size = 20
batch_size = 32
shuffle_buffer_size = 1000
class tsdataset(Dataset):
def __init__(self, series, window_size):
self.series = series
self.window_size = window_size
self.dataset, self.labels = self.preprocess()
def preprocess(self):
series = self.series
final, labels = [], []
for i in range(len(series)-self.window_size):
final.append(np.array(series[i:i+window_size]))
labels.append(np.array(series[i+window_size]))
return torch.from_numpy(np.array(final)), torch.from_numpy(np.array(labels))
def __getitem__(self,index):
# print(self.dataset[index], self.labels[index], index)
return self.dataset[index], self.labels[index]
def __len__(self):
return len(self.dataset)
train_dataset = tsdataset(x_train, window_size)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
class tspredictor(nn.Module):
def __init__(self, window_size, out1, out2, out3):
super(tspredictor, self).__init__()
self.l1 = nn.Linear(window_size, out1)
self.l2 = nn.Linear(out1, out2)
self.l3 = nn.Linear(out2, out3)
def forward(self,seq):
l1 = F.relu(self.l1(seq))
l2 = F.relu(self.l2(l1))
l3 = self.l3(l2)
return l3
model = tspredictor(20, 100,10,1)
loss_function = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=1e-6, momentum=0.9)
for epoch in range(100):
for t,l in train_dataloader:
model.zero_grad()
tag_scores = model(t)
loss = loss_function(tag_scores, l)
loss.backward()
optimizer.step()
# print("Epoch is {}, loss is {}".format(epoch, loss.data))
forecast = []
for time in range(len(series) - window_size):
prediction = model(torch.from_numpy(series[time:time + window_size][np.newaxis]))
forecast.append(prediction)
forecast = forecast[split_time-window_size:]
results = np.array(forecast)
plt.figure(figsize=(10, 6))
plot_series(time_valid, x_valid)
plot_series(time_valid, results)
To generate data, you can use:
def plot_series(time, series, format="-", start=0, end=None):
plt.plot(time[start:end], series[start:end], format)
plt.xlabel("Time")
plt.ylabel("Value")
plt.grid(False)
def trend(time, slope=0):
return slope * time
def seasonal_pattern(season_time):
"""Just an arbitrary pattern, you can change it if you wish"""
return np.where(season_time < 0.1,
np.cos(season_time * 6 * np.pi),
2 / np.exp(9 * season_time))
def seasonality(time, period, amplitude=1, phase=0):
"""Repeats the same pattern at each period"""
season_time = ((time + phase) % period) / period
return amplitude * seasonal_pattern(season_time)
def noise(time, noise_level=1, seed=None):
rnd = np.random.RandomState(seed)
return rnd.randn(len(time)) * noise_level
time = np.arange(10 * 365 + 1, dtype="float32")
baseline = 10
series = trend(time, 0.1)
baseline = 10
amplitude = 40
slope = 0.005
noise_level = 3
# Create the series
series = baseline + trend(time, slope) + seasonality(time, period=365, amplitude=amplitude)
# Update with noise
series += noise(time, noise_level, seed=51)
split_time = 3000
time_train = time[:split_time]
x_train = series[:split_time]
time_valid = time[split_time:]
x_valid = series[split_time:]
There was a broadcasting issue in the loss function. Changing to the loss to one below fixes it:
loss = loss_function(tag_scores, l.view(-1,1))

Cost-sensitive loss function in Tensorflow

I'm doing research for cost-sensitive neural network based on Tensorflow. But because of the static graph structure of Tensorflow. Some NN structure couldn't be realized by myself.
My loss function(cost) ,cost matrix and the computational progress is described as follow and my target is to compute the total cost and then optimize the NN :
Approximately computational progress:
the y_ is the last full-connect output of a CNN which has shape (1024,5)
the y is a Tensor which has shape(1024) and indicates the ground truth of x[i]
the y_soft[i] [j] indicates the probability of x[i] to be class j
How can I realize this in Tensorflow?
cost_matrix:
[[0,1,100],
[1,0,1],
[1,20,0]]
label:
[1,2]
y*:
[[0,1,0],
[0,0,1]]
y(prediction):
[[0.2,0.3,0.5],
[0.1,0.2,0.7]]
label,cost_matrix-->cost_embedding:
[[1,0,1],
[1,20,0]]
It obvious 0.3 in [0.2,0.3,0.5] refers to right lable probility of [0,1,0], so it should not contibute to loss.
0.7 in [0.1,0.2,0.7] is the same. In other words, the pos with value 1 in y* not contibute to loss.
So I have (1-y*):
[[1,0,1],
[1,1,0]]
Then the entropy is target*log(predict) + (1-target) * log(1-predict),and value 0 in y*,should use (1-target)*log(1-predict), so I use (1-predict) said (1-y)
1-y:
[[0.8,*0.7*,0.5],
[0.9,0.8,*0.3*]]
(italic num is useless)
the custom loss is
[[1,0,1], [1,20,0]] * log([[0.8,0.7,0.5],[0.9,0.8,0.3]]) *
[[1,0,1],[1,1,0]]
and you can see the (1-y*) can be drop here
so the loss is -tf.reduce_mean(cost_embedding*log(1-y))
,to make it applicable , should be:
-tf.reduce_mean(cost_embedding*log(tf.clip((1-y),1e-10)))
the demo is below
import tensorflow as tf
import numpy as np
hidden_units = 50
num_class = 3
class Model():
def __init__(self,name_scope,is_custom):
self.name_scope = name_scope
self.is_custom = is_custom
self.input_x = tf.placeholder(tf.float32,[None,hidden_units])
self.input_y = tf.placeholder(tf.int32,[None])
self.instantiate_weights()
self.logits = self.inference()
self.predictions = tf.argmax(self.logits,axis=1)
self.losses,self.train_op = self.opitmizer()
def instantiate_weights(self):
with tf.variable_scope(self.name_scope + 'FC'):
self.W = tf.get_variable('W',[hidden_units,num_class])
self.b = tf.get_variable('b',[num_class])
self.cost_matrix = tf.constant(
np.array([[0,1,100],[1,0,100],[20,5,0]]),
dtype = tf.float32
)
def inference(self):
return tf.matmul(self.input_x,self.W) + self.b
def opitmizer(self):
if not self.is_custom:
loss = tf.nn.sparse_softmax_cross_entropy_with_logits\
(labels=self.input_y,logits=self.logits)
else:
batch_cost_matrix = tf.nn.embedding_lookup(
self.cost_matrix,self.input_y
)
loss = - tf.log(1 - tf.nn.softmax(self.logits))\
* batch_cost_matrix
train_op = tf.train.AdamOptimizer().minimize(loss)
return loss,train_op
import random
batch_size = 128
norm_model = Model('norm',False)
custom_model = Model('cost',True)
split_point = int(0.9 * dataset_size)
train_set = datasets[:split_point]
test_set = datasets[split_point:]
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(100):
batch_index = random.sample(range(split_point),batch_size)
train_batch = train_set[batch_index]
train_labels = lables[batch_index]
_,eval_predict,eval_loss = sess.run([norm_model.train_op,
norm_model.predictions,norm_model.losses],
feed_dict={
norm_model.input_x:train_batch,
norm_model.input_y:train_labels
})
_,eval_predict1,eval_loss1 = sess.run([custom_model.train_op,
custom_model.predictions,custom_model.losses],
feed_dict={
custom_model.input_x:train_batch,
custom_model.input_y:train_labels
})
# print 'norm',eval_predict,'\ncustom',eval_predict1
print np.sum(((eval_predict == train_labels)==True).astype(np.int)),\
np.sum(((eval_predict1 == train_labels)==True).astype(np.int))
if i%10 == 0:
print 'norm_test',sess.run(norm_model.predictions,
feed_dict={
norm_model.input_x:test_set,
norm_model.input_y:lables[split_point:]
})
print 'custom_test',sess.run(custom_model.predictions,
feed_dict={
custom_model.input_x:test_set,
custom_model.input_y:lables[split_point:]
})