Using rejection_resample() with the Dataset Api - tensorflow

I am having a hard time trying to make some balancing batching using the rejection_resample() along with the Dataset API. I am using images and labels (ints) as input, as you can glance in the code, but the rejection_resample() seems not to work as expected.
Note: I am using Tensorflow v1.3
Here I define the dataset, the dataset's distribution and the distribution I want.
target_dist = [0.1, 0.0, 0.0, 0.0, 0.9]
initial_dist = [0.1061, 0.3213, 0.4238, 0.1203, 0.0282]
training_filenames = training_records
training_dataset = tf.contrib.data.TFRecordDataset(training_filenames)
training_dataset = training_dataset.map(tf_record_parser) # Parse the record into tensors.
training_dataset = training_dataset.repeat() # number of epochs
training_dataset = training_dataset.shuffle(buffer_size=1000)
training_dataset = tf.contrib.data.rejection_resample(training_dataset,
class_func=lambda _, c: c,
target_dist=target_dist,
initial_dist=initial_dist)
# Return to the same Dataset shape as was the original input
training_dataset = training_dataset.map(lambda _, data: (data))
training_dataset = training_dataset.batch(64)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.contrib.data.Iterator.from_string_handle(
handle, training_dataset.output_types, training_dataset.output_shapes)
batch_images, batch_labels = iterator.get_next()
training_iterator = training_dataset.make_initializable_iterator()
When I run this thing I should only get samples from the classes 0 and 4, but I get results from all of the classes, as though it did not work.
with tf.Session() as sess:
training_handle = sess.run(training_iterator.string_handle())
sess.run(training_iterator.initializer)
batch_faces_np, batch_label_np = sess.run([batch_images, batch_labels],feed_dict={handle: training_handle})
ctr = Counter(batch_label_np)
Counter({2: 31, 3: 22, 4: 6, 1: 5})
I tested with an example based on this post: Dataset API, Iterators and tf.contrib.data.rejection_resample and from the original testing code from the tensorflow repo and it works.
initial_known = True
classes = np.random.randint(5, size=(20000,)) # Uniformly sampled
target_dist = [0.5, 0.0, 0.0, 0.0, 0.4]
initial_dist = [0.2] * 5 if initial_known else None
iterator = dataset_ops.Iterator.from_dataset(
dataset_ops.rejection_resample(
(dataset_ops.Dataset.from_tensor_slices(classes)
.shuffle(200, seed=21)
.map(lambda c: (c, string_ops.as_string(c)))),
target_dist=target_dist,
initial_dist=initial_dist,
class_func=lambda c, _: c,
seed=27))
init_op = iterator.initializer
get_next = iterator.get_next()
variable_init_op = variables.global_variables_initializer()
with tf.Session() as sess:
sess.run(variable_init_op)
sess.run(init_op)
returned = []
while True:
returned.append(sess.run(get_next))
Counter({(0, (0, b'0')): 3873, (4, (4, b'4')): 3286})
Can you guys help me with that? Thanks.

Try with seed value for shuffle.
It worked with seed value for me.

Related

InvalidArgumentError (see above for traceback): indices[47,6] = 24 is not in [0, 23)

I am trying to run the following main.py file and I continuously get the error "InvalidArgumentError (see above for traceback): indices[138,4] = 23 is not in [0, 23)". I have checked my vocab file. It has exactly 23 words in it.
The code works fine for a single line of new data inserted but when the data is continuous or more then this error pops out. Please help me to rectify this issue.
Below is a small snippet of my code . The line "word_embeddings = tf.nn.embedding_lookup(variable, word_ids)" is where the error comes.
def model_fn(features, labels, mode, params):
# For serving features are a bit different
if isinstance(features, dict):
features = ((features['words'], features['nwords']),
(features['chars'], features['nchars']))
# Read vocabs and inputs
(words, nwords), (chars, nchars) = features
dropout = params['dropout']
training = (mode == tf.estimator.ModeKeys.TRAIN)
vocab_words = tf.contrib.lookup.index_table_from_file(
params['words'], num_oov_buckets=params['num_oov_buckets'])
vocab_chars = tf.contrib.lookup.index_table_from_file(
params['chars'], num_oov_buckets=params['num_oov_buckets'])
with Path(params['tags']).open() as f:
indices = [idx for idx, tag in enumerate(f) if tag.strip() != 'O']
num_tags = len(indices) + 1
with Path(params['chars']).open() as f:
num_chars = sum(1 for _ in f) + params['num_oov_buckets']
# Char Embeddings
char_ids = vocab_chars.lookup(chars)
variable = tf.get_variable(
'chars_embeddings', [num_chars, params['dim_chars']], tf.float32)
char_embeddings = tf.nn.embedding_lookup(variable, char_ids)
char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout,
training=training)
# Char LSTM
dim_words = tf.shape(char_embeddings)[1]
dim_chars = tf.shape(char_embeddings)[2]
flat = tf.reshape(char_embeddings, [-1, dim_chars, params['dim_chars']])
t = tf.transpose(flat, perm=[1, 0, 2])
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['char_lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
_, (_, output_fw) = lstm_cell_fw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
_, (_, output_bw) = lstm_cell_bw(t, dtype=tf.float32,
sequence_length=tf.reshape(nchars, [-1]))
output = tf.concat([output_fw, output_bw], axis=-1)
char_embeddings = tf.reshape(output, [-1, dim_words, 50])
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
# Concatenate Word and Char Embeddings
embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1)
embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training)
# LSTM
t = tf.transpose(embeddings, perm=[1, 0, 2]) # Need time-major
lstm_cell_fw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.LSTMBlockFusedCell(params['lstm_size'])
lstm_cell_bw = tf.contrib.rnn.TimeReversedFusedRNN(lstm_cell_bw)
output_fw, _ = lstm_cell_fw(t, dtype=tf.float32, sequence_length=nwords)
output_bw, _ = lstm_cell_bw(t, dtype=tf.float32, sequence_length=nwords)
output = tf.concat([output_fw, output_bw], axis=-1)
output = tf.transpose(output, perm=[1, 0, 2])
output = tf.layers.dropout(output, rate=dropout, training=training)
# CRF
logits = tf.layers.dense(output, num_tags)
crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32)
pred_ids, _ = tf.contrib.crf.crf_decode(logits, crf_params, nwords)
if mode == tf.estimator.ModeKeys.PREDICT:
# Predictions
reverse_vocab_tags = tf.contrib.lookup.index_to_string_table_from_file(
params['tags'])
pred_strings = reverse_vocab_tags.lookup(tf.to_int64(pred_ids))
predictions = {
'pred_ids': pred_ids,
'tags': pred_strings
}
return tf.estimator.EstimatorSpec(mode, predictions=predictions)
else:
# Loss
vocab_tags = tf.contrib.lookup.index_table_from_file(params['tags'])
tags = vocab_tags.lookup(labels)
log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(
logits, tags, nwords, crf_params)
loss = tf.reduce_mean(-log_likelihood)
# Metrics
weights = tf.sequence_mask(nwords)
metrics = {
'acc': tf.metrics.accuracy(tags, pred_ids, weights),
'precision': precision(tags, pred_ids, num_tags, indices, weights),
'recall': recall(tags, pred_ids, num_tags, indices, weights),
'f1': f1(tags, pred_ids, num_tags, indices, weights),
}
for metric_name, op in metrics.items():
tf.summary.scalar(metric_name, op[1])
if mode == tf.estimator.ModeKeys.EVAL:
return tf.estimator.EstimatorSpec(
mode, loss=loss, eval_metric_ops=metrics)
elif mode == tf.estimator.ModeKeys.TRAIN:
train_op = tf.train.AdamOptimizer().minimize(
loss, global_step=tf.train.get_or_create_global_step())
return tf.estimator.EstimatorSpec(
mode, loss=loss, train_op=train_op)
if __name__ == '__main__':
# Params
params = {
'dim': 300,
'dim_chars': 100,
'dropout': 0.5,
'num_oov_buckets': 1,
'epochs': 25,
'batch_size': 20,
'buffer': 30000000,
'char_lstm_size': 25,
'lstm_size': 100,
'words': str(Path(DATADIR, 'vocab.words.txt')),
'chars': str(Path(DATADIR, 'vocab.chars.txt')),
'tags': str(Path(DATADIR, 'vocab.tags.txt')),
'glove': str(Path(DATADIR, 'glove.npz'))
}
with Path('results1/params.json').open('w') as f:
json.dump(params, f, indent=4, sort_keys=True)
# Word Embeddings
word_ids = vocab_words.lookup(words)
glove = np.load(params['glove'])['embeddings'] # np.array
variable = np.vstack([glove, [[0.] * params['dim']]])
variable = tf.Variable(variable, dtype=tf.float32, trainable=False)
word_embeddings = tf.nn.embedding_lookup(variable, word_ids)
Hope this is not too late for you.
I have been googling this issue for a while, hopefully got the root of it and turns out it was quite simple. Similar issues unsolved were here and here.
Chances are: You have seen an example of this embeddings code somewhere and tried to follow it (this was the case for me). However, the case is that coders and tensorflow assume that the id's for the inputs are sequential. I.e. that if you have 1000 items for example, then your id's are [0,1,2,3..998,999].
However, this is usually not the case with real data where id's are something like "xYzVryCmplxNm5m3r" (in this case, it will give and error because there are characters in the id and tensorflow will not accept that, it only accepts integers), or, in the very subtle case that is probably your case, the id's are actually integers but not sequential. For example, they can go like : ids=[68632548, 15323, ....].
In this case, tensorflow will accept the input data (because it's integers as expected) and give you this error, because the numbers are not sequential and actually much larger than the number of unique id's (this number+1 is usually set to be the limit for the vocab size).
The solution that worked for me was to map all the id values in the original dataframe to sequential id's, preserving their uniqueness, and then input the same data again (it actually worked !).
The code could be something like:
unique_ids=np.unique(old_ids)
sqeuential_ids=[i for i in range(len(unique_ids))]
id_mapping_dict=dict(zip(unique_ids,sqeuential_ids))
def map_ids_to_sequential(original_id):
return id_mapping_dict[original_id]
df['ids']=df['ids'].apply(map_ids_to_sequential)

Tensorflow value error when chaining content of data - Cannot feed value of shape (1, 1) for Tensor 'Placeholder_1:0',

This post is related to the following question. The code above is taken from the accepted answer.
The program itself works fine as is, but if I only changed the values of the data provided from
df = pd.DataFrame({'Temperature': [183, 10.7, 24.3, 10.7],
'Weight': [8, 11.2, 14, 11.2],
'Size': [3.97, 7.88, 11, 7.88],
'Property': [0,1,2,0]})
to
df = pd.DataFrame({'Temperature': [0,0,0,0],
'Weight': [1,2,3,4],
'Size': [1,2,3,4],
'Property': [1,1,1,1]})
I receive the following error while executing the code
ValueError: Cannot feed value of shape (1, 1) for Tensor
'Placeholder_1:0', which has shape '(?, 3)'
Nothing really changed structurally, so I am really confused by this error. The odd thing is that changing the values of the data may or may not trigger this issue. I've tried various TF versions including the latest and the same issue always occurs.
Does anybody know what am I missing? The full code example follows.
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
df = pd.DataFrame({'Temperature': [183, 10.7, 24.3, 10.7],
'Weight': [8, 11.2, 14, 11.2],
'Size': [3.97, 7.88, 11, 7.88],
'Property': [0,1,2,0]})
df.Property = df.Property.shift(-1)
print ( df.head() )
# parameters
time_steps = 1
inputs = 3
outputs = 3
df = df.iloc[:-1,:]
df = df.values
train_X = df[:, 1:]
train_y = df[:, 0]
scaler = MinMaxScaler(feature_range=(0, 1))
train_X = scaler.fit_transform(train_X)
train_X = train_X[:,None,:]
onehot_encoder = OneHotEncoder()
encode_categorical = train_y.reshape(len(train_y), 1)
train_y = onehot_encoder.fit_transform(encode_categorical).toarray()
learning_rate = 0.001
epochs = 500
batch_size = int(train_X.shape[0]/2)
length = train_X.shape[0]
display = 100
neurons = 100
tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, time_steps, inputs])
y = tf.placeholder(tf.float32, [None, outputs])
cell = tf.contrib.rnn.BasicLSTMCell(num_units=neurons, activation=tf.nn.relu)
cell_outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
stacked_outputs = tf.reshape(cell_outputs, [-1, neurons])
out = tf.layers.dense(inputs=stacked_outputs, units=outputs)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
labels=y, logits=out))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
training_op = optimizer.minimize(loss)
accuracy = tf.metrics.accuracy(labels = tf.argmax(y, 1),
predictions = tf.argmax(out, 1),
name = "accuracy")
precision = tf.metrics.precision(labels=tf.argmax(y, 1),
predictions=tf.argmax(out, 1),
name="precision")
recall = tf.metrics.recall(labels=tf.argmax(y, 1),
predictions=tf.argmax(out, 1),
name="recall")
f1 = 2 * accuracy[1] * recall[1] / ( precision[1] + recall[1] )
with tf.Session() as sess:
tf.global_variables_initializer().run()
tf.local_variables_initializer().run()
for steps in range(epochs):
mini_batch = zip(range(0, length, batch_size),
range(batch_size, length+1, batch_size))
for (start, end) in mini_batch:
sess.run(training_op, feed_dict = {X: train_X[start:end,:,:],
y: train_y[start:end,:]})
if (steps+1) % display == 0:
loss_fn = loss.eval(feed_dict = {X: train_X, y: train_y})
print('Step: {} \tTraining loss: {}'.format((steps+1), loss_fn))
acc, prec, recall, f1 = sess.run([accuracy, precision, recall, f1],
feed_dict = {X: train_X, y: train_y})
print('\nEvaluation on training set')
print('Accuracy:', acc[1])
print('Precision:', prec[1])
print('Recall:', recall[1])
print('F1 score:', f1)
As #Lescurel rightly pointed out, in a classification setting, the variable output should reflect the number of classes in the target variable.
Whereas in a regression setting, it'll reflect the number of columns of the target variables (assuming we are predicting more than one variable).
So given the sample input data:
df = pd.DataFrame({'Temperature': [1,2,3,4,5],
'Weight': [2,4,6,8,10],
'Size': [9,24,9,9,9],
'Property': [0,0,0,0,1]})
The number of target classes is 2. Hence output = 2.
Note: Your posted code in https://paste.ubuntu.com/p/tmXgQfm8GB/ works well for me.
Just observed that your target variable Property is the last column of the DataFrame.
Temperature Weight Size Property
0 1 2 9 0.0
1 2 4 24 0.0
2 3 6 9 0.0
3 4 8 9 1.0
4 5 10 9 NaN
Modify your code as follows, instead of having:
# X_y_split
train_X = df[:, 1:]
train_y = df[:, 0]
change it to:
# X_y_split
train_X = df[:, :-1]
train_y = df[:, -1]
What you have here is a classification network: It takes inputs, or features (Temperature, Weight and Size), and classify them into one of your classes : 0, 1 or 2. (Property field)
When you modified the original dataset, you modified the number of classes : from 3 (0,1,2), you went to 1. (1).
For the code to work, you just need to modify the parameters section of your code so it fits your dataset.
# parameters
time_steps = 1
inputs = 3
outputs = 1
Note : In this case, I find the the term outputs is a bit vague. I would have used something like nb_classes

Basic Tensorflow: Define Tensor variable using existing variables

I have some very simple tensorflow code to rotate a vector:
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(2, 1))
angle = tf.placeholder(tf.float32)
s_a = tf.sin(angle)
c_a = tf.cos(angle)
R = tf.Variable([[c_a, s_a], [-s_a, c_a]], tf.float32, expected_shape=(2,2))
#R = tf.Variable([[1.0, 0.0], [0.0, 1.0]], tf.float32)
rotated_v = tf.matmul(R,x)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
res = sess.run([init,rotated_v], feed_dict={x:np.array([[1.0],[1.0]]), angle:1.0})
print(res)
The code works fine when I hand-code the identity matrix. However, in its current form I get this error:
ValueError: initial_value must have a shape specified: Tensor("Variable/initial_value:0", dtype=float32)
I've tried specifying the shape in multiple ways, but I can't make this work.
What am I doing wrong?
I have figured out a way to achieve this (might not be the best way, but it works).
import tensorflow as tf
import numpy as np
x = tf.placeholder(tf.float32, shape=(2, 1))
angle = tf.placeholder(tf.float32)
s_a = tf.sin(angle)
c_a = tf.cos(angle)
R = tf.Variable([[1.0, 0.0], [0.0, 1.0]], tf.float32)
assignR = tf.assign(R, [[c_a, s_a], [-s_a, c_a]])
rotated_v = tf.matmul(R,x)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
newR = sess.run(assignR, feed_dict={angle:1.0})
print(newR)
print()
res = sess.run([rotated_v], feed_dict={x:np.array([[1.0],[1.0]])})
print(res)
This approach won't work, because s_a and c_a are the ops outputs, which values are uniquely determined by angle. You can't assign or update these nodes, so training them doesn't make any sense.
This line, on the other hand...
R = tf.Variable([[1.0, 0.0], [0.0, 1.0]], tf.float32)
... is a definition of an independent variable with initial value equal to identity matrix. This is perfectly valid. Since this variable is independent, you can assign a new value to it, which consists of s_a and c_a. Note that you can't initialize it with s_a and c_a, because the initializer is run before the values are fed into a session (so angle is unknown).

Does the tf.data.Dataset support to generate dictionary structure?

The following is a piece of code from [https://www.tensorflow.org/programmers_guide/datasets]. In this example, the map function is a user-defined function to read the data. And in the map function, we need to set the output types are [tf.uint8, label.dtype].
import cv2
# Use a custom OpenCV function to read the image, instead of the standard
# TensorFlow `tf.read_file()` operation.
def _read_py_function(filename, label):
image_decoded = cv2.imread(image_string, cv2.IMREAD_GRAYSCALE)
return image_decoded, label
# Use standard TensorFlow operations to resize the image to a fixed shape.
def _resize_function(image_decoded, label):
image_decoded.set_shape([None, None, None])
image_resized = tf.image.resize_images(image_decoded, [28, 28])
return image_resized, label
filenames = ["/var/data/image1.jpg", "/var/data/image2.jpg", ...]
labels = [0, 37, 29, 1, ...]
dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
dataset = dataset.map(
lambda filename, label: tuple(tf.py_func(
_read_py_function, [filename, label], [tf.uint8, label.dtype])))
dataset = dataset.map(_resize_function)
My question is, if we want to the _read_py_function() output a Python dictionary, then how do we set the outptu types? Is there an inherit data type such as tf.dict? For example:
def _read_py_function(filename):
image_filename = filename[0]
label_filename = filename[1]
image_id = filename[2]
image_age = filename[3]
image_decoded = cv2.imread(image_filename, cv2.IMREAD_GRAYSCALE)
image_decoded = cv2.imread(label_fielname, cv2.IMREAD_GRAYSCALE)
return {'image':image_decoded, 'label':label_decoded, 'id':image_id, 'age':image_age}
Then, how do we design the dataset.map() function?
Returning dicts inside the function called by tf.data.Dataset.map should work as expected.
Here is an example:
dataset = tf.data.Dataset.range(10)
dataset = dataset.map(lambda x: {'a': x, 'b': 2 * x})
dataset = dataset.map(lambda y: y['a'] + y['b'])
res = dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
for i in range(10):
assert sess.run(res) == 3 * i
To add to the above answer this also works:
dataset = tf.data.Dataset.range(10)
dataset = dataset.map(lambda x: {'a': x, 'b': 2 * x})
res = dataset.make_one_shot_iterator().get_next()
with tf.Session() as sess:
for i in range(10):
curr_res = sess.run(res)
assert curr_res['a'] == i
assert curr_res['b'] == 2 * i

How to minimize the Absolute Difference loss in tensorflow?

I have tried to reproduce Flownet 1.0 in tensorflow for days.the reconstruction is not hard, but the Absolute Difference loss displayed on tensorboard seems fell into loop.
The main code that you may wanna know are shown below.
#iamges1_shape = iamges2_shape = [batch, 461, 589, 6]
def inference(iamges1, images2, flownet_groud_truth):
with tf.device('/gpu:0'):
iamges1 = iamges1 * 0.00392156862745
images2 = images2 * 0.00392156862745
inputs = tf.concat([iamges1, images2], axis = 3)
conv1 = tf.contrib.layers.conv2d(inputs, 64, 5, [2, 2] )
conv2 = tf.contrib.layers.conv2d(conv1, 128, 5, stride=[ 2, 2] )
blablabla....
flowloss = tf.losses.absolute_difference(regroud_truth, predict_flow )
final_flow = 20*final_flow
return final_flow, flowloss
lr = tf.train.exponential_decay(0.0001,
global_step,
10000,
0.99,
staircase=True)
opt = tf.train.AdamOptimizer(learning_rate=lr)
train_op = opt.minimize(loss, global_step=global_step)
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=False))
sumwriter = tf.summary.FileWriter('/tmp/flow', graph=sess.graph)
threads = tf.train.start_queue_runners(sess )
sess.run(tf.global_variables_initializer())
for step in range(100000):
gdt = flowIO.next_GroudTruthFlow_batch(gd_truth_name_list, 4)
_ , total_loss, summary = sess.run([train_op, loss, merged ], feed_dict={gd_truth:gdt })
print('---------', 'step %d' % step)
print(' loss is %f ' % total_loss )
if step % 200 == 0:
sumwriter.add_summary(summary, step)
I also tried other learning_rate,like 0.1,0.001,etc,even the Optimizer.However,the loop is still here,just different shape.
Considering that too many urgly code may hurt your mood,I do not post all of it.If more information could help,you will get it.
Any suggestion will be appreciate. Thanks a lot!