Tensorflow - Read and Save TFRecords to Dict and Use Multiprocessing - tensorflow

I am trying to speed up the conversion of select tfrecords to a series of python dictionaries. Here's what I have. Initially the CPU utilization spikes, but then goes to almost zero, suggesting my code is not working correctly.
My goal is to have 3 dictionaries saved and pickled. There are 14,000+ tfrecord files (2 gigs appx). At the current rate, it will take about 84 hours to run on a single process.
Are there any problems with my use of manage dicts
import glob
import tensorflow as tf
import cPickle
import numpy as np
from tqdm import tqdm
import collections
from multiprocessing import Process, Manager, Pool
def get_multihot_encoding(example_label):
enc = np.zeros(10)
for label in example_label:
if label in lookup.values():
index = lookup_inverted[label]
enc[index] = 1
return list(enc)
# Set-up MultiProcessing
manager = Manager()
audio_embeddings_dict = manager.dict()
audio_labels_dict = manager.dict()
audio_multihot_dict = manager.dict()
sess = tf.Session()
# The iterable which gets passed to the function
all_tfrecord_filenames = glob.glob('/Users/jeff/features/audioset_v1_embeddings/unbal_train/*.tfrecord')
def process_tfrecord(tfrecord):
for idx, example in enumerate(tf.python_io.tf_record_iterator(tfrecord)):
tf_example = tf.train.Example.FromString(example)
vid_id = tf_example.features.feature['video_id'].bytes_list.value[0].decode(encoding='UTF-8')
example_label = list(np.asarray(tf_example.features.feature['labels'].int64_list.value))
# Non zero intersect of 2 sets is True - only create dict entries if this is true!
if set(example_label) & label_filters:
print(set(example_label) & label_filters, " Is the intersection of the two")
tf_seq_example = tf.train.SequenceExample.FromString(example)
n_frames = len(tf_seq_example.feature_lists.feature_list['audio_embedding'].feature)
audio_frame = []
for i in range(n_frames):
audio_frame.append(tf.cast(tf.decode_raw(
tf_seq_example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],tf.uint8)
,tf.float32).eval(session=sess))
audio_embeddings_dict[vid_id] = audio_frame
audio_labels_dict[vid_id] = example_label
audio_multihot_dict[vid_id] = get_multihot_encoding(example_label)
#print(get_multihot_encoding(example_label), "Is the encoded label")
if idx % 100 == 0:
print ("Saving dictionary at loop: {}".format(idx))
cPickle.dump(audio_embeddings_dict, open('audio_embeddings_dict_unbal_train_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_multihot_dict_bal_untrain_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_labels_unbal_dict_multi_{}.pkl'.format(idx), 'wb'))
pool = Pool(50)
result = pool.map(process_tfrecord, all_tfrecord_filenames)

Related

What is the difference between dask.array.from_array(np.random.random) and dask.array.random.random

It comes to a situation we need a train a bunch of data(about 22GiB), I make a test with two methods to generate random data and try to train it with Dask, however, the data generated by Numpy would raise an exception(msgpack: bytes object is too large) while the Dask.array one works. Did anybody know why?
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
from dask import array as da
import numpy as np
import xgboost as xgb
import time
def main(client):
regressor = None
pre = None
n=3000
m=1000000
# numpy generated data will raise an exception
X = np.random.random((m, n))
y = np.random.random((m, 1))
X = da.from_array(X, chunks=(1000, n))
y = da.from_array(y, chunks=(1000, 1))
# data generated by dask.array works well
# X = da.random.random(size=(m, n), chunks=(1000, n))
# y = da.random.random(size=(m, 1), chunks=(1000, 1))
dtrain = xgb.dask.DaskDMatrix(client, X, y)
del X
del y
params = {'tree_method':'gpu_hist'}
watchlist = [(dtrain, 'train')]
start = time.time()
bst = xgb.dask.train(client, params, dtrain, num_boost_round=100, evals=watchlist)
print('consume:', time.time() - start)
if __name__ == '__main__':
with LocalCUDACluster(n_workers=4, device_memory_limit='12 GiB') as cluster:
with Client(cluster) as client:
main(client)
After making a few tests, I found out the reason, The da.random.random is a delay function as well(so it pass worker only the definition of random), in our situation, the msgpack limit the data size(4GiB) pass to each worker, so, in general, it wouldn't work for data size more than 4GiB directly communicate with Dask XGBoost(BTW, we can switch to parquet data and read it as dash.dataframe chunk data to bypass the limitation of msgpack)
the following commands proved my guess.

Cannot store an array using dask

I am using the following code to create an array and and store the the results sequentially in a hdf5 format. I was checking out the dask documentation, and the suggested to use dask.store to store the arrays generated in a function like mine. However I receive an error: dask has no attribute store
My code:
import os
import numpy as np
import time
import concurrent.futures
import multiprocessing
from itertools import product
import h5py
import dask as da
def mean_py(array):
start_time = time.time()
x = array.shape[1]
y = array.shape[2]
values = np.empty((x,y), type(array[0][0][0]))
for i in range(x):
for j in range(y):
values[i][j] = ((np.mean(array[:,i,j])))
end_time = time.time()
hours, rem = divmod(end_time-start_time, 3600)
minutes, seconds = divmod(rem,60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), int(seconds)))
print(f"{'.'*80}")
return values
def generate_random_array():
a = np.random.randn(120560400).reshape(10980,10980)
return a
def generate_array(nums):
for num in range(nums):
a = generate_random_array()
f = h5py.File('test_db.hdf5')
d = f.require_dataset('/data', shape=a.shape, dtype=a.dtype)
da.store(a, d)
start = time.time()
generate_array(8)
end = time.time()
print(f'\nTime complete: {end-start:.2f}s\n')
Should I use dask for such a a task, or do you recommend to store the results using h5py directly?
Please Ignore the mean_py(array) function. It's for something I want to try out once the data has been produced.
As suggested in the comments, you're currently doing this
import dask as da
When you probably meant to do this
import dask.array as da

Caffe always returns one label

I have trained a model with caffe tools under bin and now I am trying to do testing using python script, I read in an image and preprocess it myself (as I did for my training dataset) and I load the pretrained weights to the net, but I am almost always (99.99% of the time) receiving the same result -0- for every test image. I did consider that my model might be overfitting but after training a few models, I have come to realize the labels I get from predictions are most likely the cause. I have also increased dropout and took random crops to overcome overfitting and I have about 60K for training. The dataset is also roughly balanced. I get between 77 to 87 accuracy during evaluation step of training (depending on how I process data, what architecture I use etc)
Excuse my super hacky code, I have been distant to caffe testing for some time so I suspect the problem is how I pass the input data to the network, but I can't put my finger on it:
import h5py, os
import sys
sys.path.append("/home/X/Desktop/caffe-caffe-0.16/python")
from caffe.io import oversample
from caffe.io import resize_image
import caffe
from random import randint
import numpy as np
import cv2
import matplotlib.pyplot as plt
from collections import Counter as Cnt
meanImg = cv2.imread('/home/caffe/data/Ch/Final_meanImg.png')
model_def = '/home/X/Desktop/caffe-caffe-0.16/models/bvlc_googlenet/deploy.prototxt'
model_weights = '/media/X/DATA/SDet/Google__iter_140000.caffemodel'
# load the model
#caffe.set_mode_gpu()
#caffe.set_device(0)
net = caffe.Net(model_def, # defines the structure of the model
model_weights, # contains the trained weights
caffe.TEST) # use test mode (e.g., don't perform dropout)
with open( '/home/caffe/examples/sdet/SDet/test_random.txt', 'r' ) as T, open('/media/X/DATA/SDet/results/testResults.txt','w') as testResultsFile:
readImgCounter = 0
runningCorrect = 0
runningAcc = 0.0
#testResultsFile.write('filename'+' '+'prediction'+' '+'GT')
lines = T.readlines()
for i,l in enumerate(lines):
sp = l.split(' ')
video = sp[0].split('_')[0]
impath = '/home/caffe/data/Ch/images/'+video+'/'+sp[0] +'.jpg'
img = cv2.imread(impath)
resized_img = resize_image(img, (255,255))
oversampledImages = oversample([resized_img], (224,224)) #5 crops x 2 mirror flips = return 10 images
transposed_img = np.zeros( (10, 3, 224, 224), dtype='f4' )
tp = np.zeros( (1, 3, 224, 224), dtype='f4' )
predictedLabels = []
for j in range(0,oversampledImages.shape[0]-1):
transposed_img[j] = oversampledImages[j].transpose((2,0,1))
tp[0] = transposed_img[j]
net.blobs['data'].data[0] = tp
pred = net.forward(data=tp)
predictedLabels.append(pred['prob'].argmax())
print(predictedLabels)
prediction,num_most_common = Cnt(predictedLabels).most_common(1)[0]
print(prediction)
readImgCounter = readImgCounter + 1
if (prediction == int(sp[1])):
runningCorrect = runningCorrect + 1
runningAcc = runningCorrect / readImgCounter
print('runningAcc:')
print(runningAcc)
print('-----------')
print('runningCorrect:')
print(runningCorrect)
print('-----------')
print('totalImgRead:')
print(readImgCounter)
print('-----------')
testResultsFile.write(sp[0]+' '+str(prediction)+' '+sp[1])
testResultsFile.write('\n')
I have fixed this problem eventually. I am not 100% sure what worked but it was most likely changing the bias to 0 while learning.

Increase of multiprocess time

I defined a multiprocess script to improve an image analysis. It seems work well but i tried to do several tests in order to define the best processes number.
It consists in varying this processes number. And as there is some dispersion, i add a loop in order to repeat one hundred times my test.
But during the process, time increases significantly. What could be the origin of my problem? Have I to flush memory? but it seems to be no saturation.
A piece of my code :
from multiprocessing import Process, current_process
import multiprocessing
import glob as glob
import matplotlib.pyplot as plt
from skimage import io
import time
import sys
import numpy as np
import numpy.ma as ma
import gc
import os
from PIL import Image
from skimage import exposure
import cv2
Path_input = "E:\\test\\raw\\"
Path_output = "E:\\test\\"
Img_list = glob.glob((Path_input + 'Test_*.tif' ))[:]
size_y,size_x = io.imread(Img_list[0]).shape
#Function for the multi process
def Ajustement(x):
#image reading
img = plt.imread(Img_list[x])
#create a CLAHE object
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
cl1 = clahe.apply(img_rescale.astype(np.uint16))
cv2.imwrite(Path_output+ '\\Ajusted' + "%05d" % x + '.tif',cl1)
return 'Ajustement OK!'
#create strings list of process
cpu_max = 10
list = ['Process_']*cpu_max
list_process =[]
counter = 1
for u in list:
list_process.append(list[counter-1]+np.str(counter))
counter = counter+1
get_timer = time.clock if sys.platform == "win32" else time.time
time_store = []
time_process = []
if __name__ == '__main__':
range_adjusted = np.arange(0,len(Img_list),cpu_max)
m=0
for m in range(0,100,1): #loop for obtain a mean time for the process
print m
timer = get_timer() # time measuring starts now
for i in range_adjusted:
o = 0
for item in list_process[:cpu_max]: #process creation
globals()[item] = Process(name ='worker1', target=Normalization_and_ajustement, args=(i+o,))
o=o+1
o = 0
for item in list_process[:cpu_max]: #process start
globals()[item].start()
o=o+1
o = 0
for item in list_process[:cpu_max]: #process join
globals()[item].join()
o=o+1
if i == range_adjusted.max():
print("Normalization and Equalization finished")
timer = get_timer() - timer # get delta time as soon as it finishes
time_store.append(timer)
time_process.append(timer/cpu_max)
np.savetxt(Path_output + 'time_tot_normalization.txt',time_store)
np.savetxt(Path_output + 'time_process_normalization.txt',time_process)
print("\tTotal: {:.2f} seconds".format(timer))
print("\tAvg. per process: {:.2f} seconds".format(timer/cpu_max))
m=m+1
I think it was due to a memory leak . Indeed i added gc.collect() command after each loop and the problem was solved.

TensorFlow example save mandelbrot image

Learning how to use tensorflow, first tutorial code on mandelbrot set below
# Import libraries for simulation
import tensorflow as tf
import numpy as np
# Imports for visualization
import PIL.Image
from io import BytesIO
from IPython.display import Image, display
def DisplayFractal(a, fmt='jpeg'):
"""Display an array of iteration counts as a
colorful picture of a fractal."""
a_cyclic = (6.28*a/20.0).reshape(list(a.shape)+[1])
img = np.concatenate([10+20*np.cos(a_cyclic),
30+50*np.sin(a_cyclic),
155-80*np.cos(a_cyclic)], 2)
img[a==a.max()] = 0
a = img
a = np.uint8(np.clip(a, 0, 255))
f = BytesIO()
PIL.Image.fromarray(a).save(f, fmt)
display(Image(data=f.getvalue()))
sess = tf.InteractiveSession()
# Use NumPy to create a 2D array of complex numbers
Y, X = np.mgrid[-1.3:1.3:0.005, -2:1:0.005]
Z = X+1j*Y
xs = tf.constant(Z.astype(np.complex64))
zs = tf.Variable(xs)
ns = tf.Variable(tf.zeros_like(xs, tf.float32))
tf.global_variables_initializer().run()
# Compute the new values of z: z^2 + x
zs_ = zs*zs + xs
# Have we diverged with this new value?
not_diverged = tf.abs(zs_) < 4
# Operation to update the zs and the iteration count.
#
# Note: We keep computing zs after they diverge! This
# is very wasteful! There are better, if a little
# less simple, ways to do this.
#
step = tf.group(
zs.assign(zs_),
ns.assign_add(tf.cast(not_diverged, tf.float32))
)
for i in range(200): step.run()
DisplayFractal(ns.eval())
returns this on shell
<IPython.core.display.Image at 0x7fcdee1da810>
It doesn't display the image and I'd prefer if it saved the image.
How can I save the result as an image?
Scipy has an easy image save function! https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.misc.imsave.html
You should try this:
import scipy.misc
scipy.misc.imsave('mandelbrot.png',ns.eval())
I hope this works! Regardless, let me know!