Multiprocessing with large HDF5 files - python-multiprocessing

I have 5 HDF5 files that are 22 GB each. Each HDF5 file is a series of 4801 images that are 1920 by 1200 in size. I need to load the same frame number from each HDF5 file, get rid of some rogue pixels, average the stack of 5 images, and write a new HDF5 file with one processed image at each frame number. Because I can't load all 5 HDF5 files in at once without running out of RAM, I am only loading in chunks of images from each HDF5 file, putting 5 images for each frame number into a queue, processing the stack, and writing the resulting image to an HDF5 file. Right now I am using h5py to perform any reading/writing of HDF5 files.
I would like to know what the most computationally effective way is of working on chunked data? Right now, I am dedicating one processor to be the writer, then looping through some chunk size of data for which I create a number of consumers, put the data in a queue, wait for the consumers to be finished, then rinse and repeat until all of the images are processed. This means that every time the loop advances, it creates new consumer processes - I imagine there is some overhead in this. A sample of the code is below.
#!/usr/bin/env python
import time
import os
from multiprocessing import Process, Queue, JoinableQueue, cpu_count
import glob
import h5py
import numpy as np
'''Function definitions'''
# The consumer function takes data off of the Queue
def consumer(inqueue,output):
# Run indefinitely
while True:
# If the queue is empty, queue.get() will block until the queue has data
all_data = inqueue.get()
if all_data:
#n is the index corresponding to the projection location
n, image_data = all_data
#replace zingers with median and average stack
#Find the median for each pixel of the prefiltered image
med = np.median(image_data,axis=0)
#Loop through the image set
for j in range(image_data.shape[0]):
replicate = image_data[j,...]
mask = replicate - med > zinger_level
replicate[mask] = med[mask] # Substitute with median
image_data[j,...] = replicate # Put data back in place
out = np.mean(image_data,axis=0,dtype=np.float32).astype(np.uint16)
output.put((n,out))
else:
break
#function for writing out HDF5 file
def write_hdf(output,output_filename):
#create output HDF5 file
while True:
args = output.get()
if args:
i,data = args
with h5py.File(output_filename,'a') as fout:
fout['Prefiltered_images'][i,...] = data
else:
break
def fprocess_hdf_stack(hdf_filenames,output_filename):
file_list = []
for fname in hdf_filenames:
file_list.append(h5py.File(fname,'r'))
#process chunks of data so that we don't run out of memory
totsize = h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape[0]
data_shape = h5py.File(hdf_filenames[0],'r')['exchange']['data'].shape
fout.create_dataset('Prefiltered_images',data_shape,dtype=np.uint16)
fout.close()
ints = range(totsize)
chunkSize= 100
#initialize how many consumers we would like working
num_consumers = cpu_count()*2
#Create the Queue objects
inqueue = JoinableQueue()
output = Queue()
#start process for writing HDF5 file
proc = Process(target=write_hdf, args=(output,output_filename))
proc.start()
print("Loading %i images into memory..."%chunkSize)
for i in range(0,totsize,chunkSize):
time0 = time.time()
chunk = ints[i:i+chunkSize]
data_list = []
#Make a list of the HDF5 datasets we are reading in
for files in file_list:
#shape is (angles, rows, columns)
data_list.append(files['exchange']['data'][chunk,...])
data_list = np.asarray(data_list)
print("Elapsed time to load images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))
consumers = []
#Create consumer processes
for i in range(num_consumers):
p = Process(target=consumer, args=(inqueue,output))
consumers.append(p)
p.start()
for n in range(data_list.shape[1]):
#Feed data into the queue
inqueue.put((chunk[n],data_list[:,n,...]))
#Kill all of the processes when everything is finished
for i in range(num_consumers):
inqueue.put(None)
for c in consumers:
c.join()
print("Elapsed time to process images %i-%i is %0.2f minutes." %(chunk[0],chunk[-1],(time.time() - time0)/60))
time.sleep(1)
output.put(None)
proc.join()
#Close the input HDF5 files.
for hdf_file in file_list:
hdf_file.close()
print("Input HDF5 files closed.")
return
if __name__ == '__main__':
start_time = time.time()
raw_images_filenames = glob.glob(raw_images_dir + raw_images_basename)
tempname = os.path.basename(raw_images_filenames[0]).split('.')[0]
tempname_split = tempname.split('_')[:-1]
output_filename = output_dir+'_'.join(tempname_split) + '_Prefiltered.hdf5'
fprocess_hdf_stack(raw_images_filenames,output_filename)
print("Elapsed time is %0.2f minutes" %((time.time() - start_time)/60))
I don't think my bottleneck is actually in the loading of the images. It is in initializing the consumers and carrying out the processing on the 5 images per each frame number. I've played around with taking the consumer function out of the for loop, but I don't know how to put a memory cap on this so that I don't run out of RAM. Thanks!

Related

Tensorflow tf.data.Dataset takes too much time to generate dataset. Better way to optimize it?

I have .stem.mp4 files each of which is composed of multiple audio sources.
Each length of file is 2 minutes to 6 minutes. It varies a lot.
When I try to make tf.data.Dataset out of it, it seems to take a lot of time to generate a input_batch much more than my model makes a prediction of a given batch.
Let me illustarte an example.
import tensorflow as tf
import tensorflow.keras as keras
sample_data = tf.random.normal((5, 755200, 2)) # 5 sources of audio, stereo channel
# First axis is the mixture of the audio, so this is the input
# Rest 4 axes are the each source of the audio(eg. bass, drum, vocals, etc) so these are the output
input_mixture = sample_data[0, :, :]
target_mixtures = sample_data[1:, :, :]
target_mixtures = np.column_stack(target_mixtures)
length = 44100 * 11 # I want to split these into length of 11 seconds
strides = 44100 # 1 second stride
ds_inp = tf.data.Dataset.from_tensor_slices((input_mixture))
ds_inp = ds_inp.window(length, shift=strides, drop_remainder=True)
ds_inp = ds_inp.flat_map(lambda windows: windows.batch(length))
ds_inp = ds_inp.map(lambda windows: windows, num_parallel_calls=tf.data.AUTOTUNE)
ds_tar = tf.data.Dataset.from_tensor_slices((target_mixtures))
ds_tar = ds_tar.window(length, shift=strides, drop_remainder=True)
ds_tar = ds_tar.flat_map(lambda windows: windows.batch(length))
ds_tar = ds_tar.map(lambda windows: windows, num_parallel_calls=tf.data.AUTOTUNE)
ds_total = [ds_inp, ds_tar]
total_ds = tf.data.Dataset.zip(tuple(ds_total))
total_ds = total_ds.batch(BATCH_SIZE)
total_ds = total_ds.prefetch(tf.data.AUTOTUNE)
This is how I made a tf.data.Dataset from the given file.
And when I measure the time how fast does this make a input_batch and output_batch,
%%time
for i, j in total_ds.take(1):
pass
# Wall time: 18.3 s
My model has about 100 million variables, but since it fairly has a simple structure so that it takes about 6 seconds to generate a predicted_batch out of given input_batch.
So my problem is, is there any way to make it to generate input_batch, output_batch faster?
(My assumption is that, as this 'window' the given arrays, there is no better way to improve this.)
Obviously all of the files are big enough not to be cached.

Numpy memmap throttles with Pytorch Dataloader when available RAM less than file size

I'm working on a dataset that is too big to fit into RAM. The solution I'm trying currently is to use numpy memmap to load one sample/row at a time using Dataloader. The solution looks something like this:
class MMDataset(torch.utils.data.Dataset):
def __init__(self, path):
self.file_path = path
self.dataset_len = 44000000
self.bytes_per_value = 32/8
self.num_cols = 512
self.num_rows = 1
def __getitem__(self, index):
x = np.memmap(self.file_path, dtype='float32', mode='r', shape=(
self.num_rows, self.num_cols), offset=int(index*self.num_cols*self.bytes_per_value))
return np.array(x)
def __len__(self):
return self.dataset_len
dataset = MMDataset('./data/emb.memmap')
data_loader = DataLoader(
dataset,
batch_size=4096,
shuffle=True,
num_workers=20
)
When the amount of RAM available is greater than the size of the memmap file, the data loading is fast. I get around 60 batches/second. However, when the RAM available is less than the size of the memmap file, I get around 3 batches/second.
I discovered this when trying various sizes for the memmap file.
Why is this the case? If Dataloader + memmap is going to throttle when available RAM < memmap file size, this defeats the point of the solution.
I've observed that disk i/o is at 500MB/s read constantly when available RAM < memmap file size. This is much higher than the theoretical amount of reading required to load a batch of 4096 samples (closer to 8MB/s).

Processing a huge file (>30GB) in Python

I need to process a huge file of around 30GB containing hundreds of millions of rows. More precisely, I want to perform the three following steps:
Reading the file by chunks: given the size of the file, I don't have the memory to read the file in one go;
Computing stuff on the chunks before aggregating each of them to a more manageable size;
Concatenating the aggregated chunks into a final dataset containing the results of my analyses.
So far, I have coded two threads :
One thread in charge of reading the file by chunks and storing the chunks in a Queue (step 1);
One thread in charge of performing the analyses (step 2) on the chunks;
Here is the spirit of my code so far with dummy data:
import queue
import threading
import concurrent.futures
import os
import random
import pandas as pd
import time
def process_chunk(df):
return df.groupby(["Category"])["Value"].sum().reset_index(drop=False)
def producer(queue, event):
print("Producer: Reading the file by chunks")
reader = pd.read_table(full_path, sep=";", chunksize=10000, names=["Row","Category","Value"])
for index, chunk in enumerate(reader):
print(f"Producer: Adding chunk #{index} to the queue")
queue.put((index, chunk))
time.sleep(0.2)
print("Producer: Finished putting chunks")
event.set()
print("Producer: Event set")
def consumer(queue, event, result_list):
# The consumer stops iff queue is empty AND event is set
# <=> The consumer keeps going iff queue is not empty OR event is not set
while not queue.empty() or not event.is_set():
try:
index, chunk = queue.get(timeout=1)
except queue.Empty:
continue
print(f"Consumer: Retrieved chunk #{index}")
print(f"Consumer: Queue size {queue.qsize()}")
result_list.append(process_chunk(chunk))
time.sleep(0.1)
print("Consumer: Finished retrieving chunks")
if __name__=="__main__":
# Record the execution time
start = time.perf_counter()
# Generate a fake file in the current directory if necessary
path = os.path.dirname(os.path.realpath(__file__))
filename = "fake_file.txt"
full_path = os.path.join(path, filename)
if not os.path.exists(full_path):
print("Main: Generate a dummy dataset")
with open(full_path, "w", encoding="utf-8") as f:
for i in range(100000):
value = random.randint(1,101)
category = i%2
f.write(f"{i+1};{value};{category}\n")
# Defining a queue that will store the chunks of the file read by the Producer
queue = queue.Queue(maxsize=5)
# Defining an event that will be set by the Producer when he is done
event = threading.Event()
# Defining a list storing the chunks processed by the Consumer
result_list = list()
# Launch the threads Producer and Consumer
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
executor.submit(producer, queue, event)
executor.submit(consumer, queue, event, result_list)
# Display that the program is finished
print("Main: Consumer & Producer have finished!")
print(f"Main: Number of processed chunks = {len(result_list)}")
print(f"Main: Execution time = {time.perf_counter()-start} seconds")
I know that each iteration of step 1 takes more time than each iteration of step 2 i.e. that the Consumer will always be waiting for the Producer.
How can I speed up the process of reading my file by chunks (step 1) ?

Convert PCM wave data to numpy arrays and vice versa

The situation
I am using VAD (Voice Activity Detection) from WebRTC by using WebRTC-VAD, a Python adapter. The example implementation from the GitHub repo uses Python's wave module to read PCM data from files. Note that according to the comments the module only works with mono audio and a sampling rate of either 8000, 16000 or 32000 Hz.
What I want to do
Read audio data from arbitrary audio files (MP3 and WAV files) with different sampling rates, convert them into the PCM-representation that WebRTC-VAD is using, apply WebRTC-VAD to detect voice activity and finally process the result by producing Numpy-Arrays again from PCM data because they are easiest to work with when using Librosa
My problem
The WebRTC-VAD module only works correctly when using the wave module. This module returns PCM data as bytes objects. It does not work when feeding it Numpy arrays that have been obtained e.g. by using librosa.load(...). I have not found a way to convert between the two representations.
What I have done so far
I have written the following functions to read audio data from audio files and automatically convert them:
Generic function to read/convert any audio data with Librosa (--> returns Numpy array):
def read_audio(file_path, sample_rate=None, mono=False):
return librosa.load(file_path, sr=sample_rate, mono=mono)
Functions to read arbitrary data as PCM data (--> returns bytes):
def read_audio_vad(file_path):
audio, rate = librosa.load(file_path, sr=16000, mono=True)
tmp_file = 'tmp.wav'
sf.write(tmp_file, audio, rate, subtype='PCM_16')
audio, rate = read_pcm16_wave(tmp_file)
remove(tmp_file)
return audio, rate
def read_pcm16_wave(file_path):
with wave.open(file_path, 'rb') as wf:
sample_rate = wf.getframerate()
pcm_data = wf.readframes(wf.getnframes())
return pcm_data, sample_rate
As you can see I am making a detour by reading/converting the audio data with librosa first. This is needed so I can read from MP3 files or WAV files with arbitrary encodings and automatically resample it to 16kHz mono with Librosa. I am then writing to a temporary file. Before deleting the file, I read the contents out again, but this time using the wave module. This gives me the PCM data.
I now have the following code to extract the voice activity and produce Numpy arrays:
def webrtc_voice(audio, rate):
voiced_frames = webrtc_split(audio, rate)
tmp_file = 'tmp.wav'
for frames in voiced_frames:
voice_audio = b''.join([f.bytes for f in frames])
write_pcm16_wave(tmp_file, voice_audio, rate)
voice_audio, rate = read_audio(tmp_file)
remove(tmp_file)
start_time = frames[0].timestamp
end_time = (frames[-1].timestamp + frames[-1].duration)
start_frame = int(round(start_time * rate / 1e3))
end_frame = int(round(end_time * rate / 1e3))
yield voice_audio, rate, start_frame, end_frame
def write_pcm16_wave(path, audio, sample_rate):
with wave.open(path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(audio)
As you can see I am taking the detour over a temporary file again to write PCM data first and then read the temporary file out again with Librosa to get a Numpy array. The webrtc_split function is the implementation from the example implementation with only few minor changes. For completeness sake I am posting it here:
def webrtc_split(audio, rate, aggressiveness=3, frame_duration_ms=30, padding_duration_ms=300):
vad = Vad(aggressiveness)
num_padding_frames = int(padding_duration_ms / frame_duration_ms)
ring_buffer = collections.deque(maxlen=num_padding_frames)
triggered = False
voiced_frames = []
for frame in generate_frames(audio, rate):
is_speech = vad.is_speech(frame.bytes, rate)
if not triggered:
ring_buffer.append((frame, is_speech))
num_voiced = len([f for f, speech in ring_buffer if speech])
if num_voiced > 0.9 * ring_buffer.maxlen:
triggered = True
for f, s in ring_buffer:
voiced_frames.append(f)
ring_buffer.clear()
else:
voiced_frames.append(frame)
ring_buffer.append((frame, is_speech))
num_unvoiced = len([f for f, speech in ring_buffer if not speech])
if num_unvoiced > 0.9 * ring_buffer.maxlen:
triggered = False
yield voiced_frames
ring_buffer.clear()
voiced_frames = []
if voiced_frames:
yield voiced_frames
class Frame(object):
"""
object holding the audio signal of a fixed time interval (30ms) inside a long audio signal
"""
def __init__(self, bytes, timestamp, duration):
self.bytes = bytes
self.timestamp = timestamp
self.duration = duration
def generate_frames(audio, sample_rate, frame_duration_ms=30):
frame_length = int(sample_rate * frame_duration_ms / 1000) * 2
offset = 0
timestamp = 0.0
duration = (float(frame_length) / sample_rate)
while offset + frame_length < len(audio):
yield Frame(audio[offset:offset + frame_length], timestamp, duration)
timestamp += duration
offset += frame_length
My question
My implementation with writing/reading temporary files with the wave module and reading/writing these files with Librosa to get Numpy Arrays seems overly complicated to me. However, despite spending a whole day on the matter I did not find a way to convert directly between the two encodings. I admit I don't fully understand all the details of PCM and WAVE files, the impact of using 16/24/32-Bit for PCM data or the endianness. I hope my explanations above are detailed enough and not too much. Is there an easier way to convert between the two representations in-memory?
It seems that WebRTC-VAD, and the Python wrapper, py-webrtcvad, expects the audio data to be 16bit PCM little-endian - as is the most common storage format in WAV files.
librosa and its underlying I/O library pysoundfile however always returns floating point arrays in the range [-1.0, 1.0]. To convertt this to bytes containing 16bit PCM you can use the following float_to_pcm16 function.
def float_to_pcm16(audio):
import numpy
ints = (audio * 32767).astype(numpy.int16)
little_endian = ints.astype('<u2')
buf = little_endian.tostring()
return buf
def read_pcm16(path):
import soundfile
audio, sample_rate = soundfile.read(path)
assert sample_rate in (8000, 16000, 32000, 48000)
pcm_data = float_to_pcm16(audio)
return pcm_data, sample_rate

Example pipeline for TFRecords with chunking for long input sequences

I'm trying to optimise the input pipeline for a model I am using that uses GRUs. The data consists of a large number of files that contain time series of length 5000 with dimensionality of 50. I know that it isn't feasible to feed a single sequence of length 5000 into an RNN owing to the vanishing gradient, and you should instead try to chunk it into (5000-seq_len) overlapping chunks, where seq_len is a more manageable length, say 200 timesteps.
The most obvious method for getting this to work with TFRecords/SequenceExamples is to simply have each chunk included as a new SequenceExample within the same file. This seems massively inefficient however, as the majority of data in the resulting TFRecords file will be duplicate data.
Is there a better method of doing this? I've seen very few examples of how to use TFRecords that don't involve images, and no examples that use non-trivial sequence lengths!
For example:
def chunk_save_tfrecords(X, file_path_prefix, seq_length):
# Generate tfrecord writer
result_tf_file = file_path_prefix + '.tfrecords'
with tf.python_io.TFRecordWriter(result_tf_file) as writer:
# Chunk the data
for i in range(int(X.shape[0] - seq_length)):
chunk = X[i:i+seq_length]
data_features = [
tf.train.Feature(
float_list=tf.train.FloatList(value=chunk[t]))
for t in range(seq_length)] # FloatList per timestep
feature_lists = tf.train.FeatureLists(
feature_list={
'data': tf.train.FeatureList(feature=data_features)})
serialized = tf.train.SequenceExample(
feature_lists=feature_lists).SerializeToString()
writer.write(serialized)
def save_tfrecords(X, file_path_prefix):
# Generate tfrecord writer
result_tf_file = file_path_prefix + '.tfrecords'
with tf.python_io.TFRecordWriter(result_tf_file) as writer:
data_features = [
tf.train.Feature(
float_list=tf.train.FloatList(value=X[t]))
for t in range(X.shape[0])] # FloatList per timestep
feature_lists = tf.train.FeatureLists(
feature_list={
'data': tf.train.FeatureList(feature=data_features)})
serialized = tf.train.SequenceExample(
feature_lists=feature_lists).SerializeToString()
writer.write(serialized)
test = np.random.randn(5000,50)
save_tfrecords(test, 'test')
chunk_save_tfrecords(test, 'test_chunk', 200)
save_tfrecords creates a 1MB file, while chunk_save_tfrecords creates a 200MB file!