Cannot store an array using dask - numpy

I am using the following code to create an array and and store the the results sequentially in a hdf5 format. I was checking out the dask documentation, and the suggested to use dask.store to store the arrays generated in a function like mine. However I receive an error: dask has no attribute store
My code:
import os
import numpy as np
import time
import concurrent.futures
import multiprocessing
from itertools import product
import h5py
import dask as da
def mean_py(array):
start_time = time.time()
x = array.shape[1]
y = array.shape[2]
values = np.empty((x,y), type(array[0][0][0]))
for i in range(x):
for j in range(y):
values[i][j] = ((np.mean(array[:,i,j])))
end_time = time.time()
hours, rem = divmod(end_time-start_time, 3600)
minutes, seconds = divmod(rem,60)
print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), int(seconds)))
print(f"{'.'*80}")
return values
def generate_random_array():
a = np.random.randn(120560400).reshape(10980,10980)
return a
def generate_array(nums):
for num in range(nums):
a = generate_random_array()
f = h5py.File('test_db.hdf5')
d = f.require_dataset('/data', shape=a.shape, dtype=a.dtype)
da.store(a, d)
start = time.time()
generate_array(8)
end = time.time()
print(f'\nTime complete: {end-start:.2f}s\n')
Should I use dask for such a a task, or do you recommend to store the results using h5py directly?
Please Ignore the mean_py(array) function. It's for something I want to try out once the data has been produced.

As suggested in the comments, you're currently doing this
import dask as da
When you probably meant to do this
import dask.array as da

Related

ThreadPoolExecutor DataFrame

I am dealing with a simple loop.
I have a slightly larger dataframe and I would like to use the processor (currently 2%).
I tried this:
import pandas as pd
import numpy as np
import time
from concurrent.futures import ThreadPoolExecutor
scan = pd.DataFrame([[0,2,3,5],[4,2,7,7], [5,6,2,3]], columns=['st1','nd1','st2','nd2'])
def task(value):
calc_all = pd.DataFrame()
for i in range(0,3,2):
j=i+1
calc = pd.concat([pd.DataFrame(scan.iloc[:,i]), pd.DataFrame(scan.iloc[:,j])],axis=1)
calc['th'] = calc.iloc[:,0] + calc.iloc[:,1]
calc_all = pd.concat([calc_all, calc], axis=1)
time.sleep(1) #tested time
return calc_all
if __name__ == '__main__':
with ThreadPoolExecutor(2) as exe:
for result in exe.map(task, range(2)):
print(result)
It's not faster. What did I do wrong?

Python Dill or Pickle gives error when use in a new file

I need help with my code. I have built a recommendation system using cosine similarity on a colab and used pickle to serialized it. when I deserialized it inside a colab file, it works perfectly fine but when I deserialize it in a new colab file. it gives me an error
name 'data' is not defined
data is a variable that is initialized with my dataset which is outside of the class InstaPost.
import pandas as pd
import numpy as np
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import dill as pickle
data = pd.read_csv("/content/instaData.txt")
data
data = data[["Caption", "Hashtags"]]
captions = data["Caption"].tolist()
uni_tfidf = text.TfidfVectorizer(input=captions, stop_words="english")
uni_matrix = uni_tfidf.fit_transform(captions)
uni_sim = cosine_similarity(uni_matrix)
def recommend_post(x):
return ", ".join(data["Caption"].loc[x.argsort()[-7:-1]])
data["Recommended Post"] = [recommend_post(x) for x in uni_sim]
class InstaPost:
def Post(number):
count = 0
wordy = (data["Recommended Post"][number])
sentence = wordy.split(',')
for i in sentence:
count=count+1
print(count," ",i)
obj = InstaPost
obj.Post(1)
pickle_out = open("modelREC", "wb")
pickle.dump(obj, pickle_out)
pickle_out.close()
pickle_in = open("modelREC", "rb")
exe = pickle.load(pickle_in)
print(exe.Post(10))
NOTE: on a different file
print(exe.Post)
works
and give output
<function InstaPost.Post at 0x7efc0b4c3f70>
if I need to give the reference of the data than please guide me how should I do it. It will be a great help to me
Thanks in advance

Increase of multiprocess time

I defined a multiprocess script to improve an image analysis. It seems work well but i tried to do several tests in order to define the best processes number.
It consists in varying this processes number. And as there is some dispersion, i add a loop in order to repeat one hundred times my test.
But during the process, time increases significantly. What could be the origin of my problem? Have I to flush memory? but it seems to be no saturation.
A piece of my code :
from multiprocessing import Process, current_process
import multiprocessing
import glob as glob
import matplotlib.pyplot as plt
from skimage import io
import time
import sys
import numpy as np
import numpy.ma as ma
import gc
import os
from PIL import Image
from skimage import exposure
import cv2
Path_input = "E:\\test\\raw\\"
Path_output = "E:\\test\\"
Img_list = glob.glob((Path_input + 'Test_*.tif' ))[:]
size_y,size_x = io.imread(Img_list[0]).shape
#Function for the multi process
def Ajustement(x):
#image reading
img = plt.imread(Img_list[x])
#create a CLAHE object
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
cl1 = clahe.apply(img_rescale.astype(np.uint16))
cv2.imwrite(Path_output+ '\\Ajusted' + "%05d" % x + '.tif',cl1)
return 'Ajustement OK!'
#create strings list of process
cpu_max = 10
list = ['Process_']*cpu_max
list_process =[]
counter = 1
for u in list:
list_process.append(list[counter-1]+np.str(counter))
counter = counter+1
get_timer = time.clock if sys.platform == "win32" else time.time
time_store = []
time_process = []
if __name__ == '__main__':
range_adjusted = np.arange(0,len(Img_list),cpu_max)
m=0
for m in range(0,100,1): #loop for obtain a mean time for the process
print m
timer = get_timer() # time measuring starts now
for i in range_adjusted:
o = 0
for item in list_process[:cpu_max]: #process creation
globals()[item] = Process(name ='worker1', target=Normalization_and_ajustement, args=(i+o,))
o=o+1
o = 0
for item in list_process[:cpu_max]: #process start
globals()[item].start()
o=o+1
o = 0
for item in list_process[:cpu_max]: #process join
globals()[item].join()
o=o+1
if i == range_adjusted.max():
print("Normalization and Equalization finished")
timer = get_timer() - timer # get delta time as soon as it finishes
time_store.append(timer)
time_process.append(timer/cpu_max)
np.savetxt(Path_output + 'time_tot_normalization.txt',time_store)
np.savetxt(Path_output + 'time_process_normalization.txt',time_process)
print("\tTotal: {:.2f} seconds".format(timer))
print("\tAvg. per process: {:.2f} seconds".format(timer/cpu_max))
m=m+1
I think it was due to a memory leak . Indeed i added gc.collect() command after each loop and the problem was solved.

Tensorflow - Read and Save TFRecords to Dict and Use Multiprocessing

I am trying to speed up the conversion of select tfrecords to a series of python dictionaries. Here's what I have. Initially the CPU utilization spikes, but then goes to almost zero, suggesting my code is not working correctly.
My goal is to have 3 dictionaries saved and pickled. There are 14,000+ tfrecord files (2 gigs appx). At the current rate, it will take about 84 hours to run on a single process.
Are there any problems with my use of manage dicts
import glob
import tensorflow as tf
import cPickle
import numpy as np
from tqdm import tqdm
import collections
from multiprocessing import Process, Manager, Pool
def get_multihot_encoding(example_label):
enc = np.zeros(10)
for label in example_label:
if label in lookup.values():
index = lookup_inverted[label]
enc[index] = 1
return list(enc)
# Set-up MultiProcessing
manager = Manager()
audio_embeddings_dict = manager.dict()
audio_labels_dict = manager.dict()
audio_multihot_dict = manager.dict()
sess = tf.Session()
# The iterable which gets passed to the function
all_tfrecord_filenames = glob.glob('/Users/jeff/features/audioset_v1_embeddings/unbal_train/*.tfrecord')
def process_tfrecord(tfrecord):
for idx, example in enumerate(tf.python_io.tf_record_iterator(tfrecord)):
tf_example = tf.train.Example.FromString(example)
vid_id = tf_example.features.feature['video_id'].bytes_list.value[0].decode(encoding='UTF-8')
example_label = list(np.asarray(tf_example.features.feature['labels'].int64_list.value))
# Non zero intersect of 2 sets is True - only create dict entries if this is true!
if set(example_label) & label_filters:
print(set(example_label) & label_filters, " Is the intersection of the two")
tf_seq_example = tf.train.SequenceExample.FromString(example)
n_frames = len(tf_seq_example.feature_lists.feature_list['audio_embedding'].feature)
audio_frame = []
for i in range(n_frames):
audio_frame.append(tf.cast(tf.decode_raw(
tf_seq_example.feature_lists.feature_list['audio_embedding'].feature[i].bytes_list.value[0],tf.uint8)
,tf.float32).eval(session=sess))
audio_embeddings_dict[vid_id] = audio_frame
audio_labels_dict[vid_id] = example_label
audio_multihot_dict[vid_id] = get_multihot_encoding(example_label)
#print(get_multihot_encoding(example_label), "Is the encoded label")
if idx % 100 == 0:
print ("Saving dictionary at loop: {}".format(idx))
cPickle.dump(audio_embeddings_dict, open('audio_embeddings_dict_unbal_train_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_multihot_dict_bal_untrain_multi_{}.pkl'.format(idx), 'wb'))
cPickle.dump(audio_multihot_dict, open('audio_labels_unbal_dict_multi_{}.pkl'.format(idx), 'wb'))
pool = Pool(50)
result = pool.map(process_tfrecord, all_tfrecord_filenames)

How do I enable the REFS_OK flag in nditer in numpy in Python 3.3?

Does anyone know how one goes about enabling the REFS_OK flag in numpy? I cannot seem to find a clear explanation online.
My code is:
import sys
import string
import numpy as np
import pandas as pd
SNP_df = pd.read_csv('SNPs.txt',sep='\t',index_col = None ,header = None,nrows = 101)
output = open('100 SNPs.fa','a')
for i in SNP_df:
data = SNP_df[i]
data = np.array(data)
for j in np.nditer(data):
if j == 0:
output.write(("\n>%s\n")%(str(data(j))))
else:
output.write(data(j))
I keep getting the error message: Iterator operand or requested dtype holds references, but the REFS_OK was not enabled.
I cannot work out how to enable the REFS_OK flag so the program can continue...
I have isolated the problem. There is no need to use np.nditer. The main problem was with me misinterpreting how Python would read iterator variables in a for loop. The corrected code is below.
import sys
import string
import fileinput
import numpy as np
SNP_df = pd.read_csv('datafile.txt',sep='\t',index_col = None ,header = None,nrows = 5000)
output = open('outputFile.fa','a')
for i in range(1,51):
data = SNP_df[i]
data = np.array(data)
for j in range(0,1):
output.write(("\n>%s\n")%(str(data[j])))
for k in range(1,len(data)):
output.write(str(data[k]))
If you really want to enable the flag, I have an working example.
Python 2.7, numpy 1.14.2, pandas 0.22.0
import pandas as pd
import numpy as np
# get all data as panda DataFrame
data = pd.read_csv("./monthdata.csv")
print(data)
# get values as numpy array
data_ar = data.values # numpy.ndarray, every element is a row
for row in data_ar:
print(row)
sum = 0
count = 0
for month in np.nditer(row, flags=["refs_OK"], op_flags=["readwrite"]):
print month