AttributeError: 'list' object has no attribute 'reshape'. for big frames - numpy

this is my code. and i don't know what is the problem. the my goal is read more number frames and then convert them to array and finally split them to train and test data.
def read_frames(PATH, num1, num2,seq_len,count,num):
directory = os.listdir(PATH)
for i in range(num1,num2):
source_folder = PATH + '/' +'{}'.format(directory[i])
print("appending the images is started for:", directory[i])
for item in sorted(os.listdir(source_folder), key = len):
img = Image.open(os.path.join(source_folder, item))
frames.append(item)
x.append(np.asarray(img))
count+=1
if count == seq_len:
break
this is a function that read the frames from the path and convert them to array. seq_len is the number of frames.i read the frames in 3 parts. 2 parts using the function that mentioned and other part is:
seq_len = 1624
frames = []
x = []
y = []
w=[]
count = 0
num = 0
PATH = ""
directory = os.listdir(PATH)
print("#.................reading and appending the frames for PART1 are started...........#","\n")
for i in range(0, len(directory)):
source_folder = "PATH/{}".format(directory[i])
print(num)
# print(source_folder)
print("appending the images is started for:", directory[i])
for item in sorted(os.listdir(source_folder), key = len):
# print(item)
# print(count)
img = Image.open(os.path.join(source_folder, item))
frames.append(item)
x.append(np.asarray(img))
count+=1
if count == seq_len:
break
the total number of frames that reading from 3 parts are 333060.
when running this cell:
w = np.array(x)
print('w_shape: ', np.array(w, dtype='uint8').shape)
i expected that the output be (333060,224,224,3) but it is not. and i face this error.
ValueError: setting an array element with a sequence.
when the number of frames are not more this error doesn't happening and when the number of frames are more i face with that error. i using this frames for my pre-trained cnn + lstm network.
please help me to solve that.

Related

Operands Could not be Broadcast with Shapes (19,)(0,) -- KNN

I am working on how to use KNN to predict a rating for a movie. I use a video and a book to teach myself how to go about it
I tried to run the code I found in the book but it gave me error message. I googled the error message so as to understand it and fix my problem but I don't think I know how to adapt the solutions to my problem.
import numpy as np
import pandas as pd
r_cols = ['user_id', 'movie_id', 'rating']
ratings = pd.read_csv('C:/Users/dell/Downloads/DataScience/DataScience-Python3/ml-100k/u.data', sep='\t', engine='python', names=r_cols, usecols=range(3)) # please enter your file path here. The file is u.data
print(ratings.head())
movieProperties = ratings.groupby('movie_id').agg({'rating': [np.size, np.mean]})
print(movieProperties.head())
movieNumRatings = pd.DataFrame(movieProperties['rating']['size'])
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
print(movieNormalizedNumRatings.head())
movieDict = {}
with open('C:/Users/dell/Downloads/DataScience/DataScience-Python3/ml-100k/u.item') as f: # The file is u.item
temp = ''
for line in f:
fields = line.rstrip('\n').split('|')
movieID = int(fields[0])
name = fields[1]
genres = fields[5:25]
genres = map(int, genres)
movieDict[movieID] = (name, genres, movieNormalizedNumRatings.loc[movieID].get('size'), movieProperties.loc[movieID].rating.get('mean'))
print(movieDict[1])
from scipy import spatial
def ComputeDistance(a, b):
genresA = np.array(list(a[1]))
genresB = np.array(list(b[1]))
genreDistance = spatial.distance.cosine(genresA, genresB)
popularityA = np.array(a[2])
popularityB = np.array(b[2])
popularityDistance = abs(popularityA - popularityB)
return genreDistance + popularityDistance
print(ComputeDistance(movieDict[2], movieDict[4]))
import operator
def getNeighbors(movieID, K):
distances = []
for movie in movieDict:
if (movie != movieID):
dist = ComputeDistance(movieDict[movieID], movieDict[movie])
distances.append((movie, dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(K):
neighbors.append(distance[x][0])
return neighbors
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
I got this error message from PowerShell:
Traceback(most recent call last):
neighbors = getNeighbors(1, K)
dist = ComputeDistance(movieDict[movieID], movieDict[movie])
genreDistance = spatial.distance.cosine(genresA, genresB)
return correlation(u, v, w=w, centered=False)
uv = np.average(u*v, weights=w)
ValueError: operands could not be broadcast together with shape (19,)(0,)
I got this error message when I tried to debug the problem from ipython terminal:
c:\programdata\anaconda3\lib\site-packages\scipy\spatial\distance.py(695)correlation()
693 u = u - umu
694 v = v - vmu
---> 695 uv = np.average(u*v, weights=w)
696 uu = np.average(np.square(u), weights=w)
697 vv = np.average(np.square(v), weights=w)
**Note**: The code ran fine and produced results up until *print(Cprint(ComputeDistance(movieDict[2], movieDict[4]))*
My guess is the problem is with this part of the code:
import operator
def getNeighbors(movieID, K):
distances = []
for movie in movieDict:
if (movie != movieID):
dist = ComputeDistance(movieDict[movieID], movieDict[movie])
distances.append((movie, dist))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for x in range(K):
neighbors.append(distance[x][0])
return neighbors
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)
The code can be found in this link: https://hendra-herviawan.github.io/Movie-Recommendation-based-on-KNN-K-Nearest-Neighbors.html
The error of "operands could not be broadcast together with shape (x,)(y,)" usually raises when you are trying to perform an operation between two arrays that must have the same shape but they don't. In your case you are trying to take an weighted average between two arrays u and v. The arrays u and v don't have the length.
I saw that you parsing a movies list by splitting the lines with the "|" character and then storing these results in a dictionary. Probably this file or its division with "|" are returning different results.
The error log shows that the second array doesn't have any element, this could be generated by an empty line on the movies files.

How to print the labels of a tensor flow dataset?

I would like to know what are the available labels in a particular dataset. In the code i know the labels , but I want it to be printed from the dataset assuming if I don't know all the labels. is there a way to do that?
I couldn't find a solution for this in web.
splits = tfds.Split.ALL.subsplit(weighted=(70,30))
(training_set, validation_set),dataset_info = tfds.load('tf_flowers', with_info = True , as_supervised = True,split = splits)
num_classes = dataset_info.features['label'].num_classes
num_training_examples = 0
num_validation_examples = 0
for example in training_set:
num_training_examples += 1
for example in validation_set:
num_validation_examples += 1
print('Total Number of Classes: {}'.format(num_classes))
print('Total Number of Training Images: {}'.format(num_training_examples))
print('Total Number of Validation Images: {} \n'.format(num_validation_examples))
class_names = np.array(dataset_info.features['label'].names)

How to build the input pipeline for a Siamese Network in Tensorflow?

Currently, I am trying to implement the experiment in the paper: Siamese Neural Networks for One-shot Image Recognition using Tensorflow.
The image set is Omniglot, in which each image can be loaded as an [105,105,1] array.
Since the input of Siamese network is a pair of images with same-or-different class, I need to preprocess the dataset as follows.
I transfer the Omniglot dataset into a [n,20,105,105,1] numpy array, where n represents the number of classes, in which each class has 20 examples of images of size [105,105,1].
Then I implement a function to return one pair of images:
def get_example(dataset):
"""
get one pair of images
:param dataset: the set, eg. training set
:return: when label is 1, return a concatenated array of two imgs from same character
when label is 0, return a concatenated array of two imgs from different characters
"""
# randint(0, x) generates 1 random numbers from 0 ~ x
set_upper = len(dataset)
set_lower = 0
# sample(range(0, 20), 2) generates 2 random numbers from 0 ~ 19
char_upper = 20
char_lower = 0
label = randint(0, 1)
if label:
# randomly select one character from the set
char = randint(set_lower, set_upper-1)
rand_char = dataset[char]
# randomly select two different images from the character
a = b = 0
while a == b:
a, b = sample(range(char_lower, char_upper), 2)
img_a = rand_char[a]
img_b = rand_char[b]
else:
# randomly select two characters from the set
c1, c2 = sample(range(set_lower, set_upper), 2)
rand_char1 = dataset[c1]
rand_char2 = dataset[c2]
# randomly select two images from two characters
a, b = sample(range(char_lower, char_upper), 2)
img_a = rand_char1[a]
img_b = rand_char2[b]
img_input = np.concatenate((img_a, img_b), axis=0)
img_input = img_input[..., newaxis]
return img_input, label
So here is my question, how to group the images into batches, and how to feed them into the model in Tensorflow?
You should be able to create a dataset as described in https://www.tensorflow.org/guide/datasets#consuming_numpy_arrays and use standard tf.data.Dataset operations like shuffle and batch to achieve your goal.

How to prepare my own data for tensorflow?

I install Tensorflow on ubuntu 14.04. I completed MNIST For ML Beginners tutorial. I understood it.
Nor, I try to use my own data. I have train datas as T[1000][10]. Labels are L[2], 1 or 0.
How can I access my data mnist.train.images ?
In input_data.py, these two functions do the main job.
1. Download
def maybe_download(filename, work_directory):
"""Download the data from Yann's website, unless it's already here."""
if not os.path.exists(work_directory):
os.mkdir(work_directory)
filepath = os.path.join(work_directory, filename)
if not os.path.exists(filepath):
filepath, _ = urlretrieve(SOURCE_URL + filename, filepath)
statinfo = os.stat(filepath)
print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
return filepath
2 Image to nparray
def extract_images(filename):
"""Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
print('Extracting', filename)
with gzip.open(filename) as bytestream:
magic = _read32(bytestream)
if magic != 2051:
raise ValueError(
'Invalid magic number %d in MNIST image file: %s' %
(magic, filename))
num_images = _read32(bytestream)
rows = _read32(bytestream)
cols = _read32(bytestream)
buf = bytestream.read(rows * cols * num_images)
data = numpy.frombuffer(buf, dtype=numpy.uint8)
data = data.reshape(num_images, rows, cols, 1)
return data
Based on your dataset and location, you can call:
local_file = maybe_download(TRAIN_IMAGES, train_dir)
train_images = extract_images(local_file)
See the full source code at https://github.com/nlintz/TensorFlow-Tutorials/blob/master/input_data.py.

Odd-size numpy arrays send/receive

I would like to gather numpy array contents from all processors to one. In case all arrays are of the same size, it works. However I don't see a natural way of doing the same task for arrays of proc-dependent size. Please consider the following code:
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
My problem is at the "received" allocation. How can I know what is the size to be allocated? Do I have to first send/receive each array size?
Based on a suggestion below, I'll go with
data_array = numpy.ones(rank + 3, dtype=int)
data_array *= rank + 5
print '[{}] data: {} ({})'.format(rank, data_array, type(data_array))
# make all processors aware of data array sizes
all_sizes = {rank: data_array.size}
gathered_all_sizes = comm_py.allgather(all_sizes)
for d in gathered_all_sizes:
all_sizes.update(d)
# prepare Gatherv as described by #francis
nbsum = 0
sendcounts = []
displacements = []
for p in xrange(size):
n = all_sizes[p]
displacements.append(nbsum)
sendcounts.append(n)
nbsum += n
if rank==0:
result = numpy.empty(nbsum, dtype=numpy.int)
else:
result = None
comm_py.Gatherv(data_array,[result, tuple(sendcounts), tuple(displacements), MPI.INT64_T], root=0)
print '[{}] gathered data: {}'.format(rank, result)
In the code you pasted, both Send() and Recv() sends nb_elts elements. The problem is that nb_elts is not the same for every processes... Hence, the number of item received does not match the number of elements that were sent and the program complains:
mpi4py.MPI.Exception: MPI_ERR_TRUNCATE: message truncated
To prevent that, the root process must compute the number of items that the other processes have sent. Hence, in the loop for p in xrange(1, size), nb_elts must be computed according to p, not rank.
The following code based on yours has been corrected. I would add that the natural way to perform this gathering operation is to use Gatherv(). See http://materials.jeremybejarano.com/MPIwithPython/collectiveCom.html and the documentation of mpi4py for instance. I added the corresponding sample code. The only tricky point is that numpy.int is 64bit long. Hence, the Gatherv() uses the MPI type MPI_DOUBLE.
from mpi4py import MPI
import numpy
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
if rank >= size/2:
nb_elts = 5
else:
nb_elts = 2
# create data
lst = []
for i in xrange(nb_elts):
lst.append(rank*3+i)
array_lst = numpy.array(lst, dtype=int)
# communicate array
result = []
if rank == 0:
result = array_lst
for p in xrange(1, size):
if p >= size/2:
nb_elts = 5
else:
nb_elts = 2
received = numpy.empty(nb_elts, dtype=numpy.int)
comm.Recv(received, p, tag=13)
result = numpy.concatenate([result, received])
else:
comm.Send(array_lst, 0, tag=13)
if rank==0:
print "Send Recv, result= "+str(result)
#How to use Gatherv:
nbsum=0
sendcounts=[]
displacements=[]
for p in xrange(0,size):
displacements.append(nbsum)
if p >= size/2:
nbsum+= 5
sendcounts.append(5)
else:
nbsum+= 2
sendcounts.append(2)
if rank==0:
print "nbsum "+str(nbsum)
print "sendcounts "+str(tuple(sendcounts))
print "displacements "+str(tuple(displacements))
print "rank "+str(rank)+" array_lst "+str(array_lst)
print "numpy.int "+str(numpy.dtype(numpy.int))+" "+str(numpy.dtype(numpy.int).itemsize)+" "+str(numpy.dtype(numpy.int).name)
if rank==0:
result2=numpy.empty(nbsum, dtype=numpy.int)
else:
result2=None
comm.Gatherv(array_lst,[result2,tuple(sendcounts),tuple(displacements),MPI.DOUBLE],root=0)
if rank==0:
print "Gatherv, result2= "+str(result2)