RuntimeError: The size of tensor a (49) must match the size of tensor b (64) at non-singleton dimension 1 - numpy
I have been working with Swin Transformers Attention MaP. Below is my code implementation
from PIL import Image
import numpy
import sys
from torchvision import transforms
import numpy as np
import cv2
def rollout(attentions, discard_ratio, head_fusion):
result = torch.eye(attentions[0].size(-1))
with torch.no_grad():
for attention in attentions:
# print(attentions)
if head_fusion == "mean":
attention_heads_fused = attention.mean(axis=1)
elif head_fusion == "max":
attention_heads_fused = attention.max(axis=1)[0]
elif head_fusion == "min":
attention_heads_fused = attention.min(axis=1)[0]
else:
raise "Attention head fusion type Not supported"
# Drop the lowest attentions, but
# don't drop the class token
flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
# print(flat)
_, indices = flat.topk(int(flat.size(-1)*discard_ratio), -1, False)
# print("_ : ",_," indices : ",indices)
indices = indices[indices != 0]
flat[0, indices] = 0
I = torch.eye(attention_heads_fused.size(-1))
# print("I : ",I)
a = (attention_heads_fused + 1.0*I)/2
# print("a : ",a)
# print(a.size())
print(a.sum(dim=-1))
a = a / a.sum(dim=-1)
result = torch.matmul(a, result)
# print("result : ",result)
# Look at the total attention between the class token,
# and the image patches
mask = result[0, 0 , 1 :]
# In case of 224x224 image, this brings us from 196 to 14
width = int(mask.size(-1)**0.5)
mask = mask.reshape(width, width).numpy()
mask = mask / np.max(mask)
return mask
class VITAttentionRollout:
def __init__(self, model, attention_layer_name='dropout', head_fusion="mean",
discard_ratio=0.9):
self.model = model
self.head_fusion = head_fusion
self.discard_ratio = discard_ratio
# print(self.model.named_modules())
for name, module in self.model.named_modules():
# print("Name : ",name," Module : ",module)
if attention_layer_name in name:
module.register_forward_hook(self.get_attention)
# print(self.attentions)
self.attentions = []
def get_attention(self, module, input, output):
self.attentions.append(output.cpu())
def __call__(self, input_tensor):
self.attentions = []
with torch.no_grad():
output = self.model(**input_tensor)
# print(output)
return rollout(self.attentions, self.discard_ratio, self.head_fusion)
This is the main program
import sys
import torch
from PIL import Image
from torchvision import transforms
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
# from vit_rollout import VITAttentionRollout
from vit_grad_rollout import VITAttentionGradRollout
def show_mask_on_image(img, mask):
img = np.float32(img) / 255
heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
heatmap = np.float32(heatmap) / 255
cam = heatmap + np.float32(img)
cam = cam / np.max(cam)
return np.uint8(255 * cam)
if __name__ == '__main__':
model.eval()
image_path = '/content/both.jpg'
category_index = None
head_fusion = 'max'
discard_ratio = 0.9
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5]),
])
img = Image.open(image_path)
img = img.resize((224, 224))
input_tensor = feature_extractor(img, return_tensors="pt")
#print(input_tensor)
if category_index is None:
print("Doing Attention Rollout")
attention_rollout = VITAttentionRollout(model, head_fusion=head_fusion,
discard_ratio=discard_ratio)
mask = attention_rollout(input_tensor)
name = "attention_rollout_{:.3f}_{}.png".format(discard_ratio, head_fusion)
else:
print("Doing Gradient Attention Rollout")
grad_rollout = VITAttentionGradRollout(model, discard_ratio=discard_ratio)
mask = grad_rollout(input_tensor, category_index)
name = "grad_rollout_{}_{:.3f}_{}.png".format(category_index,
discard_ratio, head_fusion)
np_img = np.array(img)[:, :, ::-1]
mask = cv2.resize(mask, (np_img.shape[1], np_img.shape[0]))
mask = show_mask_on_image(np_img, mask)
cv2_imshow(np_img)
cv2_imshow(mask)
cv2.imwrite("input.jpg",np_img)
cv2.imwrite(name, mask)
cv2.waitKey(-1)
I am referring the git project https://github.com/jacobgil/vit-explain
But I am getting the error as RuntimeError: The size of tensor a (49) must match the size of tensor b (64) at non-singleton dimension 1
I researched some git projects but there is very much less information on Swin Transformers. So is there any way that I can make an attention map for Swin transformers models ?
Please help with it
Thanks in advance
Related
Nan Loss during training - MaskRCNN
I am training a custom Dataset sample_data where each image annotation is in the Pascal VOC XML format. When training the MaskRCNN model from the matterport repo for TensorFlow 1.15 and the maskrcnn for Tensorflow 2.2.0 config for the matterport using tensorflow-cpu : config When running the model (using both versions) tensorflow-cpu, data generation is pretty fast(almost instantly) and training happens as expected with proper loss values But when using the tensorflow-gpu, The model loading is too long, then epochs start after another 7-10 minutes and the loss generated is nan, I've tried to lower the Learning rate to 1e-5, multiprocessing off, workers = 1, changed optimizer to Adam, System Specs: i5 12400f, 12gb Ram, 12Gb RTX 3060, all cudnn and cudatoolkit version according to tensorflow documentation installed. Training Code : from os import listdir import imgaug import numpy as np from xml.etree import ElementTree from numpy import zeros from numpy import asarray from mrcnn.utils import Dataset from matplotlib import pyplot from mrcnn.visualize import display_instances from mrcnn.utils import extract_bboxes from mrcnn.config import Config from mrcnn.model import MaskRCNN import mrcnn.model as mrmodel import warnings import tensorflow as tf import time warnings.filterwarnings('ignore') # gpu_available = tf.config.list_physical_devices('GPU') gpu_available = tf.test.is_gpu_available() gpu_available class CornDataset(Dataset): # load the dataset definitions def load_dataset(self, dataset_dir, is_train=True): # start = time.perf_counter() # define classes self.add_class("dataset", 1, "fall-armyworm-larva") self.add_class("dataset", 2, "fall-armyworm-larval-damage") self.add_class("dataset", 3, "fall-armyworm-frass") self.add_class("dataset", 4, "fall-armyworm-egg") self.add_class("dataset", 5, "healthy-maize") self.add_class("dataset", 6, "maize-streak-disease") # define data locations images_dir = dataset_dir + '/images/' annotations_dir = dataset_dir + '/annots/' # find all images count = 1 for filename in listdir(images_dir): print(filename) # extract image id image_id = filename[:-4] name1 = '' if filename[-4:] != 'jpeg': name1 = filename[:-4] else: name1 = filename[:-5] image_id = name1 # skip all images after 115 if we are building the train set if is_train and int(image_id) >= 6770: continue # skip all images before 115 if we are building the test/val set if not is_train and int(image_id) < 6770: continue img_path = images_dir + filename ann_path = annotations_dir + image_id + '.xml' # add to dataset self.add_image('dataset', image_id=image_id, path=img_path, annotation=ann_path, class_ids = [0,1,2,3,4,5,6]) # stop = time.perf_counter() # print("time for load_dataset",(stop-start)) # extract bounding boxes from an annotation file def extract_boxes(self, filename): # start = time.perf_counter() # load and parse the file tree = ElementTree.parse(filename) # get the root of the document root = tree.getroot() # extract each bounding box boxes = list() for box in root.findall('.//object'): name = box.find('name').text #Add label name to the box list xmin = int(box.find('./bndbox/xmin').text) ymin = int(box.find('./bndbox/ymin').text) xmax = int(box.find('./bndbox/xmax').text) ymax = int(box.find('./bndbox/ymax').text) coors = [xmin, ymin, xmax, ymax, name] boxes.append(coors) # extract image dimensions width = int(root.find('.//size/width').text) height = int(root.find('.//size/height').text) # stop = time.perf_counter() # print("time for extract_boxes",(stop-start)) return boxes, width, height # load the masks for an image def load_mask(self, image_id): # start = time.perf_counter() # get details of image info = self.image_info[image_id] # define box file location path = info['annotation'] #return info, path # load XML boxes, w, h = self.extract_boxes(path) # create one array for all masks, each on a different channel masks = zeros([h, w, len(boxes)], dtype='uint8') # create masks class_ids = list() for i in range(len(boxes)): box = boxes[i] row_s, row_e = box[1], box[3] col_s, col_e = box[0], box[2] # box[4] will have the name of the class if box[4]=='fall-armyworm-larva': masks[row_s:row_e, col_s:col_e, i] = 1 class_ids.append(self.class_names.index('fall-armyworm-larva')) elif box[4]=='fall-armyworm-larval-damage': masks[row_s:row_e, col_s:col_e, i] = 2 class_ids.append(self.class_names.index('fall-armyworm-larval-damage')) elif box[4]=='fall-armyworm-frass': masks[row_s:row_e, col_s:col_e, i] = 3 class_ids.append(self.class_names.index('fall-armyworm-frass')) elif box[4]=='fall-armyworm-egg': masks[row_s:row_e, col_s:col_e, i] = 4 class_ids.append(self.class_names.index('fall-armyworm-egg')) elif box[4]=='healthy-maize' or box[4]=='healthy-maize' or box[4]=='healthy-images' or box[4]=='none-healthy': masks[row_s:row_e, col_s:col_e, i] = 5 class_ids.append(self.class_names.index('healthy-maize')) elif box[4]=='maize-streak-disease': masks[row_s:row_e, col_s:col_e, i] = 6 class_ids.append(self.class_names.index('maize-streak-disease')) # stop = time.perf_counter() # print("time for load_mask",(stop-start)) return masks, asarray(class_ids, dtype='int32') # load an image reference def image_reference(self, image_id): info = self.image_info[image_id] return info['path'] dataset_dir='final_dataset/' validset_dir = 'validation/' train_set = CornDataset() train_set.load_dataset(dataset_dir, is_train=True) train_set.prepare() print('Train: %d' % len(train_set.image_ids)) # test/val set test_set = CornDataset() test_set.load_dataset(dataset_dir, is_train=False) test_set.prepare() print('Test: %d' % len(test_set.image_ids)) import random num=random.randint(0, len(train_set.image_ids)) # define image id image_id = num # load the image image = train_set.load_image(image_id) # load the masks and the class ids mask, class_ids = train_set.load_mask(image_id) # extract bounding boxes from the masks bbox = extract_bboxes(mask) # display image with masks and bounding boxes display_instances(image, bbox, mask, class_ids, train_set.class_names) class CornConfig(Config): # define the name of the configuration NAME = "corn_cfg" # number of classes (background + 5 Diseases + 1 Healthy) NUM_CLASSES = 1 + 6 IMAGES_PER_GPU = 1 # number of training steps per epoch STEPS_PER_EPOCH = 100 VALIDATION_STEPS = 50 # Skip detections with < 90% confidence # DETECTION_MIN_CONFIDENCE = 0.8 LEARNING_RATE = 1e-4 # BATCH_SIZE = 28 # prepare config config = CornConfig() config.display() import os ROOT_DIR = "/home/mehathab/Desktop/maskrcnn_drY-run" # Directory to save logs and trained model DEFAULT_LOGS_DIR = os.path.join(ROOT_DIR, "logs") # define the model model = MaskRCNN(mode='training', model_dir=DEFAULT_LOGS_DIR, config=config) model_inference = MaskRCNN(mode="inference", config=config, model_dir=DEFAULT_LOGS_DIR) # load weights (mscoco) and exclude the output layers WEIGHT_PATH = 'mask_rcnn_coco.h5' model.load_weights(WEIGHT_PATH, by_name=True, exclude=["mrcnn_class_logits", "mrcnn_bbox_fc", "mrcnn_bbox", "mrcnn_mask"]) # train weights (output layers or 'heads') # history = model.train(train_set, test_set, learning_rate=config.LEARNING_RATE, epochs=100, layers='3+') mean_average_precision_callback = mrmodel.MeanAveragePrecisionCallback(model, model_inference, test_set, calculate_map_at_every_X_epoch=5, verbose=1) model.train(train_set,test_set, learning_rate=config.LEARNING_RATE, epochs=100, layers='heads', custom_callbacks=[mean_average_precision_callback])
Tensorflow lite only using the first item in the labelmap.txt file when identifying items
I have installed tensorflow 1.15 and created a custom model. I converted it into a .tflite file so tensorflow lite can read it. Then I ran the following code: import os import argparse import cv2 import numpy as np import sys import glob import importlib.util parser = argparse.ArgumentParser() parser.add_argument('--modeldir', help='Folder the .tflite file is located in', required=True) parser.add_argument('--graph', help='Name of the .tflite file, if different than detect.tflite', default='detect.tflite') parser.add_argument('--labels', help='Name of the labelmap file, if different than labelmap.txt', default='labelmap.txt') parser.add_argument('--threshold', help='Minimum confidence threshold for displaying detected objects', default=0.5) parser.add_argument('--image', help='Name of the single image to perform detection on. To run detection on multiple images, use --imagedir', default=None) parser.add_argument('--imagedir', help='Name of the folder containing images to perform detection on. Folder must contain only images.', default=None) parser.add_argument('--edgetpu', help='Use Coral Edge TPU Accelerator to speed up detection', action='store_true') args = parser.parse_args() MODEL_NAME = args.modeldir GRAPH_NAME = args.graph LABELMAP_NAME = args.labels min_conf_threshold = float(args.threshold) use_TPU = args.edgetpu IM_NAME = args.image IM_DIR = args.imagedir if (IM_NAME and IM_DIR): print('Error! Please only use the --image argument or the --imagedir argument, not both. Issue "python TFLite_detection_image.py -h" for help.') sys.exit() if (not IM_NAME and not IM_DIR): IM_NAME = 'test1.jpg' pkg = importlib.util.find_spec('tflite_runtime') if pkg: from tflite_runtime.interpreter import Interpreter if use_TPU: from tflite_runtime.interpreter import load_delegate else: from tensorflow.lite.python.interpreter import Interpreter if use_TPU: from tensorflow.lite.python.interpreter import load_delegate if use_TPU: if (GRAPH_NAME == 'detect.tflite'): GRAPH_NAME = 'edgetpu.tflite' CWD_PATH = os.getcwd() if IM_DIR: PATH_TO_IMAGES = os.path.join(CWD_PATH,IM_DIR) images = glob.glob(PATH_TO_IMAGES + '/*') elif IM_NAME: PATH_TO_IMAGES = os.path.join(CWD_PATH,IM_NAME) images = glob.glob(PATH_TO_IMAGES) PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,GRAPH_NAME) PATH_TO_LABELS = os.path.join(CWD_PATH,MODEL_NAME,LABELMAP_NAME) with open(PATH_TO_LABELS, 'r') as f: labels = [line.strip() for line in f.readlines()] if labels[0] == '???': del(labels[0]) if use_TPU: interpreter = Interpreter(model_path=PATH_TO_CKPT, experimental_delegates=[load_delegate('libedgetpu.so.1.0')]) print(PATH_TO_CKPT) else: interpreter = Interpreter(model_path=PATH_TO_CKPT) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() height = input_details[0]['shape'][1] width = input_details[0]['shape'][2] floating_model = (input_details[0]['dtype'] == np.float32) input_mean = 127.5 input_std = 127.5 for image_path in images: image = cv2.imread(image_path) image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) imH, imW, _ = image.shape image_resized = cv2.resize(image_rgb, (width, height)) input_data = np.expand_dims(image_resized, axis=0) if floating_model: input_data = (np.float32(input_data) - input_mean) / input_std interpreter.set_tensor(input_details[0]['index'],input_data) interpreter.invoke() boxes = interpreter.get_tensor(output_details[0]['index'])[0] # Bounding box coordinates of detected objects classes = interpreter.get_tensor(output_details[1]['index'])[0] # Class index of detected objects scores = interpreter.get_tensor(output_details[2]['index'])[0] # Confidence of detected objects for i in range(len(scores)): if ((scores[i] > min_conf_threshold) and (scores[i] <= 1.0)): ymin = int(max(1,(boxes[i][0] * imH))) xmin = int(max(1,(boxes[i][1] * imW))) ymax = int(min(imH,(boxes[i][2] * imH))) xmax = int(min(imW,(boxes[i][3] * imW))) cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (10, 255, 0), 2) object_name = labels[int(classes[i])] # Look up object name from "labels" array using class index label = '%s: %d%%' % (object_name, int(scores[i]*100)) # Example: 'person: 72%' labelSize, baseLine = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2) # Get font size label_ymin = max(ymin, labelSize[1] + 10) # Make sure not to draw label too close to top of window cv2.rectangle(image, (xmin, label_ymin-labelSize[1]-10), (xmin+labelSize[0], label_ymin+baseLine-10), (255, 255, 255), cv2.FILLED) # Draw white box to put label text in cv2.putText(image, label, (xmin, label_ymin-7), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 0), 2) cv2.imshow('Object detector', image) if cv2.waitKey(0) == ord('q'): break cv2.destroyAllWindows() Now, my custom model seems to work. It located the items on the image correctly but it labels everything with the first item on the labelmap.txt. For example: labelmap.txt: key remote The model identifies the remotes in the images but labels them as "key" because it is the first thing in the labelmap.txt. I don't know why this is happening, can someone please help me. I am sorry if anything is unclear. Please let me know and I will try my best to clarify a little better. Thank you. I followed the https://github.com/EdjeElectronics/TensorFlow-Lite-Object-Detection-on-Android-and-Raspberry-Pi.
Using photoshop to complete undersampling in tensorflow object detection?
I'm currently training an object detection model using Tensorflow and I ran into a problem. I don't have enough samples to train my model effectively and it will take me a long time to get more samples. I was wondering if it could be a good idea to complete the remaining samples using photoshop or will I run into issues using this approach?
You have so many options: imgaug albumentations Augmentor OpenCV: Image-Augmentation-Using-OpenCV-and-Python-Github-Repo example code I use before: import numpy as np import cv2 as cv import imutils def data_augmentation(img, min_rot_angle=-180, max_rot_angle=180, crop_ratio=0.2, smooth_size=3, sharp_val=3, max_noise_scale=10): (H, W) = img.shape[:2] img_a = img all_func = ['flip', 'rotate', 'crop', 'smooth', 'sharp', 'noise'] do_func = np.random.choice(all_func, size=np.random.randint(1, len(all_func)), replace=False) #do_func = ['crop'] # Filp image, 0: vertically, 1: horizontally if 'flip' in do_func: img_a = cv.flip(img_a, np.random.choice([0, 1])) # Rotate image if 'rotate' in do_func: rot_ang = np.random.uniform(min_rot_angle, max_rot_angle) img_a = imutils.rotate_bound(img_a, rot_ang) # Crop image if 'crop' in do_func: (H_A, W_A) = img_a.shape[:2] start_x = np.random.randint(0, int(H_A * crop_ratio)) start_y = np.random.randint(0, int(W_A * crop_ratio)) end_x = np.random.randint(int(H_A * (1-crop_ratio)), H_A) end_y = np.random.randint(int(W_A * (1-crop_ratio)), W_A) img_a = img_a[start_x:end_x, start_y:end_y] # Smoothing if 'smooth' in do_func: img_a = cv.GaussianBlur(img_a, (smooth_size, smooth_size), 0) # Sharpening if 'sharp' in do_func: de_sharp_val = -(sharp_val - 1) / 8 kernel = np.array([[de_sharp_val]*3, [de_sharp_val, sharp_val, de_sharp_val], [de_sharp_val]*3]) img_a = cv.filter2D(img_a, -1, kernel) # Add the Gaussian noise to the image if 'noise' in do_func: noise_scale = np.random.uniform(0, max_noise_scale) gauss = np.random.normal(0, noise_scale, img_a.size) gauss = np.float32(gauss.reshape(img_a.shape[0],img_a.shape[1],img_a.shape[2])) img_a = cv.add(img_a,gauss) # Keep shape img_a = cv.resize(img_a, (W, H)) return np.float32(img_a) Others: You can do DA with just tensorflow! more in this blog: Data Augmentation in Python: Everything You Need to Know
Unable to detect multiple faces at a time
For some reason, I'm not able to detect multiple faces at a time. It's only detecting one face at one time. How do i resolve this issue? I've added the code below. I've used google's facenet for real time face recognition. In the video output it creates a bounding box only on one face at a time. But in the console output it can count that the number of faces present are two or more than two. from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from scipy import misc import cv2 import matplotlib.pyplot as plt import numpy as np import argparse import facenet import detect_face import os from os.path import join as pjoin import sys import time import copy import math import pickle from sklearn.svm import SVC from sklearn.externals import joblib #addded #import reload #reload(sys) #sys.setdefaultencoding('utf8') print('Creating networks and loading parameters') with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = detect_face.create_mtcnn(sess, './') #face detection minsize = 20 # minimum size of face #minsize, threshold, factor used for detection threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 0.709 # scale factor margin = 44 frame_interval = 3 batch_size = 1000 image_size = 182 input_image_size = 160 items = os.listdir("/Aryabhatta Robotics Internship/facenet-master/Real_time_face/ids/aligned") #HumanNames = [] #for names in items: #HumanNames.append(names) #print(HumanNames) #HumanNames = ['Alok','Siddhant','tesra','s01','s02','s03','s04','s05','s06','s07','s08','s09','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20'] #train human name, known face names print('Loading feature extraction model') modeldir = '/Aryabhatta Robotics Internship/facenet-master/Real_time_face/models/20180402-114759/20180402-114759.pb' #feature extraction mmodel facenet.load_model(modeldir) images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] classifier_filename = '/Aryabhatta Robotics Internship/facenet-master/Real_time_face/models/my_classifier/my_classifier.pkl' #out own classifier classifier_filename_exp = os.path.expanduser(classifier_filename) with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile)#, encoding='latin1') print('load classifier file-> %s' % classifier_filename_exp) video_capture = cv2.VideoCapture(0) c = 0 # #video writer # fourcc = cv2.VideoWriter_fourcc(*'DIVX') # out = cv2.VideoWriter('3F_0726.avi', fourcc, fps=30, frameSize=(640,480)) print('Start Recognition!') prevTime = 0 while True: #infinite loop ret, frame = video_capture.read() #video capture from webcam frame = cv2.resize(frame, (0,0), fx=0.5, fy=0.5) #resize frame (optional) curTime = time.time() # calc fps timeF = frame_interval if (c % timeF == 0): find_results = [] if frame.ndim == 2: frame = facenet.to_rgb(frame) frame = frame[:, :, 0:3] bounding_boxes, _ = detect_face.detect_face(frame, minsize, pnet, rnet, onet, threshold, factor) nrof_faces = bounding_boxes.shape[0] print('Detected_FaceNum: %d' % nrof_faces) if nrof_faces > 0: det = bounding_boxes[:, 0:4] img_size = np.asarray(frame.shape)[0:2] cropped = [] scaled = [] scaled_reshape = [] bb = np.zeros((nrof_faces,4), dtype=np.int32) for i in range(nrof_faces): print("faceno:" + str(i)) emb_array = np.zeros((1, embedding_size)) bb[i][0] = det[i][0] bb[i][1] = det[i][1] bb[i][2] = det[i][2] bb[i][3] = det[i][3] # inner exception if bb[i][0] <= 0 or bb[i][1] <= 0 or bb[i][2] >= len(frame[0]) or bb[i][3] >= len(frame): print('face is inner of range!') continue cropped.append(frame[bb[i][1]:bb[i][3], bb[i][0]:bb[i][2], :]) cropped[0] = facenet.flip(cropped[0], False) scaled.append(misc.imresize(cropped[0], (image_size, image_size), interp='bilinear')) scaled[0] = cv2.resize(scaled[0], (input_image_size,input_image_size), interpolation=cv2.INTER_CUBIC) scaled[0] = facenet.prewhiten(scaled[0]) scaled_reshape.append(scaled[0].reshape(-1,input_image_size,input_image_size,3)) feed_dict = {images_placeholder: scaled_reshape[0], phase_train_placeholder: False} emb_array[0, :] = sess.run(embeddings, feed_dict=feed_dict) #print(emb_array) threshold_accuracy = 155 predictions = model.predict_proba(emb_array) #print(predictions) for i in range(len(predictions[0])): predictions[0][i] = np.exp(18*predictions[0][i]) #print(predictions) best_class_indices = np.argmax(predictions, axis=1) print(best_class_indices) print("next") best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices] print(best_class_probabilities) for i in range(len(best_class_indices)): print('%4d %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i])) cv2.rectangle(frame, (bb[i][0], bb[i][1]), (bb[i][2], bb[i][3]), (0, 255, 0), 2) #boxing face #plot result idx under box text_x = bb[i][0] text_y = bb[i][3] + 20 # print('result: ', best_class_indices[0]) if best_class_probabilities[i] > threshold_accuracy : #result_names = HumanNames[best_class_indices[0]] cv2.putText(frame, class_names[best_class_indices[i]], (text_x, text_y), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), thickness=1, lineType=2) else: cv2.putText(frame, 'Unknown', (text_x, text_y), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), thickness=1, lineType=2) #for H_i in HumanNames: #if HumanNames[best_class_indices[0]] == H_i and best_class_probabilities[0] > threshold_accuracy : #flag = 1 #result_names = HumanNames[best_class_indices[0]] #cv2.putText(frame, result_names, (text_x, text_y), cv2.FONT_HERSHEY_COMPLEX_SMALL, #1, (0, 0, 255), thickness=1, lineType=2) #else: #cv2.putText(frame, 'Unknown', (text_x, text_y), cv2.FONT_HERSHEY_COMPLEX_SMALL, # 1, (0, 0, 255), thickness=1, lineType=2) else: print('Unable to align') sec = curTime - prevTime prevTime = curTime fps = 1 / (sec) str1 = 'FPS: %2.3f' % fps text_fps_x = len(frame[0]) - 150 text_fps_y = 20 cv2.putText(frame, str1, (text_fps_x, text_fps_y), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 0), thickness=1, lineType=2) # c+=1 cv2.imshow('Video', frame) if cv2.waitKey(1) & 0xFF == ord('q'): break video_capture.release() # #video writer # out.release() cv2.destroyAllWindows()
Threshold accuracy should be between 0 to 1. Make sure Your threshold accuracy has to be >0.60.
from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf from scipy import misc from skimage.transform import resize import cv2 import numpy as np import facenet import detect_face import os import time import pickle import sys img_path='download.jpeg' modeldir = './model/20170511-185253.pb' classifier_filename = './class/classifier.pkl' npy='./npy' train_img="./train_img" with tf.Graph().as_default(): gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.6) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) with sess.as_default(): pnet, rnet, onet = detect_face.create_mtcnn(sess, npy) minsize = 10 # minimum size of face threshold = [0.6, 0.7, 0.7] # three steps's threshold factor = 1 # scale factor margin = 44 frame_interval = 3 batch_size = 1000 image_size = 182 input_image_size = 160 HumanNames = os.listdir(train_img) HumanNames.sort() print('Loading feature extraction model') facenet.load_model(modeldir) images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] classifier_filename_exp = os.path.expanduser(classifier_filename) with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile) # video_capture = cv2.VideoCapture("akshay_mov.mp4") c = 0 print('Start Recognition!') prevTime = 0 # ret, frame = video_capture.read() frame = cv2.imread(img_path,0) frame = cv2.resize(frame, (0,0), fx=0.5, fy=0.5) #resize frame (optional) curTime = time.time()+1 # calc fps timeF = frame_interval if (c % timeF == 0): find_results = [] if frame.ndim == 2: frame = facenet.to_rgb(frame) frame = frame[:, :, 0:3] print(1) bounding_boxes, _ = detect_face.detect_face(frame, minsize, pnet, rnet, onet, threshold, factor) nrof_faces = bounding_boxes.shape[0] print('Face Detected: %d' % nrof_faces) if nrof_faces > 0: det = bounding_boxes[:, 0:4] img_size = np.asarray(frame.shape)[0:2] cropped = [] scaled = [] scaled_reshape = [] bb = np.zeros((nrof_faces,4), dtype=np.int32) for i in range(nrof_faces): emb_array = np.zeros((1, embedding_size)) bb[i][0] = det[i][0] bb[i][1] = det[i][1] bb[i][2] = det[i][2] bb[i][3] = det[i][3] # inner exception if bb[i][0] <= 0 or bb[i][1] <= 0 or bb[i][2] >= len(frame[0]) or bb[i][3] >= len(frame): print('face is too close') continue cropped.append(frame[bb[i][1]:bb[i][3], bb[i][0]:bb[i][2], :]) cropped[i] = facenet.flip(cropped[i], False) scaled.append(resize(cropped[i], (image_size, image_size), anti_aliasing=True)) scaled[i] = cv2.resize(scaled[i], (input_image_size,input_image_size), interpolation=cv2.INTER_CUBIC) scaled[i] = facenet.prewhiten(scaled[i]) scaled_reshape.append(scaled[i].reshape(-1,input_image_size,input_image_size,3)) feed_dict = {images_placeholder: scaled_reshape[i], phase_train_placeholder: False} emb_array[0, :] = sess.run(embeddings, feed_dict=feed_dict) predictions = model.predict_proba(emb_array) print(predictions) best_class_indices = np.argmax(predictions, axis=1) # print(best_class_indices) best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices] print(best_class_probabilities) cv2.rectangle(frame, (bb[i][0], bb[i][1]), (bb[i][2], bb[i][3]), (0, 255, 0), 2) #boxing face #plot result idx under box text_x = bb[i][0] text_y = bb[i][3] + 20 print('Result Indices: ', best_class_indices[0]) print(HumanNames) for H_i in HumanNames: # print(H_i) if HumanNames[best_class_indices[0]] == H_i: result_names = HumanNames[best_class_indices[0]] else: print('Unable to align') cv2.imshow('Image', frame) if cv2.waitKey(100) & 0xFF == ord('q'): sys.exit("Thanks") cv2.destroyAllWindows()
why does tf.estimator.DNNRegressor predict negative y value?
It is so weird for the predict() function in tf.estimator.DNNRegressor because it predict negative y value, but the training dataset has no negative y value. I found this when I reduced the value of y by 1000 times, say if y was 12000 before, now I change it to 12. The range of y is [3-400] now, but after I did this, the predict() function output some negative values. I didn't set the active function in tf.estimator.DNNRegressor, so the default active function is relu which range is [0-max], but why it predicts negative value? is some bug in tf.estimator.DNNRegressor? or is there no active function applied for y? Thank you. The code is: from __future__ import absolute_import from __future__ import division from __future__ import print_function import itertools import pandas as pd import tensorflow as tf from sklearn import datasets, metrics import csv tf.logging.set_verbosity(tf.logging.INFO) COLUMNS = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col210","col211","col212","col213","col214"] FEATURES = ["col1","col2","col3","col4","col5","col6","col7","col8","col9","col10","col11","col12","col13","col14","col15","col16","col17","col18","col19","col20","col21","col22","col23","col24","col25","col26","col27","col28","col29","col30","col31","col32","col33","col34","col35","col36","col37","col38","col39","col40","col41","col42","col43","col44","col45","col46","col47","col48","col49","col50","col51","col52","col53","col54","col55","col56","col57","col58","col59","col60","col61","col62","col63","col64","col65","col66","col67","col68","col69","col70","col71","col72","col73","col74","col75","col76","col77","col78","col79","col80","col81","col82","col83","col84","col85","col86","col87","col88","col89","col90","col91","col92","col93","col94","col95","col96","col97","col98","col99","col100","col101","col102","col103","col104","col105","col106","col107","col108","col109","col110","col111","col112","col113","col114","col115","col116","col117","col118","col119","col120","col121","col122","col123","col124","col125","col126","col127","col128","col129","col130","col131","col132","col133","col134","col135","col136","col137","col138","col139","col140","col141","col142","col143","col144","col145","col146","col147","col148","col149","col150","col151","col152","col153","col154","col155","col156","col157","col158","col159","col160","col161","col162","col163","col164","col165","col166","col167","col168","col169","col170","col171","col172","col173","col174","col175","col176","col177","col178","col179","col180","col181","col182","col183","col184","col185","col186","col187","col188","col189","col190","col191","col192","col193","col194","col195","col196","col197","col198","col199","col200","col201","col202","col203","col204","col205","col206","col207","col208","col209","col211","col212","col213"] LABEL = "col214" def get_input_fn(data_set, num_epochs=None, shuffle=True): return tf.estimator.inputs.pandas_input_fn( x=pd.DataFrame({k: data_set[k].values for k in FEATURES}), y=pd.Series(data_set[LABEL].values), num_epochs=num_epochs, shuffle=shuffle) def get_mae(y_pre, y_target): absError = [] for i in range(len(y_pre)): absError.append(abs(y_pre[i] - y_target[i])) return sum(absError) / len(absError) def get_mse(y_pre, y_target): squaredError = [] for i in range(len(y_pre)): val = y_pre[i] - y_target[i] squaredError.append(val * val) return sum(squaredError) / len (squaredError) training_set = pd.read_csv("train.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) test_set = pd.read_csv("test.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) predict_set = pd.read_csv("predict.csv", skipinitialspace=True, skiprows=1, names=COLUMNS) feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES] regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols, hidden_units=[250, 200, 100, 50], model_dir="./model") regressor.train(input_fn=get_input_fn(training_set), steps=8000) ev = regressor.evaluate(input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False)) loss_score = ev["loss"] print("Loss: {0:f}".format(loss_score)) predict = regressor.predict(input_fn=get_input_fn(predict_set, num_epochs=1, shuffle=False)) y_predict = predict_set[LABEL].values.tolist() print(type(y_predict)) print(y_predict) list_predict = list(predict) print(type(list_predict)) y_predicted = [] for i in range(len(list_predict)): y_predicted.append(list_predict[i]['predictions'][0]) print(y_predicted) fileObject = open('time_prediction.txt', 'w') for time in y_predicted: fileObject.write(str(time)) fileObject.write('\n') fileObject.close() mae = get_mae(y_predict, y_predicted) mse = get_mse(y_predict, y_predicted) print("Mean Absolute Error:" + str(mae) + " Mean Squared Error:" + str(mse)) #mae = tf.metrics.mean_absolute_error(y_predict, list_predict) #print(mea) This is the 3 data records of the dataset: 2399.998,4,100,100,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,2,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,1,4,13,4,0,11,14,15,10,8,0,0,3,1,0,0,0,0,0,0,0,0,0,0,1,364,123428,1397595,16772133,56,103,16772153,22,22,11 1919.9984,2,30,30,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,25 479.9996,2,60,60,0,0,1,10,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,38,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,3,3,1,0,0,0,5,0,0,0,0,2,0,0,0,0,12,2,0,9,14,10,9,2,0,0,2,1,0,0,0,0,0,0,0,0,0,0,1,17525535,34347257,1397595,5590711,16698805,103,5913257,896853,1190468,168 The last column is y.