diff --git a/src/confs/config_coco.json b/src/confs/config_coco.json new file mode 100644 index 0000000..17cef8c --- /dev/null +++ b/src/confs/config_coco.json @@ -0,0 +1,33 @@ +{ + "model" : { + "input_size": 416, + "grid_size": 13, + "true_box_buffer": 10, + "iou_threshold": 0.5, + "nms_threshold": 0.3 + }, + "config_path" : { + "labels": "models/coco/labels_coco.txt", + "anchors": "models/coco/anchors_coco.txt", + "arch_plotname": "" + }, + "train": { + "out_model_name": "", + "image_folder": "", + "annot_folder": "", + "batch_size": 16, + "learning_rate": 1e-4, + "num_epochs": 20, + "object_scale": 5.0 , + "no_object_scale": 1.0, + "coord_scale": 1.0, + "class_scale": 1.0, + "verbose": 1 + }, + + "valid": { + "image_folder": "", + "annot_folder": "", + "pred_folder": "" + } +} diff --git a/src/confs/config_voc.json b/src/confs/config_voc.json new file mode 100644 index 0000000..509facd --- /dev/null +++ b/src/confs/config_voc.json @@ -0,0 +1,34 @@ +{ + "model" : { + "input_size": 416, + "grid_size": 13, + "true_box_buffer": 10, + "iou_threshold": 0.5, + "nms_threshold": 0.45 + }, + "config_path" : { + "labels": "models/voc/labels_voc.txt", + "anchors": "models/voc/anchors_voc.txt", + "arch_plotname": "voc_arch.png" + }, + "train": { + "out_model_name": "yolo_retrained_voc.h5", + "image_folder": "/home/kiran/Documents/DATA/VOC/train/imgs", + "annot_folder": "/home/kiran/Documents/DATA/VOC/train/anns", + "batch_size": 16, + "learning_rate": 1e-4, + "num_epochs": 50, + "object_scale": 5.0 , + "no_object_scale": 1.0, + "coord_scale": 1.0, + "class_scale": 1.0, + "verbose": 1 + }, + + "valid": { + "image_folder": "/home/kiran/Documents/DATA/VOC/valid/imgs", + "annot_folder": "/home/kiran/Documents/DATA/VOC/valid/anns", + "pred_folder": "/home/kiran/Documents/DATA/VOC/valid/img_pred", + "plot_preds": true + } +} diff --git a/src/gpu_test.py b/src/gpu_test.py new file mode 100644 index 0000000..f146cac --- /dev/null +++ b/src/gpu_test.py @@ -0,0 +1,2 @@ +from keras import backend as K +K.tensorflow_backend._get_available_gpus() \ No newline at end of file diff --git a/src/models/coco/anchors_coco.txt b/src/models/coco/anchors_coco.txt new file mode 100644 index 0000000..808be3a --- /dev/null +++ b/src/models/coco/anchors_coco.txt @@ -0,0 +1 @@ +0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 diff --git a/src/models/coco/labels_coco.txt b/src/models/coco/labels_coco.txt new file mode 100644 index 0000000..941cb4e --- /dev/null +++ b/src/models/coco/labels_coco.txt @@ -0,0 +1,80 @@ +person +bicycle +car +motorcycle +airplane +bus +train +truck +boat +traffic light +fire hydrant +stop sign +parking meter +bench +bird +cat +dog +horse +sheep +cow +elephant +bear +zebra +giraffe +backpack +umbrella +handbag +tie +suitcase +frisbee +skis +snowboard +sports ball +kite +baseball bat +baseball glove +skateboard +surfboard +tennis racket +bottle +wine glass +cup +fork +knife +spoon +bowl +banana +apple +sandwich +orange +broccoli +carrot +hot dog +pizza +donut +cake +chair +couch +potted plant +bed +dining table +toilet +tv +laptop +mouse +remote +keyboard +cell phone +microwave +oven +toaster +sink +refrigerator +book +clock +vase +scissors +teddy bear +hair drier +toothbrush diff --git a/src/models/voc/anchors_voc.txt b/src/models/voc/anchors_voc.txt new file mode 100644 index 0000000..5374c6f --- /dev/null +++ b/src/models/voc/anchors_voc.txt @@ -0,0 +1 @@ +1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 diff --git a/src/models/voc/labels_voc.txt b/src/models/voc/labels_voc.txt new file mode 100644 index 0000000..1168c39 --- /dev/null +++ b/src/models/voc/labels_voc.txt @@ -0,0 +1,20 @@ +aeroplane +bicycle +bird +boat +bottle +bus +car +cat +chair +cow +diningtable +dog +horse +motorbike +person +pottedplant +sheep +sofa +train +tvmonitor \ No newline at end of file diff --git a/src/net/__init__.py b/src/net/__init__.py new file mode 100644 index 0000000..8a87e95 --- /dev/null +++ b/src/net/__init__.py @@ -0,0 +1,7 @@ +#from . import netarch +#from . import netdecode +#from . import neteval +#from . import netgen +#from . import netloss +#from . import netparams +#from . import utils \ No newline at end of file diff --git a/src/net/netarch.py b/src/net/netarch.py new file mode 100644 index 0000000..e2346db --- /dev/null +++ b/src/net/netarch.py @@ -0,0 +1,274 @@ +""" +Set up keras model with Yolo v2 architecture, for both training +and inference. +""" +import tensorflow as tf +import numpy as np +import pickle, argparse, json, os, cv2 + +from keras.models import Model, load_model +from keras.layers import Reshape, Conv2D, Input, MaxPooling2D, BatchNormalization, Lambda +from keras.layers.advanced_activations import LeakyReLU + +from keras.layers.merge import concatenate +from keras.utils.vis_utils import plot_model + +from .netparams import YoloParams +from .netdecode import YoloOutProcess + + + +class YoloInferenceModel(object): + + def __init__(self, model): + self._yolo_out = YoloOutProcess() + self._inf_model = self._extend_processing(model) + self._model = model + + def _extend_processing(self, model): + output = Lambda(self._yolo_out, name='lambda_2')(model.output) + return Model(model.input, output) + + + def _prepro_single_image(self, image): + image = cv2.resize(image, + (YoloParams.INPUT_SIZE, YoloParams.INPUT_SIZE)) + # yolo normalize + image = image / 255. + image = image[:,:,::-1] + # cv2 has the channel as bgr, revert to to rgb for Yolo Pass + image = np.expand_dims(image, 0) + + return image + + def predict(self, image): + + image = self._prepro_single_image(image) + + output = self._inf_model.predict(image)[0] + + if output.size == 0: + return [np.array([])]*4 + + + label_idxs = output[:,5].astype(int) + + labels = [YoloParams.CLASS_LABELS[l] for l in label_idxs] + + return labels + + + + +class YoloArchitecture(object): + + def __init__(self): + + self.in_model_name = YoloParams.IN_MODEL + # # self.plot_name = YoloParams.ARCH_FNAME + + def get_model(self): + + yolo_model = self._load_yolo_model() + + # if YoloParams.YOLO_MODE == 'train': + # new_yolo_model = self._setup_transfer_learning(yolo_model) + # #new_name = self.tl_weights_name.split('.')[0] + '_rand.h5' + # #new_yolo_model.save_weights(new_name) + # + # elif YoloParams.YOLO_MODE in ['inference','validate','video','cam']: + # new_yolo_model = yolo_model + # + # else: + # raise ValueError( + # 'Please set \'--action\' to \'train\', \'validate\' or pass an image file/dir.') + + # if self.plot_name: + # plot_model(new_yolo_model, to_file=self.plot_name, show_shapes=True) + + #return new_yolo_model + return yolo_model + + + def _load_yolo_model(self): + if os.path.isfile(self.in_model_name): + + model = load_model(self.in_model_name, compile=False) + + return model + else: + raise ValueError('Need to load full model in order to do ' + 'transfer learning. Run script again with desired TL ' + 'config and weight file to generate model.') + + + def weights_to_model(self, in_path, out_path): + yolo_model = self._yolo_v2_architecture() + + try: + yolo_model.load_weights(in_path) + + except IOError as e: + print('File for pre-trained weights not found.') + + yolo_model.save(out_path) + return yolo_model + + + + def _yolo_v2_architecture(self): + # Parse from cfg! + self.layer_counter = 0 + + def space_to_depth_x2(x): + + import tensorflow as tf + return tf.space_to_depth(x, block_size=2) + + + # def conv2D_bn_leaky(inp, filters, kernel_size=(3,3), strides=(1,1), maxpool=False): + # self.layer_counter += 1 + # x = Conv2D(filters, kernel_size=kernel_size, strides=strides, + # padding='same', use_bias=False)(inp) + # + # x = BatchNormalization()(x) + # x = LeakyReLU(alpha=0.1)(x) + # if maxpool: + # return MaxPooling2D(pool_size=(2, 2))(x) + # return x + # + # input_image = Input(shape=(YoloParams.INPUT_SIZE, YoloParams.INPUT_SIZE, 3), name='input') + # + # # Layer 1 + # x = conv2D_bn_leaky(input_image, 32, (3,3), (1,1), maxpool=True) + # + # # Layer 2 + # x = conv2D_bn_leaky(x, 64, maxpool=True) + # + # # Layer 3 + # x = conv2D_bn_leaky(x, 128) + # + # # Layer 4 + # x = conv2D_bn_leaky(x, 64, kernel_size=(1,1)) + # + # # Layer 5 + # x = conv2D_bn_leaky(x, 128, maxpool=True) + # + # # Layer 6 + # x = conv2D_bn_leaky(x, 256) + # + # # Layer 7 + # x = conv2D_bn_leaky(x, 128, kernel_size=(1,1)) + # + # # Layer 8 + # x = conv2D_bn_leaky(x, 256, maxpool=True) + # + # # Layer 9 + # x = conv2D_bn_leaky(x, 512) + # + # # Layer 10 + # x = conv2D_bn_leaky(x, 256, kernel_size=(1,1)) + # + # # Layer 11 + # x = conv2D_bn_leaky(x, 512) + # + # # Layer 12 + # x = conv2D_bn_leaky(x, 256, kernel_size=(1,1)) + # + # # Layer 13 + # x = conv2D_bn_leaky(x, 512) + # + # skip_connection = x + # x = MaxPooling2D(pool_size=(2, 2))(x) + # + # # Layer 14 + # x = conv2D_bn_leaky(x, 1024) + # + # # Layer 15 + # x = conv2D_bn_leaky(x, 512, kernel_size=(1,1)) + # # Layer 16 + # x = conv2D_bn_leaky(x, 1024) + # + # # Layer 17 + # x = conv2D_bn_leaky(x, 512, kernel_size=(1,1)) + # # Layer 18 + # x = conv2D_bn_leaky(x, 1024) + # + # # Layer 19 + # x = conv2D_bn_leaky(x, 1024) + # + # # Layer 20 + # x = conv2D_bn_leaky(x, 1024) + # + # # Layer 21 + # skip_connection = conv2D_bn_leaky(skip_connection, 64, kernel_size=(1,1)) + # skip_connection = Lambda(space_to_depth_x2)(skip_connection) + # x = concatenate([skip_connection, x]) + # + # # Layer 22 + # x = conv2D_bn_leaky(x, 1024) + # + # # Final Conv2D + # x = Conv2D(YoloParams.NUM_BOUNDING_BOXES * (4 + 1 + YoloParams.NUM_CLASSES), (1,1), + # strides=(1,1), padding='same')(x) + # + # + # output = Reshape((YoloParams.GRID_SIZE, YoloParams.GRID_SIZE, + # YoloParams.NUM_BOUNDING_BOXES, 4 + 1 + YoloParams.NUM_CLASSES))(x) + # + # yolo_model = Model(input_image, output) + # + # return yolo_model + + + + # def _setup_transfer_learning(self, yolo_model): + # + # new_yolo_model = self._yolo_v2_update(yolo_model) + # + # layer = new_yolo_model.layers[-2] # the last convolutional layer + # weights = layer.get_weights() + # + # S2 = YoloParams.GRID_SIZE*YoloParams.GRID_SIZE + # new_kernel = np.random.normal(size=weights[0].shape)/S2 + # new_bias = np.random.normal(size=weights[1].shape)/S2 + # + # layer.set_weights([new_kernel, new_bias]) + # + # return new_yolo_model + + + + # def _yolo_v2_update(self, old_yolo_model): + # + # x = Conv2D(YoloParams.NUM_BOUNDING_BOXES * (4 + 1 + YoloParams.NUM_CLASSES), (1,1), + # strides=(1,1), padding='same', name='conv_23')(old_yolo_model.layers[-3].output) + # + # output = Reshape((YoloParams.GRID_SIZE, YoloParams.GRID_SIZE, + # YoloParams.NUM_BOUNDING_BOXES, 4 + 1 + YoloParams.NUM_CLASSES))(x) + # + # yolo_model = Model(old_yolo_model.input, output) + # + # return yolo_model + + +# def generate_model(): +# +# yolo_arch = YoloArchitecture() +# +# d = os.path.dirname(YoloParams.WEIGHT_FILE) +# +# out_fname = os.path.join(d, 'model.h5') +# +# print('------------------------------------') +# print('Reading weights from: %s'%YoloParams.WEIGHT_FILE) +# print('Loading into YOLO V2 architecture and storing...') +# print('\n\n') +# yolo_arch.weights_to_model(YoloParams.WEIGHT_FILE, out_fname) +# print('\tModel saved: %s'%out_fname) +# print('\n\n------------------------------------') +# print('Done.') + + + + diff --git a/src/net/netdecode.py b/src/net/netdecode.py new file mode 100644 index 0000000..eeb2ffd --- /dev/null +++ b/src/net/netdecode.py @@ -0,0 +1,218 @@ +""" +Process [GRID x GRID x BOXES x (4 + 1 + CLASSES)]. Filter low confidence +boxes, apply NMS and return boxes, scores, classes. +""" + +import tensorflow as tf +from keras import backend as K +import numpy as np +from .netparams import YoloParams + + + + +def process_outs(b, s, c): + + b_p = b + # Expand dims of scores and classes so we can concat them + # with the boxes and have the output of NMS as an added layer of YOLO. + # Have to do another expand_dims this time on the first dim of the result + # since NMS doesn't know about BATCH_SIZE (operates on 2D, see + # https://www.tensorflow.org/api_docs/python/tf/image/non_max_suppression) + # but keras needs this dimension in the output. + s_p = K.expand_dims(s, axis=-1) + c_p = K.expand_dims(c, axis=-1) + + output_stack = K.concatenate([b_p, s_p, c_p], axis=1) + return K.expand_dims(output_stack, axis=0) + + +class YoloOutProcess(object): + + + def __init__(self): + # thresholds + self.max_boxes = YoloParams.TRUE_BOX_BUFFER + self.nms_threshold = YoloParams.NMS_THRESHOLD + self.detection_threshold = YoloParams.DETECTION_THRESHOLD + + self.num_classes = YoloParams.NUM_CLASSES + + def __call__(self, y_sing_pred): + + # need to convert b's from GRID_SIZE units into IMG coords. Divide by grid here. + b_xy = (K.sigmoid(y_sing_pred[..., 0:2]) + YoloParams.c_grid[0]) / YoloParams.GRID_SIZE + b_wh = (K.exp(y_sing_pred[..., 2:4])*YoloParams.anchors[0]) / YoloParams.GRID_SIZE + b_xy1 = b_xy - b_wh / 2. + b_xy2 = b_xy + b_wh / 2. + boxes = K.concatenate([b_xy1, b_xy2], axis=-1) + + # filter out scores below detection threshold + scores_all = K.sigmoid(y_sing_pred[..., 4:5]) * K.softmax(y_sing_pred[...,5:]) + indicator_detection = scores_all > self.detection_threshold + scores_all = scores_all * K.cast(indicator_detection, np.float32) + + # compute detected classes and scores + classes = K.argmax(scores_all, axis=-1) + scores = K.max(scores_all, axis=-1) + + # flattened tensor length + S2B = YoloParams.GRID_SIZE*YoloParams.GRID_SIZE*YoloParams.NUM_BOUNDING_BOXES + + # flatten boxes, scores for NMS + flatten_boxes = K.reshape(boxes, shape=(S2B, 4)) + flatten_scores = K.reshape(scores, shape=(S2B, )) + flatten_classes = K.reshape(classes, shape=(S2B, )) + + inds = [] + + # apply multiclass NMS + for c in range(self.num_classes): + + # only include boxes of the current class, with > 0 confidence + class_mask = K.cast(K.equal(flatten_classes, c), np.float32) + score_mask = K.cast(flatten_scores > 0, np.float32) + mask = class_mask * score_mask + + # compute class NMS + nms_inds = tf.image.non_max_suppression( + flatten_boxes, + flatten_scores*mask, + max_output_size=self.max_boxes, + iou_threshold=self.nms_threshold, + score_threshold=0. + ) + + inds.append(nms_inds) + + # combine winning box indices of all classes + selected_indices = K.concatenate(inds, axis=-1) + + # gather corresponding boxes, scores, class indices + selected_boxes = K.gather(flatten_boxes, selected_indices) + selected_scores = K.gather(flatten_scores, selected_indices) + selected_classes = K.gather(flatten_classes, selected_indices) + + return process_outs(selected_boxes, selected_scores, K.cast(selected_classes, np.float32)) + + + + +class YoloOutProcessOther(object): + """ + [UNUSED] Ignore. + """ + + def __init__(self): + + self.max_boxes = YoloParams.TRUE_BOX_BUFFER + self.nms_threshold = YoloParams.NMS_THRESHOLD + self.detection_threshold = YoloParams.DETECTION_THRESHOLD + + self.num_classes = YoloParams.NUM_CLASSES + + + def _class_nms(self, boxes, scores, c_mask): + #c_mask = K.equal(classes, i) + c_mask = c_mask*K.cast(scores > 0, np.float32) + c_boxes = boxes * K.expand_dims(c_mask, axis=-1) + c_scores = scores * c_mask + inds = tf.image.non_max_suppression(c_boxes, c_scores, max_output_size=10, iou_threshold=0.2) + # tf.pad(inds, tf.Variable([[0,10-tf.shape(inds)[0]]]), "CONSTANT") + return self._pad_tensor(inds, 10, value=-1) + + + def _pad_tensor(self, t, length, value=0): + """Pads the input tensor with 0s along the first dimension up to the length. + Args: + t: the input tensor, assuming the rank is at least 1. + length: a tensor of shape [1] or an integer, indicating the first dimension + of the input tensor t after padding, assuming length <= t.shape[0]. + Returns: + padded_t: the padded tensor, whose first dimension is length. If the length + is an integer, the first dimension of padded_t is set to length + statically. + """ + t_rank = tf.rank(t) + t_shape = tf.shape(t) + t_d0 = t_shape[0] + pad_d0 = tf.expand_dims(length - t_d0, 0) + pad_shape = tf.cond( + tf.greater(t_rank, 1), lambda: tf.concat([pad_d0, t_shape[1:]], 0), + lambda: tf.expand_dims(length - t_d0, 0)) + padded_t = tf.concat([t, value+tf.zeros(pad_shape, dtype=t.dtype)], 0) + + t_shape = padded_t.get_shape().as_list() + t_shape[0] = length + padded_t.set_shape(t_shape) + + return padded_t + + def __call__(self, y_sing_pred): + + # need to convert b's from GRID_SIZE units into IMG coords. Divide by grid here. + b_xy = (K.sigmoid(y_sing_pred[..., 0:2]) + YoloParams.c_grid[0]) / YoloParams.GRID_SIZE + b_wh = (K.exp(y_sing_pred[..., 2:4])*YoloParams.anchors[0]) / YoloParams.GRID_SIZE + b_xy1 = b_xy - b_wh / 2. + b_xy2 = b_xy + b_wh / 2. + boxes = K.concatenate([b_xy1, b_xy2], axis=-1) + + scores_all = K.expand_dims(K.sigmoid(y_sing_pred[..., 4]), axis=-1) * K.softmax(y_sing_pred[...,5:]) + indicator_detection = scores_all > self.detection_threshold + scores_all = scores_all * K.cast(indicator_detection, np.float32) + + classes = K.argmax(scores_all, axis=-1) + scores = K.max(scores_all, axis=-1) + + S2B = YoloParams.GRID_SIZE*YoloParams.GRID_SIZE*YoloParams.NUM_BOUNDING_BOXES + + flatten_boxes = K.reshape(boxes, shape=(S2B, 4)) + flatten_scores = K.reshape(scores, shape=(S2B, )) + flatten_classes = K.reshape(classes, shape=(S2B, )) + + + c_masks = K.map_fn(lambda c: K.cast(K.equal(flatten_classes, c), np.float32), np.arange(self.num_classes), dtype=np.float32) + resu_stacked = tf.map_fn( + lambda c: self._class_nms(flatten_boxes, flatten_scores, c), + c_masks, + dtype=np.int32, + infer_shape=True) + + resu_flat = K.reshape(resu_stacked, shape=(-1,)) + selected_indices = tf.boolean_mask(resu_flat, ~K.equal(resu_flat, -1)) + + selected_boxes = K.gather(flatten_boxes, selected_indices) + selected_scores = K.gather(flatten_scores, selected_indices) + selected_classes = K.gather(flatten_classes, selected_indices) + + # Exclude padding boxes left behind by tensorflow NMS + score_mask = selected_scores>0. + selected_boxes = tf.boolean_mask(selected_boxes, score_mask) + selected_scores = tf.boolean_mask(selected_scores, score_mask) + selected_classes = tf.boolean_mask(selected_classes, score_mask) + + return process_outs(selected_boxes, selected_scores, K.cast(selected_classes, np.float32)) + + + + + +if __name__ == '__main__': + + tf.InteractiveSession() + + a = tf.convert_to_tensor(np.load('ocell.npy'), np.float32) + + yolo_out = YoloOutProcess() + + resu = yolo_out(a).eval()[0] + + b = resu[:,:4] + s = resu[:,4] + c = resu[:,5] + + print('---------------------') + + print(c) + print(s) + print(b) diff --git a/src/net/netparams.py b/src/net/netparams.py new file mode 100644 index 0000000..dfc6268 --- /dev/null +++ b/src/net/netparams.py @@ -0,0 +1,157 @@ + +import pickle, argparse, json, os, sys +from keras import backend as K +import numpy as np + + +# argparser = argparse.ArgumentParser( +# description='dourflow: a keras YOLO V2 implementation.') +# +# +# argparser.add_argument( +# 'action', +# help='what to do: \'train\', \'validate\', \'cam\' ' +# 'or pass a video, image file/dir.') +# +# argparser.add_argument( +# '-m', +# '--model', +# help='path to input yolo v2 keras model', +# default='coco_model.h5') +# +# +# argparser.add_argument( +# '-c', +# '--conf', +# help='path to configuration file', +# default='confs/config_coco.json') +# +# +# argparser.add_argument( +# '-t', +# '--threshold', +# type=float, +# help='detection threshold', +# default=0.3) +# +# +# argparser.add_argument( +# '-w', +# '--weight_file', +# help='path to weight file', +# default='weights.h5') +# +# +# argparser.add_argument( +# '--gif', +# help='video output stored as gif also', +# action='store_true') +# +# +# args = argparser.parse_args() + + +# action = args.action +config_path = "confs/config_coco.json" + +with open(config_path) as config_buffer: + config = json.loads(config_buffer.read()) + + +def generate_yolo_grid(batch, g, num_bb): + c_x = K.cast(K.reshape(K.tile(K.arange(g), [g]), (1, g, g, 1, 1)), K.floatx()) + c_y = K.permute_dimensions(c_x, (0,2,1,3,4)) + return K.tile(K.concatenate([c_x, c_y], -1), [batch, 1, 1, num_bb, 1]) + + +def get_threshold(value): + if value > 1. or value < 0: + raise ValueError('Please enter a valid threshold (between 0. and 1.).') + return value + + + +class YoloParams(object): + + # Mode + PREDICT_IMAGE = '' + WEIGHT_FILE = '' + WEBCAM_OUT = '' + GEN_ANCHORS_PATH = '' + + # if action in ['genw', 'generate_weights']: + # assert args.weight_file, "Need to pass weight file if generating model." + # WEIGHT_FILE = args.weight_file + # elif action == 'cams': + # WEBCAM_OUT = 'cam_out.mp4' + # YOLO_MODE = 'cam' + # elif action in ['genp', 'generate_priors']: + # current_anchors_path = config['config_path']['anchors'] + # GEN_ANCHORS_PATH = os.path.join(os.path.dirname(current_anchors_path), + # 'custom_'+os.path.basename(current_anchors_path)) + # YOLO_MODE = 'genp' + # else: + # if action in ['validate', 'train', 'cam']: + # YOLO_MODE = action + # else: + # if os.path.isdir(action): + # YOLO_MODE = 'inference' + # elif os.path.isfile(action): + # if action.split('.')[1] in ['mp4','avi','wmv','mpg','mpeg']: + # YOLO_MODE = 'video' + # else: + # YOLO_MODE = 'inference' + # else: + # raise ValueError('Run \'python3 dourflow.py --help\'.') + # + # PREDICT_IMAGE = action + + #Paths + TRAIN_IMG_PATH = config['train']['image_folder'] + TRAIN_ANN_PATH = config['train']['annot_folder'] + + VALIDATION_IMG_PATH = config['valid']['image_folder'] + VALIDATION_ANN_PATH = config['valid']['annot_folder'] + VALIDATION_OUT_PATH = config['valid']['pred_folder'] + + #STORE_GIF = args.gif + + # Model + IN_MODEL = "coco_model.h5" + + OUT_MODEL_NAME = config['train']['out_model_name'] + ARCH_FNAME = config['config_path']['arch_plotname'] + + # Classes + CLASS_LABELS = [x.rstrip() for x in open(config['config_path']['labels'])] + NUM_CLASSES = len(CLASS_LABELS) + CLASS_TO_INDEX = dict(zip(CLASS_LABELS, np.arange(NUM_CLASSES))) + + # Infrastructure params + INPUT_SIZE = config['model']['input_size'] + GRID_SIZE = config['model']['grid_size'] + TRUE_BOX_BUFFER = config['model']['true_box_buffer'] + + if config['config_path']['anchors']: + ANCHORS = [float(a) for a in open(config['config_path']['anchors']).read().split(', ')] + NUM_BOUNDING_BOXES = len(ANCHORS) // 2 + OBJECT_SCALE = 5.0 + NO_OBJECT_SCALE = 1.0 + CLASS_SCALE = 1.0 + COORD_SCALE = 1.0 + + # Train params + BATCH_SIZE = config['train']['batch_size'] + L_RATE = config['train']['learning_rate'] + NUM_EPOCHS = config['train']['num_epochs'] + TRAIN_VERBOSE = config['train']['verbose'] + + # Thresholding + IOU_THRESHOLD = get_threshold(config['model']['iou_threshold']) + NMS_THRESHOLD = get_threshold(config['model']['nms_threshold']) + DETECTION_THRESHOLD = get_threshold(0.3) + + # Additional / Precomputing + c_grid = generate_yolo_grid(BATCH_SIZE, GRID_SIZE, NUM_BOUNDING_BOXES) + anchors = np.reshape(ANCHORS, [1,1,1,NUM_BOUNDING_BOXES,2]) + diff --git a/src/net/utils.py b/src/net/utils.py new file mode 100644 index 0000000..1ad340a --- /dev/null +++ b/src/net/utils.py @@ -0,0 +1,259 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import os, errno +import xml.etree.ElementTree as ET + +import tensorflow as tf +import copy +import cv2 + +from moviepy.editor import VideoFileClip + + + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: # Python >2.5 + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + + +def compute_iou(bb_1, bb_2): + + xa0, ya0, xa1, ya1 = bb_1 + xb0, yb0, xb1, yb1 = bb_2 + + intersec = (min([xa1, xb1]) - max([xa0, xb0]))*(min([ya1, yb1]) - max([ya0, yb0])) + + union = (xa1 - xa0)*(ya1 - ya0) + (xb1 - xb0)*(yb1 - yb0) - intersec + + return intersec / union + + +def benchmark_timings(data, path=''): + + fig = plt.figure(figsize=(10,15)) + ax = plt.gca() + df = pd.DataFrame(data) + df.plot(ax=ax, kind='area', subplots=True) + plt.savefig(path + 'timings.png', format='png') + plt.close() + + df2 = df.apply(lambda x: x/df['total'], axis=0)[['decode', 'prediction', 'prepro']] + + fig = plt.figure(figsize=(20,13)) + ax = fig.add_subplot(111) + df2.plot(ax=ax) + vals = ax.get_yticks() + ax.set_yticklabels(['{:,.1%}'.format(x) for x in vals]) + plt.savefig(path + 'timings_combined.png', format='png') + plt.close() + + + + +def space_to_depth_x2(x): + """Thin wrapper for Tensorflow space_to_depth with block_size=2.""" + # Import currently required to make Lambda work. + # See: https://github.com/fchollet/keras/issues/5088#issuecomment-273851273 + + # the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K) + + + # tf.space_to_depth: + # Input: [batch, height, width, depth] + # Output: [batch, height/block_size, width/block_size, depth*block_size*block_size] + # Example: [1,4,4,1] -> [1,2,2,4] or in this case [?,38,38,64] -> [?,19,19,256] + # This operation is useful for resizing the activations between convolutions (but keeping all data), + # e.g. instead of pooling. It is also useful for training purely convolutional models. + + # space_to_depth_x2 is just tf.space_to_depth wrapped with block_size=2 + + + # Example + """ + input shape = (4,4,1) + + [ + [[1], [2], [3], [4]], + [[5], [6], [7], [8]], + [[9], [10], [11], [12]], + [[13], [14], [15], [16]] + ] + + is divided into the following chunks (block_size, block_size, channels): + + [[[1], [2]], [[[3], [4]], + [[5], [6]]] [[7], [8]]] + + [[[9], [10],] [[[11], [12]], + [[13], [14]]] [[15], [16]]] + + flatten each chunk to a single array: + + [[1, 2, 5, 6]], [[3, 4, 7, 8]] + [[9, 10, 13, 14]], [[11, 12, 15, 16]] + + + spatially rearrange chunks according to their initial position: + + [ + [[1, 2, 5, 6]], [[3, 4, 7, 8]], + [[9 10, 13, 14]], [[11, 12, 15, 16]] + ] + + output shape = (2,2,4) + """ + import tensorflow as tf + return tf.space_to_depth(x, block_size=2) + + +def draw_boxes(image_in, info): + image = image_in.copy() + image_h, image_w, _ = image.shape + + boxes, scores, labels = info + color_mod = 255 + + for i in range(len(boxes)): + xmin = int(boxes[i][0]*image_w) + ymin = int(boxes[i][1]*image_h) + xmax = int(boxes[i][2]*image_w) + ymax = int(boxes[i][3]*image_h) + + if scores is None: + #text = "%s"%(labels[i]) + text = '' + color_mod = 0 + else: + text = "%s (%.1f%%)"%(labels[i], 100*scores[i]) + + cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (color_mod,255,0), 2) + + cv2.putText(image, + text, + (xmin, ymin - 15), + cv2.FONT_HERSHEY_COMPLEX, + 1e-3 * image_h, + (color_mod,255,0), 1) + return image + + +def parse_annotation(ann_dir, img_dir, labels=[]): + # from https://github.com/experiencor/keras-yolo2/blob/master/preprocessing.py + all_imgs = [] + seen_labels = {} + # go through annotations by sorted filename + for ann in sorted(os.listdir(ann_dir)): + img = {'object':[]} + tree = ET.parse(os.path.join(ann_dir, ann)) + + for elem in tree.iter(): + if 'filename' in elem.tag: + img['filename'] = os.path.join(img_dir, elem.text) + if 'width' in elem.tag: + img['width'] = int(elem.text) + if 'height' in elem.tag: + img['height'] = int(elem.text) + if 'object' in elem.tag or 'part' in elem.tag: + obj = {} + + for attr in list(elem): + if 'name' in attr.tag: + obj['name'] = attr.text + + if obj['name'] in seen_labels: + seen_labels[obj['name']] += 1 + else: + seen_labels[obj['name']] = 1 + + if len(labels) > 0 and obj['name'] not in labels: + break + else: + img['object'] += [obj] + + if 'bndbox' in attr.tag: + for dim in list(attr): + if 'xmin' in dim.tag: + obj['xmin'] = int(round(float(dim.text))) + if 'ymin' in dim.tag: + obj['ymin'] = int(round(float(dim.text))) + if 'xmax' in dim.tag: + obj['xmax'] = int(round(float(dim.text))) + if 'ymax' in dim.tag: + obj['ymax'] = int(round(float(dim.text))) + + if len(img['object']) > 0: + all_imgs += [img] + + # all_imgs: [img1, img2, img3, ..] + # + """ + img: + {'object' : [{'name': 'class1', 'xmin': , 'ymin': , 'xmax': , 'ymax': }, # object 1 + {'name': 'class1', 'xmin': , 'ymin': , 'xmax': , 'ymax': }, # object 2 + {'name': 'class2', 'xmin': , 'ymin': , 'xmax': , 'ymax': }] # object 3 + 'filename' : , + 'width':, + 'height': + } + """ + # seen_labels: {'classname': count} + return all_imgs + + +def setup_logging(logging_path='logs'): + + log_path = os.path.join(os.getcwd(),logging_path) + mkdir_p(log_path) + + check_names = lambda y: y if y.isdigit() else -1 + get_ind = lambda x: int(check_names(x.split('_')[1])) + + run_counter = max(map(get_ind, os.listdir(log_path)), default=-1) + 1 + + run_path = os.path.join(log_path, 'run_%s'%run_counter) + mkdir_p(run_path) + + print('Logging set up, to monitor training run:\n' + '\t\'tensorboard --logdir=%s\'\n'%run_path) + + return run_path + + + + +def handle_empty_indexing(arr, idx): + if idx.size > 0: + return arr[idx] + return [] + + +def generate_gif(filename): + outname = filename.split('.')[-2] + '.gif' + VideoFileClip(filename).speedx(2.5).resize(0.5).write_gif( + outname,fps=20, program='ffmpeg', fuzz=3) + print('\n') + + +if __name__ == '__main__': + + imgs, cnts = parse_annotation('/home/kiran/Downloads/VOCdevkit/VOC2012/Annotations/','/home/kiran/Downloads/VOCdevkit/VOC2012/JPEGImages/') + imgs, cnts = parse_annotation('/home/kiran/Downloads/VOCdevkit2007/VOC2007/Annotations/','/home/kiran/Downloads/VOCdevkit2007/VOC2007/JPEGImages/') + + + + + + + + + + + + + diff --git a/src/objects_identifier_of_frames.py b/src/objects_identifier_of_frames.py index 603e47b..4a93115 100644 --- a/src/objects_identifier_of_frames.py +++ b/src/objects_identifier_of_frames.py @@ -17,43 +17,10 @@ from keras.preprocessing import image from keras.applications import resnet50 import os +from yolo.model import YOLO def generate_object_list_of_frames(input_frames_path,img_width, img_height): - - images = [] # List to keep scaled frame data - frame_names = [] # List to keep names of frames - - # Load Keras' ResNet50 model that was pre-trained against the ImageNet database - model = resnet50.ResNet50() - - frames_list = os.listdir(input_frames_path) - frames_list.sort(key=lambda x: int(x[5:-4])) - - for image_name in frames_list: - frame_names.append(image_name) - img = image.load_img(input_frames_path+image_name, target_size=(img_width, img_height)) - img = image.img_to_array(img) - img = np.expand_dims(img, axis=0) - images.append(img) - - images = np.vstack(images) - images = resnet50.preprocess_input(images) - predictions = model.predict(images) - predicted_classes = resnet50.decode_predictions(predictions, top=10) - - index = 0 - frames_predictions_dictionary = {} - for i in predicted_classes: - object_list = [] - for imagenet_id, name, likelihood in i: - #print(" - {}: {:2f} likelihood".format(name, likelihood)) - #if(likelihood>20): - object_list.append(name) - - frames_predictions_dictionary[frame_names[index]] = object_list - index += 1 - - return frames_predictions_dictionary + return YOLO().predict(input_frames_path) def run(): # image folder diff --git a/src/yolo/model.py b/src/yolo/model.py new file mode 100644 index 0000000..a51466d --- /dev/null +++ b/src/yolo/model.py @@ -0,0 +1,45 @@ +import os +import matplotlib.pyplot as plt +import cv2 +from tqdm import tqdm + +from src.net.netarch import YoloArchitecture,YoloInferenceModel + + +class YOLO(object): + + def __init__(self): + self.debug_timings = True + self.yolo_arch = YoloArchitecture() + self.model = self.yolo_arch.get_model() + self.inf_model = YoloInferenceModel(self.model) + + def predict(self,path): + frames_predictions_dictionary = {} + #checking whether the given path is a directory + if os.path.isdir(path): + fnames = [os.path.join(path, f) for f in os.listdir(path) + if os.path.isfile(os.path.join(path, f))] + + else: + fnames = [path] + flag = False + + for f in tqdm(fnames, desc='Processing Batch'): + image = cv2.imread(f) + labels_limited = [None]*10 + labels = self.inf_model.predict(image.copy()) + size = len(labels) + for i in range(size): + if(i==10): + break + labels_limited[i] = labels[i] + test = str(f)[len(path):] + frames_predictions_dictionary[str(f)[len(path):]] = labels_limited + print (labels_limited) + + + + + print ("Process Finished") + return frames_predictions_dictionary \ No newline at end of file diff --git a/src/yolo_test.py b/src/yolo_test.py new file mode 100644 index 0000000..5a17fc0 --- /dev/null +++ b/src/yolo_test.py @@ -0,0 +1,6 @@ +from yolo.model import YOLO + +YOLO().predict("D:\Campus\FYP\\video-summarization\src\\test_data\generated_frames") + + +