diff --git a/__pycache__/blocks.cpython-37.pyc b/__pycache__/blocks.cpython-37.pyc deleted file mode 100644 index 969367d..0000000 Binary files a/__pycache__/blocks.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/sight.cpython-37.pyc b/__pycache__/sight.cpython-37.pyc deleted file mode 100644 index 8d61992..0000000 Binary files a/__pycache__/sight.cpython-37.pyc and /dev/null differ diff --git a/__pycache__/zoo.cpython-37.pyc b/__pycache__/zoo.cpython-37.pyc deleted file mode 100644 index 65bed1c..0000000 Binary files a/__pycache__/zoo.cpython-37.pyc and /dev/null differ diff --git a/build/lib/sightseer/__init__.py b/build/lib/sightseer/__init__.py new file mode 100644 index 0000000..da1bc57 --- /dev/null +++ b/build/lib/sightseer/__init__.py @@ -0,0 +1,6 @@ +# __init__.py +__version__ = "1.0.1" + +from sightseer.sightseer import Sightseer +from sightseer.zoo import * +from sightseer.proc import * \ No newline at end of file diff --git a/sight/blocks.py b/build/lib/sightseer/blocks.py similarity index 100% rename from sight/blocks.py rename to build/lib/sightseer/blocks.py diff --git a/sight/proc.py b/build/lib/sightseer/proc.py similarity index 100% rename from sight/proc.py rename to build/lib/sightseer/proc.py diff --git a/sight/sight.py b/build/lib/sightseer/sightseer.py similarity index 100% rename from sight/sight.py rename to build/lib/sightseer/sightseer.py diff --git a/sight/zoo.py b/build/lib/sightseer/zoo.py similarity index 100% rename from sight/zoo.py rename to build/lib/sightseer/zoo.py diff --git a/dist/sightseer-1.0.0-py3-none-any.whl b/dist/sightseer-1.0.0-py3-none-any.whl new file mode 100644 index 0000000..d73b251 Binary files /dev/null and b/dist/sightseer-1.0.0-py3-none-any.whl differ diff --git a/dist/sightseer-1.0.0.tar.gz b/dist/sightseer-1.0.0.tar.gz new file mode 100644 index 0000000..cfb36a2 Binary files /dev/null and b/dist/sightseer-1.0.0.tar.gz differ diff --git a/dist/sightseer-1.0.1-py3-none-any.whl b/dist/sightseer-1.0.1-py3-none-any.whl new file mode 100644 index 0000000..e82fb02 Binary files /dev/null and b/dist/sightseer-1.0.1-py3-none-any.whl differ diff --git a/dist/sightseer-1.0.1.tar.gz b/dist/sightseer-1.0.1.tar.gz new file mode 100644 index 0000000..81a4fd4 Binary files /dev/null and b/dist/sightseer-1.0.1.tar.gz differ diff --git a/main.py b/main.py index df94533..0c3f994 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,5 @@ -from sight.sight import Sightseer -from sight.zoo import YOLOv3Client +from sightseer.sight import Sightseer +from sightseer.zoo import YOLOv3Client # downloading and configuring weights and hyperparams yolo = YOLOv3Client() diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9985cb1 --- /dev/null +++ b/setup.py @@ -0,0 +1,18 @@ +import pathlib +from setuptools import setup + +HERE = pathlib.Path(__file__).parent +README = (HERE / "README.md").read_text() + +setup( + name="sightseer", + version="1.0.1", + description="State-of-the-art Computer Vision and Object Detection for TensorFlow.", + long_description=README, + long_description_content_type="text/markdown", + author="", + author_email="", + license="ASF", + packages=["sightseer"], + zip_safe=False +) \ No newline at end of file diff --git a/sightseer.egg-info/PKG-INFO b/sightseer.egg-info/PKG-INFO new file mode 100644 index 0000000..70c74e8 --- /dev/null +++ b/sightseer.egg-info/PKG-INFO @@ -0,0 +1,116 @@ +Metadata-Version: 2.1 +Name: sightseer +Version: 1.0.1 +Summary: State-of-the-art Computer Vision and Object Detection for TensorFlow. +Home-page: UNKNOWN +Author: +Author-email: +License: ASF +Description:
+
+
+
+
+ +
+ + + +
+ +State-of-the-art Computer Vision and Object Detection for TensorFlow.
+ + + *Sight* provides state-of-the-art general-purpose architectures (YOLO9000, MaskRCNN, Fast/Faster RCNN, SSD...) for Computer Vision and Object Detection tasks with 30+ pretrained models written in TensorFlow 1.15. + + ## Installation + + `sight` is written in Python 3.5+ and TensorFlow 1.15. + + Ideally, `sight` should be installed in a [virtual environments](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out this [tutorial](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) on getting started. + + ### Via PyPi + + To use `sight`, you must first have TensorFlow installed. To do so, follow the instructions on the [TensorFlow installation page](https://www.tensorflow.org/install/pip?lang=python3). + + When your virtual environment is set up with TensorFlow, you can install `sight` using `pip`: + + ```bash + pip install sight + ``` + + ### From Source + + Again, to install from source, you need TensorFlow 1.15 and above running in a virtual environment. You can install the package by cloning the repo and installing the dependencies: + + ```bash + git clone https://github.com/rish-16/sight + cd sight + pip install . + ``` + + ### Model Architectures + + 1. YOLOv3 (Darknet by Joseph Redmon) + 2. Mask R-CNN (Facebook AI Research) + + ## Usage + + 1a. Loading images + + ```python + from sight import Sightseer + + ss = Sightseer() + image = ss.load_image("path/to/image") + ``` + + 1b. Loading videos + + ```python + from sight import Sightseer + + ss = Sightseer() + frames = ss.load_vidsource("path/to/video", return_data=True) + ``` + + 1c. Loading webcam footage + + ```python + from sight import Sightseer + + ss = Sightseer() + image = ss.load_webcam() + ``` + + 1d. Loading screen grab footage + + ```python + from sight import Sightseer + + ss = Sightseer() + image = ss.screen_grab() + ``` + + 2. Using models from `sight.zoo` + + Once installed, any model offered by `sight` can be accessed in less than 10 lines of code. For instance, the code to use the YOLOv3 (Darknet) model is as follows: + + ```python + from sight import Sightseer + from sight.zoo import YOLOv3Client + + yolo = YOLOv3Client() + yolo.load_model() # downloads weights + + # loading images from local system + ss = Sightseer("path/to/img") + image = ss.load_image() + + # returns array of labels, confidence, and bounding box info + preds, pred_img = yolo.predict(image, return_image=True) + ss.render_image(pred_img) + ``` +Platform: UNKNOWN +Description-Content-Type: text/markdown diff --git a/sightseer.egg-info/SOURCES.txt b/sightseer.egg-info/SOURCES.txt new file mode 100644 index 0000000..aba0333 --- /dev/null +++ b/sightseer.egg-info/SOURCES.txt @@ -0,0 +1,12 @@ +README.md +setup.py +sightseer/__init__.py +sightseer/blocks.py +sightseer/proc.py +sightseer/sightseer.py +sightseer/zoo.py +sightseer.egg-info/PKG-INFO +sightseer.egg-info/SOURCES.txt +sightseer.egg-info/dependency_links.txt +sightseer.egg-info/not-zip-safe +sightseer.egg-info/top_level.txt \ No newline at end of file diff --git a/sightseer.egg-info/dependency_links.txt b/sightseer.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/sightseer.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/sightseer.egg-info/not-zip-safe b/sightseer.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/sightseer.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/sightseer.egg-info/top_level.txt b/sightseer.egg-info/top_level.txt new file mode 100644 index 0000000..d251932 --- /dev/null +++ b/sightseer.egg-info/top_level.txt @@ -0,0 +1 @@ +sightseer diff --git a/sightseer/__init__.py b/sightseer/__init__.py new file mode 100644 index 0000000..da1bc57 --- /dev/null +++ b/sightseer/__init__.py @@ -0,0 +1,6 @@ +# __init__.py +__version__ = "1.0.1" + +from sightseer.sightseer import Sightseer +from sightseer.zoo import * +from sightseer.proc import * \ No newline at end of file diff --git a/sightseer/blocks.py b/sightseer/blocks.py new file mode 100644 index 0000000..4c91461 --- /dev/null +++ b/sightseer/blocks.py @@ -0,0 +1,123 @@ +import struct +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Conv2D, ZeroPadding2D, BatchNormalization, LeakyReLU, add + +class BoundingBox(object): + def __init__(self, xmin, ymin, xmax, ymax, objectness=None, classes=None): + self.xmin = xmin + self.ymin = ymin + self.xmax = xmax + self.ymax = ymax + + self.objectness = objectness + self.classes = classes + + self.label = -1 + self.confidence = -1 + + def get_label(self): + if self.label == -1: + self.label = np.argmax(self.classes) + return self.label + + def get_confidence(self): + if self.confidence == -1: + self.confidence = self.classes[self.get_label()] + return self.confidence + +class SightLoader(): + def __init__(self, weights_path): + """ + Weights loading framework for all Sight models + """ + with open(weights_path, 'rb') as wf: + major, = struct.unpack('i', wf.read(4)) + minor, = struct.unpack('i', wf.read(4)) + revision, = struct.unpack('i', wf.read(4)) + + if (major*10+ minor) >= 2 and major < 1000 and minor < 1000: + wf.read(8) + else: + wf.read(4) + + transpose = (major > 1000) or (minor > 1000) + + binary = wf.read() + + self.offset = 0 + self.all_weights = np.frombuffer(binary, dtype="float32") + + def read_bytes(self, chunk_size): + self.offset = self.offset + chunk_size + return self.all_weights[self.offset - chunk_size:self.offset] + + def load_weights(self, model, verbose=True): + for i in range(106): # standard darknet layer count + try: + conv_layer = model.get_layer("conv_" + str(i)) + + if verbose: + print ("Loading Convolution #{}".format(i)) + + if i not in [81, 93, 105]: + norm_layer = model.get_layer("bnorm_" + str(i)) + + size = np.prod(norm_layer.get_weights()[0].shape) + + beta = self.read_bytes(size) + gamma = self.read_bytes(size) + mean = self.read_bytes(size) + var = self.read_bytes(size) + + weights = norm_layer.set_weights([gamma, beta, mean, var]) + + if len(conv_layer.get_weights()) > 1: + bias = self.read_bytes(np.prod(conv_layer.get_weights()[1].shape)) + kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) + + kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) + kernel = kernel.transpose([2, 3, 1, 0]) + conv_layer.set_weights([kernel, bias]) + else: + kernel = self.read_bytes(np.prod(conv_layer.get_weights()[0].shape)) + kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape))) + kernel = kernel.transpose([2, 3, 1, 0]) + conv_layer.set_weights([kernel]) + + except ValueError: + if verbose: + print ("No Convolution #{}".format(i)) + else: + pass + + if verbose: + print ("Finished loading weights into model. Predicting on input data...") + + def reset_offset(self): + self.offset = 0 + +class ConvBlock(): + def get_conv_block(inp, convs, skip=True): + x = inp + count = 0 + + for conv in convs: + if count == (len(convs) - 2) and skip: + skip_conn = x + count += 1 + + if conv['stride'] > 1: x = ZeroPadding2D(((1,0),(1,0)))(x) + + x = Conv2D(conv['filter'], + conv['kernel'], + strides=conv['stride'], + padding="valid" if conv['stride']>1 else "same", + name="conv_"+str(conv['layer_idx']), + use_bias=False if conv['bnorm'] else True)(x) + + if conv['bnorm']: x = BatchNormalization(epsilon=0.001, name="bnorm_"+str(conv['layer_idx']))(x) + + if conv['leaky']: x = LeakyReLU(alpha=0.1, name="leaky_"+str(conv['layer_idx']))(x) + + return add([skip_conn, x]) if skip else x \ No newline at end of file diff --git a/sightseer/proc.py b/sightseer/proc.py new file mode 100644 index 0000000..6a71697 --- /dev/null +++ b/sightseer/proc.py @@ -0,0 +1,106 @@ +import io +import json +import glob +from PIL import Image +import xml.etree.ElementTree as ET +import tensorflow as tf +import numpy as np +import cv2 +import pandas as pd + +class DataAnnotator(object): + def __init__(self, classes): + self.classes = classes # array of class labels + + def list_to_csv(self, annotations, outfile): + columns = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'] + xml_df = pd.DataFrame(annotations, columns=columns) + xml_df.to_csv(outfile, index=None) + + def class_to_int(self, class): + for i in range(len(self.classes)): + if self.classes[i] == class: + return i + 1 + else: + return None + + def xml_to_csv(self, xml_path, csv_path): + annotations = [] + for xml_file in glob.glob(xml_path + '*.xml'): + tree = ET.parse(xml_file) + root = tree.getroot() + for member in root.findall('object'): + value = (root.find('filename').text, + int(root.find('size')[0].text), int(root.find('size')[1].text), + member[0].text, + int(member[4][0].text), int(member[4][1].text), + int(member[4][2].text), int(member[4][3].text)) + annotations.append(value) + + self.list_to_csv(annotations, csv_path) + + def json_to_csv(self, jsonpath, csvpath): + with open(jsonpath) as f: + images = json.load(f) + + annotations = [] + + for entry in images: + filename = images[entry]['filename'] + for region in images[entry]['regions']: + c = region['region_attributes']['class'] + xmin = region['shape_attributes']['x'] + ymin = region['shape_attributes']['y'] + xmax = xmin + region['shape_attributes']['width'] + ymax = ymin + region['shape_attributes']['height'] + width = 0 + height = 0 + + value = (filename, width, height, c, xmin, ymin, xmax, ymax) + annotations.append(value) + + self.list_to_csv(annotations, csvpath) + + def generate_tfexample(self, group, path): + with tf.gfile.GFile(os.path.join(path, '{}'.format(group.filename)), 'rb') as fid: + encoded_jpg = fid.read() + encoded_jpg_io = io.BytesIO(encoded_jpg) + image = Image.open(encoded_jpg_io) + width, height = image.size + + filename = group.filename.encode('utf8') + image_format = b'jpg' + xmins = [] + xmaxs = [] + ymins = [] + ymaxs = [] + classes_text = [] + classes = [] + + for index, row in group.object.iterrows(): + xmins.append(row['xmin'] / width) + xmaxs.append(row['xmax'] / width) + ymins.append(row['ymin'] / height) + ymaxs.append(row['ymax'] / height) + classes_text.append(row['class'].encode('utf8')) + classes.append(self.class_to_int(row['class'])) + + tf_example = tf.train.Example(features=tf.train.Features(feature={ + 'image/height': dataset_util.int64_feature(height), + 'image/width': dataset_util.int64_feature(width), + 'image/filename': dataset_util.bytes_feature(filename), + 'image/source_id': dataset_util.bytes_feature(filename), + 'image/encoded': dataset_util.bytes_feature(encoded_jpg), + 'image/format': dataset_util.bytes_feature(image_format), + 'image/object/bbox/xmin': dataset_util.float_list_feature(xmins), + 'image/object/bbox/xmax': dataset_util.float_list_feature(xmaxs), + 'image/object/bbox/ymin': dataset_util.float_list_feature(ymins), + 'image/object/bbox/ymax': dataset_util.float_list_feature(ymaxs), + 'image/object/class/text': dataset_util.bytes_list_feature(classes_text), + 'image/object/class/label': dataset_util.int64_list_feature(classes), + })) + + return tf_example + + def csv_to_tfrecord(self, csvpath, filename, tfrpath): + csv = pd.read_csv(csvpath).values \ No newline at end of file diff --git a/sightseer/sightseer.py b/sightseer/sightseer.py new file mode 100644 index 0000000..a684a95 --- /dev/null +++ b/sightseer/sightseer.py @@ -0,0 +1,125 @@ +import cv2 +import numpy as np +import tensorflow as tf +from tensorflow.keras.preprocessing.image import load_img, img_to_array +from PIL import ImageGrab +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle + +class Sightseer(object): + def __init__(self): + self.filepath = None + + def render_grayscale(self, frame): + gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + return gray_frame + + def load_webcam(self, return_data=True, set_gray=True, kill_key="q", width=160, height=120): + + cap = cv2.VideoCapture(0) + cap.set(cv2.CAP_PROP_FRAME_WIDTH, width) + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height) + + frames = [] + + while True: + ret, frame = cap.read() + print (frame.shape) + + if set_gray: + frame = self.render_grayscale(frame) + + frame = cv2.flip(frame, 1) # prevent lateral inversion + cv2.imshow('frame', frame) + frames.append(frame) + + if cv2.waitKey(1) & 0xFF == ord(kill_key): + break + + cap.release() + cv2.destroyAllWindows() + + if return_data: + frames = np.array(frames) + return frames + + def screen_grab(self, set_gray=True, write_data=True, return_data=True, kill_key="q", filename='output.avi', width=400, height=400): + fourcc = cv2.VideoWriter_fourcc(*'XVID') + out = cv2.VideoWriter(filename, fourcc, 20.0, (640, 480)) + + frames = [] + + while True: + img = np.array(ImageGrab.grab(bbox=(0, 0, width, height))) + frame = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + + if write_data: + out.write(imcv) + + if set_gray: + frame = self.render_grayscale(img) + + cv2.imshow('frame', frame) + frames.append(frame) + + if cv2.waitKey(1) & 0xFF == ord(kill_key): + break + + out.release() + cv2.destroyAllWindows() + + if return_data: + frames = np.array(frames) + return frames + + def load_vidsource(self, filepath, return_data=True, set_gray=True, kill_key="q"): + self.filepath = filepath + vidcap = cv2.VideoCapture(filepath) + + frame_exists, frame = vidcap.read() + frames = [] + + while frame_exists: + frame_exists, frame = vidcap.read() + print (frame.shape) + + if set_gray: + frame = self.render_grayscale(frame) + + cv2.imshow('frame', frame) + frames.append(frame) + + if cv2.waitKey(1) & 0xFF == ord(kill_key): + break + + vidcap.release() + cv2.destroyAllWindows() + + if return_data: + frames = np.array(frames) + return frames + + def load_image(self, filepath): + self.filepath = filepath + try: + img = cv2.imread(filepath) + return img + except: + raise FileExistsError ("File does not exist. You may want to check the filepath again.") + + def get_final_filepath(self, image_path): + image_path = image_path.split('/') + img_name = image_path[-1] + img_name = img_name.split('.') + img_name = img_name[0] + "_detected." + img_name[1] + image_path = "/".join(image_path[:-1]) + "/" + img_name + + return image_path + + def render_image(self, image, save_image=False): + plt.imshow(image) + plt.show() + + if save_image: + new_filepath = self.get_final_filepath(self.filepath) + plt.savefig(new_filepath) \ No newline at end of file diff --git a/sightseer/zoo.py b/sightseer/zoo.py new file mode 100644 index 0000000..7dcf84e --- /dev/null +++ b/sightseer/zoo.py @@ -0,0 +1,369 @@ +import os +import wget +import struct +import shutil +import logging + +import cv2 +import numpy as np +import tensorflow as tf +from tensorflow.keras.layers import Input, UpSampling2D, concatenate +from tensorflow.keras.models import Model + +from .blocks import ConvBlock, BoundingBox, SightLoader + +# disabling warnings and logging +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' +tf.autograph.set_verbosity(tf.compat.v1.logging.ERROR) +logging.disable(logging.WARNING) + +class YOLOv3Client(object): + def __init__(self, nms_threshold=0.45, obj_threshold=0.5, net_h=416, net_w=416, anchors=[[116, 90, 156, 198, 373, 326], [30, 61, 62, 45, 59, 119], [10, 13, 16, 30, 33, 23]]): + """ + Params: + ------- + + - nsm_threshold (float): Non Maximum Suppression threshold for selecting bounding boxes ina region + default: 0.45 + min: 0 + max: 1 + + - obj_threshold (float): + default: 0.5 + min: 0 + max: 1 + + - + """ + self.nms_threshold = nms_threshold + self.obj_threshold = obj_threshold + self.net_h, self.net_w = net_h, net_w + self.anchors = anchors + self.yolo_model = None + self.all_labels = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", + "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", + "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", + "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", + "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", + "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", + "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", + "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", + "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"] + + def download_weights(self): + """ + Downloads the weights from online and saves them locally + """ + + if os.path.exists("./bin/yolov3.weights"): + print ("Weights already exist. Proceeding to load YOLOv3Client...") + else: + print ("Downloading weights. This may may take a moment...") + weights_url = "https://pjreddie.com/media/files/yolov3.weights" + # config_url = "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg" + + wget.download(weights_url, os.getcwd() + "/yolov3.weights") + # wget.download(config_url, os.getcwd() + "/yolov3.cfg") + + os.mkdir("./bin", 0o755) # configuring admin rights + shutil.move("./yolov3.weights", "./bin/yolov3.weights") + # shutil.move("./yolov3.cfg", "./bin/yolov3.cfg") + + print ("\n\nWeights downloaded successfully!") + + def load_architecture(self): + """ + Returns tf.keras.models.Model instance + """ + inp_image = Input(shape=[None, None, 3]) + + x = ConvBlock.get_conv_block(inp_image, [{'filter': 32, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 0}, + {'filter': 64, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 1}, + {'filter': 32, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 2}, + {'filter': 64, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 3}]) + + x = ConvBlock.get_conv_block(x, [{'filter': 128, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 5}, + {'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 6}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 7}]) + + x = ConvBlock.get_conv_block(x, [{'filter': 64, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 9}, + {'filter': 128, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 10}]) + + x = ConvBlock.get_conv_block(x, [{'filter': 256, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 12}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 13}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 14}]) + + for i in range(7): + x = ConvBlock.get_conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 16+i*3}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 17+i*3}]) + + skip_36 = x + + x = ConvBlock.get_conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 37}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 38}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 39}]) + + for i in range(7): + x = ConvBlock.get_conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 41+i*3}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 42+i*3}]) + + skip_61 = x + + x = ConvBlock.get_conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 2, 'bnorm': True, 'leaky': True, 'layer_idx': 62}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 63}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 64}]) + + for i in range(3): + x = ConvBlock.get_conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 66+i*3}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 67+i*3}]) + + x = ConvBlock.get_conv_block(x, [{'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 75}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 76}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 77}, + {'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 78}, + {'filter': 512, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 79}], skip=False) + + yolo_82 = ConvBlock.get_conv_block(x, [{'filter': 1024, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 80}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 81}], skip=False) + + x = ConvBlock.get_conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 84}], skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_61]) + + x = ConvBlock.get_conv_block(x, [{'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 87}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 88}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 89}, + {'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 90}, + {'filter': 256, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 91}], skip=False) + + yolo_94 = ConvBlock.get_conv_block(x, [{'filter': 512, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 92}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 93}], skip=False) + + x = ConvBlock.get_conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 96}], skip=False) + x = UpSampling2D(2)(x) + x = concatenate([x, skip_36]) + + yolo_106 = ConvBlock.get_conv_block(x, [{'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 99}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 100}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 101}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 102}, + {'filter': 128, 'kernel': 1, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 103}, + {'filter': 256, 'kernel': 3, 'stride': 1, 'bnorm': True, 'leaky': True, 'layer_idx': 104}, + {'filter': 255, 'kernel': 1, 'stride': 1, 'bnorm': False, 'leaky': False, 'layer_idx': 105}], skip=False) + + model = Model(inp_image, [yolo_82, yolo_94, yolo_106]) + return model + + def sigmoid(self, z): + return 1 / (1 + np.exp(-z)) + + def preprocess(self, image): + """ + Resizes image to appropriate dimensions for YOLOv3 + """ + new_h, new_w, _ = image.shape + + if (float(self.net_w)/new_w) < (float(self.net_h)/new_h): + new_h = (new_h * self.net_w)//new_w + new_w = self.net_w + else: + new_w = (new_w * self.net_h)//new_h + new_h = self.net_h + + # resize the image to the new size + resized = cv2.resize(image[:, :, ::-1]/255., (int(new_w), int(new_h))) + + # embed the image into the standard letter box + new_img = np.ones((self.net_h, self.net_w, 3)) * 0.5 + new_img[int((self.net_h-new_h)//2):int((self.net_h+new_h)//2), int((self.net_w-new_w)//2):int((self.net_w+new_w)//2), :] = resized + new_img = np.expand_dims(new_img, 0) + + return new_img + + def interval_overlap(self, int_a, int_b): + x1, x2 = int_a + x3, x4 = int_b + + if x3 < x1: + if x4 < x1: + return 0 + else: + return min(x2, x4) - x1 + else: + if x2 < x3: + return 0 + else: + return min(x2, x4) - x3 + + def bbox_iou(self, box1, box2): + """ + Finds IOU between all bounding boxes before non maximum suppression process + """ + int_w = self.interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) + int_h = self.interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) + + intersect = int_w * int_h + + w1, h1 = box1.xmax - box1.xmin, box1.ymax - box1.ymin + w2, h2 = box2.xmax - box2.xmin, box2.ymax - box2.ymin + + union = w1*h1 + w2*h2 - intersect + + return float(intersect) / union + + def non_maximum_suppression(self, boxes): + if len(boxes) > 0: + nb_class = len(boxes[0].classes) + else: + return + + for c in range(nb_class): + sorted_indices = np.argsort([-box.classes[c] for box in boxes]) + + for i in range(len(sorted_indices)): + index_i = sorted_indices[i] + + if boxes[index_i].classes[c] == 0: continue + + for j in range(i+1, len(sorted_indices)): + index_j = sorted_indices[j] + + if self.bbox_iou(boxes[index_i], boxes[index_j]) >= self.nms_threshold: + boxes[index_j].classes[c] = 0 + + return boxes + + def decode_preds(self, preds, anchors): + gridh, gridw = preds.shape[:2] + nb_box = 3 + preds = preds.reshape([gridh, gridw, nb_box, -1]) + nb_class = preds.shape[-1] - 5 + + boxes = [] + + preds[..., :2] = self.sigmoid(preds[..., :2]) + preds[..., 4:] = self.sigmoid(preds[..., 4:]) + preds[..., 5:] = preds[..., 4][..., np.newaxis] * preds[..., 5:] + preds[..., 5:] *= preds[..., 5:] > self.obj_threshold + + for i in range(gridh * gridw): + row = i / gridw + col = i % gridw + + for b in range(nb_box): + objectness = preds[int(row)][int(col)][b][4] + + if (objectness.all() <= self.obj_threshold): continue + + x, y, w, h = preds[int(row)][int(col)][b][:4] + + x = (col + x) / gridw + y = (row + y) / gridh + w = anchors[2 * b + 0] * np.exp(w) / self.net_w + h = anchors[2 * b + 1] * np.exp(h) / self.net_h + + classes = preds[int(row)][col][b][5:] + + box = BoundingBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes) + + boxes.append(box) + + return boxes + + def rectify_boxes(self, boxes, image_h, image_w): + if (float(self.net_w)/image_w) < (float(self.net_h)/image_h): + new_w = self.net_w + new_h = (image_h * self.net_w)/ image_w + else: + new_h = self.net_w + new_w = (image_w * self.net_h) / image_h + + for i in range(len(boxes)): + x_offset, x_scale = (self.net_w - new_w)/2./self.net_w, float(new_w)/self.net_w + y_offset, y_scale = (self.net_h - new_h)/2./self.net_h, float(new_h)/self.net_h + + boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) + boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) + boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) + boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) + + return boxes + + def get_boxes(self, image, boxes, verbose=True, random_coloring=True): + final_boxes = [] + + for box in boxes: + final_label = "" + label = -1 + + for i in range(len(self.all_labels)): + if box.classes[i] > self.obj_threshold: + final_label += self.all_labels[i] + label = i + + if verbose: + print ("{}: {:.3f}%".format(self.all_labels[i], box.classes[i]*100)) + + final_boxes.append([final_label, + box.classes[i] * 100, + { + 'xmin': box.xmin, + 'ymin': box.ymin, + 'xmax': box.xmax, + 'ymax': box.ymax + } + ]) + + if label >= 0: + if random_coloring: + r, g, b = np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255) + else: + r, g, b = 0, 255, 0 + + cv2.rectangle(image, (box.xmin, box.ymin), (box.xmax, box.ymax), (r, g, b), 1) + cv2.putText(image, '{} {:.3f}'.format(final_label, box.get_confidence()), (box.xmax, box.ymin - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (r, g, b), 2) + + return final_boxes, image + + def load_model(self, default_path="./bin/yolov3.weights", verbose=True): + """ + Downloads weights and config, loads checkpoints into architecture + """ + self.download_weights() # downloading weights from online + loader = SightLoader(default_path) + + self.yolo_model = self.load_architecture() # loading weights into model + loader.load_weights(self.yolo_model, verbose) + + def predict(self, original_image, return_img=False, verbose=True): + """ + Returns a list of BoundingBox metadata (class label, confidence score, coordinates) + and the edited image with bounding boxes and their corresponding text labels/confidence scores + """ + image_h, image_w = original_image.shape[:2] + + if self.yolo_model == None: + raise ValueError ("YOLOv3 weights needs to be downloaded and configured into the model before use. You can use the `load_model()` method to do so.") + + proc_image = self.preprocess(original_image) + preds = self.yolo_model.predict(proc_image) + boxes = [] + + for i in range(len(preds)): + boxes += self.decode_preds(preds[i][0], self.anchors[i]) + + boxes = self.rectify_boxes(boxes, image_h, image_w) + boxes = self.non_maximum_suppression(boxes) + + box_list, box_image = self.get_boxes(original_image, boxes, verbose) + + if return_img: + box_image = box_image.squeeze() + return box_list, box_image + else: + return box_list + +class MaskRCNNClient(object): + def __init__(self): + pass \ No newline at end of file