Testing model on ONNX format

mfclabber · Sep 21, 2024 · 37eb890 · 37eb890
1 parent 8ce795e
commit 37eb890
Show file tree

Hide file tree

Showing 8 changed files with 303 additions and 65 deletions.
diff --git a/README.md b/README.md
@@ -9,7 +9,7 @@
 </p>
 
 ## TODO:
-- [ ] Testing model on ONNX format
+- [X] Testing model on ONNX format
 - [ ] Writing inference mode on TensorRT
 - [ ] Create full README.md
 - [ ] Add tracking algorithm (m.b. [StrongSort](https://github.com/dyhBUPT/StrongSORT?tab=readme-ov-file))

diff --git a/output_track.mp4 b/output_track.mp4
diff --git a/scripts/convert2onnx.py b/scripts/convert2onnx.py
@@ -0,0 +1,21 @@
+import onnx
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from model import YOLOv9
+
+
+if __name__ == "__main__":
+
+    torch_model = YOLOv9()
+
+    # torch_input = torch.randn(1, 1, 1280, 640)
+    # onnx_program = torch.onnx.dynamo_export(torch_model, torch_input)
+
+    # onnx_program.save("../weights/yolov9t.onnx")
+
+    torch_model.export2onnx()
+    # onnx_model = onnx.load("/home/mfclabber/fs_cones_detection&monodepth/weights/best.onnx")
+    # onnx.checker.check_model(onnx_model)
diff --git a/scripts/main.py b/scripts/main.py
@@ -18,6 +18,9 @@
 from ultralytics import YOLO
 from roboflow import Roboflow
 
+import onnx
+import onnxruntime as ort
+
 import torch
 import torchvision
 from torchvision import transforms, datasets
@@ -52,68 +55,145 @@
 if __name__ == "__main__":
 
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = YOLOv9().to(device)
-    encoder, depth_decoder, loaded_dict_enc = get_mono_640x192_model()
 
-    # image = np.array(Image.open(test_image_path_list[i])).transpose(2, 0, 1)[:3]
+    inference_mode = "on"
 
-    # disp_resized_np, pred_image = prediction(image, 
-    #                                          model,
-    #                                          encoder,
-    #                                          depth_decoder,
-    #                                          loaded_dict_enc)
-
-
-    video_path = 'videos/track.mp4'
-    cap = cv2.VideoCapture(video_path)
+    if inference_mode != "onnx":
+
+        model = YOLOv9(path2weights="weights/best.pt").to(device)
+        encoder, depth_decoder, loaded_dict_enc = get_mono_640x192_model()
 
-    if (cap.isOpened()== False): 
-        print("Error opening video stream or file")
+        # image = np.array(Image.open(test_image_path_list[i])).transpose(2, 0, 1)[:3]
 
-    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # disp_resized_np, pred_image = prediction(image, 
+        #                                          model,
+        #                                          encoder,
+        #                                          depth_decoder,
+        #                                          loaded_dict_enc)
+
+
+        video_path = 'videos/track.mp4'
+        cap = cv2.VideoCapture(video_path)
 
-    print(f"Размер видео: {frame_width}x{frame_height}, FPS: {fps}, Количество кадров: {total_frames}")
+        if (cap.isOpened()== False): 
+            print("Error opening video stream or file")
 
-    output_video_path = 'output_track.mp4'
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 
-    start_time = time.time()
-    num_frames = 0
-    frame_count = 0
+        print(f"Размер видео: {frame_width}x{frame_height}, FPS: {fps}, Количество кадров: {total_frames}")
 
-    while(cap.isOpened()):
-        ret, frame = cap.read()
+        output_video_path = 'output_track.mp4'
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
 
-        frame_count += 1
-        num_frames += 1
-        current_fps = calculate_fps(start_time, num_frames)
-
-        if frame_count % 5 == 0 or frame_count == 1:
-            if ret == True:
-                disp_resized_np, annotated_frame = process_frame(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR), 
-                                                                model,
-                                                                encoder,
-                                                                depth_decoder,
-                                                                loaded_dict_enc)
-                cv2.putText(annotated_frame, f"FPS: {current_fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
-                cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+        start_time = time.time()
+        num_frames = 0
+        frame_count = 0
+
+        while(cap.isOpened()):
+            ret, frame = cap.read()
+
+            frame_count += 1
+            num_frames += 1
+            current_fps = calculate_fps(start_time, num_frames)
 
-                if cv2.waitKey(25) & 0xFF == ord('q'):
+            if frame_count % 5 == 0 or frame_count == 1:
+                if ret == True:
+                    disp_resized_np, annotated_frame = process_frame(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR), 
+                                                                    model,
+                                                                    encoder,
+                                                                    depth_decoder,
+                                                                    loaded_dict_enc)
+                    cv2.putText(annotated_frame, f"FPS: {current_fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+                    cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+
+                    if cv2.waitKey(25) & 0xFF == ord('q'):
+                        break
+                else: 
                     break
-            else: 
-                break
-        else:
-            cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))    
+            else:
+                cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))    
+
+            out.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+
+            # time.sleep(1 / fps / 5)    
+
+        cap.release()
+        out.release()
+
+        cv2.destroyAllWindows()
+
+    else:
+
+    # TODO # onnx_model 
+
+        model = YOLOv9(path2weights="weights/best.onnx")
+        encoder, depth_decoder, loaded_dict_enc = get_mono_640x192_model()
+
+        # image = np.array(Image.open(test_image_path_list[i])).transpose(2, 0, 1)[:3]
+
+        # disp_resized_np, pred_image = prediction(image, 
+        #                                          model,
+        #                                          encoder,
+        #                                          depth_decoder,
+        #                                          loaded_dict_enc)
 
-        out.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
 
-        # time.sleep(1 / fps / 5)    
+        video_path = 'videos/track.mp4'
+        cap = cv2.VideoCapture(video_path)
+
+        if (cap.isOpened()== False): 
+            print("Error opening video stream or file")
+
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        print(f"Размер видео: {frame_width}x{frame_height}, FPS: {fps}, Количество кадров: {total_frames}")
+
+        output_video_path = 'output_track.mp4'
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
+
+        start_time = time.time()
+        num_frames = 0
+        frame_count = 0
+
+        while(cap.isOpened()):
+            ret, frame = cap.read()
+
+            frame_count += 1
+            num_frames += 1
+            current_fps = calculate_fps(start_time, num_frames)
+
+            if frame_count % 5 == 0 or frame_count == 1:
+                if ret == True:
+                    disp_resized_np, annotated_frame = process_frame(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR), 
+                                                                    model,
+                                                                    encoder,
+                                                                    depth_decoder,
+                                                                    loaded_dict_enc)
+                    cv2.putText(annotated_frame, f"FPS: {current_fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+                    cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+
+                    if cv2.waitKey(25) & 0xFF == ord('q'):
+                        break
+                else: 
+                    break
+            else:
+                cv2.imshow('Frame', cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))    
+
+            out.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
+
+            # time.sleep(1 / fps / 5)    
+
+        cap.release()
+        out.release()
+
+        cv2.destroyAllWindows()
 
-    cap.release()
-    out.release()
 
-    cv2.destroyAllWindows()
diff --git a/scripts/model.py b/scripts/model.py
@@ -1,17 +1,30 @@
 import torch
 import numpy as np
+from pathlib import Path
 from ultralytics import YOLO
 from typing import List, Dict
 
-from utils import LABEL2ID, ID2LABEL
 
 
+ID2LABEL = dict([
+    (0, "yellow_cone"),
+    (2, "blue_cone"),
+    (3, "large_orange_cone"),
+    (1, "orange_cone"),
+    (4, "unknown_cone")
+])
+
+LABEL2ID = dict()
+for k, v in ID2LABEL.items():  
+   LABEL2ID[v]=k
+
 
 class YOLOv9(torch.nn.Module):
-    def __init__(self, num_classes: int=4) -> None:
+    def __init__(self, path2weights: Path, num_classes: int=4) -> None:
         super().__init__()
 
-        self.model = YOLO("../weights/best.pt")
+        self.path2weights = path2weights
+        self.model = YOLO(f"{path2weights}")
 
         self.LABEL2LABEL = dict([
             ("unknown_cone", "blue_cone"),
@@ -22,11 +35,17 @@ def __init__(self, num_classes: int=4) -> None:
         ])
 
     def predict(self, X: torch.Tensor, confidence=40, overlap=30) -> torch.Tensor:
-        results = self.model.predict(X.transpose(1, 2, 0))
+
+        if self.path2weights[-2] != "pt":
+            results = self.model.predict(source=X.transpose(1, 2, 0), device=0)
+        else:
+            results = self.model.predict(source=X.transpose(1, 2, 0))
+
         bboxes = results[0].boxes.data[:, :4]
         labels_ = results[0].boxes.cls
         scores = results[0].boxes.conf
         labels = np.zeros_like(labels_.cpu())
+
         for i, label in enumerate(labels_):
             label = int(label.item())
             labels[i] = LABEL2ID[self.LABEL2LABEL[ID2LABEL[label]]]
@@ -37,4 +56,11 @@ def predict(self, X: torch.Tensor, confidence=40, overlap=30) -> torch.Tensor:
 
     # To calculate the loss function
     def forward(self, images: List[torch.Tensor], annotation: List[Dict[str, torch.Tensor]]) -> Dict[str, int]:
-        return self.model(images, annotation)
+        return self.model(images, annotation)
+
+    def export2onnx(self, frame_size=640):
+        self.model.export(format="onnx", 
+                          imgsz = frame_size,
+                          )
+
+        return self.model
diff --git a/scripts/utils.py b/scripts/utils.py
@@ -196,11 +196,11 @@ def get_cone_distances(detected_cones, depth_map):
     res = get_cone_distances(bboxes, 1 / disp_resized_np)
 
     return disp_resized_np, show_image_with_objects(image, 
-                                   bboxes, 
-                                   labels, 
-                                   scores, 
-                                   depths_value=res, 
-                                   threshold_score=threshold_score)
+                                                    bboxes, 
+                                                    labels, 
+                                                    scores, 
+                                                    depths_value=res, 
+                                                    threshold_score=threshold_score)
 
 
 def process_frame(frame, 
@@ -210,10 +210,10 @@ def process_frame(frame,
                   loaded_dict_enc):
 
     disp_resized_np, annotated_frame = prediction(np.array(frame).transpose(2, 0, 1)[:3],
-                                 model,
-                                 encoder,
-                                 depth_decoder,
-                                 loaded_dict_enc)
+                                                  model,
+                                                  encoder,
+                                                  depth_decoder,
+                                                  loaded_dict_enc)
 
     return disp_resized_np, np.array(annotated_frame)