NirAharon · dou3516 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022 · Sep 1, 2022
diff --git a/README.md b/README.md
@@ -12,6 +12,24 @@
 
 <p align="center"><img src="assets/Results_Bubbles.png"/></p>
 
+## Updates of this repo
+**9/1/2022**
+- Open --downscale parameter to speed up track speed.
+- Add --legacy parameter to support un-normalization input.
+- Add --save_size parameter to adjust the size of saved video/image.
+- Add time counter for det, track, save separately.
+
+## Experiments of this repo
+### GMC downscale
+downscale in GMC defaults to 2. It cost unbearable 3.6s. Time increase 10 times when downscale increase in 2 times.
+
+| Tracker       |  input_size |  downscale  |  time  |
+|:--------------|:-------------:|:------:|:------:|
+| BoT-SORT      |  (768, 1280)  |  2  |  3.6  |
+| BoT-SORT      |  (768, 1280)  |  4  |  0.26 |
+| BoT-SORT      |  (768, 1280)  |  8  |  0.02 |
+
+
 ## Highlights 🚀
 
 - YOLOX & YOLOv7 support
@@ -216,16 +234,16 @@ Demo with BoT-SORT(-ReID) based YOLOX and multi-class.
 cd <BoT-SORT_dir>
 
 # Original example
-python3 tools/demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result
+python3 tools/demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result --legacy
 
 # Multi-class example
-python3 tools/mc_demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result
+python3 tools/mc_demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result --legacy
 ```
 
 Demo with BoT-SORT(-ReID) based YOLOv7 and multi-class.
 ```shell
 cd <BoT-SORT_dir>
-python3 tools/mc_demo_yolov7.py --weights pretrained/yolov7-d6.pt --source <path_to_video/images> --fuse-score --agnostic-nms (--with-reid)
+python3 tools/mc_demo_yolov7.py --weights pretrained/yolov7-d6.pt --source <path_to_video/images> --fuse-score --agnostic-nms (--with-reid) --legacy
 ```
 
 ## Note

diff --git a/tools/demo.py b/tools/demo.py
@@ -29,6 +29,7 @@ def make_parser():
     parser.add_argument("--path", default="", help="path to images or video")
     parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
     parser.add_argument("--save_result", action="store_true",help="whether to save the inference result of image/video")
+    parser.add_argument("--save_size", default=None, type=str, help="save size of image/video, used to adjust output size")
     parser.add_argument("-f", "--exp_file", default=None, type=str, help="pls input your expriment description file")
     parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
     parser.add_argument("--device", default="gpu", type=str, help="device to run our model, can either be cpu or gpu")
@@ -39,6 +40,7 @@ def make_parser():
     parser.add_argument("--fp16", dest="fp16", default=False, action="store_true",help="Adopting mix precision evaluating.")
     parser.add_argument("--fuse", dest="fuse", default=False, action="store_true", help="Fuse conv and bn for testing.")
     parser.add_argument("--trt", dest="trt", default=False, action="store_true", help="Using TensorRT model for testing.")
+    parser.add_argument("--legacy", dest="legacy", default=False, action="store_true", help="legacy code, such as mean/std normalization.")
 
     # tracking args
     parser.add_argument("--track_high_thresh", type=float, default=0.6, help="tracking confidence threshold")
@@ -52,6 +54,7 @@ def make_parser():
 
     # CMC
     parser.add_argument("--cmc-method", default="orb", type=str, help="cmc method: files (Vidstab GMC) | orb | ecc")
+    parser.add_argument("--downscale", default=2, type=int, help="cmc downscale, large image leads to very slow gmc, increase downscale to increase speed")
 
     # ReID
     parser.add_argument("--with-reid", dest="with_reid", default=False, action="store_true", help="test mot20.")
@@ -94,7 +97,8 @@ def __init__(
         trt_file=None,
         decoder=None,
         device=torch.device("cpu"),
-        fp16=False
+        fp16=False,
+        legacy=False
     ):
         self.model = model
         self.decoder = decoder
@@ -115,6 +119,7 @@ def __init__(
             self.model = model_trt
         self.rgb_means = (0.485, 0.456, 0.406)
         self.std = (0.229, 0.224, 0.225)
+        self.legacy = legacy
 
     def inference(self, img, timer):
         img_info = {"id": 0}
@@ -129,7 +134,7 @@ def inference(self, img, timer):
         img_info["width"] = width
         img_info["raw_img"] = img
 
-        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
+        img, ratio = preproc(img, self.test_size, self.rgb_means, self.std, legacy=self.legacy)
         img_info["ratio"] = ratio
         img = torch.from_numpy(img).unsqueeze(0).float().to(self.device)
         if self.fp16:
@@ -156,11 +161,17 @@ def image_demo(predictor, vis_folder, current_time, args):
     timer = Timer()
     results = []
 
+    st_det = 0
+    st_track = 0
+    st_save = 0
     for frame_id, img_path in enumerate(files, 1):
 
+        t0 = time.time()
         # Detect objects
         outputs, img_info = predictor.inference(img_path, timer)
         scale = min(exp.test_size[0] / float(img_info['height'], ), exp.test_size[1] / float(img_info['width']))
+        t_det = time.time()
+        st_det += t_det - t0
 
         detections = []
         if outputs[0] is not None:
@@ -194,19 +205,29 @@ def image_demo(predictor, vis_folder, current_time, args):
             timer.toc()
             online_im = img_info['raw_img']
 
+        t_track = time.time()
+        st_track += t_track - t_det
+
         # result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
         if args.save_result:
             timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
             save_folder = osp.join(vis_folder, timestamp)
             os.makedirs(save_folder, exist_ok=True)
             cv2.imwrite(osp.join(save_folder, osp.basename(img_path)), online_im)
 
+        t_save = time.time()
+        st_save += t_save - t_track
+
         if frame_id % 20 == 0:
-            logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1. / max(1e-5, timer.average_time)))
+            logger.info('Processing frame {} ({:.2f} fps, det: {:.2f} fps, track: {:.2f} fps, save: {:.2f} fps)'
+                        .format(frame_id, 1. / max(1e-5, timer.average_time), 20./st_det, 20./st_track, 20./st_save))
+            st_det = 0
+            st_track = 0
+            st_save = 0
 
-        ch = cv2.waitKey(0)
-        if ch == 27 or ch == ord("q") or ch == ord("Q"):
-            break
+        # ch = cv2.waitKey(0)
+        # if ch == 27 or ch == ord("q") or ch == ord("Q"):
+        #     break
 
     if args.save_result:
         res_file = osp.join(vis_folder, f"{timestamp}.txt")
@@ -228,21 +249,38 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
     else:
         save_path = osp.join(save_folder, "camera.mp4")
     logger.info(f"video save_path is {save_path}")
-    vid_writer = cv2.VideoWriter(
-        save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
-    )
+    if args.save_size is not None:
+        vid_writer = cv2.VideoWriter(
+            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (args.save_size[1], args.save_size[0])
+        )
+    else:
+        vid_writer = cv2.VideoWriter(
+            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
+        )
     tracker = BoTSORT(args, frame_rate=args.fps)
     timer = Timer()
     frame_id = 0
     results = []
+
+    st_det = 1e-5
+    st_track = 1e-5
+    st_save = 1e-5
     while True:
         if frame_id % 20 == 0:
-            logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1. / max(1e-5, timer.average_time)))
+            logger.info('Processing frame {} ({:.2f} fps, det: {:.2f} fps, track: {:.2f} fps, save: {:.2f} fps)'
+                        .format(frame_id, 1. / max(1e-5, timer.average_time), 20. / st_det, 20. / st_track,
+                                20. / st_save))
+            st_det = 0
+            st_track = 0
+            st_save = 0
         ret_val, frame = cap.read()
         if ret_val:
             # Detect objects
+            t0 = time.time()
             outputs, img_info = predictor.inference(frame, timer)
             scale = min(exp.test_size[0] / float(img_info['height'], ), exp.test_size[1] / float(img_info['width']))
+            t_det = time.time()
+            st_det += t_det - t0
 
             if outputs[0] is not None:
                 outputs = outputs[0].cpu().numpy()
@@ -273,11 +311,22 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
             else:
                 timer.toc()
                 online_im = img_info['raw_img']
+            t_track = time.time()
+            st_track += t_track - t_det
+
             if args.save_result:
+                if args.save_size is not None:
+                    online_im = cv2.resize(
+                                online_im,
+                                (args.save_size[1], args.save_size[0]),
+                                interpolation=cv2.INTER_LINEAR,
+                                )
                 vid_writer.write(online_im)
-            ch = cv2.waitKey(1)
-            if ch == 27 or ch == ord("q") or ch == ord("Q"):
-                break
+            t_save = time.time()
+            st_save += t_save - t_track
+            # ch = cv2.waitKey(1)
+            # if ch == 27 or ch == ord("q") or ch == ord("Q"):
+            #     break
         else:
             break
         frame_id += 1
@@ -348,7 +397,10 @@ def main(exp, args):
         trt_file = None
         decoder = None
 
-    predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16)
+    if args.save_size is not None:
+        args.save_size = tuple(map(int, args.save_size.split(',')))
+
+    predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16, args.legacy)
     current_time = time.localtime()
     if args.demo == "image" or args.demo == "images":
         image_demo(predictor, vis_folder, current_time, args)

diff --git a/tracker/bot_sort.py b/tracker/bot_sort.py
@@ -10,6 +10,8 @@
 
 from fast_reid.fast_reid_interfece import FastReIDInterface
 
+import time
+
 
 class STrack(BaseTrack):
     shared_kalman = KalmanFilter()
@@ -225,7 +227,7 @@ def __init__(self, args, frame_rate=30):
         if args.with_reid:
             self.encoder = FastReIDInterface(args.fast_reid_config, args.fast_reid_weights, args.device)
 
-        self.gmc = GMC(method=args.cmc_method, verbose=[args.name, args.ablation])
+        self.gmc = GMC(method=args.cmc_method, downscale=args.downscale, verbose=[args.name, args.ablation])
 
     def update(self, output_results, img):
         self.frame_id += 1
@@ -294,10 +296,13 @@ def update(self, output_results, img):
         # Predict the current location with KF
         STrack.multi_predict(strack_pool)
 
+        t0 = time.time()
         # Fix camera motion
         warp = self.gmc.apply(img, dets)
         STrack.multi_gmc(strack_pool, warp)
         STrack.multi_gmc(unconfirmed, warp)
+        t1 = time.time()
+        # print("time: %f" % (t1 - t0))
 
         # Associate with high score detection boxes
         ious_dists = matching.iou_distance(strack_pool, detections)

diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
@@ -186,7 +186,7 @@ def _mirror(image, boxes):
     return image, boxes
 
 
-def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
+def preproc(image, input_size, mean, std, swap=(2, 0, 1), legacy=False):
     if len(image.shape) == 3:
         padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
     else:
@@ -200,12 +200,13 @@ def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
     ).astype(np.float32)
     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
 
-    padded_img = padded_img[:, :, ::-1]
-    padded_img /= 255.0
-    if mean is not None:
-        padded_img -= mean
-    if std is not None:
-        padded_img /= std
+    if legacy:
+        padded_img = padded_img[:, :, ::-1]
+        padded_img /= 255.0
+        if mean is not None:
+            padded_img -= mean
+        if std is not None:
+            padded_img /= std
     padded_img = padded_img.transpose(swap)
     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
     return padded_img, r