Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open --downscale parameter to increase speed #35

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,24 @@

<p align="center"><img src="assets/Results_Bubbles.png"/></p>

## Updates of this repo
**9/1/2022**
- Open --downscale parameter to speed up track speed.
- Add --legacy parameter to support un-normalization input.
- Add --save_size parameter to adjust the size of saved video/image.
- Add time counter for det, track, save separately.

## Experiments of this repo
### GMC downscale
downscale in GMC defaults to 2. It cost unbearable 3.6s. Time increase 10 times when downscale increase in 2 times.

| Tracker | input_size | downscale | time |
|:--------------|:-------------:|:------:|:------:|
| BoT-SORT | (768, 1280) | 2 | 3.6 |
| BoT-SORT | (768, 1280) | 4 | 0.26 |
| BoT-SORT | (768, 1280) | 8 | 0.02 |


## Highlights 🚀

- YOLOX & YOLOv7 support
Expand Down Expand Up @@ -216,16 +234,16 @@ Demo with BoT-SORT(-ReID) based YOLOX and multi-class.
cd <BoT-SORT_dir>

# Original example
python3 tools/demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result
python3 tools/demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result --legacy

# Multi-class example
python3 tools/mc_demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result
python3 tools/mc_demo.py video --path <path_to_video> -f yolox/exps/example/mot/yolox_x_mix_det.py -c pretrained/bytetrack_x_mot17.pth.tar --with-reid --fuse-score --fp16 --fuse --save_result --legacy
```

Demo with BoT-SORT(-ReID) based YOLOv7 and multi-class.
```shell
cd <BoT-SORT_dir>
python3 tools/mc_demo_yolov7.py --weights pretrained/yolov7-d6.pt --source <path_to_video/images> --fuse-score --agnostic-nms (--with-reid)
python3 tools/mc_demo_yolov7.py --weights pretrained/yolov7-d6.pt --source <path_to_video/images> --fuse-score --agnostic-nms (--with-reid) --legacy
```

## Note
Expand Down
80 changes: 66 additions & 14 deletions tools/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def make_parser():
parser.add_argument("--path", default="", help="path to images or video")
parser.add_argument("--camid", type=int, default=0, help="webcam demo camera id")
parser.add_argument("--save_result", action="store_true",help="whether to save the inference result of image/video")
parser.add_argument("--save_size", default=None, type=str, help="save size of image/video, used to adjust output size")
parser.add_argument("-f", "--exp_file", default=None, type=str, help="pls input your expriment description file")
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
parser.add_argument("--device", default="gpu", type=str, help="device to run our model, can either be cpu or gpu")
Expand All @@ -39,6 +40,7 @@ def make_parser():
parser.add_argument("--fp16", dest="fp16", default=False, action="store_true",help="Adopting mix precision evaluating.")
parser.add_argument("--fuse", dest="fuse", default=False, action="store_true", help="Fuse conv and bn for testing.")
parser.add_argument("--trt", dest="trt", default=False, action="store_true", help="Using TensorRT model for testing.")
parser.add_argument("--legacy", dest="legacy", default=False, action="store_true", help="legacy code, such as mean/std normalization.")

# tracking args
parser.add_argument("--track_high_thresh", type=float, default=0.6, help="tracking confidence threshold")
Expand All @@ -52,6 +54,7 @@ def make_parser():

# CMC
parser.add_argument("--cmc-method", default="orb", type=str, help="cmc method: files (Vidstab GMC) | orb | ecc")
parser.add_argument("--downscale", default=2, type=int, help="cmc downscale, large image leads to very slow gmc, increase downscale to increase speed")

# ReID
parser.add_argument("--with-reid", dest="with_reid", default=False, action="store_true", help="test mot20.")
Expand Down Expand Up @@ -94,7 +97,8 @@ def __init__(
trt_file=None,
decoder=None,
device=torch.device("cpu"),
fp16=False
fp16=False,
legacy=False
):
self.model = model
self.decoder = decoder
Expand All @@ -115,6 +119,7 @@ def __init__(
self.model = model_trt
self.rgb_means = (0.485, 0.456, 0.406)
self.std = (0.229, 0.224, 0.225)
self.legacy = legacy

def inference(self, img, timer):
img_info = {"id": 0}
Expand All @@ -129,7 +134,7 @@ def inference(self, img, timer):
img_info["width"] = width
img_info["raw_img"] = img

img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
img, ratio = preproc(img, self.test_size, self.rgb_means, self.std, legacy=self.legacy)
img_info["ratio"] = ratio
img = torch.from_numpy(img).unsqueeze(0).float().to(self.device)
if self.fp16:
Expand All @@ -156,11 +161,17 @@ def image_demo(predictor, vis_folder, current_time, args):
timer = Timer()
results = []

st_det = 0
st_track = 0
st_save = 0
for frame_id, img_path in enumerate(files, 1):

t0 = time.time()
# Detect objects
outputs, img_info = predictor.inference(img_path, timer)
scale = min(exp.test_size[0] / float(img_info['height'], ), exp.test_size[1] / float(img_info['width']))
t_det = time.time()
st_det += t_det - t0

detections = []
if outputs[0] is not None:
Expand Down Expand Up @@ -194,19 +205,29 @@ def image_demo(predictor, vis_folder, current_time, args):
timer.toc()
online_im = img_info['raw_img']

t_track = time.time()
st_track += t_track - t_det

# result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
if args.save_result:
timestamp = time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
save_folder = osp.join(vis_folder, timestamp)
os.makedirs(save_folder, exist_ok=True)
cv2.imwrite(osp.join(save_folder, osp.basename(img_path)), online_im)

t_save = time.time()
st_save += t_save - t_track

if frame_id % 20 == 0:
logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1. / max(1e-5, timer.average_time)))
logger.info('Processing frame {} ({:.2f} fps, det: {:.2f} fps, track: {:.2f} fps, save: {:.2f} fps)'
.format(frame_id, 1. / max(1e-5, timer.average_time), 20./st_det, 20./st_track, 20./st_save))
st_det = 0
st_track = 0
st_save = 0

ch = cv2.waitKey(0)
if ch == 27 or ch == ord("q") or ch == ord("Q"):
break
# ch = cv2.waitKey(0)
# if ch == 27 or ch == ord("q") or ch == ord("Q"):
# break

if args.save_result:
res_file = osp.join(vis_folder, f"{timestamp}.txt")
Expand All @@ -228,21 +249,38 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
else:
save_path = osp.join(save_folder, "camera.mp4")
logger.info(f"video save_path is {save_path}")
vid_writer = cv2.VideoWriter(
save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
)
if args.save_size is not None:
vid_writer = cv2.VideoWriter(
save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (args.save_size[1], args.save_size[0])
)
else:
vid_writer = cv2.VideoWriter(
save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
)
tracker = BoTSORT(args, frame_rate=args.fps)
timer = Timer()
frame_id = 0
results = []

st_det = 1e-5
st_track = 1e-5
st_save = 1e-5
while True:
if frame_id % 20 == 0:
logger.info('Processing frame {} ({:.2f} fps)'.format(frame_id, 1. / max(1e-5, timer.average_time)))
logger.info('Processing frame {} ({:.2f} fps, det: {:.2f} fps, track: {:.2f} fps, save: {:.2f} fps)'
.format(frame_id, 1. / max(1e-5, timer.average_time), 20. / st_det, 20. / st_track,
20. / st_save))
st_det = 0
st_track = 0
st_save = 0
ret_val, frame = cap.read()
if ret_val:
# Detect objects
t0 = time.time()
outputs, img_info = predictor.inference(frame, timer)
scale = min(exp.test_size[0] / float(img_info['height'], ), exp.test_size[1] / float(img_info['width']))
t_det = time.time()
st_det += t_det - t0

if outputs[0] is not None:
outputs = outputs[0].cpu().numpy()
Expand Down Expand Up @@ -273,11 +311,22 @@ def imageflow_demo(predictor, vis_folder, current_time, args):
else:
timer.toc()
online_im = img_info['raw_img']
t_track = time.time()
st_track += t_track - t_det

if args.save_result:
if args.save_size is not None:
online_im = cv2.resize(
online_im,
(args.save_size[1], args.save_size[0]),
interpolation=cv2.INTER_LINEAR,
)
vid_writer.write(online_im)
ch = cv2.waitKey(1)
if ch == 27 or ch == ord("q") or ch == ord("Q"):
break
t_save = time.time()
st_save += t_save - t_track
# ch = cv2.waitKey(1)
# if ch == 27 or ch == ord("q") or ch == ord("Q"):
# break
else:
break
frame_id += 1
Expand Down Expand Up @@ -348,7 +397,10 @@ def main(exp, args):
trt_file = None
decoder = None

predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16)
if args.save_size is not None:
args.save_size = tuple(map(int, args.save_size.split(',')))

predictor = Predictor(model, exp, trt_file, decoder, args.device, args.fp16, args.legacy)
current_time = time.localtime()
if args.demo == "image" or args.demo == "images":
image_demo(predictor, vis_folder, current_time, args)
Expand Down
7 changes: 6 additions & 1 deletion tracker/bot_sort.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from fast_reid.fast_reid_interfece import FastReIDInterface

import time


class STrack(BaseTrack):
shared_kalman = KalmanFilter()
Expand Down Expand Up @@ -225,7 +227,7 @@ def __init__(self, args, frame_rate=30):
if args.with_reid:
self.encoder = FastReIDInterface(args.fast_reid_config, args.fast_reid_weights, args.device)

self.gmc = GMC(method=args.cmc_method, verbose=[args.name, args.ablation])
self.gmc = GMC(method=args.cmc_method, downscale=args.downscale, verbose=[args.name, args.ablation])

def update(self, output_results, img):
self.frame_id += 1
Expand Down Expand Up @@ -294,10 +296,13 @@ def update(self, output_results, img):
# Predict the current location with KF
STrack.multi_predict(strack_pool)

t0 = time.time()
# Fix camera motion
warp = self.gmc.apply(img, dets)
STrack.multi_gmc(strack_pool, warp)
STrack.multi_gmc(unconfirmed, warp)
t1 = time.time()
# print("time: %f" % (t1 - t0))

# Associate with high score detection boxes
ious_dists = matching.iou_distance(strack_pool, detections)
Expand Down
15 changes: 8 additions & 7 deletions yolox/data/data_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def _mirror(image, boxes):
return image, boxes


def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
def preproc(image, input_size, mean, std, swap=(2, 0, 1), legacy=False):
if len(image.shape) == 3:
padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
else:
Expand All @@ -200,12 +200,13 @@ def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
).astype(np.float32)
padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img

padded_img = padded_img[:, :, ::-1]
padded_img /= 255.0
if mean is not None:
padded_img -= mean
if std is not None:
padded_img /= std
if legacy:
padded_img = padded_img[:, :, ::-1]
padded_img /= 255.0
if mean is not None:
padded_img -= mean
if std is not None:
padded_img /= std
padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r
Expand Down