From 8ac9fffd9da2c2bc0f9ac2423d0188d16ff7ed21 Mon Sep 17 00:00:00 2001 From: kkannan Date: Thu, 2 Jan 2025 13:16:38 +0000 Subject: [PATCH] Add support for PETR Model(Vovnet based varaints) --- .gitattributes | 12 + env/core_requirements.txt | 2 + .../petr/data/nuscenes/nuscenes_infos_val.pkl | Bin 0 -> 8003 bytes ...16-36-0400__CAM_BACK__1533151603537558.jpg | 3 + ...-0400__CAM_BACK_LEFT__1533151603547405.jpg | 3 + ...0400__CAM_BACK_RIGHT__1533151603528113.jpg | 3 + ...6-36-0400__CAM_FRONT__1533151603512404.jpg | 3 + ...0400__CAM_FRONT_LEFT__1533151603504799.jpg | 3 + ...400__CAM_FRONT_RIGHT__1533151603520482.jpg | 3 + ...-0400__LIDAR_TOP__1533151603547590.pcd.bin | 3 + ...400__RADAR_BACK_LEFT__1533151603522238.pcd | 3 + ...00__RADAR_BACK_RIGHT__1533151603576423.pcd | 3 + ...36-0400__RADAR_FRONT__1533151603555991.pcd | 3 + ...00__RADAR_FRONT_LEFT__1533151603526348.pcd | 3 + ...0__RADAR_FRONT_RIGHT__1533151603512881.pcd | 3 + .../pytorch/vision/petr/mmdet/__init__.py | 0 .../vision/petr/mmdet/core/anchor/__init__.py | 33 + .../mmdet/core/anchor/anchor_generator.py | 660 ++++++++++++++++++ .../vision/petr/mmdet/core/anchor/builder.py | 21 + .../petr/mmdet/core/anchor/point_generator.py | 172 +++++ .../vision/petr/mmdet/core/anchor/utils.py | 75 ++ .../vision/petr/mmdet/core/bbox/__init__.py | 8 + .../mmdet/core/bbox/assigners/__init__.py | 7 + .../core/bbox/assigners/base_assigner.py | 10 + .../vision/petr/mmdet/core/bbox/builder.py | 15 + .../petr/mmdet/core/bbox/coder/__init__.py | 7 + .../mmdet/core/bbox/coder/base_bbox_coder.py | 13 + .../bbox/coder/distance_point_bbox_coder.py | 66 ++ .../mmdet/core/bbox/match_costs/__init__.py | 24 + .../mmdet/core/bbox/match_costs/builder.py | 13 + .../mmdet/core/bbox/match_costs/match_cost.py | 345 +++++++++ .../vision/petr/mmdet/core/utils/__init__.py | 6 + .../petr/mmdet/core/utils/dist_utils.py | 47 ++ .../vision/petr/mmdet/datasets/__init__.py | 6 + .../vision/petr/mmdet/datasets/builder.py | 214 ++++++ .../petr/mmdet/datasets/pipelines/__init__.py | 8 + .../petr/mmdet/datasets/pipelines/compose.py | 59 ++ .../mmdet/datasets/pipelines/formatting.py | 133 ++++ .../petr/mmdet/datasets/pipelines/loading.py | 289 ++++++++ .../petr/mmdet/datasets/samplers/__init__.py | 18 + .../datasets/samplers/class_aware_sampler.py | 162 +++++ .../datasets/samplers/distributed_sampler.py | 49 ++ .../mmdet/datasets/samplers/group_sampler.py | 136 ++++ .../datasets/samplers/infinite_sampler.py | 167 +++++ .../vision/petr/mmdet/models/__init__.py | 20 + .../vision/petr/mmdet/models/builder.py | 49 ++ .../models/dense_heads/anchor_free_head.py | 274 ++++++++ .../models/dense_heads/base_dense_head.py | 520 ++++++++++++++ .../models/dense_heads/dense_test_mixins.py | 149 ++++ .../petr/mmdet/models/detectors/base.py | 145 ++++ .../petr/mmdet/models/losses/__init__.py | 0 .../petr/mmdet/models/losses/focal_loss.py | 80 +++ .../petr/mmdet/models/losses/iou_loss.py | 448 ++++++++++++ .../mmdet/models/losses/smooth_l1_loss.py | 66 ++ .../vision/petr/mmdet/models/losses/utils.py | 104 +++ .../petr/mmdet/models/utils/__init__.py | 8 + .../vision/petr/mmdet/models/utils/builder.py | 18 + .../petr/mmdet/models/utils/res_layer.py | 179 +++++ .../petr/mmdet/models/utils/transformer.py | 25 + .../vision/petr/mmdet/utils/__init__.py | 7 + .../petr/mmdet/utils/util_distribution.py | 13 + .../pytorch/vision/petr/mmdet3d/__init__.py | 0 .../mmdet3d/configs/_base_/datasets/nus-3d.py | 42 ++ .../mmdet3d/configs/_base_/default_runtime.py | 17 + .../vision/petr/mmdet3d/core/__init__.py | 0 .../vision/petr/mmdet3d/core/bbox/__init__.py | 7 + .../petr/mmdet3d/core/bbox/coders/__init__.py | 6 + .../mmdet3d/core/bbox/structures/__init__.py | 36 + .../core/bbox/structures/base_box3d.py | 338 +++++++++ .../core/bbox/structures/box_3d_mode.py | 165 +++++ .../mmdet3d/core/bbox/structures/cam_box3d.py | 256 +++++++ .../core/bbox/structures/coord_3d_mode.py | 270 +++++++ .../core/bbox/structures/depth_box3d.py | 187 +++++ .../core/bbox/structures/lidar_box3d.py | 179 +++++ .../mmdet3d/core/bbox/structures/utils.py | 229 ++++++ .../petr/mmdet3d/core/bbox/transforms.py | 29 + .../petr/mmdet3d/core/points/__init__.py | 6 + .../petr/mmdet3d/core/points/base_points.py | 335 +++++++++ .../vision/petr/mmdet3d/datasets/__init__.py | 10 + .../vision/petr/mmdet3d/datasets/builder.py | 26 + .../vision/petr/mmdet3d/datasets/custom_3d.py | 222 ++++++ .../petr/mmdet3d/datasets/nuscenes_dataset.py | 175 +++++ .../mmdet3d/datasets/pipelines/__init__.py | 8 + .../mmdet3d/datasets/pipelines/formating.py | 285 ++++++++ .../mmdet3d/datasets/pipelines/loading.py | 72 ++ .../datasets/pipelines/test_time_aug.py | 114 +++ .../vision/petr/mmdet3d/models/__init__.py | 7 + .../vision/petr/mmdet3d/models/builder.py | 65 ++ .../petr/mmdet3d/models/detectors/__init__.py | 6 + .../petr/mmdet3d/models/detectors/base.py | 54 ++ .../mmdet3d/models/detectors/mvx_two_stage.py | 418 +++++++++++ .../models/pytorch/vision/petr/test_petr.py | 173 +++++ .../pytorch/vision/petr/utils/__init__.py | 0 .../pytorch/vision/petr/utils/cp_fpn.py | 210 ++++++ .../pytorch/vision/petr/utils/grid_mask.py | 62 ++ .../pytorch/vision/petr/utils/match_cost.py | 31 + .../vision/petr/utils/model_registry.py | 39 ++ .../vision/petr/utils/nms_free_coder.py | 41 ++ .../vision/petr/utils/nuscenes_dataset.py | 89 +++ .../pytorch/vision/petr/utils/petr3d.py | 124 ++++ .../pytorch/vision/petr/utils/petr_head.py | 527 ++++++++++++++ .../vision/petr/utils/petr_transformer.py | 447 ++++++++++++ .../utils/petr_vovnet_gridmask_p4_1600x640.py | 242 +++++++ .../utils/petr_vovnet_gridmask_p4_800x320.py | 239 +++++++ .../vision/petr/utils/positional_encoding.py | 154 ++++ .../pytorch/vision/petr/utils/transform_3d.py | 213 ++++++ .../models/pytorch/vision/petr/utils/utils.py | 201 ++++++ .../pytorch/vision/petr/utils/vovnetcp.py | 394 +++++++++++ 108 files changed, 11668 insertions(+) create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/anchor_generator.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py create mode 100644 forge/test/models/pytorch/vision/petr/test_petr.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/__init__.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/cp_fpn.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/grid_mask.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/match_cost.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/model_registry.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr3d.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_head.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_transformer.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/positional_encoding.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/transform_3d.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/utils.py create mode 100644 forge/test/models/pytorch/vision/petr/utils/vovnetcp.py diff --git a/.gitattributes b/.gitattributes index e69de29bb..7bc04f0f1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -0,0 +1,12 @@ +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd filter=lfs diff=lfs merge=lfs -text +forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd filter=lfs diff=lfs merge=lfs -text diff --git a/env/core_requirements.txt b/env/core_requirements.txt index 3144d29f4..1fc40915b 100644 --- a/env/core_requirements.txt +++ b/env/core_requirements.txt @@ -51,3 +51,5 @@ pytorch_forecasting==1.0.0 patool openpyxl==3.1.5 GitPython==3.1.44 +mmcv-full==1.7.2 +nuscenes-devkit==1.1.11 diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl b/forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl new file mode 100644 index 0000000000000000000000000000000000000000..033599a0a3fba05f14a812faef64de07470fc8c8 GIT binary patch literal 8003 zcmeI1cT`i!_rQa66%?=_7VL|wKq?U1JcPl0wBCw!{3j8LI1k`0$&e=V`@Ar@IobwnabLY<7na{m5?@gTE z1}AG0krpd$6#8O`FjS@tRH_R-g2e)^lp_yafJ>B$p+W&ljtm^92$*ysh0Y~2nM?}P z$w|QFIq_(?QqjGwK^RvqQYwxJL#08uy&RXx?L$HZc(BYqELt8a<%{ei#4>TH#6Ap{ zOYOTRCE&`r_L6WJAD7@Vdl@$*EEwg}Jl&jqIey+g_7XDL$(Bl{IN73{OtGafY$*=5 zGzVKUolNF%+66fr3WG+YFenaW8iVf0V3O^^_yRkg7#XOqHI#-sIWovXBqwmBGnqJ( z$DmMYI1}eNl7&>9s#F-rB5^!SJx>ZfJ~u>0Oqa2%^K_28ueX<9TTwk`PTO3p_$OAj ztNogl?Sh|KNukoobi4UsL1?V<=rCLkb%<}T&@G(flH@4@VQJn z1E-Q5ggmN{&ZJV5iheR&A`6vL@t{zST*{Tmg1K_EkZ2lgSm49jnRmu7&0zsq^kRn% zSH`eF=+B5V^0XK&I zmW?a=q3)A}Pzhc7G0#+N#BogZNlXjo=3TsGS<#~ta zvY<4)%CKlkGYiHsud2X~(=2e~WLU;;XRhE1W4Sm4p^HdUt~T^5&ORiN>MQzc3~A%M zU8$n4Bs?T6+KwM8#qC1EgXLnbRLYH3DtdCHI6qV(lS{+-a%C!Fj1W{R3?<03)FP=$ zHYq{rs>4$zDt(nUY#p{PTX&^WG+d!CAXW=C>LzqjCar3#*H!3DRC;-NO>ISq_`@d2 zmAqB19+zqkZ`NzA0cGOcD1k{5P|_@QW5SlhrjUDTKeV}6Xi8eLxlYU*1m3-=!cN~BI<`I(`guC-{GXry|`qS5UGMg_YhS4$FU z|LC{JqajJ);Qj{x9mHKM4H(J10||vD$W1t6iCij{$i#eQCt3)zksgn<#U&Y7YAU8( zN%Dn+@Ds&J?PYEKt={Rj`JBb=ziZ{&@>;@>a7C{UukiI`c@VxS>aAG+uXrSfP}z0A zB*Z`SPAZx1L?ygaWG-?RSty1waDfwD$fF2&IF*hI`8+Zi1s(?nE|n^9U4le3X2wXFi=!R6dCZIgGRYsvk~F zL=Izna(?)_lgMGt7;ih2)_{u!uQsf#C)`9dZkIMk5_oj*YWhDSJeP)2){*}N{G`^E|?ErpfeK)#gX9s}MffXF@ zbsYfuXzH{3jqz~^-e!td@!M%zC6EuKjpkq(|7E=*PO~u=t~IC zv+0KD+m~P|o$?aVH|5)qg5Vd3zATev46c-*Z(DX3=(`oyu>Sq%N}zwzWyvDH7hrF! z&)R@fJHR)(ZXZhm*20t;XYPdLpMkf&b>7g18hAVKu)%^KUcg1)E*9AhssxudU2ug~ zd%jLz+;)>li*iV+Z9aU$?-!u*zq+bY6Ab^JzN|50C(8D9(AQA9B&+(j4*In9Ni(Gt zd+feOpOKnA7iU+t`b>i2-m-tnomy8{h*Qgb#v09$!En;D7OrNE!X!HhaYq54%;eJ@ z={S$?fODxd9-U9&Gw5WBmbF8kjt}1(jbee7mLshR5($-2W46uUj*gUfx*7 zf(zYTNa7}>P4yV)yg1?V+%T5+%i(6~piFq!wLIdMK z{e>Nu&N(jwJ}1^FY=4S}O}<;}_nPhoC)N~YikfO*U*D31Z_MK1mRCpVGE|oDY;g0? z-`y6m{w~sy4ow@rXgMV1nCj?R)I#id^@dR^in|c$Uj9<{Wa!gX?Ga3Hce|Za(dzcH zw%(-p=o*`%uWhv^TGlw91UkjP0UV*Q%hdfT%pthQ3@v)eYV>eAl}2-K zFrAnVXxAkRDYz4z=IE$JueKmpH2+!<3zWVH-h1x;5ftWH=91&9UTM**S~vdY#Q|tT ztg?z6Pn~MOf;q>UQ%Xk%ut09^8uF~$ZXM|TaW^8n7=^jq#^uE$R(FD)kM7Cg`YafB z>b94g|Bnd0#J%+r7coNbQdHc>9zg_pCV^c*uZQQZbo=09C|tbrk0aAw1D&^ph5`F` z1JBwY=U<(&9=3d!;=b8*7nq#V_ogR8O*iO}Y=`e_C^%Qk7_-cj_50@?6WMUhnBwl%qg)hGq18>{{Ck_HB3rC81~XL zo9r>31*&{M-j6ubr=31{VdJ#(Ug}MjeWPghkWT0;OJ7^_9SVGp#`W?szK=H9ZaoJ~ z?88U&<%HB6Uq~V7J7(4e`d-BsN}fsY!GInYmh4EXhJ#-o9#}APA3R1EF;;LCz{=mv zef^-lU|QP#;?`UgRf2TR zPA@`x=E$d_4a3;cntP#zBn{f`VsWbfAx+%q!1jrnh^Iu^T+i$<5&pnwC@oy;-KVSs9POov7!dDMlYbxwH3&!;O34 zm1!j-hB~;Zjz>w)1P`KMkhUr8Os{%4wP&}niTx0BtcsXEN9sV^TO$cOyUf%=zef|u zxn!p@%O8Oa^%_~Kb9)jte`KPSw?~df|1+?tkGf?o_-T_mwSrZD^rYNvy$DJ@PWZiw zw8Cyd#+}`ZreK+#Vfqtfj{sxtSxlGQ9tcAvA1e_Lid4%o6zY!-2Zn)2lS=Q3lZDqOL!JxRgKm8Qc|++}dAP*JBiR z^sV1A{ecs(#;^?-aO{tGTH|mMQc7gV)+*y0}ObBex5 zI1)8^)IKk5oV9=f0oF(JtE-`#ygbL=d>FPSfAE{L6MeCuA9gJ`{DUq=VD{U`w(-6< z!`|A=Nk0GP9k6`xWYW{RLeP*H>wf{aQ0bMYm^AfZV>?aW4p9zj0BEt%idTYkiTQHX zwI|>*+cC*`-Ul!%cG%?NQ^u+ob6@L==SQgG5*pI4ogS~c=bHW^qm-lyClmF8^;nwr+HubsC;DGP_*r4Z-aMvqUz=&|*M__+} zy2BPQWZapoycx!-lYJ^#eTT7BKhLFGt?O-%qePPkJ7I_-u=29Z1+tL`!qA1@`N7^$6Yr zQsivr`IyyU&e_46UtT6-t~HGG+Zj_aD}Q=~+wymSaHFwLL7U&d07Un5nn7Cl0W(UP zp(=n%;tL|isK$+47d2qQMC_wpWNCFxC2WfiPeI%KJO{qS1iu4SInDjTR+_8wcWym9 zbJ{Rflwsk(gzYYv;wDpdVBAAMgrm7t8cv=98j)JTJW>3P!D%;$6A}?MC0z8qu(PO% zzDJBz=+2~&J3iux?9nGYwhdd4ZG^tICAp;Ql1O1F0^#TpXmT0K<>-;3M~ogWdL-!i zZ*~Dx&yN2e{~$hJ_dv&@2s}8HFP2Al=Hu*cZ6~8J9TF+> z<)Q8|Rt3;`*GJQv8)~8FVZ%d1X54|Xv+6BzlSWAM%W}30TL`Pyu393jcmur)XM2#g z)WO-w0Swyw^&oIWf?;C$EtnUGZz;Lj4B!5uKdINbcVMR7*71}d`(om!(g~4QE<;7! z3Ra(!Z$RS}{uN>TM!3R%MM6wb1DteXYTBN(i=gtt>3J!cjUd@;)%?~eXP~`Z;h2dl zhG9`oY`^-%3b-`J^v$sekKvb2>b`ULO`W`Y_}kb46#|b;kg-T-KePW`(0!E5`QDJb zFmsCadC#>Up!@Y5{reSJt4>`mP2a!!9Q?s2#^U*)dN3&2c}DB5%OG!q5qo>j_pr%j zNAToB5wIoZS1Y#(Z^2OUUcdCEjWBB9)g>l{SD}gUQlIRG1K`Gscx+zOXjN~W>L@+y zn_xFXl8`s<4VYFp($Ur=4YW4IZyzL{3uhEl4we+%hCT{f{KB|8;T1f` z_S)aIxxp9uwDLB8p+{TacF&@o2Ps#5ZyWZP^Y9#5sZmDK_Npy^nQKS+<_2}>> from mmdet.core import AnchorGenerator + >>> self = AnchorGenerator([16], [1.], [1.], [9]) + >>> all_anchors = self.grid_priors([(2, 2)], device='cpu') + >>> print(all_anchors) + [tensor([[-4.5000, -4.5000, 4.5000, 4.5000], + [11.5000, -4.5000, 20.5000, 4.5000], + [-4.5000, 11.5000, 4.5000, 20.5000], + [11.5000, 11.5000, 20.5000, 20.5000]])] + >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18]) + >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu') + >>> print(all_anchors) + [tensor([[-4.5000, -4.5000, 4.5000, 4.5000], + [11.5000, -4.5000, 20.5000, 4.5000], + [-4.5000, 11.5000, 4.5000, 20.5000], + [11.5000, 11.5000, 20.5000, 20.5000]]), \ + tensor([[-9., -9., 9., 9.]])] + """ + + def __init__( + self, + strides, + ratios, + scales=None, + base_sizes=None, + scale_major=True, + octave_base_scale=None, + scales_per_octave=None, + centers=None, + center_offset=0.0, + ): + # check center and center_offset + if center_offset != 0: + assert centers is None, "center cannot be set when center_offset" f"!=0, {centers} is given." + if not (0 <= center_offset <= 1): + raise ValueError("center_offset should be in range [0, 1], " f"{center_offset} is given.") + if centers is not None: + assert len(centers) == len(strides), ( + "The number of strides should be the same as centers, got " f"{strides} and {centers}" + ) + + # calculate base sizes of anchors + self.strides = [_pair(stride) for stride in strides] + self.base_sizes = [min(stride) for stride in self.strides] if base_sizes is None else base_sizes + assert len(self.base_sizes) == len(self.strides), ( + "The number of strides should be the same as base sizes, got " f"{self.strides} and {self.base_sizes}" + ) + + # calculate scales of anchors + assert (octave_base_scale is not None and scales_per_octave is not None) ^ (scales is not None), ( + "scales and octave_base_scale with scales_per_octave cannot" " be set at the same time" + ) + if scales is not None: + self.scales = torch.Tensor(scales) + elif octave_base_scale is not None and scales_per_octave is not None: + octave_scales = np.array([2 ** (i / scales_per_octave) for i in range(scales_per_octave)]) + scales = octave_scales * octave_base_scale + self.scales = torch.Tensor(scales) + else: + raise ValueError("Either scales or octave_base_scale with " "scales_per_octave should be set") + + self.octave_base_scale = octave_base_scale + self.scales_per_octave = scales_per_octave + self.ratios = torch.Tensor(ratios) + self.scale_major = scale_major + self.centers = centers + self.center_offset = center_offset + self.base_anchors = self.gen_base_anchors() + + @property + def num_base_anchors(self): + """list[int]: total number of base anchors in a feature grid""" + return self.num_base_priors + + @property + def num_base_priors(self): + """list[int]: The number of priors (anchors) at a point + on the feature grid""" + return [base_anchors.size(0) for base_anchors in self.base_anchors] + + @property + def num_levels(self): + """int: number of feature levels that the generator will be applied""" + return len(self.strides) + + def gen_base_anchors(self): + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_size in enumerate(self.base_sizes): + center = None + if self.centers is not None: + center = self.centers[i] + multi_level_base_anchors.append( + self.gen_single_level_base_anchors(base_size, scales=self.scales, ratios=self.ratios, center=center) + ) + return multi_level_base_anchors + + def gen_single_level_base_anchors(self, base_size, scales, ratios, center=None): + """Generate base anchors of a single level. + + Args: + base_size (int | float): Basic size of an anchor. + scales (torch.Tensor): Scales of the anchor. + ratios (torch.Tensor): The ratio between between the height + and width of anchors in a single level. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature maps. + """ + w = base_size + h = base_size + if center is None: + x_center = self.center_offset * w + y_center = self.center_offset * h + else: + x_center, y_center = center + + h_ratios = torch.sqrt(ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:, None] * scales[None, :]).view(-1) + hs = (h * h_ratios[:, None] * scales[None, :]).view(-1) + else: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchors = [x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws, y_center + 0.5 * hs] + base_anchors = torch.stack(base_anchors, dim=-1) + + return base_anchors + + def _meshgrid(self, x, y, row_major=True): + """Generate mesh grid of x and y. + + Args: + x (torch.Tensor): Grids of x dimension. + y (torch.Tensor): Grids of y dimension. + row_major (bool, optional): Whether to return y grids first. + Defaults to True. + + Returns: + tuple[torch.Tensor]: The mesh grids of x and y. + """ + # use shape instead of len to keep tracing while exporting to onnx + xx = x.repeat(y.shape[0]) + yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def grid_priors(self, featmap_sizes, dtype=torch.float32, device="cuda"): + """Generate grid anchors in multiple feature levels. + + Args: + featmap_sizes (list[tuple]): List of feature map sizes in + multiple feature levels. + dtype (:obj:`torch.dtype`): Dtype of priors. + Default: torch.float32. + device (str): The device where the anchors will be put on. + + Return: + list[torch.Tensor]: Anchors in multiple feature levels. \ + The sizes of each tensor should be [N, 4], where \ + N = width * height * num_base_anchors, width and height \ + are the sizes of the corresponding feature level, \ + num_base_anchors is the number of anchors for that level. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_anchors = [] + for i in range(self.num_levels): + anchors = self.single_level_grid_priors(featmap_sizes[i], level_idx=i, dtype=dtype, device=device) + multi_level_anchors.append(anchors) + return multi_level_anchors + + def single_level_grid_priors(self, featmap_size, level_idx, dtype=torch.float32, device="cuda"): + """Generate grid anchors of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int]): Size of the feature maps. + level_idx (int): The index of corresponding feature map level. + dtype (obj:`torch.dtype`): Date type of points.Defaults to + ``torch.float32``. + device (str, optional): The device the tensor will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: Anchors in the overall feature maps. + """ + + base_anchors = self.base_anchors[level_idx].to(device).to(dtype) + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + # First create Range with the default dtype, than convert to + # target `dtype` for onnx exporting. + shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w + shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h + + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1) + # first feat_w elements correspond to the first row of shifts + # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get + # shifted anchors (K, A, 4), reshape to (K*A, 4) + + all_anchors = base_anchors[None, :, :] + shifts[:, None, :] + all_anchors = all_anchors.view(-1, 4) + # first A rows correspond to A anchors of (0, 0) in feature map, + # then (0, 1), (0, 2), ... + return all_anchors + + def valid_flags(self, featmap_sizes, pad_shape, device="cuda"): + """Generate valid flags of anchors in multiple feature levels. + + Args: + featmap_sizes (list(tuple)): List of feature map sizes in + multiple feature levels. + pad_shape (tuple): The padded shape of the image. + device (str): Device where the anchors will be put on. + + Return: + list(torch.Tensor): Valid flags of anchors in multiple levels. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_flags = [] + for i in range(self.num_levels): + anchor_stride = self.strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = pad_shape[:2] + valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h) + valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w) + flags = self.single_level_valid_flags( + (feat_h, feat_w), (valid_feat_h, valid_feat_w), self.num_base_anchors[i], device=device + ) + multi_level_flags.append(flags) + return multi_level_flags + + def single_level_valid_flags(self, featmap_size, valid_size, num_base_anchors, device="cuda"): + """Generate the valid flags of anchor in a single feature map. + + Args: + featmap_size (tuple[int]): The size of feature maps, arrange + as (h, w). + valid_size (tuple[int]): The valid size of the feature maps. + num_base_anchors (int): The number of base anchors. + device (str, optional): Device where the flags will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: The valid flags of each anchor in a single level \ + feature map. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + valid = valid[:, None].expand(valid.size(0), num_base_anchors).contiguous().view(-1) + return valid + + def __repr__(self): + """str: a string that describes the module""" + indent_str = " " + repr_str = self.__class__.__name__ + "(\n" + repr_str += f"{indent_str}strides={self.strides},\n" + repr_str += f"{indent_str}ratios={self.ratios},\n" + repr_str += f"{indent_str}scales={self.scales},\n" + repr_str += f"{indent_str}base_sizes={self.base_sizes},\n" + repr_str += f"{indent_str}scale_major={self.scale_major},\n" + repr_str += f"{indent_str}octave_base_scale=" + repr_str += f"{self.octave_base_scale},\n" + repr_str += f"{indent_str}scales_per_octave=" + repr_str += f"{self.scales_per_octave},\n" + repr_str += f"{indent_str}num_levels={self.num_levels}\n" + repr_str += f"{indent_str}centers={self.centers},\n" + repr_str += f"{indent_str}center_offset={self.center_offset})" + return repr_str + + +@PRIOR_GENERATORS.register_module() +class SSDAnchorGenerator(AnchorGenerator): + """Anchor generator for SSD. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels. + ratios (list[float]): The list of ratios between the height and width + of anchors in a single level. + min_sizes (list[float]): The list of minimum anchor sizes on each + level. + max_sizes (list[float]): The list of maximum anchor sizes on each + level. + basesize_ratio_range (tuple(float)): Ratio range of anchors. Being + used when not setting min_sizes and max_sizes. + input_size (int): Size of feature map, 300 for SSD300, 512 for + SSD512. Being used when not setting min_sizes and max_sizes. + scale_major (bool): Whether to multiply scales first when generating + base anchors. If true, the anchors in the same row will have the + same scales. It is always set to be False in SSD. + """ + + def __init__( + self, + strides, + ratios, + min_sizes=None, + max_sizes=None, + basesize_ratio_range=(0.15, 0.9), + input_size=300, + scale_major=True, + ): + assert len(strides) == len(ratios) + assert not (min_sizes is None) ^ (max_sizes is None) + self.strides = [_pair(stride) for stride in strides] + self.centers = [(stride[0] / 2.0, stride[1] / 2.0) for stride in self.strides] + + if min_sizes is None and max_sizes is None: + # use hard code to generate SSD anchors + self.input_size = input_size + assert mmcv.is_tuple_of(basesize_ratio_range, float) + self.basesize_ratio_range = basesize_ratio_range + # calculate anchor ratios and sizes + min_ratio, max_ratio = basesize_ratio_range + min_ratio = int(min_ratio * 100) + max_ratio = int(max_ratio * 100) + step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2)) + min_sizes = [] + max_sizes = [] + for ratio in range(int(min_ratio), int(max_ratio) + 1, step): + min_sizes.append(int(self.input_size * ratio / 100)) + max_sizes.append(int(self.input_size * (ratio + step) / 100)) + if self.input_size == 300: + if basesize_ratio_range[0] == 0.15: # SSD300 COCO + min_sizes.insert(0, int(self.input_size * 7 / 100)) + max_sizes.insert(0, int(self.input_size * 15 / 100)) + elif basesize_ratio_range[0] == 0.2: # SSD300 VOC + min_sizes.insert(0, int(self.input_size * 10 / 100)) + max_sizes.insert(0, int(self.input_size * 20 / 100)) + else: + raise ValueError( + "basesize_ratio_range[0] should be either 0.15" + "or 0.2 when input_size is 300, got " + f"{basesize_ratio_range[0]}." + ) + elif self.input_size == 512: + if basesize_ratio_range[0] == 0.1: # SSD512 COCO + min_sizes.insert(0, int(self.input_size * 4 / 100)) + max_sizes.insert(0, int(self.input_size * 10 / 100)) + elif basesize_ratio_range[0] == 0.15: # SSD512 VOC + min_sizes.insert(0, int(self.input_size * 7 / 100)) + max_sizes.insert(0, int(self.input_size * 15 / 100)) + else: + raise ValueError( + "When not setting min_sizes and max_sizes," + "basesize_ratio_range[0] should be either 0.1" + "or 0.15 when input_size is 512, got" + f" {basesize_ratio_range[0]}." + ) + else: + raise ValueError( + "Only support 300 or 512 in SSDAnchorGenerator when " + "not setting min_sizes and max_sizes, " + f"got {self.input_size}." + ) + + assert len(min_sizes) == len(max_sizes) == len(strides) + + anchor_ratios = [] + anchor_scales = [] + for k in range(len(self.strides)): + scales = [1.0, np.sqrt(max_sizes[k] / min_sizes[k])] + anchor_ratio = [1.0] + for r in ratios[k]: + anchor_ratio += [1 / r, r] # 4 or 6 ratio + anchor_ratios.append(torch.Tensor(anchor_ratio)) + anchor_scales.append(torch.Tensor(scales)) + + self.base_sizes = min_sizes + self.scales = anchor_scales + self.ratios = anchor_ratios + self.scale_major = scale_major + self.center_offset = 0 + self.base_anchors = self.gen_base_anchors() + + def gen_base_anchors(self): + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_size in enumerate(self.base_sizes): + base_anchors = self.gen_single_level_base_anchors( + base_size, scales=self.scales[i], ratios=self.ratios[i], center=self.centers[i] + ) + indices = list(range(len(self.ratios[i]))) + indices.insert(1, len(indices)) + base_anchors = torch.index_select(base_anchors, 0, torch.LongTensor(indices)) + multi_level_base_anchors.append(base_anchors) + return multi_level_base_anchors + + def __repr__(self): + """str: a string that describes the module""" + indent_str = " " + repr_str = self.__class__.__name__ + "(\n" + repr_str += f"{indent_str}strides={self.strides},\n" + repr_str += f"{indent_str}scales={self.scales},\n" + repr_str += f"{indent_str}scale_major={self.scale_major},\n" + repr_str += f"{indent_str}input_size={self.input_size},\n" + repr_str += f"{indent_str}scales={self.scales},\n" + repr_str += f"{indent_str}ratios={self.ratios},\n" + repr_str += f"{indent_str}num_levels={self.num_levels},\n" + repr_str += f"{indent_str}base_sizes={self.base_sizes},\n" + repr_str += f"{indent_str}basesize_ratio_range=" + repr_str += f"{self.basesize_ratio_range})" + return repr_str + + +@PRIOR_GENERATORS.register_module() +class LegacyAnchorGenerator(AnchorGenerator): + """Legacy anchor generator used in MMDetection V1.x. + + Note: + Difference to the V2.0 anchor generator: + + 1. The center offset of V1.x anchors are set to be 0.5 rather than 0. + 2. The width/height are minused by 1 when calculating the anchors' \ + centers and corners to meet the V1.x coordinate system. + 3. The anchors' corners are quantized. + + Args: + strides (list[int] | list[tuple[int]]): Strides of anchors + in multiple feature levels. + ratios (list[float]): The list of ratios between the height and width + of anchors in a single level. + scales (list[int] | None): Anchor scales for anchors in a single level. + It cannot be set at the same time if `octave_base_scale` and + `scales_per_octave` are set. + base_sizes (list[int]): The basic sizes of anchors in multiple levels. + If None is given, strides will be used to generate base_sizes. + scale_major (bool): Whether to multiply scales first when generating + base anchors. If true, the anchors in the same row will have the + same scales. By default it is True in V2.0 + octave_base_scale (int): The base scale of octave. + scales_per_octave (int): Number of scales for each octave. + `octave_base_scale` and `scales_per_octave` are usually used in + retinanet and the `scales` should be None when they are set. + centers (list[tuple[float, float]] | None): The centers of the anchor + relative to the feature grid center in multiple feature levels. + By default it is set to be None and not used. It a list of float + is given, this list will be used to shift the centers of anchors. + center_offset (float): The offset of center in proportion to anchors' + width and height. By default it is 0.5 in V2.0 but it should be 0.5 + in v1.x models. + + Examples: + >>> from mmdet.core import LegacyAnchorGenerator + >>> self = LegacyAnchorGenerator( + >>> [16], [1.], [1.], [9], center_offset=0.5) + >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu') + >>> print(all_anchors) + [tensor([[ 0., 0., 8., 8.], + [16., 0., 24., 8.], + [ 0., 16., 8., 24.], + [16., 16., 24., 24.]])] + """ + + def gen_single_level_base_anchors(self, base_size, scales, ratios, center=None): + """Generate base anchors of a single level. + + Note: + The width/height of anchors are minused by 1 when calculating \ + the centers and corners to meet the V1.x coordinate system. + + Args: + base_size (int | float): Basic size of an anchor. + scales (torch.Tensor): Scales of the anchor. + ratios (torch.Tensor): The ratio between between the height. + and width of anchors in a single level. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature map. + """ + w = base_size + h = base_size + if center is None: + x_center = self.center_offset * (w - 1) + y_center = self.center_offset * (h - 1) + else: + x_center, y_center = center + + h_ratios = torch.sqrt(ratios) + w_ratios = 1 / h_ratios + if self.scale_major: + ws = (w * w_ratios[:, None] * scales[None, :]).view(-1) + hs = (h * h_ratios[:, None] * scales[None, :]).view(-1) + else: + ws = (w * scales[:, None] * w_ratios[None, :]).view(-1) + hs = (h * scales[:, None] * h_ratios[None, :]).view(-1) + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchors = [ + x_center - 0.5 * (ws - 1), + y_center - 0.5 * (hs - 1), + x_center + 0.5 * (ws - 1), + y_center + 0.5 * (hs - 1), + ] + base_anchors = torch.stack(base_anchors, dim=-1).round() + + return base_anchors + + +@PRIOR_GENERATORS.register_module() +class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator): + """Legacy anchor generator used in MMDetection V1.x. + + The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator` + can be found in `LegacyAnchorGenerator`. + """ + + def __init__(self, strides, ratios, basesize_ratio_range, input_size=300, scale_major=True): + super(LegacySSDAnchorGenerator, self).__init__( + strides=strides, + ratios=ratios, + basesize_ratio_range=basesize_ratio_range, + input_size=input_size, + scale_major=scale_major, + ) + self.centers = [((stride - 1) / 2.0, (stride - 1) / 2.0) for stride in strides] + self.base_anchors = self.gen_base_anchors() + + +@PRIOR_GENERATORS.register_module() +class YOLOAnchorGenerator(AnchorGenerator): + """Anchor generator for YOLO. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels. + base_sizes (list[list[tuple[int, int]]]): The basic sizes + of anchors in multiple levels. + """ + + def __init__(self, strides, base_sizes): + self.strides = [_pair(stride) for stride in strides] + self.centers = [(stride[0] / 2.0, stride[1] / 2.0) for stride in self.strides] + self.base_sizes = [] + num_anchor_per_level = len(base_sizes[0]) + for base_sizes_per_level in base_sizes: + assert num_anchor_per_level == len(base_sizes_per_level) + self.base_sizes.append([_pair(base_size) for base_size in base_sizes_per_level]) + self.base_anchors = self.gen_base_anchors() + + @property + def num_levels(self): + """int: number of feature levels that the generator will be applied""" + return len(self.base_sizes) + + def gen_base_anchors(self): + """Generate base anchors. + + Returns: + list(torch.Tensor): Base anchors of a feature grid in multiple \ + feature levels. + """ + multi_level_base_anchors = [] + for i, base_sizes_per_level in enumerate(self.base_sizes): + center = None + if self.centers is not None: + center = self.centers[i] + multi_level_base_anchors.append(self.gen_single_level_base_anchors(base_sizes_per_level, center)) + return multi_level_base_anchors + + def gen_single_level_base_anchors(self, base_sizes_per_level, center=None): + """Generate base anchors of a single level. + + Args: + base_sizes_per_level (list[tuple[int, int]]): Basic sizes of + anchors. + center (tuple[float], optional): The center of the base anchor + related to a single feature grid. Defaults to None. + + Returns: + torch.Tensor: Anchors in a single-level feature maps. + """ + x_center, y_center = center + base_anchors = [] + for base_size in base_sizes_per_level: + w, h = base_size + + # use float anchor and the anchor's center is aligned with the + # pixel center + base_anchor = torch.Tensor([x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w, y_center + 0.5 * h]) + base_anchors.append(base_anchor) + base_anchors = torch.stack(base_anchors, dim=0) + + return base_anchors diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py new file mode 100644 index 000000000..ba002aca8 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv.utils import Registry, build_from_cfg + +PRIOR_GENERATORS = Registry("Generator for anchors and points") + +ANCHOR_GENERATORS = PRIOR_GENERATORS + + +def build_prior_generator(cfg, default_args=None): + return build_from_cfg(cfg, PRIOR_GENERATORS, default_args) + + +def build_anchor_generator(cfg, default_args=None): + warnings.warn("``build_anchor_generator`` would be deprecated soon, please use " "``build_prior_generator`` ") + return build_prior_generator(cfg, default_args=default_args) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py new file mode 100644 index 000000000..572665975 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py @@ -0,0 +1,172 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +import numpy as np +import torch +from torch.nn.modules.utils import _pair + +from .builder import PRIOR_GENERATORS + + +@PRIOR_GENERATORS.register_module() +class PointGenerator: + def _meshgrid(self, x, y, row_major=True): + xx = x.repeat(len(y)) + yy = y.view(-1, 1).repeat(1, len(x)).view(-1) + if row_major: + return xx, yy + else: + return yy, xx + + def valid_flags(self, featmap_size, valid_size, device="cuda"): + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + return valid + + +@PRIOR_GENERATORS.register_module() +class MlvlPointGenerator: + """Standard points generator for multi-level (Mlvl) feature maps in 2D + points-based detectors. + + Args: + strides (list[int] | list[tuple[int, int]]): Strides of anchors + in multiple feature levels in order (w, h). + offset (float): The offset of points, the value is normalized with + corresponding stride. Defaults to 0.5. + """ + + def __init__(self, strides, offset=0.5): + self.strides = [_pair(stride) for stride in strides] + self.offset = offset + + @property + def num_levels(self): + """int: number of feature levels that the generator will be applied""" + return len(self.strides) + + @property + def num_base_priors(self): + """list[int]: The number of priors (points) at a point + on the feature grid""" + return [1 for _ in range(len(self.strides))] + + def _meshgrid(self, x, y, row_major=True): + yy, xx = torch.meshgrid(y, x) + if row_major: + # warning .flatten() would cause error in ONNX exporting + # have to use reshape here + return xx.reshape(-1), yy.reshape(-1) + + else: + return yy.reshape(-1), xx.reshape(-1) + + def single_level_grid_priors(self, featmap_size, level_idx, dtype=torch.float32, device="cuda", with_stride=False): + """Generate grid Points of a single level. + + Note: + This function is usually called by method ``self.grid_priors``. + + Args: + featmap_size (tuple[int]): Size of the feature maps, arrange as + (h, w). + level_idx (int): The index of corresponding feature map level. + dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32. + device (str, optional): The device the tensor will be put on. + Defaults to 'cuda'. + with_stride (bool): Concatenate the stride to the last dimension + of points. + + Return: + Tensor: Points of single feature levels. + The shape of tensor should be (N, 2) when with stride is + ``False``, where N = width * height, width and height + are the sizes of the corresponding feature level, + and the last dimension 2 represent (coord_x, coord_y), + otherwise the shape should be (N, 4), + and the last dimension 4 represent + (coord_x, coord_y, stride_w, stride_h). + """ + feat_h, feat_w = featmap_size + stride_w, stride_h = self.strides[level_idx] + shift_x = (torch.arange(0, feat_w, device=device) + self.offset) * stride_w + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_x = shift_x.to(dtype) + + shift_y = (torch.arange(0, feat_h, device=device) + self.offset) * stride_h + # keep featmap_size as Tensor instead of int, so that we + # can convert to ONNX correctly + shift_y = shift_y.to(dtype) + shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) + if not with_stride: + shifts = torch.stack([shift_xx, shift_yy], dim=-1) + else: + # use `shape[0]` instead of `len(shift_xx)` for ONNX export + stride_w = shift_xx.new_full((shift_xx.shape[0],), stride_w).to(dtype) + stride_h = shift_xx.new_full((shift_yy.shape[0],), stride_h).to(dtype) + shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h], dim=-1) + all_points = shifts.to(device) + return all_points + + def valid_flags(self, featmap_sizes, pad_shape, device="cuda"): + """Generate valid flags of points of multiple feature levels. + + Args: + featmap_sizes (list(tuple)): List of feature map sizes in + multiple feature levels, each size arrange as + as (h, w). + pad_shape (tuple(int)): The padded shape of the image, + arrange as (h, w). + device (str): The device where the anchors will be put on. + + Return: + list(torch.Tensor): Valid flags of points of multiple levels. + """ + assert self.num_levels == len(featmap_sizes) + multi_level_flags = [] + for i in range(self.num_levels): + point_stride = self.strides[i] + feat_h, feat_w = featmap_sizes[i] + h, w = pad_shape[:2] + valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h) + valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w) + flags = self.single_level_valid_flags((feat_h, feat_w), (valid_feat_h, valid_feat_w), device=device) + multi_level_flags.append(flags) + return multi_level_flags + + def single_level_valid_flags(self, featmap_size, valid_size, device="cuda"): + """Generate the valid flags of points of a single feature map. + + Args: + featmap_size (tuple[int]): The size of feature maps, arrange as + as (h, w). + valid_size (tuple[int]): The valid size of the feature maps. + The size arrange as as (h, w). + device (str, optional): The device where the flags will be put on. + Defaults to 'cuda'. + + Returns: + torch.Tensor: The valid flags of each points in a single level \ + feature map. + """ + feat_h, feat_w = featmap_size + valid_h, valid_w = valid_size + assert valid_h <= feat_h and valid_w <= feat_w + valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device) + valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device) + valid_x[:valid_w] = 1 + valid_y[:valid_h] = 1 + valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) + valid = valid_xx & valid_yy + return valid diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py new file mode 100644 index 000000000..00988b218 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def images_to_levels(target, num_levels): + """Convert targets by image to targets by feature level. + + [target_img0, target_img1] -> [target_level0, target_level1, ...] + """ + target = torch.stack(target, 0) + level_targets = [] + start = 0 + for n in num_levels: + end = start + n + # level_targets.append(target[:, start:end].squeeze(0)) + level_targets.append(target[:, start:end]) + start = end + return level_targets + + +def anchor_inside_flags(flat_anchors, valid_flags, img_shape, allowed_border=0): + """Check whether the anchors are inside the border. + + Args: + flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4). + valid_flags (torch.Tensor): An existing valid flags of anchors. + img_shape (tuple(int)): Shape of current image. + allowed_border (int, optional): The border to allow the valid anchor. + Defaults to 0. + + Returns: + torch.Tensor: Flags indicating whether the anchors are inside a \ + valid range. + """ + img_h, img_w = img_shape[:2] + if allowed_border >= 0: + inside_flags = ( + valid_flags + & (flat_anchors[:, 0] >= -allowed_border) + & (flat_anchors[:, 1] >= -allowed_border) + & (flat_anchors[:, 2] < img_w + allowed_border) + & (flat_anchors[:, 3] < img_h + allowed_border) + ) + else: + inside_flags = valid_flags + return inside_flags + + +def calc_region(bbox, ratio, featmap_size=None): + """Calculate a proportional bbox region. + + The bbox center are fixed and the new h' and w' is h * ratio and w * ratio. + + Args: + bbox (Tensor): Bboxes to calculate regions, shape (n, 4). + ratio (float): Ratio of the output region. + featmap_size (tuple): Feature map size used for clipping the boundary. + + Returns: + tuple: x1, y1, x2, y2 + """ + x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long() + y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long() + x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long() + y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long() + if featmap_size is not None: + x1 = x1.clamp(min=0, max=featmap_size[1]) + y1 = y1.clamp(min=0, max=featmap_size[0]) + x2 = x2.clamp(min=0, max=featmap_size[1]) + y2 = y2.clamp(min=0, max=featmap_size[0]) + return (x1, y1, x2, y2) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py new file mode 100644 index 000000000..ead3090af --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +from .builder import build_bbox_coder +from .coder import BaseBBoxCoder diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py new file mode 100644 index 000000000..f23da515f --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +from .base_assigner import BaseAssigner diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py new file mode 100644 index 000000000..b013d933b --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + + +class BaseAssigner(metaclass=ABCMeta): + """Base assigner that assigns boxes to ground truth boxes.""" diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py new file mode 100644 index 000000000..ae668c05f --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.utils import Registry, build_from_cfg + +BBOX_ASSIGNERS = Registry("bbox_assigner") +BBOX_SAMPLERS = Registry("bbox_sampler") +BBOX_CODERS = Registry("bbox_coder") + + +def build_bbox_coder(cfg, **default_args): + """Builder of box coder.""" + return build_from_cfg(cfg, BBOX_CODERS, default_args) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py new file mode 100644 index 000000000..c56326d2e --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .base_bbox_coder import BaseBBoxCoder +from .distance_point_bbox_coder import DistancePointBBoxCoder diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py new file mode 100644 index 000000000..d56fe1e50 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + + +class BaseBBoxCoder(metaclass=ABCMeta): + """Base bounding box coder.""" + + def __init__(self, **kwargs): + pass diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py new file mode 100644 index 000000000..b88be8af0 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from ..builder import BBOX_CODERS +from .base_bbox_coder import BaseBBoxCoder + + +@BBOX_CODERS.register_module() +class DistancePointBBoxCoder(BaseBBoxCoder): + """Distance Point BBox coder. + + This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left, + right) and decode it back to the original. + + Args: + clip_border (bool, optional): Whether clip the objects outside the + border of the image. Defaults to True. + """ + + def __init__(self, clip_border=True): + super(BaseBBoxCoder, self).__init__() + self.clip_border = clip_border + + def encode(self, points, gt_bboxes, max_dis=None, eps=0.1): + """Encode bounding box to distances. + + Args: + points (Tensor): Shape (N, 2), The format is [x, y]. + gt_bboxes (Tensor): Shape (N, 4), The format is "xyxy" + max_dis (float): Upper bound of the distance. Default None. + eps (float): a small value to ensure target < max_dis, instead <=. + Default 0.1. + + Returns: + Tensor: Box transformation deltas. The shape is (N, 4). + """ + assert points.size(0) == gt_bboxes.size(0) + assert points.size(-1) == 2 + assert gt_bboxes.size(-1) == 4 + return bbox2distance(points, gt_bboxes, max_dis, eps) + + def decode(self, points, pred_bboxes, max_shape=None): + """Decode distance prediction to bounding box. + + Args: + points (Tensor): Shape (B, N, 2) or (N, 2). + pred_bboxes (Tensor): Distance from the given point to 4 + boundaries (left, top, right, bottom). Shape (B, N, 4) + or (N, 4) + max_shape (Sequence[int] or torch.Tensor or Sequence[ + Sequence[int]],optional): Maximum bounds for boxes, specifies + (H, W, C) or (H, W). If priors shape is (B, N, 4), then + the max_shape should be a Sequence[Sequence[int]], + and the length of max_shape should also be B. + Default None. + Returns: + Tensor: Boxes with shape (N, 4) or (B, N, 4) + """ + assert points.size(0) == pred_bboxes.size(0) + assert points.size(-1) == 2 + assert pred_bboxes.size(-1) == 4 + if self.clip_border is False: + max_shape = None + return distance2bbox(points, pred_bboxes, max_shape) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py new file mode 100644 index 000000000..e3eae0288 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import build_match_cost +from .match_cost import ( + BBoxL1Cost, + ClassificationCost, + CrossEntropyLossCost, + DiceCost, + FocalLossCost, + IoUCost, +) + +__all__ = [ + "build_match_cost", + "ClassificationCost", + "BBoxL1Cost", + "IoUCost", + "FocalLossCost", + "DiceCost", + "CrossEntropyLossCost", +] diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py new file mode 100644 index 000000000..341cee374 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.utils import Registry, build_from_cfg + +MATCH_COST = Registry("Match Cost") + + +def build_match_cost(cfg, default_args=None): + """Builder of IoU calculator.""" + return build_from_cfg(cfg, MATCH_COST, default_args) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py new file mode 100644 index 000000000..642f7530e --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py @@ -0,0 +1,345 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn.functional as F + +from .builder import MATCH_COST + + +@MATCH_COST.register_module() +class BBoxL1Cost: + """BBoxL1Cost. + + Args: + weight (int | float, optional): loss_weight + box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN + + Examples: + >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost + >>> import torch + >>> self = BBoxL1Cost() + >>> bbox_pred = torch.rand(1, 4) + >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) + >>> factor = torch.tensor([10, 8, 10, 8]) + >>> self(bbox_pred, gt_bboxes, factor) + tensor([[1.6172, 1.6422]]) + """ + + def __init__(self, weight=1.0, box_format="xyxy"): + self.weight = weight + assert box_format in ["xyxy", "xywh"] + self.box_format = box_format + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + (num_query, 4). + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x1, y1, x2, y2). Shape (num_gt, 4). + + Returns: + torch.Tensor: bbox_cost value with weight + """ + if self.box_format == "xywh": + gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes) + elif self.box_format == "xyxy": + bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight + + +@MATCH_COST.register_module() +class FocalLossCost: + """FocalLossCost. + + Args: + weight (int | float, optional): loss_weight + alpha (int | float, optional): focal_loss alpha + gamma (int | float, optional): focal_loss gamma + eps (float, optional): default 1e-12 + binary_input (bool, optional): Whether the input is binary, + default False. + + Examples: + >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost + >>> import torch + >>> self = FocalLossCost() + >>> cls_pred = torch.rand(4, 3) + >>> gt_labels = torch.tensor([0, 1, 2]) + >>> factor = torch.tensor([10, 8, 10, 8]) + >>> self(cls_pred, gt_labels) + tensor([[-0.3236, -0.3364, -0.2699], + [-0.3439, -0.3209, -0.4807], + [-0.4099, -0.3795, -0.2929], + [-0.1950, -0.1207, -0.2626]]) + """ + + def __init__(self, weight=1.0, alpha=0.25, gamma=2, eps=1e-12, binary_input=False): + self.weight = weight + self.alpha = alpha + self.gamma = gamma + self.eps = eps + self.binary_input = binary_input + + def _focal_loss_cost(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_query, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * (1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * (1 - cls_pred).pow(self.gamma) + + cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels] + return cls_cost * self.weight + + def _mask_focal_loss_cost(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classfication logits + in shape (num_query, d1, ..., dn), dtype=torch.float32. + gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), + dtype=torch.long. Labels should be binary. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_query, num_gt). + """ + cls_pred = cls_pred.flatten(1) + gt_labels = gt_labels.flatten(1).float() + n = cls_pred.shape[1] + cls_pred = cls_pred.sigmoid() + neg_cost = -(1 - cls_pred + self.eps).log() * (1 - self.alpha) * cls_pred.pow(self.gamma) + pos_cost = -(cls_pred + self.eps).log() * self.alpha * (1 - cls_pred).pow(self.gamma) + + cls_cost = torch.einsum("nc,mc->nm", pos_cost, gt_labels) + torch.einsum("nc,mc->nm", neg_cost, (1 - gt_labels)) + return cls_cost / n * self.weight + + def __call__(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classfication logits. + gt_labels (Tensor)): Labels. + + Returns: + Tensor: Focal cost matrix with weight in shape\ + (num_query, num_gt). + """ + if self.binary_input: + return self._mask_focal_loss_cost(cls_pred, gt_labels) + else: + return self._focal_loss_cost(cls_pred, gt_labels) + + +@MATCH_COST.register_module() +class ClassificationCost: + """ClsSoftmaxCost. + + Args: + weight (int | float, optional): loss_weight + + Examples: + >>> from mmdet.core.bbox.match_costs.match_cost import \ + ... ClassificationCost + >>> import torch + >>> self = ClassificationCost() + >>> cls_pred = torch.rand(4, 3) + >>> gt_labels = torch.tensor([0, 1, 2]) + >>> factor = torch.tensor([10, 8, 10, 8]) + >>> self(cls_pred, gt_labels) + tensor([[-0.3430, -0.3525, -0.3045], + [-0.3077, -0.2931, -0.3992], + [-0.3664, -0.3455, -0.2881], + [-0.3343, -0.2701, -0.3956]]) + """ + + def __init__(self, weight=1.0): + self.weight = weight + + def __call__(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classification logits, shape + (num_query, num_class). + gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). + + Returns: + torch.Tensor: cls_cost value with weight + """ + # Following the official DETR repo, contrary to the loss that + # NLL is used, we approximate it in 1 - cls_score[gt_label]. + # The 1 is a constant that doesn't change the matching, + # so it can be omitted. + cls_score = cls_pred.softmax(-1) + cls_cost = -cls_score[:, gt_labels] + return cls_cost * self.weight + + +@MATCH_COST.register_module() +class IoUCost: + """IoUCost. + + Args: + iou_mode (str, optional): iou mode such as 'iou' | 'giou' + weight (int | float, optional): loss weight + + Examples: + >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost + >>> import torch + >>> self = IoUCost() + >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]]) + >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) + >>> self(bboxes, gt_bboxes) + tensor([[-0.1250, 0.1667], + [ 0.1667, -0.5000]]) + """ + + def __init__(self, iou_mode="giou", weight=1.0): + self.weight = weight + self.iou_mode = iou_mode + + def __call__(self, bboxes, gt_bboxes): + """ + Args: + bboxes (Tensor): Predicted boxes with unnormalized coordinates + (x1, y1, x2, y2). Shape (num_query, 4). + gt_bboxes (Tensor): Ground truth boxes with unnormalized + coordinates (x1, y1, x2, y2). Shape (num_gt, 4). + + Returns: + torch.Tensor: iou_cost value with weight + """ + # overlaps: [num_bboxes, num_gt] + overlaps = bbox_overlaps(bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False) + # The 1 is a constant that doesn't change the matching, so omitted. + iou_cost = -overlaps + return iou_cost * self.weight + + +@MATCH_COST.register_module() +class DiceCost: + """Cost of mask assignments based on dice losses. + + Args: + weight (int | float, optional): loss_weight. Defaults to 1. + pred_act (bool, optional): Whether to apply sigmoid to mask_pred. + Defaults to False. + eps (float, optional): default 1e-12. + naive_dice (bool, optional): If True, use the naive dice loss + in which the power of the number in the denominator is + the first power. If Flase, use the second power that + is adopted by K-Net and SOLO. + Defaults to True. + """ + + def __init__(self, weight=1.0, pred_act=False, eps=1e-3, naive_dice=True): + self.weight = weight + self.pred_act = pred_act + self.eps = eps + self.naive_dice = naive_dice + + def binary_mask_dice_loss(self, mask_preds, gt_masks): + """ + Args: + mask_preds (Tensor): Mask prediction in shape (num_query, *). + gt_masks (Tensor): Ground truth in shape (num_gt, *) + store 0 or 1, 0 for negative class and 1 for + positive class. + + Returns: + Tensor: Dice cost matrix in shape (num_query, num_gt). + """ + mask_preds = mask_preds.flatten(1) + gt_masks = gt_masks.flatten(1).float() + numerator = 2 * torch.einsum("nc,mc->nm", mask_preds, gt_masks) + if self.naive_dice: + denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :] + else: + denominator = mask_preds.pow(2).sum(1)[:, None] + gt_masks.pow(2).sum(1)[None, :] + loss = 1 - (numerator + self.eps) / (denominator + self.eps) + return loss + + def __call__(self, mask_preds, gt_masks): + """ + Args: + mask_preds (Tensor): Mask prediction logits in shape (num_query, *) + gt_masks (Tensor): Ground truth in shape (num_gt, *) + + Returns: + Tensor: Dice cost matrix with weight in shape (num_query, num_gt). + """ + if self.pred_act: + mask_preds = mask_preds.sigmoid() + dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks) + return dice_cost * self.weight + + +@MATCH_COST.register_module() +class CrossEntropyLossCost: + """CrossEntropyLossCost. + + Args: + weight (int | float, optional): loss weight. Defaults to 1. + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to True. + Examples: + >>> from mmdet.core.bbox.match_costs import CrossEntropyLossCost + >>> import torch + >>> bce = CrossEntropyLossCost(use_sigmoid=True) + >>> cls_pred = torch.tensor([[7.6, 1.2], [-1.3, 10]]) + >>> gt_labels = torch.tensor([[1, 1], [1, 0]]) + >>> print(bce(cls_pred, gt_labels)) + """ + + def __init__(self, weight=1.0, use_sigmoid=True): + assert use_sigmoid, "use_sigmoid = False is not supported yet." + self.weight = weight + self.use_sigmoid = use_sigmoid + + def _binary_cross_entropy(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): The prediction with shape (num_query, 1, *) or + (num_query, *). + gt_labels (Tensor): The learning label of prediction with + shape (num_gt, *). + + Returns: + Tensor: Cross entropy cost matrix in shape (num_query, num_gt). + """ + cls_pred = cls_pred.flatten(1).float() + gt_labels = gt_labels.flatten(1).float() + n = cls_pred.shape[1] + pos = F.binary_cross_entropy_with_logits(cls_pred, torch.ones_like(cls_pred), reduction="none") + neg = F.binary_cross_entropy_with_logits(cls_pred, torch.zeros_like(cls_pred), reduction="none") + cls_cost = torch.einsum("nc,mc->nm", pos, gt_labels) + torch.einsum("nc,mc->nm", neg, 1 - gt_labels) + cls_cost = cls_cost / n + + return cls_cost + + def __call__(self, cls_pred, gt_labels): + """ + Args: + cls_pred (Tensor): Predicted classification logits. + gt_labels (Tensor): Labels. + + Returns: + Tensor: Cross entropy cost matrix with weight in + shape (num_query, num_gt). + """ + if self.use_sigmoid: + cls_cost = self._binary_cross_entropy(cls_pred, gt_labels) + else: + raise NotImplementedError + + return cls_cost * self.weight diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py new file mode 100644 index 000000000..348904611 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .dist_utils import sync_random_seed diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py new file mode 100644 index 000000000..4b3eae461 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +from mmcv.runner import get_dist_info + + +def sync_random_seed(seed=None, device="cuda"): + """Make sure different ranks share the same seed. + + All workers must call this function, otherwise it will deadlock. + This method is generally used in `DistributedSampler`, + because the seed should be identical across all processes + in the distributed group. + + In distributed sampling, different ranks should sample non-overlapped + data in the dataset. Therefore, this function is used to make sure that + each rank shuffles the data indices in the same order based + on the same seed. Then different ranks could use different indices + to select non-overlapped data from the same data list. + + Args: + seed (int, Optional): The seed. Default to None. + device (str): The device where the seed will be put on. + Default to 'cuda'. + + Returns: + int: Seed to be used. + """ + if seed is None: + seed = np.random.randint(2**31) + assert isinstance(seed, int) + + rank, world_size = get_dist_info() + + if world_size == 1: + return seed + + if rank == 0: + random_num = torch.tensor(seed, dtype=torch.int32, device=device) + else: + random_num = torch.tensor(0, dtype=torch.int32, device=device) + dist.broadcast(random_num, src=0) + return random_num.item() diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py new file mode 100644 index 000000000..ee50982e4 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py new file mode 100644 index 000000000..ec9ee2ad2 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py @@ -0,0 +1,214 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import platform +import random +import warnings +from functools import partial + +import numpy as np +import torch +from mmcv.parallel import collate +from mmcv.runner import get_dist_info +from mmcv.utils import TORCH_VERSION, Registry, build_from_cfg, digit_version +from torch.utils.data import DataLoader + +from .samplers import ( + ClassAwareSampler, + DistributedGroupSampler, + DistributedSampler, + GroupSampler, + InfiniteBatchSampler, + InfiniteGroupBatchSampler, +) + +if platform.system() != "Windows": + # https://github.com/pytorch/pytorch/issues/973 + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + base_soft_limit = rlimit[0] + hard_limit = rlimit[1] + soft_limit = min(max(4096, base_soft_limit), hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + +DATASETS = Registry("dataset") +PIPELINES = Registry("pipeline") + + +def _concat_dataset(cfg, default_args=None): + from .dataset_wrappers import ConcatDataset + + ann_files = cfg["ann_file"] + img_prefixes = cfg.get("img_prefix", None) + seg_prefixes = cfg.get("seg_prefix", None) + proposal_files = cfg.get("proposal_file", None) + separate_eval = cfg.get("separate_eval", True) + + datasets = [] + num_dset = len(ann_files) + for i in range(num_dset): + data_cfg = copy.deepcopy(cfg) + # pop 'separate_eval' since it is not a valid key for common datasets. + if "separate_eval" in data_cfg: + data_cfg.pop("separate_eval") + data_cfg["ann_file"] = ann_files[i] + if isinstance(img_prefixes, (list, tuple)): + data_cfg["img_prefix"] = img_prefixes[i] + if isinstance(seg_prefixes, (list, tuple)): + data_cfg["seg_prefix"] = seg_prefixes[i] + if isinstance(proposal_files, (list, tuple)): + data_cfg["proposal_file"] = proposal_files[i] + datasets.append(build_dataset(data_cfg, default_args)) + + return ConcatDataset(datasets, separate_eval) + + +def build_dataset(cfg, default_args=None): + from .dataset_wrappers import ( + ClassBalancedDataset, + ConcatDataset, + MultiImageMixDataset, + RepeatDataset, + ) + + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg["type"] == "ConcatDataset": + dataset = ConcatDataset( + [build_dataset(c, default_args) for c in cfg["datasets"]], cfg.get("separate_eval", True) + ) + elif cfg["type"] == "RepeatDataset": + dataset = RepeatDataset(build_dataset(cfg["dataset"], default_args), cfg["times"]) + elif cfg["type"] == "ClassBalancedDataset": + dataset = ClassBalancedDataset(build_dataset(cfg["dataset"], default_args), cfg["oversample_thr"]) + elif cfg["type"] == "MultiImageMixDataset": + cp_cfg = copy.deepcopy(cfg) + cp_cfg["dataset"] = build_dataset(cp_cfg["dataset"]) + cp_cfg.pop("type") + dataset = MultiImageMixDataset(**cp_cfg) + elif isinstance(cfg.get("ann_file"), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset + + +def build_dataloader( + dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + runner_type="EpochBasedRunner", + persistent_workers=False, + class_aware_sampler=None, + **kwargs +): + """Build PyTorch DataLoader. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + seed (int, Optional): Seed to be used. Default: None. + runner_type (str): Type of runner. Default: `EpochBasedRunner` + persistent_workers (bool): If True, the data loader will not shutdown + the worker processes after a dataset has been consumed once. + This allows to maintain the workers `Dataset` instances alive. + This argument is only valid when PyTorch>=1.7.0. Default: False. + class_aware_sampler (dict): Whether to use `ClassAwareSampler` + during training. Default: None. + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + + if dist: + # When model is :obj:`DistributedDataParallel`, + # `batch_size` of :obj:`dataloader` is the + # number of training samples on each GPU. + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + # When model is obj:`DataParallel` + # the batch size is samples on all the GPUS + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + if runner_type == "IterBasedRunner": + # this is a batch sampler, which can yield + # a mini-batch indices each time. + # it can be used in both `DataParallel` and + # `DistributedDataParallel` + if shuffle: + batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed) + else: + batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed, shuffle=False) + batch_size = 1 + sampler = None + else: + if class_aware_sampler is not None: + # ClassAwareSampler can be used in both distributed and + # non-distributed training. + num_sample_class = class_aware_sampler.get("num_sample_class", 1) + sampler = ClassAwareSampler( + dataset, samples_per_gpu, world_size, rank, seed=seed, num_sample_class=num_sample_class + ) + elif dist: + # DistributedGroupSampler will definitely shuffle the data to + # satisfy that images on each GPU are in the same group + if shuffle: + sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank, seed=seed) + else: + sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed) + else: + sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None + batch_sampler = None + + init_fn = partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None + + if TORCH_VERSION != "parrots" and digit_version(TORCH_VERSION) >= digit_version("1.7.0"): + kwargs["persistent_workers"] = persistent_workers + elif persistent_workers is True: + warnings.warn("persistent_workers is invalid because your pytorch " "version is lower than 1.7.0") + + data_loader = DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=kwargs.pop("pin_memory", False), + worker_init_fn=init_fn, + **kwargs + ) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + # The seed of each worker equals to + # num_worker * rank + worker_id + user_seed + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) + torch.manual_seed(worker_seed) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py new file mode 100644 index 000000000..7dde517f0 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .compose import Compose +from .formatting import to_tensor +from .loading import LoadAnnotations, LoadImageFromFile diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py new file mode 100644 index 000000000..16cb9d062 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py @@ -0,0 +1,59 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import collections + +from mmcv.utils import build_from_cfg + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class Compose: + """Compose multiple transforms sequentially. + + Args: + transforms (Sequence[dict | callable]): Sequence of transform object or + config dict to be composed. + """ + + def __init__(self, transforms): + assert isinstance(transforms, collections.abc.Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError("transform must be callable or a dict") + + def __call__(self, data): + """Call function to apply transforms sequentially. + + Args: + data (dict): A result dict contains the data to transform. + + Returns: + dict: Transformed data. + """ + + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + str_ = t.__repr__() + if "Compose(" in str_: + str_ = str_.replace("\n", "\n ") + format_string += "\n" + format_string += f" {str_}" + format_string += "\n)" + return format_string diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py new file mode 100644 index 000000000..55fcd087a --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# # Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Sequence + +import mmcv +import numpy as np +import torch + +from ..builder import PIPELINES + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f"type {type(data)} cannot be converted to tensor.") + + +@PIPELINES.register_module() +class DefaultFormatBundle: + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img", + "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ + (3)to DataContainer (stack=True) + + Args: + img_to_float (bool): Whether to force the image to be converted to + float type. Default: True. + pad_val (dict): A dict for padding value in batch collating, + the default value is `dict(img=0, masks=0, seg=255)`. + Without this argument, the padding value of "gt_semantic_seg" + will be set to 0 by default, which should be 255. + """ + + def __init__(self, img_to_float=True, pad_val=dict(img=0, masks=0, seg=255)): + self.img_to_float = img_to_float + self.pad_val = pad_val + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with \ + default bundle. + """ + + if "img" in results: + img = results["img"] + if self.img_to_float is True and img.dtype == np.uint8: + # Normally, image is of uint8 type without normalization. + # At this time, it needs to be forced to be converted to + # flot32, otherwise the model training and inference + # will be wrong. Only used for YOLOX currently . + img = img.astype(np.float32) + # add default meta keys + results = self._add_default_meta_keys(results) + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + results["img"] = DC(to_tensor(img), padding_value=self.pad_val["img"], stack=True) + for key in ["proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels"]: + if key not in results: + continue + results[key] = DC(to_tensor(results[key])) + if "gt_masks" in results: + results["gt_masks"] = DC(results["gt_masks"], padding_value=self.pad_val["masks"], cpu_only=True) + if "gt_semantic_seg" in results: + results["gt_semantic_seg"] = DC( + to_tensor(results["gt_semantic_seg"][None, ...]), padding_value=self.pad_val["seg"], stack=True + ) + return results + + def _add_default_meta_keys(self, results): + """Add default meta keys. + + We set default meta keys including `pad_shape`, `scale_factor` and + `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and + `Pad` are implemented during the whole pipeline. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + results (dict): Updated result dict contains the data to convert. + """ + img = results["img"] + results.setdefault("pad_shape", img.shape) + results.setdefault("scale_factor", 1.0) + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results.setdefault( + "img_norm_cfg", + dict( + mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False + ), + ) + return results + + def __repr__(self): + return self.__class__.__name__ + f"(img_to_float={self.img_to_float})" diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py new file mode 100644 index 000000000..35e2b9789 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py @@ -0,0 +1,289 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +import mmcv +import numpy as np +import pycocotools.mask as maskUtils + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class LoadImageFromFile: + """Load an image from file. + + Required keys are "img_prefix" and "img_info" (a dict that must contain the + key "filename"). Added or updated keys are "filename", "img", "img_shape", + "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), + "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + """ + + def __init__( + self, to_float32=False, color_type="color", channel_order="bgr", file_client_args=dict(backend="disk") + ): + self.to_float32 = to_float32 + self.color_type = color_type + self.channel_order = channel_order + self.file_client_args = file_client_args.copy() + self.file_client = None + + def __call__(self, results): + """Call functions to load image and get image meta information. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results["img_prefix"] is not None: + filename = osp.join(results["img_prefix"], results["img_info"]["filename"]) + else: + filename = results["img_info"]["filename"] + + img_bytes = self.file_client.get(filename) + img = mmcv.imfrombytes(img_bytes, flag=self.color_type, channel_order=self.channel_order) + if self.to_float32: + img = img.astype(np.float32) + + results["filename"] = filename + results["ori_filename"] = results["img_info"]["filename"] + results["img"] = img + results["img_shape"] = img.shape + results["ori_shape"] = img.shape + results["img_fields"] = ["img"] + return results + + def __repr__(self): + repr_str = ( + f"{self.__class__.__name__}(" + f"to_float32={self.to_float32}, " + f"color_type='{self.color_type}', " + f"channel_order='{self.channel_order}', " + f"file_client_args={self.file_client_args})" + ) + return repr_str + + +@PIPELINES.register_module() +class LoadAnnotations: + """Load multiple types of annotations. + + Args: + with_bbox (bool): Whether to parse and load the bbox annotation. + Default: True. + with_label (bool): Whether to parse and load the label annotation. + Default: True. + with_mask (bool): Whether to parse and load the mask annotation. + Default: False. + with_seg (bool): Whether to parse and load the semantic segmentation + annotation. Default: False. + poly2mask (bool): Whether to convert the instance masks from polygons + to bitmaps. Default: True. + denorm_bbox (bool): Whether to convert bbox from relative value to + absolute value. Only used in OpenImage Dataset. + Default: False. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + """ + + def __init__( + self, + with_bbox=True, + with_label=True, + with_mask=False, + with_seg=False, + poly2mask=True, + denorm_bbox=False, + file_client_args=dict(backend="disk"), + ): + self.with_bbox = with_bbox + self.with_label = with_label + self.with_mask = with_mask + self.with_seg = with_seg + self.poly2mask = poly2mask + self.denorm_bbox = denorm_bbox + self.file_client_args = file_client_args.copy() + self.file_client = None + + def _load_bboxes(self, results): + """Private function to load bounding box annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded bounding box annotations. + """ + + ann_info = results["ann_info"] + results["gt_bboxes"] = ann_info["bboxes"].copy() + + if self.denorm_bbox: + bbox_num = results["gt_bboxes"].shape[0] + if bbox_num != 0: + h, w = results["img_shape"][:2] + results["gt_bboxes"][:, 0::2] *= w + results["gt_bboxes"][:, 1::2] *= h + + gt_bboxes_ignore = ann_info.get("bboxes_ignore", None) + if gt_bboxes_ignore is not None: + results["gt_bboxes_ignore"] = gt_bboxes_ignore.copy() + results["bbox_fields"].append("gt_bboxes_ignore") + results["bbox_fields"].append("gt_bboxes") + + gt_is_group_ofs = ann_info.get("gt_is_group_ofs", None) + if gt_is_group_ofs is not None: + results["gt_is_group_ofs"] = gt_is_group_ofs.copy() + + return results + + def _load_labels(self, results): + """Private function to load label annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded label annotations. + """ + + results["gt_labels"] = results["ann_info"]["labels"].copy() + return results + + def _poly2mask(self, mask_ann, img_h, img_w): + """Private function to convert masks represented with polygon to + bitmaps. + + Args: + mask_ann (list | dict): Polygon mask annotation input. + img_h (int): The height of output mask. + img_w (int): The width of output mask. + + Returns: + numpy.ndarray: The decode bitmap mask of shape (img_h, img_w). + """ + + if isinstance(mask_ann, list): + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) + rle = maskUtils.merge(rles) + elif isinstance(mask_ann["counts"], list): + # uncompressed RLE + rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) + else: + # rle + rle = mask_ann + mask = maskUtils.decode(rle) + return mask + + def process_polygons(self, polygons): + """Convert polygons to list of ndarray and filter invalid polygons. + + Args: + polygons (list[list]): Polygons of one instance. + + Returns: + list[numpy.ndarray]: Processed polygons. + """ + + polygons = [np.array(p) for p in polygons] + valid_polygons = [] + for polygon in polygons: + if len(polygon) % 2 == 0 and len(polygon) >= 6: + valid_polygons.append(polygon) + return valid_polygons + + def _load_masks(self, results): + """Private function to load mask annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded mask annotations. + If ``self.poly2mask`` is set ``True``, `gt_mask` will contain + :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used. + """ + + h, w = results["img_info"]["height"], results["img_info"]["width"] + gt_masks = results["ann_info"]["masks"] + if self.poly2mask: + gt_masks = BitmapMasks([self._poly2mask(mask, h, w) for mask in gt_masks], h, w) + else: + gt_masks = PolygonMasks([self.process_polygons(polygons) for polygons in gt_masks], h, w) + results["gt_masks"] = gt_masks + results["mask_fields"].append("gt_masks") + return results + + def _load_semantic_seg(self, results): + """Private function to load semantic segmentation annotations. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: The dict contains loaded semantic segmentation annotations. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + filename = osp.join(results["seg_prefix"], results["ann_info"]["seg_map"]) + img_bytes = self.file_client.get(filename) + results["gt_semantic_seg"] = mmcv.imfrombytes(img_bytes, flag="unchanged").squeeze() + results["seg_fields"].append("gt_semantic_seg") + return results + + def __call__(self, results): + """Call function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:`mmdet.CustomDataset`. + + Returns: + dict: The dict contains loaded bounding box, label, mask and + semantic segmentation annotations. + """ + + if self.with_bbox: + results = self._load_bboxes(results) + if results is None: + return None + if self.with_label: + results = self._load_labels(results) + if self.with_mask: + results = self._load_masks(results) + if self.with_seg: + results = self._load_semantic_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f"(with_bbox={self.with_bbox}, " + repr_str += f"with_label={self.with_label}, " + repr_str += f"with_mask={self.with_mask}, " + repr_str += f"with_seg={self.with_seg}, " + repr_str += f"poly2mask={self.poly2mask}, " + repr_str += f"poly2mask={self.file_client_args})" + return repr_str diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py new file mode 100644 index 000000000..04ce19131 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .class_aware_sampler import ClassAwareSampler +from .distributed_sampler import DistributedSampler +from .group_sampler import DistributedGroupSampler, GroupSampler +from .infinite_sampler import InfiniteBatchSampler, InfiniteGroupBatchSampler + +__all__ = [ + "DistributedSampler", + "DistributedGroupSampler", + "GroupSampler", + "InfiniteGroupBatchSampler", + "InfiniteBatchSampler", + "ClassAwareSampler", +] diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py new file mode 100644 index 000000000..393ef3feb --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py @@ -0,0 +1,162 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from mmcv.runner import get_dist_info +from mmdet.core.utils import sync_random_seed +from torch.utils.data import Sampler + + +class ClassAwareSampler(Sampler): + r"""Sampler that restricts data loading to the label of the dataset. + + A class-aware sampling strategy to effectively tackle the + non-uniform class distribution. The length of the training data is + consistent with source data. Simple improvements based on `Relay + Backpropagation for Effective Learning of Deep Convolutional + Neural Networks `_ + + The implementation logic is referred to + https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py + + Args: + dataset: Dataset used for sampling. + samples_per_gpu (int): When model is :obj:`DistributedDataParallel`, + it is the number of training samples on each GPU. + When model is :obj:`DataParallel`, it is + `num_gpus * samples_per_gpu`. + Default : 1. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + seed (int, optional): random seed used to shuffle the sampler if + ``shuffle=True``. This number should be identical across all + processes in the distributed group. Default: 0. + num_sample_class (int): The number of samples taken from each + per-label list. Default: 1 + """ + + def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0, num_sample_class=1): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + + self.dataset = dataset + self.num_replicas = num_replicas + self.samples_per_gpu = samples_per_gpu + self.rank = rank + self.epoch = 0 + # Must be the same across all workers. If None, will use a + # random seed shared among workers + # (require synchronization among all workers) + self.seed = sync_random_seed(seed) + + # The number of samples taken from each per-label list + assert num_sample_class > 0 and isinstance(num_sample_class, int) + self.num_sample_class = num_sample_class + # Get per-label image list from dataset + assert hasattr(dataset, "get_cat2imgs"), "dataset must have `get_cat2imgs` function" + self.cat_dict = dataset.get_cat2imgs() + + self.num_samples = ( + int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas / self.samples_per_gpu)) * self.samples_per_gpu + ) + self.total_size = self.num_samples * self.num_replicas + + # get number of images containing each category + self.num_cat_imgs = [len(x) for x in self.cat_dict.values()] + # filter labels without images + self.valid_cat_inds = [i for i, length in enumerate(self.num_cat_imgs) if length != 0] + self.num_classes = len(self.valid_cat_inds) + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + + # initialize label list + label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g) + # initialize each per-label image list + data_iter_dict = dict() + for i in self.valid_cat_inds: + data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g) + + def gen_cat_img_inds(cls_list, data_dict, num_sample_cls): + """Traverse the categories and extract `num_sample_cls` image + indexes of the corresponding categories one by one.""" + id_indices = [] + for _ in range(len(cls_list)): + cls_idx = next(cls_list) + for _ in range(num_sample_cls): + id = next(data_dict[cls_idx]) + id_indices.append(id) + return id_indices + + # deterministically shuffle based on epoch + num_bins = int(math.ceil(self.total_size * 1.0 / self.num_classes / self.num_sample_class)) + indices = [] + for i in range(num_bins): + indices += gen_cat_img_inds(label_iter_list, data_iter_dict, self.num_sample_class) + + # fix extra samples to make it evenly divisible + if len(indices) >= self.total_size: + indices = indices[: self.total_size] + else: + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + +class RandomCycleIter: + """Shuffle the list and do it again after the list have traversed. + + The implementation logic is referred to + https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py + + Example: + >>> label_list = [0, 1, 2, 4, 5] + >>> g = torch.Generator() + >>> g.manual_seed(0) + >>> label_iter_list = RandomCycleIter(label_list, generator=g) + >>> index = next(label_iter_list) + Args: + data (list or ndarray): The data that needs to be shuffled. + generator: An torch.Generator object, which is used in setting the seed + for generating random numbers. + """ # noqa: W605 + + def __init__(self, data, generator=None): + self.data = data + self.length = len(data) + self.index = torch.randperm(self.length, generator=generator).numpy() + self.i = 0 + self.generator = generator + + def __iter__(self): + return self + + def __len__(self): + return len(self.data) + + def __next__(self): + if self.i == self.length: + self.index = torch.randperm(self.length, generator=self.generator).numpy() + self.i = 0 + idx = self.data[self.index[self.i]] + self.i += 1 + return idx diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py new file mode 100644 index 000000000..b9db3a0f2 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +from mmdet.core.utils import sync_random_seed +from mmdet.utils import get_device +from torch.utils.data import DistributedSampler as _DistributedSampler + + +class DistributedSampler(_DistributedSampler): + def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + + # In distributed sampling, different ranks should sample + # non-overlapped data in the dataset. Therefore, this function + # is used to make sure that each rank shuffles the data indices + # in the same order based on the same seed. Then different ranks + # could use different indices to select non-overlapped data from the + # same data list. + device = get_device() + self.seed = sync_random_seed(seed, device) + + def __iter__(self): + # deterministically shuffle based on epoch + if self.shuffle: + g = torch.Generator() + # When :attr:`shuffle=True`, this ensures all replicas + # use a different random ordering for each epoch. + # Otherwise, the next iteration of this sampler will + # yield the same ordering. + g.manual_seed(self.epoch + self.seed) + indices = torch.randperm(len(self.dataset), generator=g).tolist() + else: + indices = torch.arange(len(self.dataset)).tolist() + + # add extra samples to make it evenly divisible + # in case that indices is shorter than half of total_size + indices = (indices * math.ceil(self.total_size / len(indices)))[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py new file mode 100644 index 000000000..923cf4bed --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +from mmcv.runner import get_dist_info +from torch.utils.data import Sampler + + +class GroupSampler(Sampler): + def __init__(self, dataset, samples_per_gpu=1): + assert hasattr(dataset, "flag") + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.flag = dataset.flag.astype(np.int64) + self.group_sizes = np.bincount(self.flag) + self.num_samples = 0 + for i, size in enumerate(self.group_sizes): + self.num_samples += int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu + + def __iter__(self): + indices = [] + for i, size in enumerate(self.group_sizes): + if size == 0: + continue + indice = np.where(self.flag == i)[0] + assert len(indice) == size + np.random.shuffle(indice) + num_extra = int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu - len(indice) + indice = np.concatenate([indice, np.random.choice(indice, num_extra)]) + indices.append(indice) + indices = np.concatenate(indices) + indices = [ + indices[i * self.samples_per_gpu : (i + 1) * self.samples_per_gpu] + for i in np.random.permutation(range(len(indices) // self.samples_per_gpu)) + ] + indices = np.concatenate(indices) + indices = indices.astype(np.int64).tolist() + assert len(indices) == self.num_samples + return iter(indices) + + def __len__(self): + return self.num_samples + + +class DistributedGroupSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + + .. note:: + Dataset is assumed to be of constant size. + + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + seed (int, optional): random seed used to shuffle the sampler if + ``shuffle=True``. This number should be identical across all + processes in the distributed group. Default: 0. + """ + + def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0): + _rank, _num_replicas = get_dist_info() + if num_replicas is None: + num_replicas = _num_replicas + if rank is None: + rank = _rank + self.dataset = dataset + self.samples_per_gpu = samples_per_gpu + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.seed = seed if seed is not None else 0 + + assert hasattr(self.dataset, "flag") + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + + self.num_samples = 0 + for i, j in enumerate(self.group_sizes): + self.num_samples += ( + int(math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) + * self.samples_per_gpu + ) + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch + self.seed) + + indices = [] + for i, size in enumerate(self.group_sizes): + if size > 0: + indice = np.where(self.flag == i)[0] + assert len(indice) == size + # add .numpy() to avoid bug when selecting indice in parrots. + # TODO: check whether torch.randperm() can be replaced by + # numpy.random.permutation(). + indice = indice[list(torch.randperm(int(size), generator=g).numpy())].tolist() + extra = int( + math.ceil(size * 1.0 / self.samples_per_gpu / self.num_replicas) + ) * self.samples_per_gpu * self.num_replicas - len(indice) + # pad indice + tmp = indice.copy() + for _ in range(extra // size): + indice.extend(tmp) + indice.extend(tmp[: extra % size]) + indices.extend(indice) + + assert len(indices) == self.total_size + + indices = [ + indices[j] + for i in list(torch.randperm(len(indices) // self.samples_per_gpu, generator=g)) + for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu) + ] + + # subsample + offset = self.num_samples * self.rank + indices = indices[offset : offset + self.num_samples] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py new file mode 100644 index 000000000..11b4acd86 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py @@ -0,0 +1,167 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import itertools + +import numpy as np +import torch +from mmcv.runner import get_dist_info +from mmdet.core.utils import sync_random_seed +from torch.utils.data.sampler import Sampler + + +class InfiniteGroupBatchSampler(Sampler): + """Similar to `BatchSampler` warping a `GroupSampler. It is designed for + iteration-based runners like `IterBasedRunner` and yields a mini-batch + indices each time, all indices in a batch should be in the same group. + + The implementation logic is referred to + https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py + + Args: + dataset (object): The dataset. + batch_size (int): When model is :obj:`DistributedDataParallel`, + it is the number of training samples on each GPU. + When model is :obj:`DataParallel`, it is + `num_gpus * samples_per_gpu`. + Default : 1. + world_size (int, optional): Number of processes participating in + distributed training. Default: None. + rank (int, optional): Rank of current process. Default: None. + seed (int): Random seed. Default: 0. + shuffle (bool): Whether shuffle the indices of a dummy `epoch`, it + should be noted that `shuffle` can not guarantee that you can + generate sequential indices because it need to ensure + that all indices in a batch is in a group. Default: True. + """ # noqa: W605 + + def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0, shuffle=True): + _rank, _world_size = get_dist_info() + if world_size is None: + world_size = _world_size + if rank is None: + rank = _rank + self.rank = rank + self.world_size = world_size + self.dataset = dataset + self.batch_size = batch_size + # In distributed sampling, different ranks should sample + # non-overlapped data in the dataset. Therefore, this function + # is used to make sure that each rank shuffles the data indices + # in the same order based on the same seed. Then different ranks + # could use different indices to select non-overlapped data from the + # same data list. + self.seed = sync_random_seed(seed) + self.shuffle = shuffle + + assert hasattr(self.dataset, "flag") + self.flag = self.dataset.flag + self.group_sizes = np.bincount(self.flag) + # buffer used to save indices of each group + self.buffer_per_group = {k: [] for k in range(len(self.group_sizes))} + + self.size = len(dataset) + self.indices = self._indices_of_rank() + + def _infinite_indices(self): + """Infinitely yield a sequence of indices.""" + g = torch.Generator() + g.manual_seed(self.seed) + while True: + if self.shuffle: + yield from torch.randperm(self.size, generator=g).tolist() + + else: + yield from torch.arange(self.size).tolist() + + def _indices_of_rank(self): + """Slice the infinite indices by rank.""" + yield from itertools.islice(self._infinite_indices(), self.rank, None, self.world_size) + + def __iter__(self): + # once batch size is reached, yield the indices + for idx in self.indices: + flag = self.flag[idx] + group_buffer = self.buffer_per_group[flag] + group_buffer.append(idx) + if len(group_buffer) == self.batch_size: + yield group_buffer[:] + del group_buffer[:] + + def __len__(self): + """Length of base dataset.""" + return self.size + + +class InfiniteBatchSampler(Sampler): + """Similar to `BatchSampler` warping a `DistributedSampler. It is designed + iteration-based runners like `IterBasedRunner` and yields a mini-batch + indices each time. + + The implementation logic is referred to + https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py + + Args: + dataset (object): The dataset. + batch_size (int): When model is :obj:`DistributedDataParallel`, + it is the number of training samples on each GPU, + When model is :obj:`DataParallel`, it is + `num_gpus * samples_per_gpu`. + Default : 1. + world_size (int, optional): Number of processes participating in + distributed training. Default: None. + rank (int, optional): Rank of current process. Default: None. + seed (int): Random seed. Default: 0. + shuffle (bool): Whether shuffle the dataset or not. Default: True. + """ # noqa: W605 + + def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0, shuffle=True): + _rank, _world_size = get_dist_info() + if world_size is None: + world_size = _world_size + if rank is None: + rank = _rank + self.rank = rank + self.world_size = world_size + self.dataset = dataset + self.batch_size = batch_size + # In distributed sampling, different ranks should sample + # non-overlapped data in the dataset. Therefore, this function + # is used to make sure that each rank shuffles the data indices + # in the same order based on the same seed. Then different ranks + # could use different indices to select non-overlapped data from the + # same data list. + self.seed = sync_random_seed(seed) + self.shuffle = shuffle + self.size = len(dataset) + self.indices = self._indices_of_rank() + + def _infinite_indices(self): + """Infinitely yield a sequence of indices.""" + g = torch.Generator() + g.manual_seed(self.seed) + while True: + if self.shuffle: + yield from torch.randperm(self.size, generator=g).tolist() + + else: + yield from torch.arange(self.size).tolist() + + def _indices_of_rank(self): + """Slice the infinite indices by rank.""" + yield from itertools.islice(self._infinite_indices(), self.rank, None, self.world_size) + + def __iter__(self): + # once batch size is reached, yield the indices + batch_buffer = [] + for idx in self.indices: + batch_buffer.append(idx) + if len(batch_buffer) == self.batch_size: + yield batch_buffer + batch_buffer = [] + + def __len__(self): + """Length of base dataset.""" + return self.size diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py new file mode 100644 index 000000000..7281a4c13 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +from .builder import ( + BACKBONES, + DETECTORS, + HEADS, + LOSSES, + NECKS, + ROI_EXTRACTORS, + SHARED_HEADS, + build_backbone, + build_detector, + build_head, + build_loss, + build_neck, +) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py new file mode 100644 index 000000000..9048153df --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmcv.cnn import MODELS as MMCV_MODELS +from mmcv.utils import Registry + +MODELS = Registry("models", parent=MMCV_MODELS) + +BACKBONES = MODELS +NECKS = MODELS +ROI_EXTRACTORS = MODELS +SHARED_HEADS = MODELS +HEADS = MODELS +LOSSES = MODELS +DETECTORS = MODELS + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_loss(cfg): + """Build loss.""" + return LOSSES.build(cfg) + + +def build_detector(cfg, train_cfg=None, test_cfg=None): + """Build detector.""" + if train_cfg is not None or test_cfg is not None: + warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning) + assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field " + assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field " + + return DETECTORS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py new file mode 100644 index 000000000..d4b12417f --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py @@ -0,0 +1,274 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod + +import torch.nn as nn +from mmcv.cnn import ConvModule +from mmcv.runner import force_fp32 +from mmdet3d.core.bbox.coders import build_bbox_coder # multi_apply +from mmdet.core.anchor.point_generator import MlvlPointGenerator + +from ..builder import HEADS, build_loss +from .base_dense_head import BaseDenseHead +from .dense_test_mixins import BBoxTestMixin + + +@HEADS.register_module() +class AnchorFreeHead(BaseDenseHead, BBoxTestMixin): + """Anchor-free head (FCOS, Fovea, RepPoints, etc.). + + Args: + num_classes (int): Number of categories excluding the background + category. + in_channels (int): Number of channels in the input feature map. + feat_channels (int): Number of hidden channels. Used in child classes. + stacked_convs (int): Number of stacking convs of the head. + strides (tuple): Downsample factor of each feature map. + dcn_on_last_conv (bool): If true, use dcn in the last layer of + towers. Default: False. + conv_bias (bool | str): If specified as `auto`, it will be decided by + the norm_cfg. Bias of conv will be set as True if `norm_cfg` is + None, otherwise False. Default: "auto". + loss_cls (dict): Config of classification loss. + loss_bbox (dict): Config of localization loss. + bbox_coder (dict): Config of bbox coder. Defaults + 'DistancePointBBoxCoder'. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + train_cfg (dict): Training config of anchor head. + test_cfg (dict): Testing config of anchor head. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ # noqa: W605 + + _version = 1 + + def __init__( + self, + num_classes, + in_channels, + feat_channels=256, + stacked_convs=4, + strides=(4, 8, 16, 32, 64), + dcn_on_last_conv=False, + conv_bias="auto", + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), + loss_bbox=dict(type="IoULoss", loss_weight=1.0), + bbox_coder=dict(type="DistancePointBBoxCoder"), + conv_cfg=None, + norm_cfg=None, + train_cfg=None, + test_cfg=None, + init_cfg=dict( + type="Normal", + layer="Conv2d", + std=0.01, + override=dict(type="Normal", name="conv_cls", std=0.01, bias_prob=0.01), + ), + ): + super(AnchorFreeHead, self).__init__(init_cfg) + self.num_classes = num_classes + self.use_sigmoid_cls = loss_cls.get("use_sigmoid", False) + if self.use_sigmoid_cls: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + self.in_channels = in_channels + self.feat_channels = feat_channels + self.stacked_convs = stacked_convs + self.strides = strides + self.dcn_on_last_conv = dcn_on_last_conv + assert conv_bias == "auto" or isinstance(conv_bias, bool) + self.conv_bias = conv_bias + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + self.bbox_coder = build_bbox_coder(bbox_coder) + + self.prior_generator = MlvlPointGenerator(strides) + + # In order to keep a more general interface and be consistent with + # anchor_head. We can think of point like one anchor + self.num_base_priors = self.prior_generator.num_base_priors[0] + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.fp16_enabled = False + + self._init_layers() + + def _init_layers(self): + """Initialize layers of the head.""" + self._init_cls_convs() + self._init_reg_convs() + self._init_predictor() + + def _init_cls_convs(self): + """Initialize classification conv layers of the head.""" + self.cls_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type="DCNv2") + else: + conv_cfg = self.conv_cfg + self.cls_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias, + ) + ) + + def _init_reg_convs(self): + """Initialize bbox regression conv layers of the head.""" + self.reg_convs = nn.ModuleList() + for i in range(self.stacked_convs): + chn = self.in_channels if i == 0 else self.feat_channels + if self.dcn_on_last_conv and i == self.stacked_convs - 1: + conv_cfg = dict(type="DCNv2") + else: + conv_cfg = self.conv_cfg + self.reg_convs.append( + ConvModule( + chn, + self.feat_channels, + 3, + stride=1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=self.norm_cfg, + bias=self.conv_bias, + ) + ) + + def _init_predictor(self): + """Initialize predictor layers of the head.""" + self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 3, padding=1) + self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + """Hack some keys of the model state dict so that can load checkpoints + of previous version.""" + version = local_metadata.get("version", None) + if version is None: + # the key is different in early versions + # for example, 'fcos_cls' become 'conv_cls' now + bbox_head_keys = [k for k in state_dict.keys() if k.startswith(prefix)] + ori_predictor_keys = [] + new_predictor_keys = [] + # e.g. 'fcos_cls' or 'fcos_reg' + for key in bbox_head_keys: + ori_predictor_keys.append(key) + key = key.split(".") + conv_name = None + if key[1].endswith("cls"): + conv_name = "conv_cls" + elif key[1].endswith("reg"): + conv_name = "conv_reg" + elif key[1].endswith("centerness"): + conv_name = "conv_centerness" + else: + assert NotImplementedError + if conv_name is not None: + key[1] = conv_name + new_predictor_keys.append(".".join(key)) + else: + ori_predictor_keys.pop(-1) + for i in range(len(new_predictor_keys)): + state_dict[new_predictor_keys[i]] = state_dict.pop(ori_predictor_keys[i]) + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, feats): + """Forward features from the upstream network. + + Args: + feats (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + + Returns: + tuple: Usually contain classification scores and bbox predictions. + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + """ + return multi_apply(self.forward_single, feats)[:2] + + def forward_single(self, x): + """Forward features of a single scale level. + + Args: + x (Tensor): FPN feature maps of the specified stride. + + Returns: + tuple: Scores for each class, bbox predictions, features + after classification and regression conv layers, some + models needs these features like FCOS. + """ + cls_feat = x + reg_feat = x + + for cls_layer in self.cls_convs: + cls_feat = cls_layer(cls_feat) + cls_score = self.conv_cls(cls_feat) + + for reg_layer in self.reg_convs: + reg_feat = reg_layer(reg_feat) + bbox_pred = self.conv_reg(reg_feat) + return cls_score, bbox_pred, cls_feat, reg_feat + + @abstractmethod + @force_fp32(apply_to=("cls_scores", "bbox_preds")) + def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None): + """Compute loss of the head. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level, + each is a 4D-tensor, the channel number is + num_points * num_classes. + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level, each is a 4D-tensor, the channel number is + num_points * 4. + gt_bboxes (list[Tensor]): Ground truth bboxes for each image with + shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels (list[Tensor]): class indices corresponding to each box + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes_ignore (None | list[Tensor]): specify which bounding + boxes can be ignored when computing the loss. + """ + + raise NotImplementedError + + def aug_test(self, feats, img_metas, rescale=False): + """Test function with test time augmentation. + + Args: + feats (list[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains features for all images in the batch. + img_metas (list[list[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. each dict has image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[ndarray]: bbox results of each class + """ + return self.aug_test_bboxes(feats, img_metas, rescale=rescale) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py new file mode 100644 index 000000000..af3369445 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py @@ -0,0 +1,520 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +import torch +from mmcv.cnn.utils.weight_init import constant_init +from mmcv.ops import batched_nms +from mmcv.runner import BaseModule, force_fp32 + + +class BaseDenseHead(BaseModule, metaclass=ABCMeta): + """Base class for DenseHeads.""" + + def __init__(self, init_cfg=None): + super(BaseDenseHead, self).__init__(init_cfg) + + def init_weights(self): + super(BaseDenseHead, self).init_weights() + # avoid init_cfg overwrite the initialization of `conv_offset` + for m in self.modules(): + # DeformConv2dPack, ModulatedDeformConv2dPack + if hasattr(m, "conv_offset"): + constant_init(m.conv_offset, 0) + + @abstractmethod + def loss(self, **kwargs): + """Compute losses of the head.""" + pass + + @force_fp32(apply_to=("cls_scores", "bbox_preds")) + def get_bboxes( + self, + cls_scores, + bbox_preds, + score_factors=None, + img_metas=None, + cfg=None, + rescale=False, + with_nms=True, + **kwargs + ): + """Transform network outputs of a batch into bbox results. + + Note: When score_factors is not None, the cls_scores are + usually multiplied by it then obtain the real score used in NMS, + such as CenterNess in FCOS, IoU branch in ATSS. + + Args: + cls_scores (list[Tensor]): Classification scores for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for all + scale levels, each is a 4D-tensor, has shape + (batch_size, num_priors * 4, H, W). + score_factors (list[Tensor], Optional): Score factor for + all scale level, each is a 4D-tensor, has shape + (batch_size, num_priors * 1, H, W). Default None. + img_metas (list[dict], Optional): Image meta info. Default None. + cfg (mmcv.Config, Optional): Test / postprocessing configuration, + if None, test_cfg would be used. Default None. + rescale (bool): If True, return boxes in original image space. + Default False. + with_nms (bool): If True, do nms before return boxes. + Default True. + + Returns: + list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is an (n, 5) tensor, where the first 4 columns + are bounding box positions (tl_x, tl_y, br_x, br_y) and the + 5-th column is a score between 0 and 1. The second item is a + (n,) tensor where each item is the predicted class label of + the corresponding box. + """ + assert len(cls_scores) == len(bbox_preds) + + if score_factors is None: + # e.g. Retina, FreeAnchor, Foveabox, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, AutoAssign, etc. + with_score_factors = True + assert len(cls_scores) == len(score_factors) + + num_levels = len(cls_scores) + + featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device + ) + + result_list = [] + + for img_id in range(len(img_metas)): + img_meta = img_metas[img_id] + cls_score_list = select_single_mlvl(cls_scores, img_id) + bbox_pred_list = select_single_mlvl(bbox_preds, img_id) + if with_score_factors: + score_factor_list = select_single_mlvl(score_factors, img_id) + else: + score_factor_list = [None for _ in range(num_levels)] + + results = self._get_bboxes_single( + cls_score_list, + bbox_pred_list, + score_factor_list, + mlvl_priors, + img_meta, + cfg, + rescale, + with_nms, + **kwargs + ) + result_list.append(results) + return result_list + + def _get_bboxes_single( + self, + cls_score_list, + bbox_pred_list, + score_factor_list, + mlvl_priors, + img_meta, + cfg, + rescale=False, + with_nms=True, + **kwargs + ): + """Transform outputs of a single image into bbox predictions. + + Args: + cls_score_list (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_priors * num_classes, H, W). + bbox_pred_list (list[Tensor]): Box energies / deltas from + all scale levels of a single image, each item has shape + (num_priors * 4, H, W). + score_factor_list (list[Tensor]): Score factor from all scale + levels of a single image, each item has shape + (num_priors * 1, H, W). + mlvl_priors (list[Tensor]): Each element in the list is + the priors of a single level in feature pyramid. In all + anchor-based methods, it has shape (num_priors, 4). In + all anchor-free methods, it has shape (num_priors, 2) + when `with_stride=True`, otherwise it still has shape + (num_priors, 4). + img_meta (dict): Image meta info. + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape \ + [num_bboxes, 5], where the first 4 columns are bounding \ + box positions (tl_x, tl_y, br_x, br_y) and the 5-th \ + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding \ + box with shape [num_bboxes]. + """ + if score_factor_list[0] is None: + # e.g. Retina, FreeAnchor, etc. + with_score_factors = False + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + + cfg = self.test_cfg if cfg is None else cfg + img_shape = img_meta["img_shape"] + nms_pre = cfg.get("nms_pre", -1) + + mlvl_bboxes = [] + mlvl_scores = [] + mlvl_labels = [] + if with_score_factors: + mlvl_score_factors = [] + else: + mlvl_score_factors = None + for level_idx, (cls_score, bbox_pred, score_factor, priors) in enumerate( + zip(cls_score_list, bbox_pred_list, score_factor_list, mlvl_priors) + ): + + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4) + if with_score_factors: + score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid() + cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = cls_score.sigmoid() + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + scores = cls_score.softmax(-1)[:, :-1] + + # After https://github.com/open-mmlab/mmdetection/pull/6268/, + # this operation keeps fewer bboxes under the same `nms_pre`. + # There is no difference in performance for most models. If you + # find a slight drop in performance, you can set a larger + # `nms_pre` than before. + results = filter_scores_and_topk(scores, cfg.score_thr, nms_pre, dict(bbox_pred=bbox_pred, priors=priors)) + scores, labels, keep_idxs, filtered_results = results + + bbox_pred = filtered_results["bbox_pred"] + priors = filtered_results["priors"] + + if with_score_factors: + score_factor = score_factor[keep_idxs] + + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + mlvl_bboxes.append(bboxes) + mlvl_scores.append(scores) + mlvl_labels.append(labels) + if with_score_factors: + mlvl_score_factors.append(score_factor) + + return self._bbox_post_process( + mlvl_scores, + mlvl_labels, + mlvl_bboxes, + img_meta["scale_factor"], + cfg, + rescale, + with_nms, + mlvl_score_factors, + **kwargs + ) + + def _bbox_post_process( + self, + mlvl_scores, + mlvl_labels, + mlvl_bboxes, + scale_factor, + cfg, + rescale=False, + with_nms=True, + mlvl_score_factors=None, + **kwargs + ): + """bbox post-processing method. + + The boxes would be rescaled to the original image scale and do + the nms operation. Usually `with_nms` is False is used for aug test. + + Args: + mlvl_scores (list[Tensor]): Box scores from all scale + levels of a single image, each item has shape + (num_bboxes, ). + mlvl_labels (list[Tensor]): Box class labels from all scale + levels of a single image, each item has shape + (num_bboxes, ). + mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale + levels of a single image, each item has shape (num_bboxes, 4). + scale_factor (ndarray, optional): Scale factor of the image arange + as (w_scale, h_scale, w_scale, h_scale). + cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used. + rescale (bool): If True, return boxes in original image space. + Default: False. + with_nms (bool): If True, do nms before return boxes. + Default: True. + mlvl_score_factors (list[Tensor], optional): Score factor from + all scale levels of a single image, each item has shape + (num_bboxes, ). Default: None. + + Returns: + tuple[Tensor]: Results of detected bboxes and labels. If with_nms + is False and mlvl_score_factor is None, return mlvl_bboxes and + mlvl_scores, else return mlvl_bboxes, mlvl_scores and + mlvl_score_factor. Usually with_nms is False is used for aug + test. If with_nms is True, then return the following format + + - det_bboxes (Tensor): Predicted bboxes with shape \ + [num_bboxes, 5], where the first 4 columns are bounding \ + box positions (tl_x, tl_y, br_x, br_y) and the 5-th \ + column are scores between 0 and 1. + - det_labels (Tensor): Predicted labels of the corresponding \ + box with shape [num_bboxes]. + """ + assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels) + + mlvl_bboxes = torch.cat(mlvl_bboxes) + if rescale: + mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor) + mlvl_scores = torch.cat(mlvl_scores) + mlvl_labels = torch.cat(mlvl_labels) + + if mlvl_score_factors is not None: + # TODO: Add sqrt operation in order to be consistent with + # the paper. + mlvl_score_factors = torch.cat(mlvl_score_factors) + mlvl_scores = mlvl_scores * mlvl_score_factors + + if with_nms: + if mlvl_bboxes.numel() == 0: + det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1) + return det_bboxes, mlvl_labels + + det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores, mlvl_labels, cfg.nms) + det_bboxes = det_bboxes[: cfg.max_per_img] + det_labels = mlvl_labels[keep_idxs][: cfg.max_per_img] + return det_bboxes, det_labels + else: + return mlvl_bboxes, mlvl_scores, mlvl_labels + + def forward_train( + self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None, **kwargs + ): + """ + Args: + x (list[Tensor]): Features from FPN. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. + gt_bboxes (Tensor): Ground truth bboxes of the image, + shape (num_gts, 4). + gt_labels (Tensor): Ground truth labels of each box, + shape (num_gts,). + gt_bboxes_ignore (Tensor): Ground truth bboxes to be + ignored, shape (num_ignored_gts, 4). + proposal_cfg (mmcv.Config): Test / postprocessing configuration, + if None, test_cfg would be used + + Returns: + tuple: + losses: (dict[str, Tensor]): A dictionary of loss components. + proposal_list (list[Tensor]): Proposals of each image. + """ + outs = self(x) + if gt_labels is None: + loss_inputs = outs + (gt_bboxes, img_metas) + else: + loss_inputs = outs + (gt_bboxes, gt_labels, img_metas) + losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + if proposal_cfg is None: + return losses + else: + proposal_list = self.get_bboxes(*outs, img_metas=img_metas, cfg=proposal_cfg) + return losses, proposal_list + + def simple_test(self, feats, img_metas, rescale=False): + """Test function without test-time augmentation. + + Args: + feats (tuple[torch.Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + img_metas (list[dict]): List of image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n, ). + """ + return self.simple_test_bboxes(feats, img_metas, rescale=rescale) + + @force_fp32(apply_to=("cls_scores", "bbox_preds")) + def onnx_export(self, cls_scores, bbox_preds, score_factors=None, img_metas=None, with_nms=True): + """Transform network output for a batch into bbox predictions. + + Args: + cls_scores (list[Tensor]): Box scores for each scale level + with shape (N, num_points * num_classes, H, W). + bbox_preds (list[Tensor]): Box energies / deltas for each scale + level with shape (N, num_points * 4, H, W). + score_factors (list[Tensor]): score_factors for each s + cale level with shape (N, num_points * 1, H, W). + Default: None. + img_metas (list[dict]): Meta information of each image, e.g., + image size, scaling factor, etc. Default: None. + with_nms (bool): Whether apply nms to the bboxes. Default: True. + + Returns: + tuple[Tensor, Tensor] | list[tuple]: When `with_nms` is True, + it is tuple[Tensor, Tensor], first tensor bboxes with shape + [N, num_det, 5], 5 arrange as (x1, y1, x2, y2, score) + and second element is class labels of shape [N, num_det]. + When `with_nms` is False, first tensor is bboxes with + shape [N, num_det, 4], second tensor is raw score has + shape [N, num_det, num_classes]. + """ + assert len(cls_scores) == len(bbox_preds) + + num_levels = len(cls_scores) + + featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] + mlvl_priors = self.prior_generator.grid_priors( + featmap_sizes, dtype=bbox_preds[0].dtype, device=bbox_preds[0].device + ) + + mlvl_cls_scores = [cls_scores[i].detach() for i in range(num_levels)] + mlvl_bbox_preds = [bbox_preds[i].detach() for i in range(num_levels)] + + assert len(img_metas) == 1, "Only support one input image while in exporting to ONNX" + img_shape = img_metas[0]["img_shape_for_onnx"] + + cfg = self.test_cfg + assert len(cls_scores) == len(bbox_preds) == len(mlvl_priors) + device = cls_scores[0].device + batch_size = cls_scores[0].shape[0] + # convert to tensor to keep tracing + nms_pre_tensor = torch.tensor(cfg.get("nms_pre", -1), device=device, dtype=torch.long) + + # e.g. Retina, FreeAnchor, etc. + if score_factors is None: + with_score_factors = False + mlvl_score_factor = [None for _ in range(num_levels)] + else: + # e.g. FCOS, PAA, ATSS, etc. + with_score_factors = True + mlvl_score_factor = [score_factors[i].detach() for i in range(num_levels)] + mlvl_score_factors = [] + + mlvl_batch_bboxes = [] + mlvl_scores = [] + + for cls_score, bbox_pred, score_factors, priors in zip( + mlvl_cls_scores, mlvl_bbox_preds, mlvl_score_factor, mlvl_priors + ): + assert cls_score.size()[-2:] == bbox_pred.size()[-2:] + + scores = cls_score.permute(0, 2, 3, 1).reshape(batch_size, -1, self.cls_out_channels) + if self.use_sigmoid_cls: + scores = scores.sigmoid() + nms_pre_score = scores + else: + scores = scores.softmax(-1) + nms_pre_score = scores + + if with_score_factors: + score_factors = score_factors.permute(0, 2, 3, 1).reshape(batch_size, -1).sigmoid() + bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 4) + priors = priors.expand(batch_size, -1, priors.size(-1)) + # Get top-k predictions + from mmdet.core.export import get_k_for_topk + + nms_pre = get_k_for_topk(nms_pre_tensor, bbox_pred.shape[1]) + if nms_pre > 0: + + if with_score_factors: + nms_pre_score = nms_pre_score * score_factors[..., None] + else: + nms_pre_score = nms_pre_score + + # Get maximum scores for foreground classes. + if self.use_sigmoid_cls: + max_scores, _ = nms_pre_score.max(-1) + else: + # remind that we set FG labels to [0, num_class-1] + # since mmdet v2.0 + # BG cat_id: num_class + max_scores, _ = nms_pre_score[..., :-1].max(-1) + _, topk_inds = max_scores.topk(nms_pre) + + batch_inds = torch.arange(batch_size, device=bbox_pred.device).view(-1, 1).expand_as(topk_inds).long() + # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501 + transformed_inds = bbox_pred.shape[1] * batch_inds + topk_inds + priors = priors.reshape(-1, priors.size(-1))[transformed_inds, :].reshape( + batch_size, -1, priors.size(-1) + ) + bbox_pred = bbox_pred.reshape(-1, 4)[transformed_inds, :].reshape(batch_size, -1, 4) + scores = scores.reshape(-1, self.cls_out_channels)[transformed_inds, :].reshape( + batch_size, -1, self.cls_out_channels + ) + if with_score_factors: + score_factors = score_factors.reshape(-1, 1)[transformed_inds].reshape(batch_size, -1) + + bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape) + + mlvl_batch_bboxes.append(bboxes) + mlvl_scores.append(scores) + if with_score_factors: + mlvl_score_factors.append(score_factors) + + batch_bboxes = torch.cat(mlvl_batch_bboxes, dim=1) + batch_scores = torch.cat(mlvl_scores, dim=1) + if with_score_factors: + batch_score_factors = torch.cat(mlvl_score_factors, dim=1) + + # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment + + from mmdet.core.export import add_dummy_nms_for_onnx + + if not self.use_sigmoid_cls: + batch_scores = batch_scores[..., : self.num_classes] + + if with_score_factors: + batch_scores = batch_scores * (batch_score_factors.unsqueeze(2)) + + if with_nms: + max_output_boxes_per_class = cfg.nms.get("max_output_boxes_per_class", 200) + iou_threshold = cfg.nms.get("iou_threshold", 0.5) + score_threshold = cfg.score_thr + nms_pre = cfg.get("deploy_nms_pre", -1) + return add_dummy_nms_for_onnx( + batch_bboxes, + batch_scores, + max_output_boxes_per_class, + iou_threshold, + score_threshold, + nms_pre, + cfg.max_per_img, + ) + else: + return batch_bboxes, batch_scores diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py new file mode 100644 index 000000000..c1424e602 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from inspect import signature + +import torch + + +class BBoxTestMixin(object): + """Mixin class for testing det bboxes via DenseHead.""" + + def simple_test_bboxes(self, feats, img_metas, rescale=False): + """Test det bboxes without test-time augmentation, can be applied in + DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``, + etc. + + Args: + feats (tuple[torch.Tensor]): Multi-level features from the + upstream network, each is a 4D-tensor. + img_metas (list[dict]): List of image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n,) + """ + outs = self.forward(feats) + results_list = self.get_bboxes(*outs, img_metas=img_metas, rescale=rescale) + return results_list + + def aug_test_bboxes(self, feats, img_metas, rescale=False): + """Test det bboxes with test time augmentation, can be applied in + DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``, + etc. + + Args: + feats (list[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains features for all images in the batch. + img_metas (list[list[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. each dict has image information. + rescale (bool, optional): Whether to rescale the results. + Defaults to False. + + Returns: + list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. + The first item is ``bboxes`` with shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + The shape of the second tensor in the tuple is ``labels`` + with shape (n,). The length of list should always be 1. + """ + # check with_nms argument + gb_sig = signature(self.get_bboxes) + gb_args = [p.name for p in gb_sig.parameters.values()] + gbs_sig = signature(self._get_bboxes_single) + gbs_args = [p.name for p in gbs_sig.parameters.values()] + assert ("with_nms" in gb_args) and ("with_nms" in gbs_args), ( + f"{self.__class__.__name__}" " does not support test-time augmentation" + ) + + aug_bboxes = [] + aug_scores = [] + aug_labels = [] + for x, img_meta in zip(feats, img_metas): + # only one image in the batch + outs = self.forward(x) + bbox_outputs = self.get_bboxes(*outs, img_metas=img_meta, cfg=self.test_cfg, rescale=False, with_nms=False)[ + 0 + ] + aug_bboxes.append(bbox_outputs[0]) + aug_scores.append(bbox_outputs[1]) + if len(bbox_outputs) >= 3: + aug_labels.append(bbox_outputs[2]) + + # after merging, bboxes will be rescaled to the original image size + merged_bboxes, merged_scores = self.merge_aug_bboxes(aug_bboxes, aug_scores, img_metas) + merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None + + if merged_bboxes.numel() == 0: + det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1) + return [ + (det_bboxes, merged_labels), + ] + + det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores, merged_labels, self.test_cfg.nms) + det_bboxes = det_bboxes[: self.test_cfg.max_per_img] + det_labels = merged_labels[keep_idxs][: self.test_cfg.max_per_img] + + if rescale: + _det_bboxes = det_bboxes + else: + _det_bboxes = det_bboxes.clone() + _det_bboxes[:, :4] *= det_bboxes.new_tensor(img_metas[0][0]["scale_factor"]) + + return [ + (_det_bboxes, det_labels), + ] + + def simple_test_rpn(self, x, img_metas): + """Test without augmentation, only for ``RPNHead`` and its variants, + e.g., ``GARPNHead``, etc. + + Args: + x (tuple[Tensor]): Features from the upstream network, each is + a 4D-tensor. + img_metas (list[dict]): Meta info of each image. + + Returns: + list[Tensor]: Proposals of each image, each item has shape (n, 5), + where 5 represent (tl_x, tl_y, br_x, br_y, score). + """ + rpn_outs = self(x) + proposal_list = self.get_bboxes(*rpn_outs, img_metas=img_metas) + return proposal_list + + def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas): + """Merge augmented detection bboxes and scores. + + Args: + aug_bboxes (list[Tensor]): shape (n, 4*#class) + aug_scores (list[Tensor] or None): shape (n, #class) + img_shapes (list[Tensor]): shape (3, ). + + Returns: + tuple[Tensor]: ``bboxes`` with shape (n,4), where + 4 represent (tl_x, tl_y, br_x, br_y) + and ``scores`` with shape (n,). + """ + recovered_bboxes = [] + for bboxes, img_info in zip(aug_bboxes, img_metas): + img_shape = img_info[0]["img_shape"] + scale_factor = img_info[0]["scale_factor"] + flip = img_info[0]["flip"] + flip_direction = img_info[0]["flip_direction"] + bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip, flip_direction) + recovered_bboxes.append(bboxes) + bboxes = torch.cat(recovered_bboxes, dim=0) + if aug_scores is None: + return bboxes + else: + scores = torch.cat(aug_scores, dim=0) + return bboxes, scores diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py b/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py new file mode 100644 index 000000000..6dd11bd8d --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py @@ -0,0 +1,145 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + +import torch +from mmcv.runner import BaseModule, auto_fp16 + + +class BaseDetector(BaseModule, metaclass=ABCMeta): + """Base class for detectors.""" + + def __init__(self, init_cfg=None): + super(BaseDetector, self).__init__(init_cfg) + self.fp16_enabled = False + + @property + def with_neck(self): + """bool: whether the detector has a neck""" + return hasattr(self, "neck") and self.neck is not None + + # TODO: these properties need to be carefully handled + # for both single stage & two stage detectors + @property + def with_shared_head(self): + """bool: whether the detector has a shared head in the RoI Head""" + return hasattr(self, "roi_head") and self.roi_head.with_shared_head + + @property + def with_bbox(self): + """bool: whether the detector has a bbox head""" + return (hasattr(self, "roi_head") and self.roi_head.with_bbox) or ( + hasattr(self, "bbox_head") and self.bbox_head is not None + ) + + @property + def with_mask(self): + """bool: whether the detector has a mask head""" + return (hasattr(self, "roi_head") and self.roi_head.with_mask) or ( + hasattr(self, "mask_head") and self.mask_head is not None + ) + + @abstractmethod + def extract_feat(self, imgs): + """Extract features from images.""" + pass + + def extract_feats(self, imgs): + """Extract features from multiple images. + + Args: + imgs (list[torch.Tensor]): A list of images. The images are + augmented from the same image but in different ways. + + Returns: + list[torch.Tensor]: Features of different images + """ + assert isinstance(imgs, list) + return [self.extract_feat(img) for img in imgs] + + def forward_train(self, imgs, img_metas, **kwargs): + """ + Args: + img (Tensor): of shape (N, C, H, W) encoding input images. + Typically these should be mean centered and std scaled. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys, see + :class:`mmdet.datasets.pipelines.Collect`. + kwargs (keyword arguments): Specific to concrete implementation. + """ + # NOTE the batched image size information may be useful, e.g. + # in DETR, this is needed for the construction of masks, which is + # then used for the transformer_head. + batch_input_shape = tuple(imgs[0].size()[-2:]) + for img_meta in img_metas: + img_meta["batch_input_shape"] = batch_input_shape + + @abstractmethod + def simple_test(self, img, img_metas, **kwargs): + pass + + def forward_test(self, imgs, img_metas, **kwargs): + """ + Args: + imgs (List[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains all images in the batch. + img_metas (List[List[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. + """ + for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]: + if not isinstance(var, list): + raise TypeError(f"{name} must be a list, but got {type(var)}") + + num_augs = len(imgs) + if num_augs != len(img_metas): + raise ValueError(f"num of augmentations ({len(imgs)}) " f"!= num of image meta ({len(img_metas)})") + + # NOTE the batched image size information may be useful, e.g. + # in DETR, this is needed for the construction of masks, which is + # then used for the transformer_head. + for img, img_meta in zip(imgs, img_metas): + batch_size = len(img_meta) + for img_id in range(batch_size): + img_meta[img_id]["batch_input_shape"] = tuple(img.size()[-2:]) + + if num_augs == 1: + # proposals (List[List[Tensor]]): the outer list indicates + # test-time augs (multiscale, flip, etc.) and the inner list + # indicates images in a batch. + # The Tensor should have a shape Px4, where P is the number of + # proposals. + if "proposals" in kwargs: + kwargs["proposals"] = kwargs["proposals"][0] + return self.simple_test(imgs[0], img_metas[0], **kwargs) + else: + assert imgs[0].size(0) == 1, "aug test does not support " "inference with batch size " f"{imgs[0].size(0)}" + # TODO: support test augmentation for predefined proposals + assert "proposals" not in kwargs + return self.aug_test(imgs, img_metas, **kwargs) + + @auto_fp16(apply_to=("img",)) + def forward(self, img, img_metas, return_loss=True, **kwargs): + """Calls either :func:`forward_train` or :func:`forward_test` depending + on whether ``return_loss`` is ``True``. + + Note this setting will change the expected inputs. When + ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor + and List[dict]), and when ``resturn_loss=False``, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + """ + if torch.onnx.is_in_onnx_export(): + assert len(img_metas) == 1 + return self.onnx_export(img[0], img_metas[0]) + + if return_loss: + return self.forward_train(img, img_metas, **kwargs) + else: + return self.forward_test(img, img_metas, **kwargs) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py new file mode 100644 index 000000000..2265daf99 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES + + +@LOSSES.register_module() +class FocalLoss(nn.Module): + def __init__(self, use_sigmoid=True, gamma=2.0, alpha=0.25, reduction="mean", loss_weight=1.0, activated=False): + """`Focal Loss `_ + + Args: + use_sigmoid (bool, optional): Whether to the prediction is + used for sigmoid or softmax. Defaults to True. + gamma (float, optional): The gamma for calculating the modulating + factor. Defaults to 2.0. + alpha (float, optional): A balanced form for Focal Loss. + Defaults to 0.25. + reduction (str, optional): The method used to reduce the loss into + a scalar. Defaults to 'mean'. Options are "none", "mean" and + "sum". + loss_weight (float, optional): Weight of loss. Defaults to 1.0. + activated (bool, optional): Whether the input is activated. + If True, it means the input has been activated and can be + treated as probabilities. Else, it should be treated as logits. + Defaults to False. + """ + super(FocalLoss, self).__init__() + assert use_sigmoid is True, "Only sigmoid focal loss supported now." + self.use_sigmoid = use_sigmoid + self.gamma = gamma + self.alpha = alpha + self.reduction = reduction + self.loss_weight = loss_weight + self.activated = activated + + def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Options are "none", "mean" and "sum". + + Returns: + torch.Tensor: The calculated loss + """ + assert reduction_override in (None, "none", "mean", "sum") + reduction = reduction_override if reduction_override else self.reduction + if self.use_sigmoid: + if self.activated: + calculate_loss_func = py_focal_loss_with_prob + else: + if torch.cuda.is_available() and pred.is_cuda: + calculate_loss_func = sigmoid_focal_loss + else: + num_classes = pred.size(1) + target = F.one_hot(target, num_classes=num_classes + 1) + target = target[:, :num_classes] + calculate_loss_func = py_sigmoid_focal_loss + + loss_cls = self.loss_weight * calculate_loss_func( + pred, target, weight, gamma=self.gamma, alpha=self.alpha, reduction=reduction, avg_factor=avg_factor + ) + + else: + raise NotImplementedError + return loss_cls diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py new file mode 100644 index 000000000..9aa621834 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py @@ -0,0 +1,448 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import mmcv +import torch +import torch.nn as nn + +from ..builder import LOSSES +from .utils import weighted_loss + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def iou_loss(pred, target, linear=False, mode="log", eps=1e-6): + """IoU loss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + The loss is calculated as negative log of IoU. + + Args: + pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (torch.Tensor): Corresponding gt bboxes, shape (n, 4). + linear (bool, optional): If True, use linear scale of loss instead of + log scale. Default: False. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'log' + eps (float): Eps to avoid log(0). + + Return: + torch.Tensor: Loss tensor. + """ + assert mode in ["linear", "square", "log"] + if linear: + mode = "linear" + warnings.warn( + 'DeprecationWarning: Setting "linear=True" in ' + 'iou_loss is deprecated, please use "mode=`linear`" ' + "instead." + ) + ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps) + if mode == "linear": + loss = 1 - ious + elif mode == "square": + loss = 1 - ious**2 + elif mode == "log": + loss = -ious.log() + else: + raise NotImplementedError + return loss + + +# @mmcv.jit(derivate=True, coderize=True) +# @weighted_loss +# def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3): +# """BIoULoss. + +# This is an implementation of paper +# `Improving Object Localization with Fitness NMS and Bounded IoU Loss. +# `_. + +# Args: +# pred (torch.Tensor): Predicted bboxes. +# target (torch.Tensor): Target bboxes. +# beta (float): beta parameter in smoothl1. +# eps (float): eps to avoid NaN. +# """ +# pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5 +# pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5 +# pred_w = pred[:, 2] - pred[:, 0] +# pred_h = pred[:, 3] - pred[:, 1] +# with torch.no_grad(): +# target_ctrx = (target[:, 0] + target[:, 2]) * 0.5 +# target_ctry = (target[:, 1] + target[:, 3]) * 0.5 +# target_w = target[:, 2] - target[:, 0] +# target_h = target[:, 3] - target[:, 1] + +# dx = target_ctrx - pred_ctrx +# dy = target_ctry - pred_ctry + +# loss_dx = 1 - torch.max( +# (target_w - 2 * dx.abs()) / +# (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx)) +# loss_dy = 1 - torch.max( +# (target_h - 2 * dy.abs()) / +# (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy)) +# loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w / +# (target_w + eps)) +# loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h / +# (target_h + eps)) +# # view(..., -1) does not work for empty tensor +# loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh], +# dim=-1).flatten(1) + +# loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta, +# loss_comb - 0.5 * beta) +# return loss + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def giou_loss(pred, target, eps=1e-7): + r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding + Box Regression `_. + + Args: + pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2), + shape (n, 4). + target (torch.Tensor): Corresponding gt bboxes, shape (n, 4). + eps (float): Eps to avoid log(0). + + Return: + Tensor: Loss tensor. + """ + gious = bbox_overlaps(pred, target, mode="giou", is_aligned=True, eps=eps) + loss = 1 - gious + return loss + + +# @mmcv.jit(derivate=True, coderize=True) +# @weighted_loss +# def diou_loss(pred, target, eps=1e-7): +# r"""`Implementation of Distance-IoU Loss: Faster and Better +# Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_. + +# Code is modified from https://github.com/Zzh-tju/DIoU. + +# Args: +# pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), +# shape (n, 4). +# target (Tensor): Corresponding gt bboxes, shape (n, 4). +# eps (float): Eps to avoid log(0). +# Return: +# Tensor: Loss tensor. +# """ +# # overlap +# lt = torch.max(pred[:, :2], target[:, :2]) +# rb = torch.min(pred[:, 2:], target[:, 2:]) +# wh = (rb - lt).clamp(min=0) +# overlap = wh[:, 0] * wh[:, 1] + +# # union +# ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1]) +# ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1]) +# union = ap + ag - overlap + eps + +# # IoU +# ious = overlap / union + +# # enclose area +# enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) +# enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) +# enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + +# cw = enclose_wh[:, 0] +# ch = enclose_wh[:, 1] + +# c2 = cw**2 + ch**2 + eps + +# b1_x1, b1_y1 = pred[:, 0], pred[:, 1] +# b1_x2, b1_y2 = pred[:, 2], pred[:, 3] +# b2_x1, b2_y1 = target[:, 0], target[:, 1] +# b2_x2, b2_y2 = target[:, 2], target[:, 3] + +# left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 +# right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 +# rho2 = left + right + +# # DIoU +# dious = ious - rho2 / c2 +# loss = 1 - dious +# return loss + + +# @mmcv.jit(derivate=True, coderize=True) +# @weighted_loss +# def ciou_loss(pred, target, eps=1e-7): +# r"""`Implementation of paper `Enhancing Geometric Factors into +# Model Learning and Inference for Object Detection and Instance +# Segmentation `_. + +# Code is modified from https://github.com/Zzh-tju/CIoU. + +# Args: +# pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2), +# shape (n, 4). +# target (Tensor): Corresponding gt bboxes, shape (n, 4). +# eps (float): Eps to avoid log(0). +# Return: +# Tensor: Loss tensor. +# """ +# # overlap +# lt = torch.max(pred[:, :2], target[:, :2]) +# rb = torch.min(pred[:, 2:], target[:, 2:]) +# wh = (rb - lt).clamp(min=0) +# overlap = wh[:, 0] * wh[:, 1] + +# # union +# ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1]) +# ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1]) +# union = ap + ag - overlap + eps + +# # IoU +# ious = overlap / union + +# # enclose area +# enclose_x1y1 = torch.min(pred[:, :2], target[:, :2]) +# enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:]) +# enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0) + +# cw = enclose_wh[:, 0] +# ch = enclose_wh[:, 1] + +# c2 = cw**2 + ch**2 + eps + +# b1_x1, b1_y1 = pred[:, 0], pred[:, 1] +# b1_x2, b1_y2 = pred[:, 2], pred[:, 3] +# b2_x1, b2_y1 = target[:, 0], target[:, 1] +# b2_x2, b2_y2 = target[:, 2], target[:, 3] + +# w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps +# w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps + +# left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 +# right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 +# rho2 = left + right + +# factor = 4 / math.pi**2 +# v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) + +# with torch.no_grad(): +# alpha = (ious > 0.5).float() * v / (1 - ious + v) + +# # CIoU +# cious = ious - (rho2 / c2 + alpha * v) +# loss = 1 - cious.clamp(min=-1.0, max=1.0) +# return loss + + +@LOSSES.register_module() +class IoULoss(nn.Module): + """IoULoss. + + Computing the IoU loss between a set of predicted bboxes and target bboxes. + + Args: + linear (bool): If True, use linear scale of loss else determined + by mode. Default: False. + eps (float): Eps to avoid log(0). + reduction (str): Options are "none", "mean" and "sum". + loss_weight (float): Weight of loss. + mode (str): Loss scaling mode, including "linear", "square", and "log". + Default: 'log' + """ + + def __init__(self, linear=False, eps=1e-6, reduction="mean", loss_weight=1.0, mode="log"): + super(IoULoss, self).__init__() + assert mode in ["linear", "square", "log"] + if linear: + mode = "linear" + warnings.warn( + 'DeprecationWarning: Setting "linear=True" in ' + 'IOULoss is deprecated, please use "mode=`linear`" ' + "instead." + ) + self.mode = mode + self.linear = linear + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. Options are "none", "mean" and "sum". + """ + assert reduction_override in (None, "none", "mean", "sum") + reduction = reduction_override if reduction_override else self.reduction + if (weight is not None) and (not torch.any(weight > 0)) and (reduction != "none"): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # iou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * iou_loss( + pred, target, weight, mode=self.mode, eps=self.eps, reduction=reduction, avg_factor=avg_factor, **kwargs + ) + return loss + + +# @LOSSES.register_module() +# class BoundedIoULoss(nn.Module): + +# def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0): +# super(BoundedIoULoss, self).__init__() +# self.beta = beta +# self.eps = eps +# self.reduction = reduction +# self.loss_weight = loss_weight + +# def forward(self, +# pred, +# target, +# weight=None, +# avg_factor=None, +# reduction_override=None, +# **kwargs): +# if weight is not None and not torch.any(weight > 0): +# if pred.dim() == weight.dim() + 1: +# weight = weight.unsqueeze(1) +# return (pred * weight).sum() # 0 +# assert reduction_override in (None, 'none', 'mean', 'sum') +# reduction = ( +# reduction_override if reduction_override else self.reduction) +# loss = self.loss_weight * bounded_iou_loss( +# pred, +# target, +# weight, +# beta=self.beta, +# eps=self.eps, +# reduction=reduction, +# avg_factor=avg_factor, +# **kwargs) +# return loss + + +@LOSSES.register_module() +class GIoULoss(nn.Module): + def __init__(self, eps=1e-6, reduction="mean", loss_weight=1.0): + super(GIoULoss, self).__init__() + self.eps = eps + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs): + if weight is not None and not torch.any(weight > 0): + if pred.dim() == weight.dim() + 1: + weight = weight.unsqueeze(1) + return (pred * weight).sum() # 0 + assert reduction_override in (None, "none", "mean", "sum") + reduction = reduction_override if reduction_override else self.reduction + if weight is not None and weight.dim() > 1: + # TODO: remove this in the future + # reduce the weight of shape (n, 4) to (n,) to match the + # giou_loss of shape (n,) + assert weight.shape == pred.shape + weight = weight.mean(-1) + loss = self.loss_weight * giou_loss( + pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor, **kwargs + ) + return loss + + +# @LOSSES.register_module() +# class DIoULoss(nn.Module): + +# def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0): +# super(DIoULoss, self).__init__() +# self.eps = eps +# self.reduction = reduction +# self.loss_weight = loss_weight + +# def forward(self, +# pred, +# target, +# weight=None, +# avg_factor=None, +# reduction_override=None, +# **kwargs): +# if weight is not None and not torch.any(weight > 0): +# if pred.dim() == weight.dim() + 1: +# weight = weight.unsqueeze(1) +# return (pred * weight).sum() # 0 +# assert reduction_override in (None, 'none', 'mean', 'sum') +# reduction = ( +# reduction_override if reduction_override else self.reduction) +# if weight is not None and weight.dim() > 1: +# # TODO: remove this in the future +# # reduce the weight of shape (n, 4) to (n,) to match the +# # giou_loss of shape (n,) +# assert weight.shape == pred.shape +# weight = weight.mean(-1) +# loss = self.loss_weight * diou_loss( +# pred, +# target, +# weight, +# eps=self.eps, +# reduction=reduction, +# avg_factor=avg_factor, +# **kwargs) +# return loss + + +# @LOSSES.register_module() +# class CIoULoss(nn.Module): + +# def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0): +# super(CIoULoss, self).__init__() +# self.eps = eps +# self.reduction = reduction +# self.loss_weight = loss_weight + +# def forward(self, +# pred, +# target, +# weight=None, +# avg_factor=None, +# reduction_override=None, +# **kwargs): +# if weight is not None and not torch.any(weight > 0): +# if pred.dim() == weight.dim() + 1: +# weight = weight.unsqueeze(1) +# return (pred * weight).sum() # 0 +# assert reduction_override in (None, 'none', 'mean', 'sum') +# reduction = ( +# reduction_override if reduction_override else self.reduction) +# if weight is not None and weight.dim() > 1: +# # TODO: remove this in the future +# # reduce the weight of shape (n, 4) to (n,) to match the +# # giou_loss of shape (n,) +# assert weight.shape == pred.shape +# weight = weight.mean(-1) +# loss = self.loss_weight * ciou_loss( +# pred, +# target, +# weight, +# eps=self.eps, +# reduction=reduction, +# avg_factor=avg_factor, +# **kwargs) +# return loss diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py new file mode 100644 index 000000000..82fda7704 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import torch +import torch.nn as nn + +from ..builder import LOSSES +from .utils import weighted_loss + + +@mmcv.jit(derivate=True, coderize=True) +@weighted_loss +def l1_loss(pred, target): + """L1 loss. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + + Returns: + torch.Tensor: Calculated loss + """ + if target.numel() == 0: + return pred.sum() * 0 + + assert pred.size() == target.size() + loss = torch.abs(pred - target) + return loss + + +@LOSSES.register_module() +class L1Loss(nn.Module): + """L1 loss. + + Args: + reduction (str, optional): The method to reduce the loss. + Options are "none", "mean" and "sum". + loss_weight (float, optional): The weight of loss. + """ + + def __init__(self, reduction="mean", loss_weight=1.0): + super(L1Loss, self).__init__() + self.reduction = reduction + self.loss_weight = loss_weight + + def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): + """Forward function. + + Args: + pred (torch.Tensor): The prediction. + target (torch.Tensor): The learning target of the prediction. + weight (torch.Tensor, optional): The weight of loss for each + prediction. Defaults to None. + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + reduction_override (str, optional): The reduction method used to + override the original reduction method of the loss. + Defaults to None. + """ + assert reduction_override in (None, "none", "mean", "sum") + reduction = reduction_override if reduction_override else self.reduction + loss_bbox = self.loss_weight * l1_loss(pred, target, weight, reduction=reduction, avg_factor=avg_factor) + return loss_bbox diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py new file mode 100644 index 000000000..7e79bd2b3 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import functools + +import mmcv +import torch +import torch.nn.functional as F + + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + + +@mmcv.jit(derivate=True, coderize=True) +def weight_reduce_loss(loss, weight=None, reduction="mean", avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Average factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == "mean": + # Avoid causing ZeroDivisionError when avg_factor is 0.0, + # i.e., all labels of an image belong to ignore index. + eps = torch.finfo(torch.float32).eps + loss = loss.sum() / (avg_factor + eps) + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != "none": + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + + +def weighted_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, target, weight=None, reduction="mean", avg_factor=None, **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py new file mode 100644 index 000000000..0b288063d --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# # Copyright (c) OpenMMLab. All rights reserved. + +from .builder import build_transformer +from .res_layer import ResLayer, SimplifiedBasicBlock diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py new file mode 100644 index 000000000..d258773c6 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.utils import Registry, build_from_cfg + +TRANSFORMER = Registry("Transformer") +LINEAR_LAYERS = Registry("linear layers") + + +def build_transformer(cfg, default_args=None): + """Builder for Transformer.""" + return build_from_cfg(cfg, TRANSFORMER, default_args) + + +LINEAR_LAYERS.register_module("Linear", module=nn.Linear) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py new file mode 100644 index 000000000..7e734c1eb --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py @@ -0,0 +1,179 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.cnn import build_conv_layer, build_norm_layer +from mmcv.runner import BaseModule, Sequential +from torch import nn as nn + + +class ResLayer(Sequential): + """ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + downsample_first (bool): Downsample at the first block or last block. + False for Hourglass, True for ResNet. Default: True + """ + + def __init__( + self, + block, + inplanes, + planes, + num_blocks, + stride=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type="BN"), + downsample_first=True, + **kwargs + ): + self.block = block + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + if avg_down: + conv_stride = 1 + downsample.append( + nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False) + ) + downsample.extend( + [ + build_conv_layer( + conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=conv_stride, bias=False + ), + build_norm_layer(norm_cfg, planes * block.expansion)[1], + ] + ) + downsample = nn.Sequential(*downsample) + + layers = [] + if downsample_first: + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs + ) + ) + inplanes = planes * block.expansion + for _ in range(1, num_blocks): + layers.append( + block(inplanes=inplanes, planes=planes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs) + ) + + else: # downsample_first=False is for HourglassModule + for _ in range(num_blocks - 1): + layers.append( + block(inplanes=inplanes, planes=inplanes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs) + ) + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs + ) + ) + super(ResLayer, self).__init__(*layers) + + +class SimplifiedBasicBlock(BaseModule): + """Simplified version of original basic residual block. This is used in + `SCNet `_. + + - Norm layer is now optional + - Last ReLU in forward function is removed + """ + + expansion = 1 + + def __init__( + self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style="pytorch", + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type="BN"), + dcn=None, + plugins=None, + init_fg=None, + ): + super(SimplifiedBasicBlock, self).__init__(init_fg) + assert dcn is None, "Not implemented yet." + assert plugins is None, "Not implemented yet." + assert not with_cp, "Not implemented yet." + self.with_norm = norm_cfg is not None + with_bias = True if norm_cfg is None else False + self.conv1 = build_conv_layer( + conv_cfg, inplanes, planes, 3, stride=stride, padding=dilation, dilation=dilation, bias=with_bias + ) + if self.with_norm: + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer(conv_cfg, planes, planes, 3, padding=1, bias=with_bias) + if self.with_norm: + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) if self.with_norm else None + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) if self.with_norm else None + + def forward(self, x): + """Forward function.""" + + identity = x + + out = self.conv1(x) + if self.with_norm: + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + if self.with_norm: + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py new file mode 100644 index 000000000..ba1fa3973 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def inverse_sigmoid(x, eps=1e-5): + """Inverse function of sigmoid. + + Args: + x (Tensor): The tensor to do the + inverse. + eps (float): EPS avoid numerical + overflow. Defaults 1e-5. + Returns: + Tensor: The x has passed the inverse + function of sigmoid, has same + shape with input. + """ + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) diff --git a/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py new file mode 100644 index 000000000..d0d7b8f0b --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +from .util_distribution import get_device diff --git a/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py b/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py new file mode 100644 index 000000000..1466423c4 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import torch + + +def get_device(): + """Returns an available device, cpu, cuda or mlu.""" + is_device_available = {"cuda": torch.cuda.is_available(), "mlu": is_mlu_available()} + device_list = [k for k, v in is_device_available.items() if v] + return device_list[0] if len(device_list) == 1 else "cpu" diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py new file mode 100644 index 000000000..bfcdeed3b --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-50, -50, -5, 50, 50, 3] +# For nuScenes we usually do 10-class detection +class_names = [ + "car", + "truck", + "trailer", + "bus", + "construction_vehicle", + "bicycle", + "motorcycle", + "pedestrian", + "traffic_cone", + "barrier", +] +dataset_type = "NuScenesDataset" +# Input modality for nuScenes dataset, this is consistent with the submission +# format which requires the information in input_modality. +input_modality = dict(use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) +file_client_args = dict(backend="disk") +test_pipeline = [ + dict(type="LoadPointsFromFile", coord_type="LIDAR", load_dim=5, use_dim=5, file_client_args=file_client_args), + dict(type="LoadPointsFromMultiSweeps", sweeps_num=10, file_client_args=file_client_args), + dict( + type="MultiScaleFlipAug3D", + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type="GlobalRotScaleTrans", rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]), + dict(type="RandomFlip3D"), + dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range), + dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False), + dict(type="Collect3D", keys=["points"]), + ], + ), +] diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py new file mode 100644 index 000000000..ed42a1e20 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +checkpoint_config = dict(interval=1) +# yapf:disable push +# By default we use textlogger hook and tensorboard +# For more loggers see +# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook +log_config = dict(interval=50, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")]) +# yapf:enable +dist_params = dict(backend="nccl") +log_level = "INFO" +work_dir = None +load_from = None +resume_from = None +workflow = [("train", 1)] diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py new file mode 100644 index 000000000..9bc22e852 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. + +from .transforms import bbox3d2result diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py new file mode 100644 index 000000000..94fc09eb7 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.core.bbox import build_bbox_coder diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py new file mode 100644 index 000000000..1a3ab5953 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .base_box3d import BaseInstance3DBoxes +from .box_3d_mode import Box3DMode +from .cam_box3d import CameraInstance3DBoxes +from .coord_3d_mode import Coord3DMode +from .depth_box3d import DepthInstance3DBoxes +from .lidar_box3d import LiDARInstance3DBoxes +from .utils import ( + get_box_type, + get_proj_mat_by_coord_type, + limit_period, + mono_cam_box2vis, + points_cam2img, + rotation_3d_in_axis, + xywhr2xyxyr, +) + +__all__ = [ + "Box3DMode", + "BaseInstance3DBoxes", + "LiDARInstance3DBoxes", + "CameraInstance3DBoxes", + "DepthInstance3DBoxes", + "xywhr2xyxyr", + "get_box_type", + "rotation_3d_in_axis", + "limit_period", + "points_cam2img", + "Coord3DMode", + "mono_cam_box2vis", + "get_proj_mat_by_coord_type", +] diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py new file mode 100644 index 000000000..073e57222 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py @@ -0,0 +1,338 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +from abc import abstractmethod + +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .utils import xywhr2xyxyr + + +class BaseInstance3DBoxes(object): + """Base class for 3D Boxes. + + Note: + The box is bottom centered, i.e. the relative position of origin in + the box is (0.5, 0.5, 0). + + Args: + tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix. + box_dim (int): Number of the dimension of a box. + Each row is (x, y, z, x_size, y_size, z_size, yaw). + Default to 7. + with_yaw (bool): Whether the box is with yaw rotation. + If False, the value of yaw will be set to 0 as minmax boxes. + Default to True. + origin (tuple[float]): The relative position of origin in the box. + Default to (0.5, 0.5, 0). This will guide the box be converted to + (0.5, 0.5, 0) mode. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicating the dimension of a box. + Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)): + if isinstance(tensor, torch.Tensor): + device = tensor.device + else: + device = torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that + # does not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, device=device) + assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() + + if tensor.shape[-1] == 6: + # If the dimension of boxes is 6, we expand box_dim by padding + # 0 as a fake yaw and set with_yaw to False. + assert box_dim == 6 + fake_rot = tensor.new_zeros(tensor.shape[0], 1) + tensor = torch.cat((tensor, fake_rot), dim=-1) + self.box_dim = box_dim + 1 + self.with_yaw = False + else: + self.box_dim = box_dim + self.with_yaw = with_yaw + self.tensor = tensor.clone() + + if origin != (0.5, 0.5, 0): + dst = self.tensor.new_tensor((0.5, 0.5, 0)) + src = self.tensor.new_tensor(origin) + self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) + + @property + def volume(self): + """torch.Tensor: A vector with volume of each box.""" + return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5] + + @property + def dims(self): + """torch.Tensor: Corners of each box with size (N, 8, 3).""" + return self.tensor[:, 3:6] + + @property + def yaw(self): + """torch.Tensor: A vector with yaw of each box.""" + return self.tensor[:, 6] + + @property + def height(self): + """torch.Tensor: A vector with height of each box.""" + return self.tensor[:, 5] + + @property + def top_height(self): + """torch.Tensor: A vector with the top height of each box.""" + return self.bottom_height + self.height + + @property + def bottom_height(self): + """torch.Tensor: A vector with bottom's height of each box.""" + return self.tensor[:, 2] + + @property + def center(self): + """Calculate the center of all the boxes. + + Note: + In the MMDetection3D's convention, the bottom center is + usually taken as the default center. + + The relative position of the centers in different kinds of + boxes are different, e.g., the relative center of a boxes is + (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. + It is recommended to use ``bottom_center`` or ``gravity_center`` + for more clear usage. + + Returns: + torch.Tensor: A tensor with center of each box. + """ + return self.bottom_center + + @property + def bottom_center(self): + """torch.Tensor: A tensor with center of each box.""" + return self.tensor[:, :3] + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box.""" + pass + + @property + def corners(self): + """torch.Tensor: a tensor with 8 corners of each box.""" + pass + + @abstractmethod + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or \ + rotation matrix. + + Args: + angle (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + """ + pass + + @abstractmethod + def flip(self, bev_direction="horizontal"): + """Flip the boxes in BEV along given BEV direction.""" + pass + + def scale(self, scale_factor): + """Scale the box with horizontal and vertical scaling factors. + + Args: + scale_factors (float): Scale factors to scale the boxes. + """ + self.tensor[:, :6] *= scale_factor + self.tensor[:, 7:] *= scale_factor + + def __getitem__(self, item): + """ + Note: + The following usage are allowed: + 1. `new_boxes = boxes[3]`: + return a `Boxes` that contains only one box. + 2. `new_boxes = boxes[2:10]`: + return a slice of boxes. + 3. `new_boxes = boxes[vector]`: + where vector is a torch.BoolTensor with `length = len(boxes)`. + Nonzero elements in the vector will be selected. + Note that the returned Boxes might share storage with this Boxes, + subject to Pytorch's indexing semantics. + + Returns: + :obj:`BaseInstance3DBoxes`: A new object of \ + :class:`BaseInstances3DBoxes` after indexing. + """ + original_type = type(self) + if isinstance(item, int): + return original_type(self.tensor[item].view(1, -1), box_dim=self.box_dim, with_yaw=self.with_yaw) + b = self.tensor[item] + assert b.dim() == 2, f"Indexing on Boxes with {item} failed to return a matrix!" + return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw) + + def __len__(self): + """int: Number of boxes in the current object.""" + return self.tensor.shape[0] + + def __repr__(self): + """str: Return a strings that describes the object.""" + return self.__class__.__name__ + "(\n " + str(self.tensor) + ")" + + @classmethod + def cat(cls, boxes_list): + """Concatenate a list of Boxes into a single Boxes. + + Args: + boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes. + + Returns: + :obj:`BaseInstance3DBoxes`: The concatenated Boxes. + """ + assert isinstance(boxes_list, (list, tuple)) + if len(boxes_list) == 0: + return cls(torch.empty(0)) + assert all(isinstance(box, cls) for box in boxes_list) + + # use torch.cat (v.s. layers.cat) + # so the returned boxes never share storage with input + cat_boxes = cls( + torch.cat([b.tensor for b in boxes_list], dim=0), + box_dim=boxes_list[0].tensor.shape[1], + with_yaw=boxes_list[0].with_yaw, + ) + return cat_boxes + + def to(self, device): + """Convert current boxes to a specific device. + + Args: + device (str | :obj:`torch.device`): The name of the device. + + Returns: + :obj:`BaseInstance3DBoxes`: A new boxes object on the \ + specific device. + """ + original_type = type(self) + return original_type(self.tensor.to(device), box_dim=self.box_dim, with_yaw=self.with_yaw) + + def clone(self): + """Clone the Boxes. + + Returns: + :obj:`BaseInstance3DBoxes`: Box object with the same properties \ + as self. + """ + original_type = type(self) + return original_type(self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw) + + @property + def device(self): + """str: The device of the boxes are on.""" + return self.tensor.device + + def __iter__(self): + """Yield a box as a Tensor of shape (4,) at a time. + + Returns: + torch.Tensor: A box of shape (4,). + """ + yield from self.tensor + + @classmethod + def height_overlaps(cls, boxes1, boxes2, mode="iou"): + """Calculate height overlaps of two boxes. + + Note: + This function calculates the height overlaps between boxes1 and + boxes2, boxes1 and boxes2 should be in the same type. + + Args: + boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. + mode (str, optional): Mode of iou calculation. Defaults to 'iou'. + + Returns: + torch.Tensor: Calculated iou of boxes. + """ + assert isinstance(boxes1, BaseInstance3DBoxes) + assert isinstance(boxes2, BaseInstance3DBoxes) + assert type(boxes1) == type(boxes2), ( + '"boxes1" and "boxes2" should' f"be in the same type, got {type(boxes1)} and {type(boxes2)}." + ) + + boxes1_top_height = boxes1.top_height.view(-1, 1) + boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) + boxes2_top_height = boxes2.top_height.view(1, -1) + boxes2_bottom_height = boxes2.bottom_height.view(1, -1) + + heighest_of_bottom = torch.max(boxes1_bottom_height, boxes2_bottom_height) + lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height) + overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0) + return overlaps_h + + @classmethod + def overlaps(cls, boxes1, boxes2, mode="iou"): + """Calculate 3D overlaps of two boxes. + + Note: + This function calculates the overlaps between ``boxes1`` and + ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. + mode (str, optional): Mode of iou calculation. Defaults to 'iou'. + + Returns: + torch.Tensor: Calculated iou of boxes' heights. + """ + assert isinstance(boxes1, BaseInstance3DBoxes) + assert isinstance(boxes2, BaseInstance3DBoxes) + assert type(boxes1) == type(boxes2), ( + '"boxes1" and "boxes2" should' f"be in the same type, got {type(boxes1)} and {type(boxes2)}." + ) + + assert mode in ["iou", "iof"] + + rows = len(boxes1) + cols = len(boxes2) + if rows * cols == 0: + return boxes1.tensor.new(rows, cols) + + # height overlap + overlaps_h = cls.height_overlaps(boxes1, boxes2) + + # obtain BEV boxes in XYXYR format + boxes1_bev = xywhr2xyxyr(boxes1.bev) + boxes2_bev = xywhr2xyxyr(boxes2.bev) + + # bev overlap + overlaps_bev = boxes1_bev.new_zeros((boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda() # (N, M) + iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(), boxes2_bev.contiguous().cuda(), overlaps_bev) + + # 3d overlaps + overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h + + volume1 = boxes1.volume.view(-1, 1) + volume2 = boxes2.volume.view(1, -1) + + if mode == "iou": + # the clamp func is used to avoid division of 0 + iou3d = overlaps_3d / torch.clamp(volume1 + volume2 - overlaps_3d, min=1e-8) + else: + iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8) + + return iou3d diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py new file mode 100644 index 000000000..3b3a9716b --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py @@ -0,0 +1,165 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +from enum import IntEnum, unique + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from .base_box3d import BaseInstance3DBoxes +from .cam_box3d import CameraInstance3DBoxes +from .depth_box3d import DepthInstance3DBoxes +from .lidar_box3d import LiDARInstance3DBoxes + + +@unique +class Box3DMode(IntEnum): + r"""Enum of different ways to represent a box. + + Coordinates in LiDAR: + + .. code-block:: none + + up z + ^ x front + | / + | / + left y <------ 0 + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + + Coordinates in camera: + + .. code-block:: none + + z front + / + / + 0 ------> x right + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], + and the yaw is around the y axis, thus the rotation axis=1. + + Coordinates in Depth mode: + + .. code-block:: none + + up z + ^ y front + | / + | / + 0 ------> x right + + The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + """ + + LIDAR = 0 + CAM = 1 + DEPTH = 2 + + @staticmethod + def convert(box, src, dst, rt_mat=None): + """Convert boxes from `src` mode to `dst` mode. + + Args: + box (tuple | list | np.ndarray | + torch.Tensor | BaseInstance3DBoxes): + Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. + src (:obj:`Box3DMode`): The src Box mode. + dst (:obj:`Box3DMode`): The target Box mode. + rt_mat (np.ndarray | torch.Tensor): The rotation and translation + matrix between different coordinates. Defaults to None. + The conversion from `src` coordinates to `dst` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + + Returns: + (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \ + The converted box of the same type. + """ + if src == dst: + return box + + is_numpy = isinstance(box, np.ndarray) + is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) + single_box = isinstance(box, (list, tuple)) + if single_box: + assert len(box) >= 7, ( + "Box3DMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 7" + ) + arr = torch.tensor(box)[None, :] + else: + # avoid modifying the input box + if is_numpy: + arr = torch.from_numpy(np.asarray(box)).clone() + elif is_Instance3DBoxes: + arr = box.tensor.clone() + else: + arr = box.clone() + + # convert box from `src` mode to `dst` mode. + x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] + if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) + xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) + elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) + xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) + elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + else: + raise NotImplementedError(f"Conversion from Box3DMode {src} to {dst} " "is not supported yet") + + if not isinstance(rt_mat, torch.Tensor): + rt_mat = arr.new_tensor(rt_mat) + if rt_mat.size(1) == 4: + extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) + xyz = extended_xyz @ rt_mat.t() + else: + xyz = arr[:, :3] @ rt_mat.t() + + remains = arr[..., 6:] + arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) + + # convert arr to the original type + original_type = type(box) + if single_box: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + elif is_Instance3DBoxes: + if dst == Box3DMode.CAM: + target_type = CameraInstance3DBoxes + elif dst == Box3DMode.LIDAR: + target_type = LiDARInstance3DBoxes + elif dst == Box3DMode.DEPTH: + target_type = DepthInstance3DBoxes + else: + raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet") + return target_type(arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) + else: + return arr diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py new file mode 100644 index 000000000..d0073db07 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py @@ -0,0 +1,256 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis + + +class CameraInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in CAM coordinates. + + Coordinates in camera: + + .. code-block:: none + + z front (yaw=-0.5*pi) + / + / + 0 ------> x right (yaw=0) + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), + and the yaw is around the y axis, thus the rotation axis=1. + The yaw is 0 at the positive direction of x axis, and decreases from + the positive direction of x to the positive direction of z. + + A refactor is ongoing to make the three coordinate systems + easier to understand and convert between each other. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicates the dimension of a box + Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 1.0, 0.5)): + if isinstance(tensor, torch.Tensor): + device = tensor.device + else: + device = torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that + # does not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, device=device) + assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() + + if tensor.shape[-1] == 6: + # If the dimension of boxes is 6, we expand box_dim by padding + # 0 as a fake yaw and set with_yaw to False. + assert box_dim == 6 + fake_rot = tensor.new_zeros(tensor.shape[0], 1) + tensor = torch.cat((tensor, fake_rot), dim=-1) + self.box_dim = box_dim + 1 + self.with_yaw = False + else: + self.box_dim = box_dim + self.with_yaw = with_yaw + self.tensor = tensor.clone() + + if origin != (0.5, 1.0, 0.5): + dst = self.tensor.new_tensor((0.5, 1.0, 0.5)) + src = self.tensor.new_tensor(origin) + self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) + + @property + def height(self): + """torch.Tensor: A vector with height of each box.""" + return self.tensor[:, 4] + + @property + def top_height(self): + """torch.Tensor: A vector with the top height of each box.""" + # the positive direction is down rather than up + return self.bottom_height - self.height + + @property + def bottom_height(self): + """torch.Tensor: A vector with bottom's height of each box.""" + return self.tensor[:, 1] + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box.""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]] + gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5 + return gravity_center + + @property + def corners(self): + """torch.Tensor: Coordinates of corners of all the boxes in + shape (N, 8, 3). + + Convert the boxes to in clockwise order, in the form of + (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0) + + .. code-block:: none + + front z + / + / + (x0, y0, z1) + ----------- + (x1, y0, z1) + /| / | + / | / | + (x0, y0, z0) + ----------- + + (x1, y1, z1) + | / . | / + | / origin | / + (x0, y1, z0) + ----------- + -------> x right + | (x1, y1, z0) + | + v + down y + """ + # TODO: rotation_3d_in_axis function do not support + # empty tensor currently. + assert len(self.tensor) != 0 + dims = self.dims + corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( + device=dims.device, dtype=dims.dtype + ) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin [0.5, 1, 0.5] + corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around y axis + corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + @property + def bev(self): + """torch.Tensor: A n x 5 tensor of 2D BEV box of each box + with rotation in XYWHR format.""" + return self.tensor[:, [0, 2, 3, 5, 6]] + + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or \ + rotation matrix. + + Args: + angle (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns \ + None, otherwise it returns the rotated points and the \ + rotation matrix ``rot_mat_T``. + """ + if not isinstance(angle, torch.Tensor): + angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}" + + if angle.numel() == 1: + rot_sin = torch.sin(angle) + rot_cos = torch.cos(angle) + rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]]) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[2, 0] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + + self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T + self.tensor[:, 6] += angle + + if points is not None: + if isinstance(points, torch.Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + # clockwise + points.rotate(-angle) + else: + raise ValueError + return points, rot_mat_T + + def flip(self, bev_direction="horizontal", points=None): + """Flip the boxes in BEV along given BEV direction. + + In CAM coordinates, it flips the x (horizontal) or z (vertical) axis. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + Points to flip. Defaults to None. + + Returns: + torch.Tensor, numpy.ndarray or None: Flipped points. + """ + assert bev_direction in ("horizontal", "vertical") + if bev_direction == "horizontal": + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == "vertical": + self.tensor[:, 2::7] = -self.tensor[:, 2::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) + if isinstance(points, (torch.Tensor, np.ndarray)): + if bev_direction == "horizontal": + points[:, 0] = -points[:, 0] + elif bev_direction == "vertical": + points[:, 2] = -points[:, 2] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points + + @classmethod + def height_overlaps(cls, boxes1, boxes2, mode="iou"): + """Calculate height overlaps of two boxes. + + This function calculates the height overlaps between ``boxes1`` and + ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type. + + Args: + boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes. + boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes. + mode (str, optional): Mode of iou calculation. Defaults to 'iou'. + + Returns: + torch.Tensor: Calculated iou of boxes' heights. + """ + assert isinstance(boxes1, CameraInstance3DBoxes) + assert isinstance(boxes2, CameraInstance3DBoxes) + + boxes1_top_height = boxes1.top_height.view(-1, 1) + boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) + boxes2_top_height = boxes2.top_height.view(1, -1) + boxes2_bottom_height = boxes2.bottom_height.view(1, -1) + + # In camera coordinate system + # from up to down is the positive direction + heighest_of_bottom = torch.min(boxes1_bottom_height, boxes2_bottom_height) + lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height) + overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0) + return overlaps_h diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py new file mode 100644 index 000000000..5acf9dc00 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py @@ -0,0 +1,270 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +from enum import IntEnum, unique + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from .base_box3d import BaseInstance3DBoxes +from .cam_box3d import CameraInstance3DBoxes +from .depth_box3d import DepthInstance3DBoxes +from .lidar_box3d import LiDARInstance3DBoxes + + +@unique +class Coord3DMode(IntEnum): + r"""Enum of different ways to represent a box + and point cloud. + + Coordinates in LiDAR: + + .. code-block:: none + + up z + ^ x front + | / + | / + left y <------ 0 + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + + Coordinates in camera: + + .. code-block:: none + + z front + / + / + 0 ------> x right + | + | + v + down y + + The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], + and the yaw is around the y axis, thus the rotation axis=1. + + Coordinates in Depth mode: + + .. code-block:: none + + up z + ^ y front + | / + | / + 0 ------> x right + + The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + """ + + LIDAR = 0 + CAM = 1 + DEPTH = 2 + + @staticmethod + def convert(input, src, dst, rt_mat=None): + """Convert boxes or points from `src` mode to `dst` mode.""" + if isinstance(input, BaseInstance3DBoxes): + return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat) + elif isinstance(input, BasePoints): + return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) + else: + raise NotImplementedError + + @staticmethod + def convert_box(box, src, dst, rt_mat=None): + """Convert boxes from `src` mode to `dst` mode. + + Args: + box (tuple | list | np.ndarray | + torch.Tensor | BaseInstance3DBoxes): + Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. + src (:obj:`CoordMode`): The src Box mode. + dst (:obj:`CoordMode`): The target Box mode. + rt_mat (np.ndarray | torch.Tensor): The rotation and translation + matrix between different coordinates. Defaults to None. + The conversion from `src` coordinates to `dst` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + + Returns: + (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \ + The converted box of the same type. + """ + if src == dst: + return box + + is_numpy = isinstance(box, np.ndarray) + is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) + single_box = isinstance(box, (list, tuple)) + if single_box: + assert len(box) >= 7, ( + "CoordMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 7" + ) + arr = torch.tensor(box)[None, :] + else: + # avoid modifying the input box + if is_numpy: + arr = torch.from_numpy(np.asarray(box)).clone() + elif is_Instance3DBoxes: + arr = box.tensor.clone() + else: + arr = box.clone() + + # convert box from `src` mode to `dst` mode. + x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] + if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) + xyz_size = torch.cat([y_size, z_size, x_size], dim=-1) + elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) + xyz_size = torch.cat([z_size, x_size, y_size], dim=-1) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) + elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + xyz_size = torch.cat([y_size, x_size, z_size], dim=-1) + else: + raise NotImplementedError(f"Conversion from Coord3DMode {src} to {dst} " "is not supported yet") + + if not isinstance(rt_mat, torch.Tensor): + rt_mat = arr.new_tensor(rt_mat) + if rt_mat.size(1) == 4: + extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) + xyz = extended_xyz @ rt_mat.t() + else: + xyz = arr[:, :3] @ rt_mat.t() + + remains = arr[..., 6:] + arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1) + + # convert arr to the original type + original_type = type(box) + if single_box: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + elif is_Instance3DBoxes: + if dst == Coord3DMode.CAM: + target_type = CameraInstance3DBoxes + elif dst == Coord3DMode.LIDAR: + target_type = LiDARInstance3DBoxes + elif dst == Coord3DMode.DEPTH: + target_type = DepthInstance3DBoxes + else: + raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet") + return target_type(arr, box_dim=arr.size(-1), with_yaw=box.with_yaw) + else: + return arr + + @staticmethod + def convert_point(point, src, dst, rt_mat=None): + """Convert points from `src` mode to `dst` mode. + + Args: + point (tuple | list | np.ndarray | + torch.Tensor | BasePoints): + Can be a k-tuple, k-list or an Nxk array/tensor. + src (:obj:`CoordMode`): The src Point mode. + dst (:obj:`CoordMode`): The target Point mode. + rt_mat (np.ndarray | torch.Tensor): The rotation and translation + matrix between different coordinates. Defaults to None. + The conversion from `src` coordinates to `dst` coordinates + usually comes along the change of sensors, e.g., from camera + to LiDAR. This requires a transformation matrix. + + Returns: + (tuple | list | np.ndarray | torch.Tensor | BasePoints): \ + The converted point of the same type. + """ + if src == dst: + return point + + is_numpy = isinstance(point, np.ndarray) + is_InstancePoints = isinstance(point, BasePoints) + single_point = isinstance(point, (list, tuple)) + if single_point: + assert len(point) >= 3, ( + "CoordMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 3" + ) + arr = torch.tensor(point)[None, :] + else: + # avoid modifying the input point + if is_numpy: + arr = torch.from_numpy(np.asarray(point)).clone() + elif is_InstancePoints: + arr = point.tensor.clone() + else: + arr = point.clone() + + # convert point from `src` mode to `dst` mode. + # TODO: LIDAR + # only implemented provided Rt matrix in cam-depth conversion + if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) + elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) + elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) + elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) + elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: + if rt_mat is None: + rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) + else: + raise NotImplementedError(f"Conversion from Coord3DMode {src} to {dst} " "is not supported yet") + + if not isinstance(rt_mat, torch.Tensor): + rt_mat = arr.new_tensor(rt_mat) + if rt_mat.size(1) == 4: + extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1) + xyz = extended_xyz @ rt_mat.t() + else: + xyz = arr[:, :3] @ rt_mat.t() + + remains = arr[:, 3:] + arr = torch.cat([xyz[:, :3], remains], dim=-1) + + # convert arr to the original type + original_type = type(point) + if single_point: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + elif is_InstancePoints: + if dst == Coord3DMode.CAM: + target_type = CameraPoints + elif dst == Coord3DMode.LIDAR: + target_type = LiDARPoints + elif dst == Coord3DMode.DEPTH: + target_type = DepthPoints + else: + raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet") + return target_type(arr, points_dim=arr.size(-1), attribute_dims=point.attribute_dims) + else: + return arr diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py new file mode 100644 index 000000000..7753d025c --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py @@ -0,0 +1,187 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis + + +class DepthInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in Depth coordinates. + + Coordinates in Depth: + + .. code-block:: none + + up z y front (yaw=-0.5*pi) + ^ ^ + | / + | / + 0 ------> x right (yaw=0) + + The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + The yaw is 0 at the positive direction of x axis, and decreases from + the positive direction of x to the positive direction of y. + Also note that rotation of DepthInstance3DBoxes is counterclockwise, + which is reverse to the definition of the yaw angle (clockwise). + + A refactor is ongoing to make the three coordinate systems + easier to understand and convert between each other. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicates the dimension of a box + Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box.""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, :2] = bottom_center[:, :2] + gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 + return gravity_center + + @property + def corners(self): + """torch.Tensor: Coordinates of corners of all the boxes + in shape (N, 8, 3). + + Convert the boxes to corners in clockwise order, in form of + ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` + + .. code-block:: none + + up z + front y ^ + / | + / | + (x0, y1, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + (x0, y0, z0) + ----------- + --------> right x + (x1, y0, z0) + """ + # TODO: rotation_3d_in_axis function do not support + # empty tensor currently. + assert len(self.tensor) != 0 + dims = self.dims + corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( + device=dims.device, dtype=dims.dtype + ) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin (0.5, 0.5, 0) + corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around z axis + corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + @property + def bev(self): + """torch.Tensor: A n x 5 tensor of 2D BEV box of each box + in XYWHR format.""" + return self.tensor[:, [0, 1, 3, 4, 6]] + + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or \ + rotation matrix. + + Args: + angle (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns \ + None, otherwise it returns the rotated points and the \ + rotation matrix ``rot_mat_T``. + """ + if not isinstance(angle, torch.Tensor): + angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}" + + if angle.numel() == 1: + rot_sin = torch.sin(angle) + rot_cos = torch.cos(angle) + rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]).T + else: + rot_mat_T = angle.T + rot_sin = rot_mat_T[0, 1] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + + self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T + if self.with_yaw: + self.tensor[:, 6] -= angle + else: + corners_rot = self.corners @ rot_mat_T + new_x_size = ( + corners_rot[..., 0].max(dim=1, keepdim=True)[0] - corners_rot[..., 0].min(dim=1, keepdim=True)[0] + ) + new_y_size = ( + corners_rot[..., 1].max(dim=1, keepdim=True)[0] - corners_rot[..., 1].min(dim=1, keepdim=True)[0] + ) + self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1) + + if points is not None: + if isinstance(points, torch.Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + # anti-clockwise + points.rotate(angle) + else: + raise ValueError + return points, rot_mat_T + + def flip(self, bev_direction="horizontal", points=None): + """Flip the boxes in BEV along given BEV direction. + + In Depth coordinates, it flips x (horizontal) or y (vertical) axis. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + Points to flip. Defaults to None. + + Returns: + torch.Tensor, numpy.ndarray or None: Flipped points. + """ + assert bev_direction in ("horizontal", "vertical") + if bev_direction == "horizontal": + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == "vertical": + self.tensor[:, 1::7] = -self.tensor[:, 1::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) + if isinstance(points, (torch.Tensor, np.ndarray)): + if bev_direction == "horizontal": + points[:, 0] = -points[:, 0] + elif bev_direction == "vertical": + points[:, 1] = -points[:, 1] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py new file mode 100644 index 000000000..9a6a4e79a --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py @@ -0,0 +1,179 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from .base_box3d import BaseInstance3DBoxes +from .utils import rotation_3d_in_axis + + +class LiDARInstance3DBoxes(BaseInstance3DBoxes): + """3D boxes of instances in LIDAR coordinates. + + Coordinates in LiDAR: + + .. code-block:: none + + up z x front (yaw=-0.5*pi) + ^ ^ + | / + | / + (yaw=-pi) left y <------ 0 -------- (yaw=0) + + The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), + and the yaw is around the z axis, thus the rotation axis=2. + The yaw is 0 at the negative direction of y axis, and decreases from + the negative direction of y to the positive direction of x. + + A refactor is ongoing to make the three coordinate systems + easier to understand and convert between each other. + + Attributes: + tensor (torch.Tensor): Float matrix of N x box_dim. + box_dim (int): Integer indicating the dimension of a box. + Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). + with_yaw (bool): If True, the value of yaw will be set to 0 as minmax + boxes. + """ + + @property + def gravity_center(self): + """torch.Tensor: A tensor with center of each box.""" + bottom_center = self.bottom_center + gravity_center = torch.zeros_like(bottom_center) + gravity_center[:, :2] = bottom_center[:, :2] + gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 + return gravity_center + + @property + def corners(self): + """torch.Tensor: Coordinates of corners of all the boxes + in shape (N, 8, 3). + + Convert the boxes to corners in clockwise order, in form of + ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` + + .. code-block:: none + + up z + front x ^ + / | + / | + (x1, y0, z1) + ----------- + (x1, y1, z1) + /| / | + / | / | + (x0, y0, z1) + ----------- + + (x1, y1, z0) + | / . | / + | / origin | / + left y<-------- + ----------- + (x0, y1, z0) + (x0, y0, z0) + """ + # TODO: rotation_3d_in_axis function do not support + # empty tensor currently. + assert len(self.tensor) != 0 + dims = self.dims + corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( + device=dims.device, dtype=dims.dtype + ) + + corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] + # use relative origin [0.5, 0.5, 0] + corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) + corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) + + # rotate around z axis + corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2) + corners += self.tensor[:, :3].view(-1, 1, 3) + return corners + + @property + def bev(self): + """torch.Tensor: 2D BEV box of each box with rotation + in XYWHR format.""" + return self.tensor[:, [0, 1, 3, 4, 6]] + + def rotate(self, angle, points=None): + """Rotate boxes with points (optional) with the given angle or \ + rotation matrix. + + Args: + angles (float | torch.Tensor | np.ndarray): + Rotation angle or rotation matrix. + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional): + Points to rotate. Defaults to None. + + Returns: + tuple or None: When ``points`` is None, the function returns \ + None, otherwise it returns the rotated points and the \ + rotation matrix ``rot_mat_T``. + """ + if not isinstance(angle, torch.Tensor): + angle = self.tensor.new_tensor(angle) + assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}" + + if angle.numel() == 1: + rot_sin = torch.sin(angle) + rot_cos = torch.cos(angle) + rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) + else: + rot_mat_T = angle + rot_sin = rot_mat_T[1, 0] + rot_cos = rot_mat_T[0, 0] + angle = np.arctan2(rot_sin, rot_cos) + + self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T + self.tensor[:, 6] += angle + + if self.tensor.shape[1] == 9: + # rotate velo vector + self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2] + + if points is not None: + if isinstance(points, torch.Tensor): + points[:, :3] = points[:, :3] @ rot_mat_T + elif isinstance(points, np.ndarray): + rot_mat_T = rot_mat_T.numpy() + points[:, :3] = np.dot(points[:, :3], rot_mat_T) + elif isinstance(points, BasePoints): + # clockwise + points.rotate(-angle) + else: + raise ValueError + return points, rot_mat_T + + def flip(self, bev_direction="horizontal", points=None): + """Flip the boxes in BEV along given BEV direction. + + In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis. + + Args: + bev_direction (str): Flip direction (horizontal or vertical). + points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None): + Points to flip. Defaults to None. + + Returns: + torch.Tensor, numpy.ndarray or None: Flipped points. + """ + assert bev_direction in ("horizontal", "vertical") + if bev_direction == "horizontal": + self.tensor[:, 1::7] = -self.tensor[:, 1::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + np.pi + elif bev_direction == "vertical": + self.tensor[:, 0::7] = -self.tensor[:, 0::7] + if self.with_yaw: + self.tensor[:, 6] = -self.tensor[:, 6] + + if points is not None: + assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) + if isinstance(points, (torch.Tensor, np.ndarray)): + if bev_direction == "horizontal": + points[:, 1] = -points[:, 1] + elif bev_direction == "vertical": + points[:, 0] = -points[:, 0] + elif isinstance(points, BasePoints): + points.flip(bev_direction) + return points diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py new file mode 100644 index 000000000..0b1201e93 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +from logging import warning + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + + +def limit_period(val, offset=0.5, period=np.pi): + """Limit the value into a period for periodic function. + + Args: + val (torch.Tensor): The value to be converted. + offset (float, optional): Offset to set the value range. \ + Defaults to 0.5. + period ([type], optional): Period of the value. Defaults to np.pi. + + Returns: + torch.Tensor: Value in the range of \ + [-offset * period, (1-offset) * period] + """ + return val - torch.floor(val / period + offset) * period + + +def rotation_3d_in_axis(points, angles, axis=0): + """Rotate points by angles according to axis. + + Args: + points (torch.Tensor): Points of shape (N, M, 3). + angles (torch.Tensor): Vector of angles in shape (N,) + axis (int, optional): The axis to be rotated. Defaults to 0. + + Raises: + ValueError: when the axis is not in range [0, 1, 2], it will \ + raise value error. + + Returns: + torch.Tensor: Rotated points in shape (N, M, 3) + """ + rot_sin = torch.sin(angles) + rot_cos = torch.cos(angles) + ones = torch.ones_like(rot_cos) + zeros = torch.zeros_like(rot_cos) + if axis == 1: + rot_mat_T = torch.stack( + [ + torch.stack([rot_cos, zeros, -rot_sin]), + torch.stack([zeros, ones, zeros]), + torch.stack([rot_sin, zeros, rot_cos]), + ] + ) + elif axis == 2 or axis == -1: + rot_mat_T = torch.stack( + [ + torch.stack([rot_cos, -rot_sin, zeros]), + torch.stack([rot_sin, rot_cos, zeros]), + torch.stack([zeros, zeros, ones]), + ] + ) + elif axis == 0: + rot_mat_T = torch.stack( + [ + torch.stack([zeros, rot_cos, -rot_sin]), + torch.stack([zeros, rot_sin, rot_cos]), + torch.stack([ones, zeros, zeros]), + ] + ) + else: + raise ValueError(f"axis should in range [0, 1, 2], got {axis}") + + return torch.einsum("aij,jka->aik", (points, rot_mat_T)) + + +def xywhr2xyxyr(boxes_xywhr): + """Convert a rotated boxes in XYWHR format to XYXYR format. + + Args: + boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format. + + Returns: + torch.Tensor: Converted boxes in XYXYR format. + """ + boxes = torch.zeros_like(boxes_xywhr) + half_w = boxes_xywhr[:, 2] / 2 + half_h = boxes_xywhr[:, 3] / 2 + + boxes[:, 0] = boxes_xywhr[:, 0] - half_w + boxes[:, 1] = boxes_xywhr[:, 1] - half_h + boxes[:, 2] = boxes_xywhr[:, 0] + half_w + boxes[:, 3] = boxes_xywhr[:, 1] + half_h + boxes[:, 4] = boxes_xywhr[:, 4] + return boxes + + +def get_box_type(box_type): + """Get the type and mode of box structure. + + Args: + box_type (str): The type of box structure. + The valid value are "LiDAR", "Camera", or "Depth". + + Returns: + tuple: Box type and box mode. + """ + from .box_3d_mode import ( + Box3DMode, + CameraInstance3DBoxes, + DepthInstance3DBoxes, + LiDARInstance3DBoxes, + ) + + box_type_lower = box_type.lower() + if box_type_lower == "lidar": + box_type_3d = LiDARInstance3DBoxes + box_mode_3d = Box3DMode.LIDAR + elif box_type_lower == "camera": + box_type_3d = CameraInstance3DBoxes + box_mode_3d = Box3DMode.CAM + elif box_type_lower == "depth": + box_type_3d = DepthInstance3DBoxes + box_mode_3d = Box3DMode.DEPTH + else: + raise ValueError('Only "box_type" of "camera", "lidar", "depth"' f" are supported, got {box_type}") + + return box_type_3d, box_mode_3d + + +def points_cam2img(points_3d, proj_mat, with_depth=False): + """Project points from camera coordicates to image coordinates. + + Args: + points_3d (torch.Tensor): Points in shape (N, 3). + proj_mat (torch.Tensor): Transformation matrix between coordinates. + with_depth (bool, optional): Whether to keep depth in the output. + Defaults to False. + + Returns: + torch.Tensor: Points in image coordinates with shape [N, 2]. + """ + points_num = list(points_3d.shape)[:-1] + + points_shape = np.concatenate([points_num, [1]], axis=0).tolist() + assert len(proj_mat.shape) == 2, ( + "The dimension of the projection" f" matrix should be 2 instead of {len(proj_mat.shape)}." + ) + d1, d2 = proj_mat.shape[:2] + assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (d1 == 4 and d2 == 4), ( + "The shape of the projection matrix" f" ({d1}*{d2}) is not supported." + ) + if d1 == 3: + proj_mat_expanded = torch.eye(4, device=proj_mat.device, dtype=proj_mat.dtype) + proj_mat_expanded[:d1, :d2] = proj_mat + proj_mat = proj_mat_expanded + + # previous implementation use new_zeros, new_one yeilds better results + points_4 = torch.cat([points_3d, points_3d.new_ones(*points_shape)], dim=-1) + point_2d = torch.matmul(points_4, proj_mat.t()) + point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] + + if with_depth: + return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) + return point_2d_res + + +def mono_cam_box2vis(cam_box): + """This is a post-processing function on the bboxes from Mono-3D task. If + we want to perform projection visualization, we need to: + + 1. rotate the box along x-axis for np.pi / 2 (roll) + 2. change orientation from local yaw to global yaw + 3. convert yaw by (np.pi / 2 - yaw) + + After applying this function, we can project and draw it on 2D images. + + Args: + cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate \ + system before conversion. Could be gt bbox loaded from dataset or \ + network prediction output. + + Returns: + :obj:`CameraInstance3DBoxes`: Box after conversion. + """ + warning.warn( + "DeprecationWarning: The hack of yaw and dimension in the " + "monocular 3D detection on nuScenes has been removed. The " + "function mono_cam_box2vis will be deprecated." + ) + from . import CameraInstance3DBoxes + + assert isinstance(cam_box, CameraInstance3DBoxes), "input bbox should be CameraInstance3DBoxes!" + + loc = cam_box.gravity_center + dim = cam_box.dims + yaw = cam_box.yaw + feats = cam_box.tensor[:, 7:] + # rotate along x-axis for np.pi / 2 + # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa + dim[:, [1, 2]] = dim[:, [2, 1]] + # change local yaw to global yaw for visualization + # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa + yaw += torch.atan2(loc[:, 0], loc[:, 2]) + # convert yaw by (-yaw - np.pi / 2) + # this is because mono 3D box class such as `NuScenesBox` has different + # definition of rotation with our `CameraInstance3DBoxes` + yaw = -yaw - np.pi / 2 + cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1) + cam_box = CameraInstance3DBoxes(cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5)) + + return cam_box + + +def get_proj_mat_by_coord_type(img_meta, coord_type): + """Obtain image features using points. + + Args: + img_meta (dict): Meta info. + coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. + Can be case-insensitive. + + Returns: + torch.Tensor: transformation matrix. + """ + coord_type = coord_type.upper() + mapping = {"LIDAR": "lidar2img", "DEPTH": "depth2img", "CAMERA": "cam2img"} + assert coord_type in mapping.keys() + return img_meta[mapping[coord_type]] diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py new file mode 100644 index 000000000..3a23aebd7 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +def bbox3d2result(bboxes, scores, labels, attrs=None): + """Convert detection results to a list of numpy arrays. + + Args: + bboxes (torch.Tensor): Bounding boxes with shape of (n, 5). + labels (torch.Tensor): Labels with shape of (n, ). + scores (torch.Tensor): Scores with shape of (n, ). + attrs (torch.Tensor, optional): Attributes with shape of (n, ). \ + Defaults to None. + + Returns: + dict[str, torch.Tensor]: Bounding box results in cpu mode. + + - boxes_3d (torch.Tensor): 3D boxes. + - scores (torch.Tensor): Prediction scores. + - labels_3d (torch.Tensor): Box labels. + - attrs_3d (torch.Tensor, optional): Box attributes. + """ + result_dict = dict(boxes_3d=bboxes.to("cpu"), scores_3d=scores.cpu(), labels_3d=labels.cpu()) + + if attrs is not None: + result_dict["attrs_3d"] = attrs.cpu() + + return result_dict diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py new file mode 100644 index 000000000..bdded694d --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .base_points import BasePoints diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py new file mode 100644 index 000000000..8c51c5adc --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py @@ -0,0 +1,335 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import warnings +from abc import abstractmethod + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + + +class BasePoints(object): + """Base class for Points. + + Args: + tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. + points_dim (int): Number of the dimension of a point. + Each row is (x, y, z). Default to 3. + attribute_dims (dict): Dictionary to indicate the meaning of extra + dimension. Default to None. + + Attributes: + tensor (torch.Tensor): Float matrix of N x points_dim. + points_dim (int): Integer indicating the dimension of a point. + Each row is (x, y, z, ...). + attribute_dims (bool): Dictionary to indicate the meaning of extra + dimension. Default to None. + rotation_axis (int): Default rotation axis for points rotation. + """ + + def __init__(self, tensor, points_dim=3, attribute_dims=None): + if isinstance(tensor, torch.Tensor): + device = tensor.device + else: + device = torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that + # does not depend on the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, points_dim)).to(dtype=torch.float32, device=device) + assert tensor.dim() == 2 and tensor.size(-1) == points_dim, tensor.size() + + self.tensor = tensor + self.points_dim = points_dim + self.attribute_dims = attribute_dims + self.rotation_axis = 0 + + @property + def coord(self): + """torch.Tensor: Coordinates of each point with size (N, 3).""" + return self.tensor[:, :3] + + @coord.setter + def coord(self, tensor): + """Set the coordinates of each point.""" + try: + tensor = tensor.reshape(self.shape[0], 3) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f"got unexpected shape {tensor.shape}") + if not isinstance(tensor, torch.Tensor): + tensor = self.tensor.new_tensor(tensor) + self.tensor[:, :3] = tensor + + @property + def height(self): + """torch.Tensor: A vector with height of each point.""" + if self.attribute_dims is not None and "height" in self.attribute_dims.keys(): + return self.tensor[:, self.attribute_dims["height"]] + else: + return None + + @height.setter + def height(self, tensor): + """Set the height of each point.""" + try: + tensor = tensor.reshape(self.shape[0]) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f"got unexpected shape {tensor.shape}") + if not isinstance(tensor, torch.Tensor): + tensor = self.tensor.new_tensor(tensor) + if self.attribute_dims is not None and "height" in self.attribute_dims.keys(): + self.tensor[:, self.attribute_dims["height"]] = tensor + else: + # add height attribute + if self.attribute_dims is None: + self.attribute_dims = dict() + attr_dim = self.shape[1] + self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1) + self.attribute_dims.update(dict(height=attr_dim)) + self.points_dim += 1 + + @property + def color(self): + """torch.Tensor: A vector with color of each point.""" + if self.attribute_dims is not None and "color" in self.attribute_dims.keys(): + return self.tensor[:, self.attribute_dims["color"]] + else: + return None + + @color.setter + def color(self, tensor): + """Set the color of each point.""" + try: + tensor = tensor.reshape(self.shape[0], 3) + except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray + raise ValueError(f"got unexpected shape {tensor.shape}") + if tensor.max() >= 256 or tensor.min() < 0: + warnings.warn("point got color value beyond [0, 255]") + if not isinstance(tensor, torch.Tensor): + tensor = self.tensor.new_tensor(tensor) + if self.attribute_dims is not None and "color" in self.attribute_dims.keys(): + self.tensor[:, self.attribute_dims["color"]] = tensor + else: + # add color attribute + if self.attribute_dims is None: + self.attribute_dims = dict() + attr_dim = self.shape[1] + self.tensor = torch.cat([self.tensor, tensor], dim=1) + self.attribute_dims.update(dict(color=[attr_dim, attr_dim + 1, attr_dim + 2])) + self.points_dim += 3 + + @property + def shape(self): + """torch.Shape: Shape of points.""" + return self.tensor.shape + + def shuffle(self): + """Shuffle the points. + + Returns: + torch.Tensor: The shuffled index. + """ + idx = torch.randperm(self.__len__(), device=self.tensor.device) + self.tensor = self.tensor[idx] + return idx + + def rotate(self, rotation, axis=None): + """Rotate points with the given rotation matrix or angle. + + Args: + rotation (float, np.ndarray, torch.Tensor): Rotation matrix + or angle. + axis (int): Axis to rotate at. Defaults to None. + """ + if not isinstance(rotation, torch.Tensor): + rotation = self.tensor.new_tensor(rotation) + assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, f"invalid rotation shape {rotation.shape}" + + if axis is None: + axis = self.rotation_axis + + if rotation.numel() == 1: + rot_sin = torch.sin(rotation) + rot_cos = torch.cos(rotation) + if axis == 1: + rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]]) + elif axis == 2 or axis == -1: + rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) + elif axis == 0: + rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin], [0, rot_sin, rot_cos], [1, 0, 0]]) + else: + raise ValueError("axis should in range") + rot_mat_T = rot_mat_T.T + elif rotation.numel() == 9: + rot_mat_T = rotation + else: + raise NotImplementedError + self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T + + return rot_mat_T + + @abstractmethod + def flip(self, bev_direction="horizontal"): + """Flip the points in BEV along given BEV direction.""" + pass + + def scale(self, scale_factor): + """Scale the points with horizontal and vertical scaling factors. + + Args: + scale_factors (float): Scale factors to scale the points. + """ + self.tensor[:, :3] *= scale_factor + + def __getitem__(self, item): + """ + Note: + The following usage are allowed: + 1. `new_points = points[3]`: + return a `Points` that contains only one point. + 2. `new_points = points[2:10]`: + return a slice of points. + 3. `new_points = points[vector]`: + where vector is a torch.BoolTensor with `length = len(points)`. + Nonzero elements in the vector will be selected. + 4. `new_points = points[3:11, vector]`: + return a slice of points and attribute dims. + 5. `new_points = points[4:12, 2]`: + return a slice of points with single attribute. + Note that the returned Points might share storage with this Points, + subject to Pytorch's indexing semantics. + + Returns: + :obj:`BasePoints`: A new object of \ + :class:`BasePoints` after indexing. + """ + original_type = type(self) + if isinstance(item, int): + return original_type( + self.tensor[item].view(1, -1), points_dim=self.points_dim, attribute_dims=self.attribute_dims + ) + elif isinstance(item, tuple) and len(item) == 2: + if isinstance(item[1], slice): + start = 0 if item[1].start is None else item[1].start + stop = self.tensor.shape[1] if item[1].stop is None else item[1].stop + step = 1 if item[1].step is None else item[1].step + item = list(item) + item[1] = list(range(start, stop, step)) + item = tuple(item) + elif isinstance(item[1], int): + item = list(item) + item[1] = [item[1]] + item = tuple(item) + p = self.tensor[item[0], item[1]] + + keep_dims = list(set(item[1]).intersection(set(range(3, self.tensor.shape[1])))) + if self.attribute_dims is not None: + attribute_dims = self.attribute_dims.copy() + for key in self.attribute_dims.keys(): + cur_attribute_dims = attribute_dims[key] + if isinstance(cur_attribute_dims, int): + cur_attribute_dims = [cur_attribute_dims] + intersect_attr = list(set(cur_attribute_dims).intersection(set(keep_dims))) + if len(intersect_attr) == 1: + attribute_dims[key] = intersect_attr[0] + elif len(intersect_attr) > 1: + attribute_dims[key] = intersect_attr + else: + attribute_dims.pop(key) + else: + attribute_dims = None + elif isinstance(item, (slice, np.ndarray, torch.Tensor)): + p = self.tensor[item] + attribute_dims = self.attribute_dims + else: + raise NotImplementedError(f"Invalid slice {item}!") + + assert p.dim() == 2, f"Indexing on Points with {item} failed to return a matrix!" + return original_type(p, points_dim=p.shape[1], attribute_dims=attribute_dims) + + def __len__(self): + """int: Number of points in the current object.""" + return self.tensor.shape[0] + + def __repr__(self): + """str: Return a strings that describes the object.""" + return self.__class__.__name__ + "(\n " + str(self.tensor) + ")" + + @classmethod + def cat(cls, points_list): + """Concatenate a list of Points into a single Points. + + Args: + points_list (list[:obj:`BasePoints`]): List of points. + + Returns: + :obj:`BasePoints`: The concatenated Points. + """ + assert isinstance(points_list, (list, tuple)) + if len(points_list) == 0: + return cls(torch.empty(0)) + assert all(isinstance(points, cls) for points in points_list) + + # use torch.cat (v.s. layers.cat) + # so the returned points never share storage with input + cat_points = cls( + torch.cat([p.tensor for p in points_list], dim=0), + points_dim=points_list[0].tensor.shape[1], + attribute_dims=points_list[0].attribute_dims, + ) + return cat_points + + def to(self, device): + """Convert current points to a specific device. + + Args: + device (str | :obj:`torch.device`): The name of the device. + + Returns: + :obj:`BasePoints`: A new boxes object on the \ + specific device. + """ + original_type = type(self) + return original_type(self.tensor.to(device), points_dim=self.points_dim, attribute_dims=self.attribute_dims) + + def clone(self): + """Clone the Points. + + Returns: + :obj:`BasePoints`: Box object with the same properties \ + as self. + """ + original_type = type(self) + return original_type(self.tensor.clone(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) + + @property + def device(self): + """str: The device of the points are on.""" + return self.tensor.device + + def __iter__(self): + """Yield a point as a Tensor of shape (4,) at a time. + + Returns: + torch.Tensor: A point of shape (4,). + """ + yield from self.tensor + + def new_point(self, data): + """Create a new point object with data. + + The new point and its tensor has the similar properties \ + as self and self.tensor, respectively. + + Args: + data (torch.Tensor | numpy.array | list): Data to be copied. + + Returns: + :obj:`BasePoints`: A new point object with ``data``, \ + the object's other properties are similar to ``self``. + """ + new_tensor = self.tensor.new_tensor(data) if not isinstance(data, torch.Tensor) else data.to(self.device) + original_type = type(self) + return original_type(new_tensor, points_dim=self.points_dim, attribute_dims=self.attribute_dims) diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py new file mode 100644 index 000000000..d9127610f --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets.builder import build_dataloader + +from .builder import DATASETS, build_dataset +from .custom_3d import Custom3DDataset +from .nuscenes_dataset import NuScenesDataset diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py new file mode 100644 index 000000000..8dbd95eb3 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import platform + +from mmcv.utils import build_from_cfg +from mmdet.datasets import DATASETS + +if platform.system() != "Windows": + # https://github.com/pytorch/pytorch/issues/973 + import resource + + rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) + base_soft_limit = rlimit[0] + hard_limit = rlimit[1] + soft_limit = min(max(4096, base_soft_limit), hard_limit) + resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) + + +def build_dataset(cfg, default_args=None): + + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py new file mode 100644 index 000000000..65bf42cfd --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py @@ -0,0 +1,222 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +from os import path as osp + +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from mmdet.datasets import DATASETS +from torch.utils.data import Dataset + +from ..core.bbox.structures import get_box_type +from .pipelines import Compose + + +@DATASETS.register_module() +class Custom3DDataset(Dataset): + """Customized 3D dataset. + + This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI + dataset. + + Args: + data_root (str): Path of dataset root. + ann_file (str): Path of annotation file. + pipeline (list[dict], optional): Pipeline used for data processing. + Defaults to None. + classes (tuple[str], optional): Classes used in the dataset. + Defaults to None. + modality (dict, optional): Modality to specify the sensor data used + as input. Defaults to None. + box_type_3d (str, optional): Type of 3D box of this dataset. + Based on the `box_type_3d`, the dataset will encapsulate the box + to its original format then converted them to `box_type_3d`. + Defaults to 'LiDAR'. Available options includes + + - 'LiDAR': Box in LiDAR coordinates. + - 'Depth': Box in depth coordinates, usually for indoor dataset. + - 'Camera': Box in camera coordinates. + filter_empty_gt (bool, optional): Whether to filter empty GT. + Defaults to True. + test_mode (bool, optional): Whether the dataset is in test mode. + Defaults to False. + """ + + def __init__( + self, + data_root, + ann_file, + pipeline=None, + classes=None, + modality=None, + box_type_3d="LiDAR", + filter_empty_gt=True, + test_mode=False, + ): + super().__init__() + self.data_root = data_root + self.ann_file = ann_file + self.test_mode = test_mode + self.modality = modality + self.filter_empty_gt = filter_empty_gt + self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) + + self.CLASSES = self.get_classes(classes) + self.cat2id = {name: i for i, name in enumerate(self.CLASSES)} + self.data_infos = self.load_annotations(self.ann_file) + + if pipeline is not None: + self.pipeline = Compose(pipeline) + + # set group flag for the sampler + if not self.test_mode: + self._set_group_flag() + + def load_annotations(self, ann_file): + """Load annotations from ann_file. + + Args: + ann_file (str): Path of the annotation file. + + Returns: + list[dict]: List of annotations. + """ + return mmcv.load(ann_file) + + def get_data_info(self, index): + """Get data info according to the given index. + + Args: + index (int): Index of the sample data to get. + + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - file_name (str): Filename of point clouds. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + sample_idx = info["point_cloud"]["lidar_idx"] + pts_filename = osp.join(self.data_root, info["pts_path"]) + + input_dict = dict(pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename) + + if not self.test_mode: + annos = self.get_ann_info(index) + input_dict["ann_info"] = annos + if self.filter_empty_gt and ~(annos["gt_labels_3d"] != -1).any(): + return None + return input_dict + + def pre_pipeline(self, results): + """Initialization before data preparation. + + Args: + results (dict): Dict before data preprocessing. + + - img_fields (list): Image fields. + - bbox3d_fields (list): 3D bounding boxes fields. + - pts_mask_fields (list): Mask fields of points. + - pts_seg_fields (list): Mask fields of point segments. + - bbox_fields (list): Fields of bounding boxes. + - mask_fields (list): Fields of masks. + - seg_fields (list): Segment fields. + - box_type_3d (str): 3D box type. + - box_mode_3d (str): 3D box mode. + """ + results["img_fields"] = [] + results["bbox3d_fields"] = [] + results["pts_mask_fields"] = [] + results["pts_seg_fields"] = [] + results["bbox_fields"] = [] + results["mask_fields"] = [] + results["seg_fields"] = [] + results["box_type_3d"] = self.box_type_3d + results["box_mode_3d"] = self.box_mode_3d + + def prepare_test_data(self, index): + """Prepare data for testing. + + Args: + index (int): Index for accessing the target data. + + Returns: + dict: Testing data dict of the corresponding index. + """ + input_dict = self.get_data_info(index) + self.pre_pipeline(input_dict) + example = self.pipeline(input_dict) + return example + + @classmethod + def get_classes(cls, classes=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + + Return: + list[str]: A list of class names. + """ + if classes is None: + return cls.CLASSES + + if isinstance(classes, str): + # take it as a file path + class_names = mmcv.list_from_file(classes) + elif isinstance(classes, (tuple, list)): + class_names = classes + else: + raise ValueError(f"Unsupported type {type(classes)} of classes.") + + return class_names + + def __len__(self): + """Return the length of data infos. + + Returns: + int: Length of data infos. + """ + return len(self.data_infos) + + def _rand_another(self, idx): + """Randomly get another item with the same flag. + + Returns: + int: Another index of item with the same flag. + """ + pool = np.where(self.flag == self.flag[idx])[0] + return np.random.choice(pool) + + def __getitem__(self, idx): + """Get item from infos according to the given index. + + Returns: + dict: Data dictionary of the corresponding index. + """ + if self.test_mode: + return self.prepare_test_data(idx) + while True: + data = self.prepare_train_data(idx) + if data is None: + idx = self._rand_another(idx) + continue + return data + + def _set_group_flag(self): + """Set flag according to image aspect ratio. + + Images with aspect ratio greater than 1 will be set as group 1, + otherwise group 0. In 3D datasets, they are all the same, thus are all + zeros. + """ + self.flag = np.zeros(len(self), dtype=np.uint8) diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py new file mode 100644 index 000000000..f48c9daf6 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +from mmdet.datasets import DATASETS + +from .custom_3d import Custom3DDataset + + +@DATASETS.register_module() +class NuScenesDataset(Custom3DDataset): + r"""NuScenes Dataset. + + This class serves as the API for experiments on the NuScenes Dataset. + + Please refer to `NuScenes Dataset `_ + for data downloading. + + Args: + ann_file (str): Path of annotation file. + pipeline (list[dict], optional): Pipeline used for data processing. + Defaults to None. + data_root (str): Path of dataset root. + classes (tuple[str], optional): Classes used in the dataset. + Defaults to None. + load_interval (int, optional): Interval of loading the dataset. It is + used to uniformly sample the dataset. Defaults to 1. + with_velocity (bool, optional): Whether include velocity prediction + into the experiments. Defaults to True. + modality (dict, optional): Modality to specify the sensor data used + as input. Defaults to None. + box_type_3d (str, optional): Type of 3D box of this dataset. + Based on the `box_type_3d`, the dataset will encapsulate the box + to its original format then converted them to `box_type_3d`. + Defaults to 'LiDAR' in this dataset. Available options includes. + - 'LiDAR': Box in LiDAR coordinates. + - 'Depth': Box in depth coordinates, usually for indoor dataset. + - 'Camera': Box in camera coordinates. + filter_empty_gt (bool, optional): Whether to filter empty GT. + Defaults to True. + test_mode (bool, optional): Whether the dataset is in test mode. + Defaults to False. + eval_version (bool, optional): Configuration version of evaluation. + Defaults to 'detection_cvpr_2019'. + use_valid_flag (bool): Whether to use `use_valid_flag` key in the info + file as mask to filter gt_boxes and gt_names. Defaults to False. + """ + NameMapping = { + "movable_object.barrier": "barrier", + "vehicle.bicycle": "bicycle", + "vehicle.bus.bendy": "bus", + "vehicle.bus.rigid": "bus", + "vehicle.car": "car", + "vehicle.construction": "construction_vehicle", + "vehicle.motorcycle": "motorcycle", + "human.pedestrian.adult": "pedestrian", + "human.pedestrian.child": "pedestrian", + "human.pedestrian.construction_worker": "pedestrian", + "human.pedestrian.police_officer": "pedestrian", + "movable_object.trafficcone": "traffic_cone", + "vehicle.trailer": "trailer", + "vehicle.truck": "truck", + } + DefaultAttribute = { + "car": "vehicle.parked", + "pedestrian": "pedestrian.moving", + "trailer": "vehicle.parked", + "truck": "vehicle.parked", + "bus": "vehicle.moving", + "motorcycle": "cycle.without_rider", + "construction_vehicle": "vehicle.parked", + "bicycle": "cycle.without_rider", + "barrier": "", + "traffic_cone": "", + } + AttrMapping = { + "cycle.with_rider": 0, + "cycle.without_rider": 1, + "pedestrian.moving": 2, + "pedestrian.standing": 3, + "pedestrian.sitting_lying_down": 4, + "vehicle.moving": 5, + "vehicle.parked": 6, + "vehicle.stopped": 7, + } + AttrMapping_rev = [ + "cycle.with_rider", + "cycle.without_rider", + "pedestrian.moving", + "pedestrian.standing", + "pedestrian.sitting_lying_down", + "vehicle.moving", + "vehicle.parked", + "vehicle.stopped", + ] + # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa + ErrNameMapping = { + "trans_err": "mATE", + "scale_err": "mASE", + "orient_err": "mAOE", + "vel_err": "mAVE", + "attr_err": "mAAE", + } + CLASSES = ( + "car", + "truck", + "trailer", + "bus", + "construction_vehicle", + "bicycle", + "motorcycle", + "pedestrian", + "traffic_cone", + "barrier", + ) + + def __init__( + self, + ann_file, + pipeline=None, + data_root=None, + classes=None, + load_interval=1, + with_velocity=True, + modality=None, + box_type_3d="LiDAR", + filter_empty_gt=True, + test_mode=False, + eval_version="detection_cvpr_2019", + use_valid_flag=False, + ): + self.load_interval = load_interval + self.use_valid_flag = use_valid_flag + super().__init__( + data_root=data_root, + ann_file=ann_file, + pipeline=pipeline, + classes=classes, + modality=modality, + box_type_3d=box_type_3d, + filter_empty_gt=filter_empty_gt, + test_mode=test_mode, + ) + + self.with_velocity = with_velocity + self.eval_version = eval_version + from nuscenes.eval.detection.config import config_factory + + self.eval_detection_configs = config_factory(self.eval_version) + if self.modality is None: + self.modality = dict( + use_camera=False, + use_lidar=True, + use_radar=False, + use_map=False, + use_external=False, + ) + + def load_annotations(self, ann_file): + """Load annotations from ann_file. + + Args: + ann_file (str): Path of the annotation file. + + Returns: + list[dict]: List of annotations sorted by timestamps. + """ + data = mmcv.load(ann_file) + data_infos = list(sorted(data["infos"], key=lambda e: e["timestamp"])) + data_infos = data_infos[:: self.load_interval] + self.metadata = data["metadata"] + self.version = self.metadata["version"] + return data_infos diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py new file mode 100644 index 000000000..843c9ff7c --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmdet.datasets.pipelines import Compose + +from .loading import LoadMultiViewImageFromFiles diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py new file mode 100644 index 000000000..e4e0f56c3 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py @@ -0,0 +1,285 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +from mmcv.parallel import DataContainer as DC +from mmdet3d.core.bbox.structures import BaseInstance3DBoxes +from mmdet3d.core.points import BasePoints +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import to_tensor + +PIPELINES._module_dict.pop("DefaultFormatBundle") + + +@PIPELINES.register_module() +class DefaultFormatBundle(object): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img", + "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \ + (3)to DataContainer (stack=True) + """ + + def __init__( + self, + ): + return + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + if "img" in results: + if isinstance(results["img"], list): + # process multiple imgs in single frame + imgs = [img.transpose(2, 0, 1) for img in results["img"]] + imgs = np.ascontiguousarray(np.stack(imgs, axis=0)) + results["img"] = DC(to_tensor(imgs), stack=True) + else: + img = np.ascontiguousarray(results["img"].transpose(2, 0, 1)) + results["img"] = DC(to_tensor(img), stack=True) + for key in [ + "proposals", + "gt_bboxes", + "gt_bboxes_ignore", + "gt_labels", + "gt_labels_3d", + "attr_labels", + "pts_instance_mask", + "pts_semantic_mask", + "centers2d", + "depths", + ]: + if key not in results: + continue + if isinstance(results[key], list): + results[key] = DC([to_tensor(res) for res in results[key]]) + else: + results[key] = DC(to_tensor(results[key])) + if "gt_bboxes_3d" in results: + if isinstance(results["gt_bboxes_3d"], BaseInstance3DBoxes): + results["gt_bboxes_3d"] = DC(results["gt_bboxes_3d"], cpu_only=True) + else: + results["gt_bboxes_3d"] = DC(to_tensor(results["gt_bboxes_3d"])) + + if "gt_masks" in results: + results["gt_masks"] = DC(results["gt_masks"], cpu_only=True) + if "gt_semantic_seg" in results: + results["gt_semantic_seg"] = DC(to_tensor(results["gt_semantic_seg"][None, ...]), stack=True) + + return results + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module() +class Collect3D(object): + """Collect data from the loader relevant to the specific task. + + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "proposals", "gt_bboxes", + "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". + + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + + - 'img_shape': shape of the image input to the network as a tuple \ + (h, w, c). Note that images may be zero padded on the \ + bottom/right if the batch tensor is larger than this shape. + - 'scale_factor': a float indicating the preprocessing scale + - 'flip': a boolean indicating if image flip transform was used + - 'filename': path to the image file + - 'ori_shape': original shape of the image as a tuple (h, w, c) + - 'pad_shape': image shape after padding + - 'lidar2img': transform from lidar to image + - 'depth2img': transform from depth to image + - 'cam2img': transform from camera to image + - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ + flipped horizontally + - 'pcd_vertical_flip': a boolean indicating if point cloud is \ + flipped vertically + - 'box_mode_3d': 3D box mode + - 'box_type_3d': 3D box type + - 'img_norm_cfg': a dict of normalization information: + - mean: per channel mean subtraction + - std: per channel std divisor + - to_rgb: bool indicating if bgr was converted to rgb + - 'pcd_trans': point cloud transformations + - 'sample_idx': sample index + - 'pcd_scale_factor': point cloud scale factor + - 'pcd_rotation': rotation applied to point cloud + - 'pts_filename': path to point cloud file. + + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', + 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', + 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', + 'box_type_3d', 'img_norm_cfg', 'pcd_trans', + 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') + """ + + def __init__( + self, + keys, + meta_keys=( + "filename", + "ori_shape", + "img_shape", + "lidar2img", + "depth2img", + "cam2img", + "pad_shape", + "scale_factor", + "flip", + "pcd_horizontal_flip", + "pcd_vertical_flip", + "box_mode_3d", + "box_type_3d", + "img_norm_cfg", + "pcd_trans", + "sample_idx", + "pcd_scale_factor", + "pcd_rotation", + "pts_filename", + "transformation_3d_flow", + ), + ): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:`mmcv.DataContainer`. + + Args: + results (dict): Result dict contains the data to collect. + + Returns: + dict: The result dict contains the following keys + - keys in ``self.keys`` + - ``img_metas`` + """ + data = {} + img_metas = {} + for key in self.meta_keys: + if key in results: + img_metas[key] = results[key] + + data["img_metas"] = DC(img_metas, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + """str: Return a string that describes the module.""" + return self.__class__.__name__ + f"(keys={self.keys}, meta_keys={self.meta_keys})" + + +@PIPELINES.register_module() +class DefaultFormatBundle3D(DefaultFormatBundle): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields for voxels, + including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and + "gt_semantic_seg". + These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - proposals: (1)to tensor, (2)to DataContainer + - gt_bboxes: (1)to tensor, (2)to DataContainer + - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer + - gt_labels: (1)to tensor, (2)to DataContainer + """ + + def __init__(self, class_names, with_gt=True, with_label=True): + super(DefaultFormatBundle3D, self).__init__() + self.class_names = class_names + self.with_gt = with_gt + self.with_label = with_label + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + # Format 3D data + if "points" in results: + assert isinstance(results["points"], BasePoints) + results["points"] = DC(results["points"].tensor) + + for key in ["voxels", "coors", "voxel_centers", "num_points"]: + if key not in results: + continue + results[key] = DC(to_tensor(results[key]), stack=False) + + if self.with_gt: + # Clean GT bboxes in the final + if "gt_bboxes_3d_mask" in results: + gt_bboxes_3d_mask = results["gt_bboxes_3d_mask"] + results["gt_bboxes_3d"] = results["gt_bboxes_3d"][gt_bboxes_3d_mask] + if "gt_names_3d" in results: + results["gt_names_3d"] = results["gt_names_3d"][gt_bboxes_3d_mask] + if "centers2d" in results: + results["centers2d"] = results["centers2d"][gt_bboxes_3d_mask] + if "depths" in results: + results["depths"] = results["depths"][gt_bboxes_3d_mask] + if "gt_bboxes_mask" in results: + gt_bboxes_mask = results["gt_bboxes_mask"] + if "gt_bboxes" in results: + results["gt_bboxes"] = results["gt_bboxes"][gt_bboxes_mask] + results["gt_names"] = results["gt_names"][gt_bboxes_mask] + if self.with_label: + if "gt_names" in results and len(results["gt_names"]) == 0: + results["gt_labels"] = np.array([], dtype=np.int64) + results["attr_labels"] = np.array([], dtype=np.int64) + elif "gt_names" in results and isinstance(results["gt_names"][0], list): + # gt_labels might be a list of list in multi-view setting + results["gt_labels"] = [ + np.array([self.class_names.index(n) for n in res], dtype=np.int64) + for res in results["gt_names"] + ] + elif "gt_names" in results: + results["gt_labels"] = np.array( + [self.class_names.index(n) for n in results["gt_names"]], dtype=np.int64 + ) + # we still assume one pipeline for one frame LiDAR + # thus, the 3D name is list[string] + if "gt_names_3d" in results: + results["gt_labels_3d"] = np.array( + [self.class_names.index(n) for n in results["gt_names_3d"]], dtype=np.int64 + ) + results = super(DefaultFormatBundle3D, self).__call__(results) + return results + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f"(class_names={self.class_names}, " + repr_str += f"with_gt={self.with_gt}, with_label={self.with_label})" + return repr_str diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py new file mode 100644 index 000000000..75dc9dfc2 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# # Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from mmdet.datasets.builder import PIPELINES + + +@PIPELINES.register_module() +class LoadMultiViewImageFromFiles(object): + """Load multi channel images from a list of separate channel files. + + Expects results['img_filename'] to be a list of filenames. + + Args: + to_float32 (bool): Whether to convert the img to float32. + Defaults to False. + color_type (str): Color type of the file. Defaults to 'unchanged'. + """ + + def __init__(self, to_float32=False, color_type="unchanged"): + self.to_float32 = to_float32 + self.color_type = color_type + + def __call__(self, results): + """Call function to load multi-view image from files. + + Args: + results (dict): Result dict containing multi-view image filenames. + + Returns: + dict: The result dict containing the multi-view image data. \ + Added keys and values are described below. + + - filename (str): Multi-view image filenames. + - img (np.ndarray): Multi-view image arrays. + - img_shape (tuple[int]): Shape of multi-view image arrays. + - ori_shape (tuple[int]): Shape of original image arrays. + - pad_shape (tuple[int]): Shape of padded image arrays. + - scale_factor (float): Scale factor. + - img_norm_cfg (dict): Normalization configuration of images. + """ + + filename = results["img_filename"] + + # img is of shape (h, w, c, num_views) + img = np.stack([mmcv.imread(name, self.color_type) for name in filename], axis=-1) + if self.to_float32: + img = img.astype(np.float32) + results["filename"] = filename + # unravel to list, see `DefaultFormatBundle` in formating.py + # which will transpose each image separately and then stack into array + results["img"] = [img[..., i] for i in range(img.shape[-1])] + results["img_shape"] = img.shape + results["ori_shape"] = img.shape + # Set initial values for default meta_keys + results["pad_shape"] = img.shape + results["scale_factor"] = 1.0 + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results["img_norm_cfg"] = dict( + mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False + ) + return results + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f"(to_float32={self.to_float32}, " + repr_str += f"color_type='{self.color_type}')" + return repr_str diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py new file mode 100644 index 000000000..74fb057f4 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import warnings +from copy import deepcopy + +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +from mmdet.datasets.builder import PIPELINES +from mmdet.datasets.pipelines import Compose + + +@PIPELINES.register_module() +class MultiScaleFlipAug3D(object): + """Test-time augmentation with multiple scales and flipping. + + Args: + transforms (list[dict]): Transforms to apply in each augmentation. + img_scale (tuple | list[tuple]: Images scales for resizing. + pts_scale_ratio (float | list[float]): Points scale ratios for + resizing. + flip (bool): Whether apply flip augmentation. Defaults to False. + flip_direction (str | list[str]): Flip augmentation directions + for images, options are "horizontal" and "vertical". + If flip_direction is list, multiple flip augmentations will + be applied. It has no effect when ``flip == False``. + Defaults to "horizontal". + pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation + to point cloud. Defaults to True. Note that it works only when + 'flip' is turned on. + pcd_vertical_flip (bool): Whether apply vertical flip augmentation + to point cloud. Defaults to True. Note that it works only when + 'flip' is turned on. + """ + + def __init__( + self, + transforms, + img_scale, + pts_scale_ratio, + flip=False, + flip_direction="horizontal", + pcd_horizontal_flip=False, + pcd_vertical_flip=False, + ): + self.transforms = Compose(transforms) + self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] + self.pts_scale_ratio = pts_scale_ratio if isinstance(pts_scale_ratio, list) else [float(pts_scale_ratio)] + + assert mmcv.is_list_of(self.img_scale, tuple) + assert mmcv.is_list_of(self.pts_scale_ratio, float) + + self.flip = flip + self.pcd_horizontal_flip = pcd_horizontal_flip + self.pcd_vertical_flip = pcd_vertical_flip + + self.flip_direction = flip_direction if isinstance(flip_direction, list) else [flip_direction] + assert mmcv.is_list_of(self.flip_direction, str) + if not self.flip and self.flip_direction != ["horizontal"]: + warnings.warn("flip_direction has no effect when flip is set to False") + if self.flip and not any([(t["type"] == "RandomFlip3D" or t["type"] == "RandomFlip") for t in transforms]): + warnings.warn("flip has no effect when RandomFlip is not in transforms") + + def __call__(self, results): + """Call function to augment common fields in results. + + Args: + results (dict): Result dict contains the data to augment. + + Returns: + dict: The result dict contains the data that is augmented with \ + different scales and flips. + """ + aug_data = [] + + # modified from `flip_aug = [False, True] if self.flip else [False]` + # to reduce unnecessary scenes when using double flip augmentation + # during test time + flip_aug = [True] if self.flip else [False] + pcd_horizontal_flip_aug = [False, True] if self.flip and self.pcd_horizontal_flip else [False] + pcd_vertical_flip_aug = [False, True] if self.flip and self.pcd_vertical_flip else [False] + for scale in self.img_scale: + for pts_scale_ratio in self.pts_scale_ratio: + for flip in flip_aug: + for pcd_horizontal_flip in pcd_horizontal_flip_aug: + for pcd_vertical_flip in pcd_vertical_flip_aug: + for direction in self.flip_direction: + # results.copy will cause bug + # since it is shallow copy + _results = deepcopy(results) + _results["scale"] = scale + _results["flip"] = flip + _results["pcd_scale_factor"] = pts_scale_ratio + _results["flip_direction"] = direction + _results["pcd_horizontal_flip"] = pcd_horizontal_flip + _results["pcd_vertical_flip"] = pcd_vertical_flip + data = self.transforms(_results) + aug_data.append(data) + # list of dict to dict of list + aug_data_dict = {key: [] for key in aug_data[0]} + for data in aug_data: + for key, val in data.items(): + aug_data_dict[key].append(val) + return aug_data_dict + + def __repr__(self): + """str: Return a string that describes the module.""" + repr_str = self.__class__.__name__ + repr_str += f"(transforms={self.transforms}, " + repr_str += f"img_scale={self.img_scale}, flip={self.flip}, " + repr_str += f"pts_scale_ratio={self.pts_scale_ratio}, " + repr_str += f"flip_direction={self.flip_direction})" + return repr_str diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py new file mode 100644 index 000000000..64f636518 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# # Copyright (c) OpenMMLab. All rights reserved. + +from .detectors import * diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py new file mode 100644 index 000000000..a73c30306 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +from mmdet.models.builder import BACKBONES, DETECTORS, HEADS, LOSSES, MODELS, NECKS + +VOXEL_ENCODERS = MODELS +MIDDLE_ENCODERS = MODELS +FUSION_LAYERS = MODELS + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_loss(cfg): + """Build loss function.""" + return LOSSES.build(cfg) + + +def build_detector(cfg, train_cfg=None, test_cfg=None): + """Build detector.""" + if train_cfg is not None or test_cfg is not None: + warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning) + assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field " + assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field " + return DETECTORS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) + + +def build_model(cfg, train_cfg=None, test_cfg=None): + """A function warpper for building 3D detector according to + cfg. + + Should be deprecated in the future. + """ + return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg) + + +def build_voxel_encoder(cfg): + """Build voxel encoder.""" + return VOXEL_ENCODERS.build(cfg) + + +def build_middle_encoder(cfg): + """Build middle level encoder.""" + return MIDDLE_ENCODERS.build(cfg) + + +def build_fusion_layer(cfg): + """Build fusion layer.""" + return FUSION_LAYERS.build(cfg) diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py new file mode 100644 index 000000000..5d1d0befa --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from .base import Base3DDetector diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py new file mode 100644 index 000000000..7ab88bea1 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import auto_fp16 +from mmdet.models.detectors.base import BaseDetector + + +class Base3DDetector(BaseDetector): + """Base class for detectors.""" + + def forward_test(self, points, img_metas, img=None, **kwargs): + """ + Args: + points (list[torch.Tensor]): the outer list indicates test-time + augmentations and inner torch.Tensor should have a shape NxC, + which contains all points in the batch. + img_metas (list[list[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch + img (list[torch.Tensor], optional): the outer + list indicates test-time augmentations and inner + torch.Tensor should have a shape NxCxHxW, which contains + all images in the batch. Defaults to None. + """ + for var, name in [(points, "points"), (img_metas, "img_metas")]: + if not isinstance(var, list): + raise TypeError("{} must be a list, but got {}".format(name, type(var))) + + num_augs = len(points) + if num_augs != len(img_metas): + raise ValueError("num of augmentations ({}) != num of image meta ({})".format(len(points), len(img_metas))) + + if num_augs == 1: + img = [img] if img is None else img + return self.simple_test(points[0], img_metas[0], img[0], **kwargs) + else: + return self.aug_test(points, img_metas, img, **kwargs) + + @auto_fp16(apply_to=("img", "points")) + def forward(self, **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. + + Note this setting will change the expected inputs. When + `return_loss=True`, img and img_metas are single-nested (i.e. + torch.Tensor and list[dict]), and when `resturn_loss=False`, img and + img_metas should be double nested (i.e. list[torch.Tensor], + list[list[dict]]), with the outer list indicating test time + augmentations. + """ + + return self.forward_test(**kwargs) diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py new file mode 100644 index 000000000..dcb498542 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py @@ -0,0 +1,418 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import warnings + +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from mmcv.runner import force_fp32 +from mmdet.models import DETECTORS +from torch.nn import functional as F + +from .. import builder +from .base import Base3DDetector + + +@DETECTORS.register_module() +class MVXTwoStageDetector(Base3DDetector): + """Base class of Multi-modality VoxelNet.""" + + def __init__( + self, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + init_cfg=None, + ): + super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg) + + if pts_voxel_layer: + self.pts_voxel_layer = Voxelization(**pts_voxel_layer) + if pts_voxel_encoder: + self.pts_voxel_encoder = builder.build_voxel_encoder(pts_voxel_encoder) + if pts_middle_encoder: + self.pts_middle_encoder = builder.build_middle_encoder(pts_middle_encoder) + if pts_backbone: + self.pts_backbone = builder.build_backbone(pts_backbone) + if pts_fusion_layer: + self.pts_fusion_layer = builder.build_fusion_layer(pts_fusion_layer) + if pts_neck is not None: + self.pts_neck = builder.build_neck(pts_neck) + if pts_bbox_head: + pts_train_cfg = train_cfg.pts if train_cfg else None + pts_bbox_head.update(train_cfg=pts_train_cfg) + pts_test_cfg = test_cfg.pts if test_cfg else None + pts_bbox_head.update(test_cfg=pts_test_cfg) + self.pts_bbox_head = builder.build_head(pts_bbox_head) + + if img_backbone: + self.img_backbone = builder.build_backbone(img_backbone) + if img_neck is not None: + self.img_neck = builder.build_neck(img_neck) + if img_rpn_head is not None: + self.img_rpn_head = builder.build_head(img_rpn_head) + if img_roi_head is not None: + self.img_roi_head = builder.build_head(img_roi_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + if pretrained is None: + img_pretrained = None + pts_pretrained = None + elif isinstance(pretrained, dict): + img_pretrained = pretrained.get("img", None) + pts_pretrained = pretrained.get("pts", None) + else: + raise ValueError(f"pretrained should be a dict, got {type(pretrained)}") + + if self.with_img_backbone: + if img_pretrained is not None: + warnings.warn( + "DeprecationWarning: pretrained is a deprecated \ + key, please consider using init_cfg" + ) + self.img_backbone.init_cfg = dict(type="Pretrained", checkpoint=img_pretrained) + if self.with_img_roi_head: + if img_pretrained is not None: + warnings.warn( + "DeprecationWarning: pretrained is a deprecated \ + key, please consider using init_cfg" + ) + self.img_roi_head.init_cfg = dict(type="Pretrained", checkpoint=img_pretrained) + + if self.with_pts_backbone: + if pts_pretrained is not None: + warnings.warn( + "DeprecationWarning: pretrained is a deprecated \ + key, please consider using init_cfg" + ) + self.pts_backbone.init_cfg = dict(type="Pretrained", checkpoint=pts_pretrained) + + @property + def with_img_shared_head(self): + """bool: Whether the detector has a shared head in image branch.""" + return hasattr(self, "img_shared_head") and self.img_shared_head is not None + + @property + def with_pts_bbox(self): + """bool: Whether the detector has a 3D box head.""" + return hasattr(self, "pts_bbox_head") and self.pts_bbox_head is not None + + @property + def with_img_bbox(self): + """bool: Whether the detector has a 2D image box head.""" + return hasattr(self, "img_bbox_head") and self.img_bbox_head is not None + + @property + def with_img_backbone(self): + """bool: Whether the detector has a 2D image backbone.""" + return hasattr(self, "img_backbone") and self.img_backbone is not None + + @property + def with_pts_backbone(self): + """bool: Whether the detector has a 3D backbone.""" + return hasattr(self, "pts_backbone") and self.pts_backbone is not None + + @property + def with_fusion(self): + """bool: Whether the detector has a fusion layer.""" + return hasattr(self, "pts_fusion_layer") and self.fusion_layer is not None + + @property + def with_img_neck(self): + """bool: Whether the detector has a neck in image branch.""" + return hasattr(self, "img_neck") and self.img_neck is not None + + @property + def with_pts_neck(self): + """bool: Whether the detector has a neck in 3D detector branch.""" + return hasattr(self, "pts_neck") and self.pts_neck is not None + + @property + def with_img_rpn(self): + """bool: Whether the detector has a 2D RPN in image detector branch.""" + return hasattr(self, "img_rpn_head") and self.img_rpn_head is not None + + @property + def with_img_roi_head(self): + """bool: Whether the detector has a RoI Head in image branch.""" + return hasattr(self, "img_roi_head") and self.img_roi_head is not None + + @property + def with_voxel_encoder(self): + """bool: Whether the detector has a voxel encoder.""" + return hasattr(self, "voxel_encoder") and self.voxel_encoder is not None + + @property + def with_middle_encoder(self): + """bool: Whether the detector has a middle encoder.""" + return hasattr(self, "middle_encoder") and self.middle_encoder is not None + + def extract_img_feat(self, img, img_metas): + """Extract features of images.""" + if self.with_img_backbone and img is not None: + input_shape = img.shape[-2:] + # update real input shape of each single img + for img_meta in img_metas: + img_meta.update(input_shape=input_shape) + + if img.dim() == 5 and img.size(0) == 1: + img.squeeze_() + elif img.dim() == 5 and img.size(0) > 1: + B, N, C, H, W = img.size() + img = img.view(B * N, C, H, W) + img_feats = self.img_backbone(img) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + return img_feats + + def extract_pts_feat(self, pts, img_feats, img_metas): + """Extract features of points.""" + if not self.with_pts_bbox: + return None + voxels, num_points, coors = self.voxelize(pts) + voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, img_feats, img_metas) + batch_size = coors[-1, 0] + 1 + x = self.pts_middle_encoder(voxel_features, coors, batch_size) + x = self.pts_backbone(x) + if self.with_pts_neck: + x = self.pts_neck(x) + return x + + def extract_feat(self, points, img, img_metas): + """Extract features from images and points.""" + img_feats = self.extract_img_feat(img, img_metas) + pts_feats = self.extract_pts_feat(points, img_feats, img_metas) + return (img_feats, pts_feats) + + @torch.no_grad() + @force_fp32() + def voxelize(self, points): + """Apply dynamic voxelization to points. + + Args: + points (list[torch.Tensor]): Points of each sample. + + Returns: + tuple[torch.Tensor]: Concatenated points, number of points + per voxel, and coordinates. + """ + voxels, coors, num_points = [], [], [] + for res in points: + res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) + voxels.append(res_voxels) + coors.append(res_coors) + num_points.append(res_num_points) + voxels = torch.cat(voxels, dim=0) + num_points = torch.cat(num_points, dim=0) + coors_batch = [] + for i, coor in enumerate(coors): + coor_pad = F.pad(coor, (1, 0), mode="constant", value=i) + coors_batch.append(coor_pad) + coors_batch = torch.cat(coors_batch, dim=0) + return voxels, num_points, coors_batch + + def forward_train( + self, + points=None, + img_metas=None, + gt_bboxes_3d=None, + gt_labels_3d=None, + gt_labels=None, + gt_bboxes=None, + img=None, + proposals=None, + gt_bboxes_ignore=None, + ): + """Forward training function. + + Args: + points (list[torch.Tensor], optional): Points of each sample. + Defaults to None. + img_metas (list[dict], optional): Meta information of each sample. + Defaults to None. + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): + Ground truth 3D boxes. Defaults to None. + gt_labels_3d (list[torch.Tensor], optional): Ground truth labels + of 3D boxes. Defaults to None. + gt_labels (list[torch.Tensor], optional): Ground truth labels + of 2D boxes in images. Defaults to None. + gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in + images. Defaults to None. + img (torch.Tensor optional): Images of each sample with shape + (N, C, H, W). Defaults to None. + proposals ([list[torch.Tensor], optional): Predicted proposals + used for training Fast RCNN. Defaults to None. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + 2D boxes in images to be ignored. Defaults to None. + + Returns: + dict: Losses of different branches. + """ + img_feats, pts_feats = self.extract_feat(points, img=img, img_metas=img_metas) + losses = dict() + if pts_feats: + losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) + losses.update(losses_pts) + if img_feats: + losses_img = self.forward_img_train( + img_feats, + img_metas=img_metas, + gt_bboxes=gt_bboxes, + gt_labels=gt_labels, + gt_bboxes_ignore=gt_bboxes_ignore, + proposals=proposals, + ) + losses.update(losses_img) + return losses + + def forward_pts_train(self, pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): + """Forward function for point cloud branch. + + Args: + pts_feats (list[torch.Tensor]): Features of point cloud branch + gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth + boxes for each sample. + gt_labels_3d (list[torch.Tensor]): Ground truth labels for + boxes of each sampole + img_metas (list[dict]): Meta information of samples. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + + Returns: + dict: Losses of each branch. + """ + outs = self.pts_bbox_head(pts_feats) + loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas) + losses = self.pts_bbox_head.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + return losses + + def forward_img_train(self, x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, proposals=None, **kwargs): + """Forward function for image branch. + + This function works similar to the forward function of Faster R-CNN. + + Args: + x (list[torch.Tensor]): Image features of shape (B, C, H, W) + of multiple levels. + img_metas (list[dict]): Meta information of images. + gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image + sample. + gt_labels (list[torch.Tensor]): Ground truth labels of boxes. + gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth + boxes to be ignored. Defaults to None. + proposals (list[torch.Tensor], optional): Proposals of each sample. + Defaults to None. + + Returns: + dict: Losses of each branch. + """ + losses = dict() + # RPN forward and loss + if self.with_img_rpn: + rpn_outs = self.img_rpn_head(x) + rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.img_rpn) + rpn_losses = self.img_rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) + losses.update(rpn_losses) + + proposal_cfg = self.train_cfg.get("img_rpn_proposal", self.test_cfg.img_rpn) + proposal_inputs = rpn_outs + (img_metas, proposal_cfg) + proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) + else: + proposal_list = proposals + + # bbox head forward and loss + if self.with_img_bbox: + # bbox head forward and loss + img_roi_losses = self.img_roi_head.forward_train( + x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs + ) + losses.update(img_roi_losses) + + return losses + + def simple_test_img(self, x, img_metas, proposals=None, rescale=False): + """Test without augmentation.""" + if proposals is None: + proposal_list = self.simple_test_rpn(x, img_metas, self.test_cfg.img_rpn) + else: + proposal_list = proposals + + return self.img_roi_head.simple_test(x, proposal_list, img_metas, rescale=rescale) + + def simple_test_rpn(self, x, img_metas, rpn_test_cfg): + """RPN test function.""" + rpn_outs = self.img_rpn_head(x) + proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg) + proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) + return proposal_list + + def simple_test_pts(self, x, img_metas, rescale=False): + """Test function of point cloud branch.""" + outs = self.pts_bbox_head(x) + bbox_list = self.pts_bbox_head.get_bboxes(*outs, img_metas, rescale=rescale) + bbox_results = [bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list] + return bbox_results + + def simple_test(self, points, img_metas, img=None, rescale=False): + """Test function without augmentaiton.""" + img_feats, pts_feats = self.extract_feat(points, img=img, img_metas=img_metas) + + bbox_list = [dict() for i in range(len(img_metas))] + if pts_feats and self.with_pts_bbox: + bbox_pts = self.simple_test_pts(pts_feats, img_metas, rescale=rescale) + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict["pts_bbox"] = pts_bbox + if img_feats and self.with_img_bbox: + bbox_img = self.simple_test_img(img_feats, img_metas, rescale=rescale) + for result_dict, img_bbox in zip(bbox_list, bbox_img): + result_dict["img_bbox"] = img_bbox + return bbox_list + + def aug_test(self, points, img_metas, imgs=None, rescale=False): + """Test function with augmentaiton.""" + img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) + + bbox_list = dict() + if pts_feats and self.with_pts_bbox: + bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale) + bbox_list.update(pts_bbox=bbox_pts) + return [bbox_list] + + def extract_feats(self, points, img_metas, imgs=None): + """Extract point and image features of multiple samples.""" + if imgs is None: + imgs = [None] * len(img_metas) + img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs, img_metas) + return img_feats, pts_feats + + def aug_test_pts(self, feats, img_metas, rescale=False): + """Test function of point cloud branch with augmentaiton.""" + # only support aug_test for one sample + aug_bboxes = [] + for x, img_meta in zip(feats, img_metas): + outs = self.pts_bbox_head(x) + bbox_list = self.pts_bbox_head.get_bboxes(*outs, img_meta, rescale=rescale) + bbox_list = [ + dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list + ] + aug_bboxes.append(bbox_list[0]) + + # after merging, bboxes will be rescaled to the original image size + merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.pts_bbox_head.test_cfg) + return merged_bboxes diff --git a/forge/test/models/pytorch/vision/petr/test_petr.py b/forge/test/models/pytorch/vision/petr/test_petr.py new file mode 100644 index 000000000..87f0c3372 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/test_petr.py @@ -0,0 +1,173 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import sys + +import torch + +sys.path.append("forge/test/models/pytorch/vision/petr") +import pytest +from mmcv.parallel import MMDataParallel +from mmdet3d.models.builder import build_model + +import forge +from forge.verify.verify import verify + +# Import necessary classes for model registration, ensuring availability even if not used directly +from utils import model_registry +from utils.utils import load_config, prepare_model_inputs + +from test.models.utils import Framework, Source, Task, build_module_name + + +class petr_wrapper(torch.nn.Module): + def __init__( + self, + model, + filename, + ori_shape, + img_shape, + pad_shape, + scale_factor, + flip, + pcd_horizontal_flip, + pcd_vertical_flip, + box_mode_3d, + box_type_3d, + to_rgb, + sample_idx, + pcd_scale_factor, + pts_filename, + ): + super().__init__() + self.model = model + + self.filename = filename + self.ori_shape = ori_shape + self.img_shape = img_shape + self.pad_shape = pad_shape + self.scale_factor = scale_factor + self.flip = flip + self.pcd_horizontal_flip = pcd_horizontal_flip + self.pcd_vertical_flip = pcd_vertical_flip + self.box_mode_3d = box_mode_3d + self.box_type_3d = box_type_3d + self.to_rgb = to_rgb + self.sample_idx = sample_idx + self.pcd_scale_factor = pcd_scale_factor + self.pts_filename = pts_filename + + def forward(self, l0, l1, l2, l3, l4, l5, img, mean, std, masks): + + l0 = l0.squeeze(0) + l1 = l1.squeeze(0) + l2 = l2.squeeze(0) + l3 = l3.squeeze(0) + l4 = l4.squeeze(0) + l5 = l5.squeeze(0) + img = img.squeeze(0) + mean = mean.squeeze(0) + std = std.squeeze(0) + masks = masks.squeeze(0) + + data = { + "img_metas": [ + [ + { + "filename": self.filename, + "ori_shape": self.ori_shape, + "img_shape": self.img_shape, + "lidar2img": [l0, l1, l2, l3, l4, l5], + "pad_shape": self.pad_shape, + "scale_factor": self.scale_factor, + "flip": self.flip, + "pcd_horizontal_flip": self.pcd_horizontal_flip, + "pcd_vertical_flip": self.pcd_vertical_flip, + "box_mode_3d": self.box_mode_3d, + "box_type_3d": self.box_type_3d, + "img_norm_cfg": {"mean": mean, "std": std, "to_rgb": self.to_rgb}, + "sample_idx": self.sample_idx, + "pcd_scale_factor": self.pcd_scale_factor, + "pts_filename": self.pts_filename, + "masks": masks, + } + ] + ], + "img": [img], + } + + output = self.model(**data) + return (output["all_cls_scores"], output["all_bbox_preds"]) + + +variants = ["vovnet_gridmask_p4_800x320", "vovnet_gridmask_p4_1600x640"] + + +@pytest.mark.parametrize("variant", variants) +def test_petr(record_forge_property, variant): + + # Build Module Name + module_name = build_module_name( + framework=Framework.PYTORCH, model="petr", source=Source.GITHUB, task=Task.OBJECT_DETECTION, variant=variant + ) + + # Record Forge Property + record_forge_property("model_name", module_name) + + _ = model_registry # Prevents removal by linters/formatters + + # Load config + cfg = load_config(variant) + + # Prepare input + ( + filename, + ori_shape, + img_shape, + pad_shape, + scale_factor, + flip, + pcd_horizontal_flip, + pcd_vertical_flip, + box_mode_3d, + box_type_3d, + to_rgb, + sample_idx, + pcd_scale_factor, + pts_filename, + inputs, + ) = prepare_model_inputs(cfg) + + # Load Model + model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"), train_cfg=cfg.get("train_cfg")) + model = MMDataParallel(model, device_ids=[0]) + model.eval() + + for param in model.parameters(): + param.requires_grad = False + + framework_model = petr_wrapper( + model, + filename, + ori_shape, + img_shape, + pad_shape, + scale_factor, + flip, + pcd_horizontal_flip, + pcd_vertical_flip, + box_mode_3d, + box_type_3d, + to_rgb, + sample_idx, + pcd_scale_factor, + pts_filename, + ) + framework_model.eval() + + # Forge compile framework model + compiled_model = forge.compile(framework_model, sample_inputs=inputs, module_name=module_name) + + # Model Verification + verify(inputs, framework_model, compiled_model) diff --git a/forge/test/models/pytorch/vision/petr/utils/__init__.py b/forge/test/models/pytorch/vision/petr/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py b/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py new file mode 100644 index 000000000..8ebae7e5d --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py @@ -0,0 +1,210 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from mmdetection (https://github.com/open-mmlab/mmdetection) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ + +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import ConvModule +from mmcv.runner import BaseModule, auto_fp16 +from mmdet.models import NECKS + + +####This FPN remove the unused parameters which can used with checkpoint (with_cp = True in Backbone) +@NECKS.register_module() +class CPFPN(BaseModule): + r"""Feature Pyramid Network. + + This is an implementation of paper `Feature Pyramid Networks for Object + Detection `_. + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, it is equivalent to `add_extra_convs='on_input'`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (str): Config dict for activation layer in ConvModule. + Default: None. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(mode='nearest')` + init_cfg (dict or list[dict], optional): Initialization config dict. + + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = FPN(in_channels, 11, len(in_channels)).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__( + self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + upsample_cfg=dict(mode="nearest"), + init_cfg=dict(type="Xavier", layer="Conv2d", distribution="uniform"), + ): + super(CPFPN, self).__init__(init_cfg) + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.fp16_enabled = False + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level < inputs, no extra level is allowed + self.backbone_end_level = end_level + assert end_level <= len(in_channels) + assert num_outs == end_level - start_level + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ("on_input", "on_lateral", "on_output") + elif add_extra_convs: # True + self.add_extra_convs = "on_input" + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False, + ) + self.lateral_convs.append(l_conv) + if i == 0: + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False, + ) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == "on_input": + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False, + ) + self.fpn_convs.append(extra_fpn_conv) + + @auto_fp16() + def forward(self, inputs): + """Forward function.""" + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs)] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if "scale_factor" in self.upsample_cfg: + laterals[i - 1] += F.interpolate(laterals[i], **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += F.interpolate(laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i] for i in range(used_backbone_levels)] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == "on_input": + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == "on_lateral": + extra_source = laterals[-1] + elif self.add_extra_convs == "on_output": + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/forge/test/models/pytorch/vision/petr/utils/grid_mask.py b/forge/test/models/pytorch/vision/petr/utils/grid_mask.py new file mode 100644 index 000000000..c8fa6db4b --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/grid_mask.py @@ -0,0 +1,62 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import torch +import torch.nn as nn +from PIL import Image + + +class GridMask(nn.Module): + def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0): + super(GridMask, self).__init__() + self.use_h = use_h + self.use_w = use_w + self.rotate = rotate + self.offset = offset + self.ratio = ratio + self.mode = mode + self.st_prob = prob + self.prob = prob + + def forward(self, x): + if np.random.rand() > self.prob or not self.training: + return x + n, c, h, w = x.size() + x = x.view(-1, h, w) + hh = int(1.5 * h) + ww = int(1.5 * w) + d = np.random.randint(2, h) + self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) + mask = np.ones((hh, ww), np.float32) + st_h = np.random.randint(d) + st_w = np.random.randint(d) + if self.use_h: + for i in range(hh // d): + s = d * i + st_h + t = min(s + self.l, hh) + mask[s:t, :] *= 0 + if self.use_w: + for i in range(ww // d): + s = d * i + st_w + t = min(s + self.l, ww) + mask[:, s:t] *= 0 + + r = np.random.randint(self.rotate) + mask = Image.fromarray(np.uint8(mask)) + mask = mask.rotate(r) + mask = np.asarray(mask) + mask = mask[(hh - h) // 2 : (hh - h) // 2 + h, (ww - w) // 2 : (ww - w) // 2 + w] + + mask = torch.from_numpy(mask).float().cuda() + if self.mode == 1: + mask = 1 - mask + mask = mask.expand_as(x) + if self.offset: + offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float().cuda() + x = x * mask + offset * (1 - mask) + else: + x = x * mask + + return x.view(n, c, h, w) diff --git a/forge/test/models/pytorch/vision/petr/utils/match_cost.py b/forge/test/models/pytorch/vision/petr/utils/match_cost.py new file mode 100644 index 000000000..dca25f395 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/match_cost.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import torch +from mmdet.core.bbox.match_costs.builder import MATCH_COST + + +@MATCH_COST.register_module() +class BBox3DL1Cost(object): + """BBox3DL1Cost. + Args: + weight (int | float, optional): loss_weight + """ + + def __init__(self, weight=1.0): + self.weight = weight + + def __call__(self, bbox_pred, gt_bboxes): + """ + Args: + bbox_pred (Tensor): Predicted boxes with normalized coordinates + (cx, cy, w, h), which are all in range [0, 1]. Shape + [num_query, 4]. + gt_bboxes (Tensor): Ground truth boxes with normalized + coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. + Returns: + torch.Tensor: bbox_cost value with weight + """ + bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) + return bbox_cost * self.weight diff --git a/forge/test/models/pytorch/vision/petr/utils/model_registry.py b/forge/test/models/pytorch/vision/petr/utils/model_registry.py new file mode 100644 index 000000000..56cb667ed --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/model_registry.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 +from mmdet3d.datasets.pipelines.formating import DefaultFormatBundle3D +from mmdet3d.datasets.pipelines.test_time_aug import MultiScaleFlipAug3D +from mmdet.core.bbox.coder import distance_point_bbox_coder +from mmdet.models.losses import focal_loss, iou_loss +from mmdet.models.losses.smooth_l1_loss import L1Loss +from utils.cp_fpn import CPFPN +from utils.grid_mask import GridMask +from utils.match_cost import BBox3DL1Cost +from utils.nms_free_coder import NMSFreeCoder +from utils.nuscenes_dataset import CustomNuScenesDataset +from utils.petr3d import Petr3D +from utils.petr_head import PETRHead +from utils.petr_transformer import PETRTransformer +from utils.positional_encoding import SinePositionalEncoding3D +from utils.transform_3d import ResizeCropFlipImage +from utils.vovnetcp import VoVNetCP + +__all__ = [ + "Petr3D", + "PETRHead", + "BBox3DL1Cost", + "focal_loss", + "iou_loss", + "L1Loss", + "distance_point_bbox_coder", + "SinePositionalEncoding3D", + "PETRTransformer", + "NMSFreeCoder", + "GridMask", + "CustomNuScenesDataset", + "ResizeCropFlipImage", + "MultiScaleFlipAug3D", + "DefaultFormatBundle3D", + "CPFPN", + "VoVNetCP", +] diff --git a/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py b/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py new file mode 100644 index 000000000..1a4f30544 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py @@ -0,0 +1,41 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2021 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS + + +@BBOX_CODERS.register_module() +class NMSFreeCoder(BaseBBoxCoder): + """Bbox coder for NMS-free detector. + Args: + pc_range (list[float]): Range of point cloud. + post_center_range (list[float]): Limit of the center. + Default: None. + max_num (int): Max number to be kept. Default: 100. + score_threshold (float): Threshold to filter boxes based on score. + Default: None. + code_size (int): Code size of bboxes. Default: 9 + """ + + def __init__( + self, pc_range, voxel_size=None, post_center_range=None, max_num=100, score_threshold=None, num_classes=10 + ): + + self.pc_range = pc_range + self.voxel_size = voxel_size + self.post_center_range = post_center_range + self.max_num = max_num + self.score_threshold = score_threshold + self.num_classes = num_classes diff --git a/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py b/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py new file mode 100644 index 000000000..bbb32d222 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ +import numpy as np +from mmdet3d.datasets import NuScenesDataset +from mmdet.datasets import DATASETS + + +@DATASETS.register_module() +class CustomNuScenesDataset(NuScenesDataset): + r"""NuScenes Dataset. + This datset only add camera intrinsics and extrinsics to the results. + """ + + def get_data_info(self, index): + """Get data info according to the given index. + Args: + index (int): Index of the sample data to get. + Returns: + dict: Data information that will be passed to the data \ + preprocessing pipelines. It includes the following keys: + + - sample_idx (str): Sample index. + - pts_filename (str): Filename of point clouds. + - sweeps (list[dict]): Infos of sweeps. + - timestamp (float): Sample timestamp. + - img_filename (str, optional): Image filename. + - lidar2img (list[np.ndarray], optional): Transformations \ + from lidar to different cameras. + - ann_info (dict): Annotation info. + """ + info = self.data_infos[index] + # standard protocal modified from SECOND.Pytorch + input_dict = dict( + sample_idx=info["token"], + pts_filename=info["lidar_path"], + sweeps=info["sweeps"], + timestamp=info["timestamp"] / 1e6, + ) + + if self.modality["use_camera"]: + image_paths = [] + lidar2img_rts = [] + intrinsics = [] + extrinsics = [] + img_timestamp = [] + for cam_type, cam_info in info["cams"].items(): + img_timestamp.append(cam_info["timestamp"] / 1e6) + image_paths.append(cam_info["data_path"]) + # obtain lidar to image transformation matrix + lidar2cam_r = np.linalg.inv(cam_info["sensor2lidar_rotation"]) + lidar2cam_t = cam_info["sensor2lidar_translation"] @ lidar2cam_r.T + lidar2cam_rt = np.eye(4) + lidar2cam_rt[:3, :3] = lidar2cam_r.T + lidar2cam_rt[3, :3] = -lidar2cam_t + intrinsic = cam_info["cam_intrinsic"] + viewpad = np.eye(4) + viewpad[: intrinsic.shape[0], : intrinsic.shape[1]] = intrinsic + lidar2img_rt = viewpad @ lidar2cam_rt.T + intrinsics.append(viewpad) + extrinsics.append( + lidar2cam_rt + ) ###The extrinsics mean the tranformation from lidar to camera. If anyone want to use the extrinsics as sensor to lidar, please use np.linalg.inv(lidar2cam_rt.T) and modify the ResizeCropFlipImage and LoadMultiViewImageFromMultiSweepsFiles. + lidar2img_rts.append(lidar2img_rt) + + input_dict.update( + dict( + img_timestamp=img_timestamp, + img_filename=image_paths, + lidar2img=lidar2img_rts, + intrinsics=intrinsics, + extrinsics=extrinsics, + ) + ) + + if not self.test_mode: + annos = self.get_ann_info(index) + input_dict["ann_info"] = annos + return input_dict diff --git a/forge/test/models/pytorch/vision/petr/utils/petr3d.py b/forge/test/models/pytorch/vision/petr/utils/petr3d.py new file mode 100644 index 000000000..9e313af10 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/petr3d.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ + +import torch +from mmcv.runner import auto_fp16, force_fp32 +from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector +from mmdet.models.builder import DETECTORS +from utils.grid_mask import GridMask + + +@DETECTORS.register_module() +class Petr3D(MVXTwoStageDetector): + """Petr3D.""" + + def __init__( + self, + use_grid_mask=False, + pts_voxel_layer=None, + pts_voxel_encoder=None, + pts_middle_encoder=None, + pts_fusion_layer=None, + img_backbone=None, + pts_backbone=None, + img_neck=None, + pts_neck=None, + pts_bbox_head=None, + img_roi_head=None, + img_rpn_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + ): + super(Petr3D, self).__init__( + pts_voxel_layer, + pts_voxel_encoder, + pts_middle_encoder, + pts_fusion_layer, + img_backbone, + pts_backbone, + img_neck, + pts_neck, + pts_bbox_head, + img_roi_head, + img_rpn_head, + train_cfg, + test_cfg, + pretrained, + ) + self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) + self.use_grid_mask = use_grid_mask + + def extract_img_feat(self, img, img_metas): + """Extract features of images.""" + if isinstance(img, list): + img = torch.stack(img, dim=0) + + B = img.size(0) + if img is not None: + input_shape = img.shape[-2:] + # update real input shape of each single img + for img_meta in img_metas: + img_meta.update(input_shape=input_shape) + if img.dim() == 5: + if img.size(0) == 1 and img.size(1) != 1: + img.squeeze_() + else: + B, N, C, H, W = img.size() + img = img.view(B * N, C, H, W) + if self.use_grid_mask: + img = self.grid_mask(img) + + img_feats = self.img_backbone(img) + if isinstance(img_feats, dict): + img_feats = list(img_feats.values()) + else: + return None + if self.with_img_neck: + img_feats = self.img_neck(img_feats) + img_feats_reshaped = [] + for img_feat in img_feats: + BN, C, H, W = img_feat.size() + img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W)) + return img_feats_reshaped + + @auto_fp16(apply_to=("img"), out_fp32=True) + def extract_feat(self, img, img_metas): + """Extract features from images and points.""" + img_feats = self.extract_img_feat(img, img_metas) + return img_feats + + @force_fp32(apply_to=("img", "points")) + def forward(self, **kwargs): + return self.forward_test(**kwargs) + + def forward_test(self, img_metas, img=None, **kwargs): + for var, name in [(img_metas, "img_metas")]: + if not isinstance(var, list): + raise TypeError("{} must be a list, but got {}".format(name, type(var))) + img = [img] if img is None else img + return self.simple_test(img_metas[0], img[0], **kwargs) + + def simple_test_pts(self, x, img_metas, rescale=False): + """Test function of point cloud branch.""" + outs = self.pts_bbox_head(x, img_metas) + return outs + + def simple_test(self, img_metas, img=None, rescale=False): + """Test function without augmentaiton.""" + img_feats = self.extract_feat(img=img, img_metas=img_metas) + + bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale) + + return bbox_pts diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_head.py b/forge/test/models/pytorch/vision/petr/utils/petr_head.py new file mode 100644 index 000000000..4a2398214 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/petr_head.py @@ -0,0 +1,527 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import math + +import numpy as np + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import Conv2d, Linear, bias_init_with_prob +from mmcv.cnn.bricks.transformer import build_positional_encoding +from mmcv.runner import force_fp32 +from mmdet3d.core.bbox.coders import build_bbox_coder +from mmdet.models import HEADS, build_loss +from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead +from mmdet.models.utils import build_transformer +from mmdet.models.utils.transformer import inverse_sigmoid + + +def pos2posemb3d(pos, num_pos_feats=128, temperature=10000): + scale = 2 * math.pi + pos = pos * scale + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) + dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats) + pos_x = pos[..., 0, None] / dim_t + pos_y = pos[..., 1, None] / dim_t + pos_z = pos[..., 2, None] / dim_t + pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) + pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) + pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2) + posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1) + return posemb + + +@HEADS.register_module() +class PETRHead(AnchorFreeHead): + """Implements the DETR transformer head. + See `paper: End-to-End Object Detection with Transformers + `_ for details. + Args: + num_classes (int): Number of categories excluding the background. + in_channels (int): Number of channels in the input feature map. + num_query (int): Number of query in Transformer. + num_reg_fcs (int, optional): Number of fully-connected layers used in + `FFN`, which is then used for the regression head. Default 2. + transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. + Default: None. + sync_cls_avg_factor (bool): Whether to sync the avg_factor of + all ranks. Default to False. + positional_encoding (obj:`mmcv.ConfigDict`|dict): + Config for position encoding. + loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the + classification loss. Default `CrossEntropyLoss`. + loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the + regression loss. Default `L1Loss`. + loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the + regression iou loss. Default `GIoULoss`. + tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of + transformer head. + test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of + transformer head. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + _version = 2 + + def __init__( + self, + num_classes, + in_channels, + num_query=100, + num_reg_fcs=2, + transformer=None, + sync_cls_avg_factor=False, + positional_encoding=dict(type="SinePositionalEncoding", num_feats=128, normalize=True), + code_weights=None, + bbox_coder=None, + loss_cls=dict(type="CrossEntropyLoss", bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), + loss_bbox=dict(type="L1Loss", loss_weight=5.0), + loss_iou=dict(type="GIoULoss", loss_weight=2.0), + train_cfg=dict( + assigner=dict( + type="HungarianAssigner", + cls_cost=dict(type="ClassificationCost", weight=1.0), + reg_cost=dict(type="BBoxL1Cost", weight=5.0), + iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0), + ) + ), + test_cfg=dict(max_per_img=100), + with_position=True, + with_multiview=False, + depth_step=0.8, + depth_num=64, + LID=False, + depth_start=1, + position_range=[-65, -65, -8.0, 65, 65, 8.0], + init_cfg=None, + normedlinear=False, + **kwargs, + ): + # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, + # since it brings inconvenience when the initialization of + # `AnchorFreeHead` is called. + if "code_size" in kwargs: + self.code_size = kwargs["code_size"] + else: + self.code_size = 10 + if code_weights is not None: + self.code_weights = code_weights + else: + self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] + self.code_weights = self.code_weights[: self.code_size] + self.bg_cls_weight = 0 + self.sync_cls_avg_factor = sync_cls_avg_factor + class_weight = loss_cls.get("class_weight", None) + if class_weight is not None and (self.__class__ is PETRHead): + assert isinstance(class_weight, float), ( + "Expected " "class_weight to have type float. Found " f"{type(class_weight)}." + ) + # NOTE following the official DETR rep0, bg_cls_weight means + # relative classification weight of the no-object class. + bg_cls_weight = loss_cls.get("bg_cls_weight", class_weight) + assert isinstance(bg_cls_weight, float), ( + "Expected " "bg_cls_weight to have type float. Found " f"{type(bg_cls_weight)}." + ) + class_weight = torch.ones(num_classes + 1) * class_weight + # set background class as the last indice + class_weight[num_classes] = bg_cls_weight + loss_cls.update({"class_weight": class_weight}) + if "bg_cls_weight" in loss_cls: + loss_cls.pop("bg_cls_weight") + self.bg_cls_weight = bg_cls_weight + + self.num_query = num_query + self.num_classes = num_classes + self.in_channels = in_channels + self.num_reg_fcs = num_reg_fcs + self.train_cfg = train_cfg + self.test_cfg = test_cfg + self.fp16_enabled = False + self.embed_dims = 256 + self.depth_step = depth_step + self.depth_num = depth_num + self.position_dim = 3 * self.depth_num + self.position_range = position_range + self.LID = LID + self.depth_start = depth_start + self.position_level = 0 + self.with_position = with_position + self.with_multiview = with_multiview + assert "num_feats" in positional_encoding + num_feats = positional_encoding["num_feats"] + assert num_feats * 2 == self.embed_dims, ( + "embed_dims should" f" be exactly 2 times of num_feats. Found {self.embed_dims}" f" and {num_feats}." + ) + self.act_cfg = transformer.get("act_cfg", dict(type="ReLU", inplace=True)) + self.num_pred = 6 + self.normedlinear = normedlinear + super(PETRHead, self).__init__(num_classes, in_channels, init_cfg=init_cfg) + + self.loss_cls = build_loss(loss_cls) + self.loss_bbox = build_loss(loss_bbox) + self.loss_iou = build_loss(loss_iou) + + if self.loss_cls.use_sigmoid: + self.cls_out_channels = num_classes + else: + self.cls_out_channels = num_classes + 1 + self.positional_encoding = build_positional_encoding(positional_encoding) + self.transformer = build_transformer(transformer) + + self.code_weights = nn.Parameter(torch.tensor(self.code_weights, requires_grad=False), requires_grad=False) + self.bbox_coder = build_bbox_coder(bbox_coder) + self.pc_range = self.bbox_coder.pc_range + self._init_layers() + + def _init_layers(self): + """Initialize layers of the transformer head.""" + if self.with_position: + self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1) + else: + self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1) + + cls_branch = [] + for _ in range(self.num_reg_fcs): + cls_branch.append(Linear(self.embed_dims, self.embed_dims)) + cls_branch.append(nn.LayerNorm(self.embed_dims)) + cls_branch.append(nn.ReLU(inplace=True)) + if self.normedlinear: + cls_branch.append(NormedLinear(self.embed_dims, self.cls_out_channels)) + else: + cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) + fc_cls = nn.Sequential(*cls_branch) + + reg_branch = [] + for _ in range(self.num_reg_fcs): + reg_branch.append(Linear(self.embed_dims, self.embed_dims)) + reg_branch.append(nn.ReLU()) + reg_branch.append(Linear(self.embed_dims, self.code_size)) + reg_branch = nn.Sequential(*reg_branch) + + self.cls_branches = nn.ModuleList([fc_cls for _ in range(self.num_pred)]) + self.reg_branches = nn.ModuleList([reg_branch for _ in range(self.num_pred)]) + + if self.with_multiview: + self.adapt_pos3d = nn.Sequential( + nn.Conv2d(self.embed_dims * 3 // 2, self.embed_dims * 4, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + nn.Conv2d(self.embed_dims * 4, self.embed_dims, kernel_size=1, stride=1, padding=0), + ) + else: + self.adapt_pos3d = nn.Sequential( + nn.Conv2d(self.embed_dims, self.embed_dims, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + nn.Conv2d(self.embed_dims, self.embed_dims, kernel_size=1, stride=1, padding=0), + ) + + if self.with_position: + self.position_encoder = nn.Sequential( + nn.Conv2d(self.position_dim, self.embed_dims * 4, kernel_size=1, stride=1, padding=0), + nn.ReLU(), + nn.Conv2d(self.embed_dims * 4, self.embed_dims, kernel_size=1, stride=1, padding=0), + ) + + self.reference_points = nn.Embedding(self.num_query, 3) + self.query_embedding = nn.Sequential( + nn.Linear(self.embed_dims * 3 // 2, self.embed_dims), + nn.ReLU(), + nn.Linear(self.embed_dims, self.embed_dims), + ) + + def init_weights(self): + """Initialize weights of the transformer head.""" + # The initialization for transformer is important + self.transformer.init_weights() + nn.init.uniform_(self.reference_points.weight.data, 0, 1) + if self.loss_cls.use_sigmoid: + bias_init = bias_init_with_prob(0.01) + for m in self.cls_branches: + nn.init.constant_(m[-1].bias, bias_init) + + def position_embeding(self, img_feats, img_metas, masks=None): + eps = 1e-5 + pad_h, pad_w, _ = img_metas[0]["pad_shape"][0] + B, N, C, H, W = img_feats[self.position_level].shape + coords_h = torch.arange(H, device=img_feats[0].device).float() * pad_h / H + coords_w = torch.arange(W, device=img_feats[0].device).float() * pad_w / W + + if self.LID: + index = torch.arange(start=0, end=self.depth_num, step=1, device=img_feats[0].device).float() + index_1 = index + 1 + bin_size = (self.position_range[3] - self.depth_start) / (self.depth_num * (1 + self.depth_num)) + coords_d = self.depth_start + bin_size * index * index_1 + else: + index = torch.arange(start=0, end=self.depth_num, step=1, device=img_feats[0].device).float() + bin_size = (self.position_range[3] - self.depth_start) / self.depth_num + coords_d = self.depth_start + bin_size * index + + D = coords_d.shape[0] + coords = torch.stack(torch.meshgrid([coords_w, coords_h, coords_d])).permute(1, 2, 3, 0) # W, H, D, 3 + coords = torch.cat((coords, torch.ones_like(coords[..., :1])), -1) + + # coords[..., :2] = coords[..., :2] * torch.maximum(coords[..., 2:3], torch.ones_like(coords[..., 2:3])*eps) + + updated_coords = coords[..., :2] * torch.maximum(coords[..., 2:3], torch.ones_like(coords[..., 2:3]) * eps) + coords = torch.cat((updated_coords, coords[..., 2:]), dim=-1) + + img2lidars = [] + for img_meta in img_metas: + img2lidar = [] + for i in range(len(img_meta["lidar2img"])): + img2lidar.append(np.linalg.inv(img_meta["lidar2img"][i])) + img2lidars.append(np.asarray(img2lidar)) + img2lidars = np.asarray(img2lidars) + img2lidars = coords.new_tensor(img2lidars) # (B, N, 4, 4) + + coords = coords.view(1, 1, W, H, D, 4, 1).repeat(B, N, 1, 1, 1, 1, 1) + img2lidars = img2lidars.view(B, N, 1, 1, 1, 4, 4).repeat(1, 1, W, H, D, 1, 1) + coords3d = torch.matmul(img2lidars, coords).squeeze(-1)[..., :3] + + # coords3d[..., 0:1] = (coords3d[..., 0:1] - self.position_range[0]) / (self.position_range[3] - self.position_range[0]) + # coords3d[..., 1:2] = (coords3d[..., 1:2] - self.position_range[1]) / (self.position_range[4] - self.position_range[1]) + # coords3d[..., 2:3] = (coords3d[..., 2:3] - self.position_range[2]) / (self.position_range[5] - self.position_range[2]) + + x = (coords3d[..., 0:1] - self.position_range[0]) / (self.position_range[3] - self.position_range[0]) + y = (coords3d[..., 1:2] - self.position_range[1]) / (self.position_range[4] - self.position_range[1]) + z = (coords3d[..., 2:3] - self.position_range[2]) / (self.position_range[5] - self.position_range[2]) + coords3d = torch.cat([x, y, z], dim=-1) + + coords_mask = (coords3d > 1.0) | (coords3d < 0.0) + coords_mask = coords_mask.flatten(-2).sum(-1) > (D * 0.5) + coords_mask = masks | coords_mask.permute(0, 1, 3, 2) + coords3d = coords3d.permute(0, 1, 4, 5, 3, 2).contiguous().view(B * N, -1, H, W) + coords3d = inverse_sigmoid(coords3d) + coords_position_embeding = self.position_encoder(coords3d) + + return coords_position_embeding.view(B, N, self.embed_dims, H, W), coords_mask + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + """load checkpoints.""" + # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, + # since `AnchorFreeHead._load_from_state_dict` should not be + # called here. Invoking the default `Module._load_from_state_dict` + # is enough. + + # Names of some parameters in has been changed. + version = local_metadata.get("version", None) + if (version is None or version < 2) and self.__class__ is PETRHead: + convert_dict = { + ".self_attn.": ".attentions.0.", + # '.ffn.': '.ffns.0.', + ".multihead_attn.": ".attentions.1.", + ".decoder.norm.": ".decoder.post_norm.", + } + state_dict_keys = list(state_dict.keys()) + for k in state_dict_keys: + for ori_key, convert_key in convert_dict.items(): + if ori_key in k: + convert_key = k.replace(ori_key, convert_key) + state_dict[convert_key] = state_dict[k] + del state_dict[k] + + super(AnchorFreeHead, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, mlvl_feats, img_metas): + """Forward function. + Args: + mlvl_feats (tuple[Tensor]): Features from the upstream + network, each is a 5D-tensor with shape + (B, N, C, H, W). + Returns: + all_cls_scores (Tensor): Outputs from the classification head, \ + shape [nb_dec, bs, num_query, cls_out_channels]. Note \ + cls_out_channels should includes background. + all_bbox_preds (Tensor): Sigmoid outputs from the regression \ + head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ + Shape [nb_dec, bs, num_query, 9]. + """ + + x = mlvl_feats[0] + batch_size, num_cams = x.size(0), x.size(1) + input_img_h, input_img_w, _ = img_metas[0]["pad_shape"][0] + # masks = x.new_ones( + # (batch_size, num_cams, input_img_h, input_img_w)) + masks = img_metas[0]["masks"] + + for img_id in range(batch_size): + for cam_id in range(num_cams): + img_h, img_w, _ = img_metas[img_id]["img_shape"][cam_id] + masks[img_id, cam_id, :img_h, :img_w] = 0 + x = self.input_proj(x.flatten(0, 1)) + x = x.view(batch_size, num_cams, *x.shape[-3:]) + # interpolate masks to have the same spatial shape with x + masks = F.interpolate(masks, size=x.shape[-2:]).to(torch.bool) + + if self.with_position: + coords_position_embeding, _ = self.position_embeding(mlvl_feats, img_metas, masks) + pos_embed = coords_position_embeding + if self.with_multiview: + sin_embed = self.positional_encoding(masks) + sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(x.size()) + pos_embed = pos_embed + sin_embed + else: + pos_embeds = [] + for i in range(num_cams): + xy_embed = self.positional_encoding(masks[:, i, :, :]) + pos_embeds.append(xy_embed.unsqueeze(1)) + sin_embed = torch.cat(pos_embeds, 1) + sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(x.size()) + pos_embed = pos_embed + sin_embed + else: + if self.with_multiview: + pos_embed = self.positional_encoding(masks) + pos_embed = self.adapt_pos3d(pos_embed.flatten(0, 1)).view(x.size()) + else: + pos_embeds = [] + for i in range(num_cams): + pos_embed = self.positional_encoding(masks[:, i, :, :]) + pos_embeds.append(pos_embed.unsqueeze(1)) + pos_embed = torch.cat(pos_embeds, 1) + + reference_points = self.reference_points.weight + query_embeds = self.query_embedding(pos2posemb3d(reference_points)) + reference_points = reference_points.unsqueeze(0).repeat(batch_size, 1, 1) # .sigmoid() + outs_dec, _ = self.transformer(x, masks, query_embeds, pos_embed, self.reg_branches) + + outs_dec = torch.nan_to_num(outs_dec) + outputs_classes = [] + outputs_coords = [] + for lvl in range(outs_dec.shape[0]): + reference = inverse_sigmoid(reference_points.clone()) + assert reference.shape[-1] == 3 + outputs_class = self.cls_branches[lvl](outs_dec[lvl]) + tmp = self.reg_branches[lvl](outs_dec[lvl]) + + # tmp[..., 0:2] += reference[..., 0:2] + # tmp[..., 0:2] = tmp[..., 0:2].sigmoid() + # tmp[..., 4:5] += reference[..., 2:3] + # tmp[..., 4:5] = tmp[..., 4:5].sigmoid() + + xy = tmp[..., 0:2] + reference[..., 0:2] + xy = xy.sigmoid() + z = tmp[..., 4:5] + reference[..., 2:3] + z = z.sigmoid() + tmp = torch.cat([xy, tmp[..., 2:4], z, tmp[..., 5:]], dim=-1) + + outputs_coord = tmp + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + + all_cls_scores = torch.stack(outputs_classes) + all_bbox_preds = torch.stack(outputs_coords) + + # all_bbox_preds[..., 0:1] = (all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) + # all_bbox_preds[..., 1:2] = (all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) + # all_bbox_preds[..., 4:5] = (all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]) + + updated_0_1 = all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0] + updated_1_2 = all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1] + updated_4_5 = all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2] + + all_bbox_preds = torch.cat( + [updated_0_1, updated_1_2, all_bbox_preds[..., 2:4], updated_4_5, all_bbox_preds[..., 5:]], dim=-1 + ) + + outs = { + "all_cls_scores": all_cls_scores, + "all_bbox_preds": all_bbox_preds, + "enc_cls_scores": None, + "enc_bbox_preds": None, + } + return outs + + @force_fp32(apply_to=("preds_dicts")) + def loss(self, gt_bboxes_list, gt_labels_list, preds_dicts, gt_bboxes_ignore=None): + """ "Loss function. + Args: + gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image + with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. + gt_labels_list (list[Tensor]): Ground truth class indices for each + image with shape (num_gts, ). + preds_dicts: + all_cls_scores (Tensor): Classification score of all + decoder layers, has shape + [nb_dec, bs, num_query, cls_out_channels]. + all_bbox_preds (Tensor): Sigmoid regression + outputs of all decode layers. Each is a 4D-tensor with + normalized coordinate format (cx, cy, w, h) and shape + [nb_dec, bs, num_query, 4]. + enc_cls_scores (Tensor): Classification scores of + points on encode feature map , has shape + (N, h*w, num_classes). Only be passed when as_two_stage is + True, otherwise is None. + enc_bbox_preds (Tensor): Regression results of each points + on the encode feature map, has shape (N, h*w, 4). Only be + passed when as_two_stage is True, otherwise is None. + gt_bboxes_ignore (list[Tensor], optional): Bounding boxes + which can be ignored for each image. Default None. + Returns: + dict[str, Tensor]: A dictionary of loss components. + """ + assert gt_bboxes_ignore is None, ( + f"{self.__class__.__name__} only supports " f"for gt_bboxes_ignore setting to None." + ) + + all_cls_scores = preds_dicts["all_cls_scores"] + all_bbox_preds = preds_dicts["all_bbox_preds"] + enc_cls_scores = preds_dicts["enc_cls_scores"] + enc_bbox_preds = preds_dicts["enc_bbox_preds"] + + num_dec_layers = len(all_cls_scores) + device = gt_labels_list[0].device + gt_bboxes_list = [ + torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) + for gt_bboxes in gt_bboxes_list + ] + + all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] + all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] + all_gt_bboxes_ignore_list = [gt_bboxes_ignore for _ in range(num_dec_layers)] + + losses_cls, losses_bbox = multi_apply( + self.loss_single, + all_cls_scores, + all_bbox_preds, + all_gt_bboxes_list, + all_gt_labels_list, + all_gt_bboxes_ignore_list, + ) + + loss_dict = dict() + # loss of proposal generated from encode feature map. + if enc_cls_scores is not None: + binary_labels_list = [torch.zeros_like(gt_labels_list[i]) for i in range(len(all_gt_labels_list))] + enc_loss_cls, enc_losses_bbox = self.loss_single( + enc_cls_scores, enc_bbox_preds, gt_bboxes_list, binary_labels_list, gt_bboxes_ignore + ) + loss_dict["enc_loss_cls"] = enc_loss_cls + loss_dict["enc_loss_bbox"] = enc_losses_bbox + + # loss from the last decoder layer + loss_dict["loss_cls"] = losses_cls[-1] + loss_dict["loss_bbox"] = losses_bbox[-1] + + # loss from other decoder layers + num_dec_layer = 0 + for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): + loss_dict[f"d{num_dec_layer}.loss_cls"] = loss_cls_i + loss_dict[f"d{num_dec_layer}.loss_bbox"] = loss_bbox_i + num_dec_layer += 1 + return loss_dict diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py b/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py new file mode 100644 index 000000000..b728d7f51 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py @@ -0,0 +1,447 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ + +import warnings + +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from mmcv.cnn import build_norm_layer, xavier_init +from mmcv.cnn.bricks.drop import build_dropout +from mmcv.cnn.bricks.registry import ( + ATTENTION, + TRANSFORMER_LAYER, + TRANSFORMER_LAYER_SEQUENCE, +) +from mmcv.cnn.bricks.transformer import ( + BaseTransformerLayer, + TransformerLayerSequence, + build_transformer_layer_sequence, +) +from mmcv.runner.base_module import BaseModule +from mmcv.utils import deprecated_api_warning +from mmdet.models.utils.builder import TRANSFORMER + + +@TRANSFORMER.register_module() +class PETRTransformer(BaseModule): + """Implements the DETR transformer. + Following the official DETR implementation, this module copy-paste + from torch.nn.Transformer with modifications: + * positional encodings are passed in MultiheadAttention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers + See `paper: End-to-End Object Detection with Transformers + `_ for details. + Args: + encoder (`mmcv.ConfigDict` | Dict): Config of + TransformerEncoder. Defaults to None. + decoder ((`mmcv.ConfigDict` | Dict)): Config of + TransformerDecoder. Defaults to None + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Defaults to None. + """ + + def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False): + super(PETRTransformer, self).__init__(init_cfg=init_cfg) + if encoder is not None: + self.encoder = build_transformer_layer_sequence(encoder) + else: + self.encoder = None + self.decoder = build_transformer_layer_sequence(decoder) + self.embed_dims = self.decoder.embed_dims + self.cross = cross + + def init_weights(self): + # follow the official DETR to init parameters + for m in self.modules(): + if hasattr(m, "weight") and m.weight.dim() > 1: + xavier_init(m, distribution="uniform") + self._is_init = True + + def forward(self, x, mask, query_embed, pos_embed, reg_branch=None): + """Forward function for `Transformer`. + Args: + x (Tensor): Input query with shape [bs, c, h, w] where + c = embed_dims. + mask (Tensor): The key_padding_mask used for encoder and decoder, + with shape [bs, h, w]. + query_embed (Tensor): The query embedding for decoder, with shape + [num_query, c]. + pos_embed (Tensor): The positional encoding for encoder and + decoder, with the same shape as `x`. + Returns: + tuple[Tensor]: results of decoder containing the following tensor. + - out_dec: Output from decoder. If return_intermediate_dec \ + is True output has shape [num_dec_layers, bs, + num_query, embed_dims], else has shape [1, bs, \ + num_query, embed_dims]. + - memory: Output results from encoder, with shape \ + [bs, embed_dims, h, w]. + """ + bs, n, c, h, w = x.shape + memory = x.permute(1, 3, 4, 0, 2).reshape(-1, bs, c) # [bs, n, c, h, w] -> [n*h*w, bs, c] + pos_embed = pos_embed.permute(1, 3, 4, 0, 2).reshape(-1, bs, c) # [bs, n, c, h, w] -> [n*h*w, bs, c] + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) # [num_query, dim] -> [num_query, bs, dim] + mask = mask.view(bs, -1) # [bs, n, h, w] -> [bs, n*h*w] + + # target = torch.zeros_like(query_embed) + target = torch.zeros(query_embed.shape) + + # out_dec: [num_layers, num_query, bs, dim] + out_dec = self.decoder( + query=target, + key=memory, + value=memory, + key_pos=pos_embed, + query_pos=query_embed, + key_padding_mask=mask, + reg_branch=reg_branch, + ) + + out_dec = out_dec.transpose(1, 2) + memory = memory.reshape(n, h, w, bs, c).permute(3, 0, 4, 1, 2) + return out_dec, memory + + +@TRANSFORMER_LAYER.register_module() +class PETRTransformerDecoderLayer(BaseTransformerLayer): + """Implements decoder layer in DETR transformer. + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): + Configs for self_attention or cross_attention, the order + should be consistent with it in `operation_order`. If it is + a dict, it would be expand to the number of attention in + `operation_order`. + feedforward_channels (int): The hidden dimension for FFNs. + ffn_dropout (float): Probability of an element to be zeroed + in ffn. Default 0.0. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Default:None + act_cfg (dict): The activation config for FFNs. Default: `LN` + norm_cfg (dict): Config dict for normalization layer. + Default: `LN`. + ffn_num_fcs (int): The number of fully-connected layers in FFNs. + Default:2. + """ + + def __init__( + self, + attn_cfgs, + feedforward_channels, + ffn_dropout=0.0, + operation_order=None, + act_cfg=dict(type="ReLU", inplace=True), + norm_cfg=dict(type="LN"), + ffn_num_fcs=2, + with_cp=True, + **kwargs, + ): + super(PETRTransformerDecoderLayer, self).__init__( + attn_cfgs=attn_cfgs, + feedforward_channels=feedforward_channels, + ffn_dropout=ffn_dropout, + operation_order=operation_order, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + ffn_num_fcs=ffn_num_fcs, + **kwargs, + ) + assert len(operation_order) == 6 + assert set(operation_order) == set(["self_attn", "norm", "cross_attn", "ffn"]) + self.use_checkpoint = with_cp + + def _forward( + self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + ): + """Forward function for `TransformerCoder`. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + x = super(PETRTransformerDecoderLayer, self).forward( + query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + ) + + return x + + def forward( + self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs, + ): + """Forward function for `TransformerCoder`. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if self.use_checkpoint and self.training: + x = cp.checkpoint( + self._forward, + query, + key, + value, + query_pos, + key_pos, + attn_masks, + query_key_padding_mask, + key_padding_mask, + ) + else: + x = self._forward( + query, + key=key, + value=value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + ) + return x + + +@ATTENTION.register_module() +class PETRMultiheadAttention(BaseModule): + """A wrapper for ``torch.nn.MultiheadAttention``. + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to False. + """ + + def __init__( + self, + embed_dims, + num_heads, + attn_drop=0.0, + proj_drop=0.0, + dropout_layer=dict(type="Dropout", drop_prob=0.0), + init_cfg=None, + batch_first=False, + **kwargs, + ): + super(PETRMultiheadAttention, self).__init__(init_cfg) + if "dropout" in kwargs: + warnings.warn( + "The arguments `dropout` in MultiheadAttention " + "has been deprecated, now you can separately " + "set `attn_drop`(float), proj_drop(float), " + "and `dropout_layer`(dict) ", + DeprecationWarning, + ) + attn_drop = kwargs["dropout"] + dropout_layer["drop_prob"] = kwargs.pop("dropout") + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs) + + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else nn.Identity() + + @deprecated_api_warning({"residual": "identity"}, cls_name="MultiheadAttention") + def forward( + self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs, + ): + """Forward function for `MultiheadAttention`. + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f"position encoding of key is" f"missing in {self.__class__.__name__}.") + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + out = self.attn(query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class PETRTransformerEncoder(TransformerLayerSequence): + """TransformerEncoder of DETR. + Args: + post_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. Only used when `self.pre_norm` is `True` + """ + + def __init__(self, *args, post_norm_cfg=dict(type="LN"), **kwargs): + super(PETRTransformerEncoder, self).__init__(*args, **kwargs) + if post_norm_cfg is not None: + self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None + else: + assert not self.pre_norm, f"Use prenorm in " f"{self.__class__.__name__}," f"Please specify post_norm_cfg" + self.post_norm = None + + def forward(self, *args, **kwargs): + """Forward function for `TransformerCoder`. + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + x = super(PETRTransformerEncoder, self).forward(*args, **kwargs) + if self.post_norm is not None: + x = self.post_norm(x) + return x + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class PETRTransformerDecoder(TransformerLayerSequence): + """Implements the decoder in DETR transformer. + Args: + return_intermediate (bool): Whether to return intermediate outputs. + post_norm_cfg (dict): Config of last normalization layer. Default: + `LN`. + """ + + def __init__(self, *args, post_norm_cfg=dict(type="LN"), return_intermediate=False, **kwargs): + + super(PETRTransformerDecoder, self).__init__(*args, **kwargs) + self.return_intermediate = return_intermediate + if post_norm_cfg is not None: + self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] + else: + self.post_norm = None + + def forward(self, query, *args, **kwargs): + """Forward function for `TransformerDecoder`. + Args: + query (Tensor): Input query with shape + `(num_query, bs, embed_dims)`. + Returns: + Tensor: Results with shape [1, num_query, bs, embed_dims] when + return_intermediate is `False`, otherwise it has shape + [num_layers, num_query, bs, embed_dims]. + """ + if not self.return_intermediate: + x = super().forward(query, *args, **kwargs) + if self.post_norm: + x = self.post_norm(x)[None] + return x + + intermediate = [] + for layer in self.layers: + query = layer(query, *args, **kwargs) + if self.return_intermediate: + if self.post_norm is not None: + intermediate.append(self.post_norm(query)) + + else: + intermediate.append(query) + return torch.stack(intermediate) diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py new file mode 100644 index 000000000..2417e73eb --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py @@ -0,0 +1,242 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +backbone_norm_cfg = dict(type="LN", requires_grad=True) +plugin = True + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + "car", + "truck", + "construction_vehicle", + "bus", + "trailer", + "barrier", + "motorcycle", + "bicycle", + "pedestrian", + "traffic_cone", +] +input_modality = dict(use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) +model = dict( + type="Petr3D", + use_grid_mask=True, + img_backbone=dict( + type="VoVNetCP", + spec_name="V-99-eSE", + norm_eval=True, + frozen_stages=-1, + input_ch=3, + out_features=( + "stage4", + "stage5", + ), + ), + img_neck=dict(type="CPFPN", in_channels=[768, 1024], out_channels=256, num_outs=2), + pts_bbox_head=dict( + type="PETRHead", + num_classes=10, + in_channels=256, + num_query=900, + LID=True, + with_position=True, + with_multiview=True, + position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + normedlinear=False, + transformer=dict( + type="PETRTransformer", + decoder=dict( + type="PETRTransformerDecoder", + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type="PETRTransformerDecoderLayer", + attn_cfgs=[ + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1), + dict(type="PETRMultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1), + ], + feedforward_channels=2048, + ffn_dropout=0.1, + with_cp=True, + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), + ), + ), + ), + bbox_coder=dict( + type="NMSFreeCoder", + # type='NMSFreeClsCoder', + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10, + ), + positional_encoding=dict(type="SinePositionalEncoding3D", num_feats=128, normalize=True), + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), + loss_bbox=dict(type="L1Loss", loss_weight=0.25), + loss_iou=dict(type="GIoULoss", loss_weight=0.0), + ), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type="HungarianAssigner3D", + cls_cost=dict(type="FocalLossCost", weight=2.0), + reg_cost=dict(type="BBox3DL1Cost", weight=0.25), + iou_cost=dict( + type="IoUCost", weight=0.0 + ), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range, + ), + ) + ), +) + +dataset_type = "CustomNuScenesDataset" +data_root = "../data/nuscenes/" +file_client_args = dict(backend="disk") + +db_sampler = dict( + data_root=data_root, + info_path=data_root + "nuscenes_dbinfos_train.pkl", + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ), + ), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2, + ), + points_loader=dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args, + ), +) +ida_aug_conf = { + "resize_lim": (0.94, 1.25), + "final_dim": (640, 1600), + "bot_pct_lim": (0.0, 0.0), + "rot_lim": (0.0, 0.0), + "H": 900, + "W": 1600, + # "rand_flip": False, + "rand_flip": True, +} +train_pipeline = [ + dict(type="LoadMultiViewImageFromFiles", to_float32=True), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict(type="ObjectNameFilter", classes=class_names), + dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=True), + dict( + type="GlobalRotScaleTransImage", + rot_range=[-0.3925, 0.3925], + translation_std=[0, 0, 0], + scale_ratio_range=[0.95, 1.05], + reverse_angle=True, + training=True, + ), + dict(type="NormalizeMultiviewImage", **img_norm_cfg), + dict(type="PadMultiViewImage", size_divisor=32), + dict(type="DefaultFormatBundle3D", class_names=class_names), + dict(type="Collect3D", keys=["gt_bboxes_3d", "gt_labels_3d", "img"]), +] +test_pipeline = [ + dict(type="LoadMultiViewImageFromFiles", to_float32=True), + dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=False), + dict(type="NormalizeMultiviewImage", **img_norm_cfg), + dict(type="PadMultiViewImage", size_divisor=32), + dict( + type="MultiScaleFlipAug3D", + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False), + dict(type="Collect3D", keys=["img"]), + ], + ), +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + "nuscenes_infos_train.pkl", + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d="LiDAR", + ), + val=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality), + test=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality), +) + +optimizer = dict( + type="AdamW", + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + "img_backbone": dict(lr_mult=0.1), + } + ), + weight_decay=0.01, +) + +optimizer_config = dict(type="Fp16OptimizerHook", loss_scale=512.0, grad_clip=dict(max_norm=35, norm_type=2)) + +# learning policy +lr_config = dict( + policy="CosineAnnealing", + warmup="linear", + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3, + # by_epoch=False +) +total_epochs = 24 +evaluation = dict(interval=24, pipeline=test_pipeline) +find_unused_parameters = False + +runner = dict(type="EpochBasedRunner", max_epochs=total_epochs) +resume_from = None diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py new file mode 100644 index 000000000..329e58094 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py @@ -0,0 +1,239 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +backbone_norm_cfg = dict(type="LN", requires_grad=True) +plugin = True + +# If point cloud range is changed, the models should also change their point +# cloud range accordingly +point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] +voxel_size = [0.2, 0.2, 8] +img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False) +# For nuScenes we usually do 10-class detection +class_names = [ + "car", + "truck", + "construction_vehicle", + "bus", + "trailer", + "barrier", + "motorcycle", + "bicycle", + "pedestrian", + "traffic_cone", +] +input_modality = dict(use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True) +model = dict( + type="Petr3D", + use_grid_mask=True, + img_backbone=dict( + type="VoVNetCP", + spec_name="V-99-eSE", + norm_eval=True, + frozen_stages=-1, + input_ch=3, + out_features=( + "stage4", + "stage5", + ), + ), + img_neck=dict(type="CPFPN", in_channels=[768, 1024], out_channels=256, num_outs=2), + pts_bbox_head=dict( + type="PETRHead", + num_classes=10, + in_channels=256, + num_query=900, + LID=True, + with_position=True, + with_multiview=True, + position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + normedlinear=False, + transformer=dict( + type="PETRTransformer", + decoder=dict( + type="PETRTransformerDecoder", + return_intermediate=True, + num_layers=6, + transformerlayers=dict( + type="PETRTransformerDecoderLayer", + attn_cfgs=[ + dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1), + dict(type="PETRMultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1), + ], + feedforward_channels=2048, + ffn_dropout=0.1, + operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"), + ), + ), + ), + bbox_coder=dict( + type="NMSFreeCoder", + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + pc_range=point_cloud_range, + max_num=300, + voxel_size=voxel_size, + num_classes=10, + ), + positional_encoding=dict(type="SinePositionalEncoding3D", num_feats=128, normalize=True), + loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), + loss_bbox=dict(type="L1Loss", loss_weight=0.25), + loss_iou=dict(type="GIoULoss", loss_weight=0.0), + ), + # model training and testing settings + train_cfg=dict( + pts=dict( + grid_size=[512, 512, 1], + voxel_size=voxel_size, + point_cloud_range=point_cloud_range, + out_size_factor=4, + assigner=dict( + type="HungarianAssigner3D", + cls_cost=dict(type="FocalLossCost", weight=2.0), + reg_cost=dict(type="BBox3DL1Cost", weight=0.25), + iou_cost=dict( + type="IoUCost", weight=0.0 + ), # Fake cost. This is just to make it compatible with DETR head. + pc_range=point_cloud_range, + ), + ) + ), +) + +dataset_type = "CustomNuScenesDataset" +data_root = "../data/nuscenes/" + +file_client_args = dict(backend="disk") + +db_sampler = dict( + data_root=data_root, + info_path=data_root + "nuscenes_dbinfos_train.pkl", + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5, + ), + ), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2, + ), + points_loader=dict( + type="LoadPointsFromFile", + coord_type="LIDAR", + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + file_client_args=file_client_args, + ), +) +ida_aug_conf = { + "resize_lim": (0.47, 0.625), + "final_dim": (320, 800), + "bot_pct_lim": (0.0, 0.0), + "rot_lim": (0.0, 0.0), + "H": 900, + "W": 1600, + "rand_flip": True, +} +train_pipeline = [ + dict(type="LoadMultiViewImageFromFiles", to_float32=True), + dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False), + dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range), + dict(type="ObjectNameFilter", classes=class_names), + dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=True), + dict( + type="GlobalRotScaleTransImage", + rot_range=[-0.3925, 0.3925], + translation_std=[0, 0, 0], + scale_ratio_range=[0.95, 1.05], + reverse_angle=True, + training=True, + ), + dict(type="NormalizeMultiviewImage", **img_norm_cfg), + dict(type="PadMultiViewImage", size_divisor=32), + dict(type="DefaultFormatBundle3D", class_names=class_names), + dict(type="Collect3D", keys=["gt_bboxes_3d", "gt_labels_3d", "img"]), +] +test_pipeline = [ + dict(type="LoadMultiViewImageFromFiles", to_float32=True), + dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=False), + dict(type="NormalizeMultiviewImage", **img_norm_cfg), + dict(type="PadMultiViewImage", size_divisor=32), + dict( + type="MultiScaleFlipAug3D", + img_scale=(1333, 800), + pts_scale_ratio=1, + flip=False, + transforms=[ + dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False), + dict(type="Collect3D", keys=["img"]), + ], + ), +] + +data = dict( + samples_per_gpu=1, + workers_per_gpu=4, + train=dict( + type=dataset_type, + data_root=data_root, + ann_file=data_root + "nuscenes_infos_train.pkl", + pipeline=train_pipeline, + classes=class_names, + modality=input_modality, + test_mode=False, + use_valid_flag=True, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d="LiDAR", + ), + val=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality), + test=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality), +) + +optimizer = dict( + type="AdamW", + lr=2e-4, + paramwise_cfg=dict( + custom_keys={ + "img_backbone": dict(lr_mult=0.1), + } + ), + weight_decay=0.01, +) + +optimizer_config = dict(type="Fp16OptimizerHook", loss_scale=512.0, grad_clip=dict(max_norm=35, norm_type=2)) + +# learning policy +lr_config = dict( + policy="CosineAnnealing", + warmup="linear", + warmup_iters=500, + warmup_ratio=1.0 / 3, + min_lr_ratio=1e-3, +) +total_epochs = 24 +evaluation = dict(interval=24, pipeline=test_pipeline) +find_unused_parameters = False + +runner = dict(type="EpochBasedRunner", max_epochs=total_epochs) +resume_from = None diff --git a/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py b/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py new file mode 100644 index 000000000..dc6896208 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from mmdetection (https://github.com/open-mmlab/mmdetection) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ +import math + +import torch +import torch.nn as nn +from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING +from mmcv.runner import BaseModule + + +@POSITIONAL_ENCODING.register_module() +class SinePositionalEncoding3D(BaseModule): + """Position encoding with sine and cosine functions. + See `End-to-End Object Detection with Transformers + `_ for details. + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. Note the final returned dimension + for each position is 2 times of this value. + temperature (int, optional): The temperature used for scaling + the position embedding. Defaults to 10000. + normalize (bool, optional): Whether to normalize the position + embedding. Defaults to False. + scale (float, optional): A scale factor that scales the position + embedding. The scale will be used only when `normalize` is True. + Defaults to 2*pi. + eps (float, optional): A value added to the denominator for + numerical stability. Defaults to 1e-6. + offset (float): offset add to embed when do the normalization. + Defaults to 0. + init_cfg (dict or list[dict], optional): Initialization config dict. + Default: None + """ + + def __init__( + self, num_feats, temperature=10000, normalize=False, scale=2 * math.pi, eps=1e-6, offset=0.0, init_cfg=None + ): + super(SinePositionalEncoding3D, self).__init__(init_cfg) + if normalize: + assert isinstance(scale, (float, int)), ( + "when normalize is set," "scale should be provided and in float or int type, " f"found {type(scale)}" + ) + self.num_feats = num_feats + self.temperature = temperature + self.normalize = normalize + self.scale = scale + self.eps = eps + self.offset = offset + + def forward(self, mask): + """Forward function for `SinePositionalEncoding`. + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + # For convenience of exporting to ONNX, it's required to convert + # `masks` from bool to int. + mask = mask.to(torch.int) + not_mask = 1 - mask # logical_not + n_embed = not_mask.cumsum(1, dtype=torch.float32) + y_embed = not_mask.cumsum(2, dtype=torch.float32) + x_embed = not_mask.cumsum(3, dtype=torch.float32) + if self.normalize: + n_embed = (n_embed + self.offset) / (n_embed[:, -1:, :, :] + self.eps) * self.scale + y_embed = (y_embed + self.offset) / (y_embed[:, :, -1:, :] + self.eps) * self.scale + x_embed = (x_embed + self.offset) / (x_embed[:, :, :, -1:] + self.eps) * self.scale + dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats) + pos_n = n_embed[:, :, :, :, None] / dim_t + pos_x = x_embed[:, :, :, :, None] / dim_t + pos_y = y_embed[:, :, :, :, None] / dim_t + # use `view` instead of `flatten` for dynamically exporting to ONNX + B, N, H, W = mask.size() + pos_n = torch.stack((pos_n[:, :, :, :, 0::2].sin(), pos_n[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1) + pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1) + pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1) + pos = torch.cat((pos_n, pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3) + return pos + + def __repr__(self): + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f"(num_feats={self.num_feats}, " + repr_str += f"temperature={self.temperature}, " + repr_str += f"normalize={self.normalize}, " + repr_str += f"scale={self.scale}, " + repr_str += f"eps={self.eps})" + return repr_str + + +@POSITIONAL_ENCODING.register_module() +class LearnedPositionalEncoding3D(BaseModule): + """Position embedding with learnable embedding weights. + Args: + num_feats (int): The feature dimension for each position + along x-axis or y-axis. The final returned dimension for + each position is 2 times of this value. + row_num_embed (int, optional): The dictionary size of row embeddings. + Default 50. + col_num_embed (int, optional): The dictionary size of col embeddings. + Default 50. + init_cfg (dict or list[dict], optional): Initialization config dict. + """ + + def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type="Uniform", layer="Embedding")): + super(LearnedPositionalEncoding3D, self).__init__(init_cfg) + self.row_embed = nn.Embedding(row_num_embed, num_feats) + self.col_embed = nn.Embedding(col_num_embed, num_feats) + self.num_feats = num_feats + self.row_num_embed = row_num_embed + self.col_num_embed = col_num_embed + + def forward(self, mask): + """Forward function for `LearnedPositionalEncoding`. + Args: + mask (Tensor): ByteTensor mask. Non-zero values representing + ignored positions, while zero values means valid positions + for this image. Shape [bs, h, w]. + Returns: + pos (Tensor): Returned position embedding with shape + [bs, num_feats*2, h, w]. + """ + h, w = mask.shape[-2:] + x = torch.arange(w, device=mask.device) + y = torch.arange(h, device=mask.device) + x_embed = self.col_embed(x) + y_embed = self.row_embed(y) + pos = ( + torch.cat((x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(1, w, 1)), dim=-1) + .permute(2, 0, 1) + .unsqueeze(0) + .repeat(mask.shape[0], 1, 1, 1) + ) + return pos + + def __repr__(self): + """str: a string that describes the module""" + repr_str = self.__class__.__name__ + repr_str += f"(num_feats={self.num_feats}, " + repr_str += f"row_num_embed={self.row_num_embed}, " + repr_str += f"col_num_embed={self.col_num_embed})" + return repr_str diff --git a/forge/test/models/pytorch/vision/petr/utils/transform_3d.py b/forge/test/models/pytorch/vision/petr/utils/transform_3d.py new file mode 100644 index 000000000..429d157da --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/transform_3d.py @@ -0,0 +1,213 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import mmcv + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) +# Copyright (c) OpenMMLab. All rights reserved. +# ------------------------------------------------------------------------ +import numpy as np +import torch +from mmdet.datasets.builder import PIPELINES +from PIL import Image + + +@PIPELINES.register_module() +class PadMultiViewImage(object): + """Pad the multi-view image. + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_val (float, optional): Padding value, 0 by default. + """ + + def __init__(self, size=None, size_divisor=None, pad_val=0): + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + # only one of size and size_divisor should be valid + assert size is not None or size_divisor is not None + assert size is None or size_divisor is None + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + if self.size is not None: + padded_img = [mmcv.impad(img, shape=self.size, pad_val=self.pad_val) for img in results["img"]] + elif self.size_divisor is not None: + padded_img = [ + mmcv.impad_to_multiple(img, self.size_divisor, pad_val=self.pad_val) for img in results["img"] + ] + results["img_shape"] = [img.shape for img in results["img"]] + results["img"] = padded_img + results["pad_shape"] = [img.shape for img in padded_img] + results["pad_fixed_size"] = self.size + results["pad_size_divisor"] = self.size_divisor + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + self._pad_img(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f"(size={self.size}, " + repr_str += f"size_divisor={self.size_divisor}, " + repr_str += f"pad_val={self.pad_val})" + return repr_str + + +@PIPELINES.register_module() +class NormalizeMultiviewImage(object): + """Normalize the image. + Added key is "img_norm_cfg". + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def __call__(self, results): + """Call function to normalize images. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + results["img"] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results["img"]] + results["img_norm_cfg"] = dict(mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f"(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})" + return repr_str + + +@PIPELINES.register_module() +class ResizeCropFlipImage(object): + """Random resize, Crop and flip the image + Args: + size (tuple, optional): Fixed padding size. + """ + + def __init__(self, data_aug_conf=None, training=True): + self.data_aug_conf = data_aug_conf + self.training = training + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Updated result dict. + """ + + imgs = results["img"] + N = len(imgs) + new_imgs = [] + resize, resize_dims, crop, flip, rotate = self._sample_augmentation() + for i in range(N): + img = Image.fromarray(np.uint8(imgs[i])) + # augmentation (resize, crop, horizontal flip, rotate) + # resize, resize_dims, crop, flip, rotate = self._sample_augmentation() ###different view use different aug (BEV Det) + img, ida_mat = self._img_transform( + img, + resize=resize, + resize_dims=resize_dims, + crop=crop, + flip=flip, + rotate=rotate, + ) + new_imgs.append(np.array(img).astype(np.float32)) + results["intrinsics"][i][:3, :3] = ida_mat @ results["intrinsics"][i][:3, :3] + + results["img"] = new_imgs + results["lidar2img"] = [ + results["intrinsics"][i] @ results["extrinsics"][i].T for i in range(len(results["extrinsics"])) + ] + + return results + + def _get_rot(self, h): + + return torch.Tensor( + [ + [np.cos(h), np.sin(h)], + [-np.sin(h), np.cos(h)], + ] + ) + + def _img_transform(self, img, resize, resize_dims, crop, flip, rotate): + ida_rot = torch.eye(2) + ida_tran = torch.zeros(2) + # adjust image + img = img.resize(resize_dims) + img = img.crop(crop) + if flip: + img = img.transpose(method=Image.FLIP_LEFT_RIGHT) + img = img.rotate(rotate) + + # post-homography transformation + ida_rot *= resize + ida_tran -= torch.Tensor(crop[:2]) + if flip: + A = torch.Tensor([[-1, 0], [0, 1]]) + b = torch.Tensor([crop[2] - crop[0], 0]) + ida_rot = A.matmul(ida_rot) + ida_tran = A.matmul(ida_tran) + b + A = self._get_rot(rotate / 180 * np.pi) + b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 + b = A.matmul(-b) + b + ida_rot = A.matmul(ida_rot) + ida_tran = A.matmul(ida_tran) + b + ida_mat = torch.eye(3) + ida_mat[:2, :2] = ida_rot + ida_mat[:2, 2] = ida_tran + return img, ida_mat + + def _sample_augmentation(self): + H, W = self.data_aug_conf["H"], self.data_aug_conf["W"] + fH, fW = self.data_aug_conf["final_dim"] + if self.training: + resize = np.random.uniform(*self.data_aug_conf["resize_lim"]) + resize_dims = (int(W * resize), int(H * resize)) + newW, newH = resize_dims + crop_h = int((1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"])) * newH) - fH + crop_w = int(np.random.uniform(0, max(0, newW - fW))) + crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) + flip = False + if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]): + flip = True + rotate = np.random.uniform(*self.data_aug_conf["rot_lim"]) + else: + resize = max(fH / H, fW / W) + resize_dims = (int(W * resize), int(H * resize)) + newW, newH = resize_dims + crop_h = int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH) - fH + crop_w = int(max(0, newW - fW) / 2) + crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) + flip = False + rotate = 0 + return resize, resize_dims, crop, flip, rotate diff --git a/forge/test/models/pytorch/vision/petr/utils/utils.py b/forge/test/models/pytorch/vision/petr/utils/utils.py new file mode 100644 index 000000000..ab04e9365 --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/utils.py @@ -0,0 +1,201 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import torch +from mmcv import Config +from mmdet3d.datasets import build_dataloader, build_dataset + +from test.models.pytorch.vision.petr.mmdet3d.core.bbox.transforms import bbox3d2result + + +def load_config(variant): + cfg = Config.fromfile(f"forge/test/models/pytorch/vision/petr/utils/petr_{variant}.py") + cfg.data.test.test_mode = True + cfg.data.test.ann_file = "forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl" + return cfg + + +def prepare_model_inputs(cfg): + + dataset = build_dataset(cfg.data.test) + data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=0, dist=False, shuffle=False) + dataset = data_loader.dataset + + for i, data in enumerate(data_loader): + + img_metas = data["img_metas"][0].data[0] + filename = img_metas[0]["filename"] + ori_shape = img_metas[0]["ori_shape"] + img_shape = img_metas[0]["img_shape"] + pad_shape = img_metas[0]["pad_shape"] + scale_factor = img_metas[0]["scale_factor"] + flip = img_metas[0]["flip"] + pcd_horizontal_flip = img_metas[0]["pcd_horizontal_flip"] + pcd_vertical_flip = img_metas[0]["pcd_vertical_flip"] + box_mode_3d = img_metas[0]["box_mode_3d"] + box_type_3d = img_metas[0]["box_type_3d"] + mean = torch.from_numpy(img_metas[0]["img_norm_cfg"]["mean"]) + std = torch.from_numpy(img_metas[0]["img_norm_cfg"]["std"]) + to_rgb = img_metas[0]["img_norm_cfg"]["to_rgb"] + sample_idx = img_metas[0]["sample_idx"] + pcd_scale_factor = img_metas[0]["pcd_scale_factor"] + pts_filename = img_metas[0]["pts_filename"] + img = data["img"][0].data[0] + lidar2img_list = img_metas[0]["lidar2img"] + + lidar2img_tensors_list = [] + + for idx, lidar2img_array in enumerate(lidar2img_list): + lidar2img_tensor = torch.from_numpy(lidar2img_array) + lidar2img_tensors_list.append(lidar2img_tensor) + + batch_size = 1 + num_cams = 6 + input_img_h, input_img_w, _ = pad_shape[0] + x = torch.rand(batch_size, num_cams, input_img_h, input_img_w) + masks = x.new_ones((batch_size, num_cams, input_img_h, input_img_w)) + + inputs = [ + lidar2img_tensors_list[0].unsqueeze(0), + lidar2img_tensors_list[1].unsqueeze(0), + lidar2img_tensors_list[2].unsqueeze(0), + lidar2img_tensors_list[3].unsqueeze(0), + lidar2img_tensors_list[4].unsqueeze(0), + lidar2img_tensors_list[5].unsqueeze(0), + img.unsqueeze(0), + mean.unsqueeze(0), + std.unsqueeze(0), + masks.unsqueeze(0), + ] + + for i, tensor in enumerate(inputs): + if tensor.dtype == torch.float64: + inputs[i] = tensor.to(torch.float32) + + return ( + filename, + ori_shape, + img_shape, + pad_shape, + scale_factor, + flip, + pcd_horizontal_flip, + pcd_vertical_flip, + box_mode_3d, + box_type_3d, + to_rgb, + sample_idx, + pcd_scale_factor, + pts_filename, + inputs, + ) + + +def denormalize_bbox(normalized_bboxes, pc_range): + # rotation + rot_sine = normalized_bboxes[..., 6:7] + + rot_cosine = normalized_bboxes[..., 7:8] + rot = torch.atan2(rot_sine, rot_cosine) + + # center in the bev + cx = normalized_bboxes[..., 0:1] + cy = normalized_bboxes[..., 1:2] + cz = normalized_bboxes[..., 4:5] + + # size + w = normalized_bboxes[..., 2:3] + l = normalized_bboxes[..., 3:4] + h = normalized_bboxes[..., 5:6] + + w = w.exp() + l = l.exp() + h = h.exp() + if normalized_bboxes.size(-1) > 8: + # velocity + vx = normalized_bboxes[:, 8:9] + vy = normalized_bboxes[:, 9:10] + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) + else: + denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) + return denormalized_bboxes + + +def decode_single(cls_scores, bbox_preds): + + # post processing + pc_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] + voxel_size = [0.2, 0.2, 8] + post_center_range = [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0] + max_num = 300 + score_threshold = None + num_classes = 10 + + cls_scores = cls_scores.sigmoid() + scores, indexs = cls_scores.view(-1).topk(max_num) + labels = indexs % num_classes + bbox_index = indexs // num_classes + + bbox_preds = bbox_preds[bbox_index] + + final_box_preds = denormalize_bbox(bbox_preds, pc_range) + final_scores = scores + final_preds = labels + + if score_threshold is not None: + thresh_mask = final_scores > self.score_threshold + if post_center_range is not None: + post_center_range = torch.tensor(post_center_range, device=scores.device) + + mask = (final_box_preds[..., :3] >= post_center_range[:3]).all(1) + mask &= (final_box_preds[..., :3] <= post_center_range[3:]).all(1) + + if score_threshold: + mask &= thresh_mask + + boxes3d = final_box_preds[mask] + scores = final_scores[mask] + labels = final_preds[mask] + predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels} + + return predictions_dict + + +def post_process(img_metas, all_class_scores, all_bbox_predictions): + + all_cls_scores = all_class_scores[-1] + all_bbox_preds = all_bbox_predictions[-1] + + batch_size = all_cls_scores.size()[0] + predictions_list = [] + for i in range(batch_size): + predictions_list.append(decode_single(all_cls_scores[i], all_bbox_preds[i])) + + preds_dicts = predictions_list + num_samples = len(preds_dicts) + ret_list = [] + for i in range(num_samples): + preds = preds_dicts[i] + bboxes = preds["bboxes"] + bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 + bboxes = img_metas[i]["box_type_3d"](bboxes, bboxes.size(-1)) + scores = preds["scores"] + labels = preds["labels"] + ret_list.append([bboxes, scores, labels]) + + bbox_list = ret_list + bbox_results = [bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list] + + bbox_list = [dict() for i in range(1)] + bbox_pts = bbox_results + for result_dict, pts_bbox in zip(bbox_list, bbox_pts): + result_dict["pts_bbox"] = pts_bbox + + boxes_3d_tensor = bbox_list[0]["pts_bbox"]["boxes_3d"].tensor + scores_3d_tensor = bbox_list[0]["pts_bbox"]["scores_3d"] + labels_3d_tensor = bbox_list[0]["pts_bbox"]["labels_3d"] + + output = (boxes_3d_tensor, scores_3d_tensor, labels_3d_tensor) + + return output diff --git a/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py b/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py new file mode 100644 index 000000000..08f9f41ae --- /dev/null +++ b/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py @@ -0,0 +1,394 @@ +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +# SPDX-License-Identifier: Apache-2.0 + +import warnings + +# ------------------------------------------------------------------------ +# Copyright (c) 2022 megvii-model. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from DETR3D (https://github.com/WangYueFt/detr3d) +# Copyright (c) 2021 Wang, Yue +# ------------------------------------------------------------------------ +# Copyright (c) Youngwan Lee (ETRI) All Rights Reserved. +# Copyright 2021 Toyota Research Institute. All rights reserved. +# ------------------------------------------------------------------------ +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from mmcv.runner import BaseModule +from mmdet.models.builder import BACKBONES +from torch.nn.modules.batchnorm import _BatchNorm + +VoVNet19_slim_dw_eSE = { + "stem": [64, 64, 64], + "stage_conv_ch": [64, 80, 96, 112], + "stage_out_ch": [112, 256, 384, 512], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True, +} + +VoVNet19_dw_eSE = { + "stem": [64, 64, 64], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": True, +} + +VoVNet19_slim_eSE = { + "stem": [64, 64, 128], + "stage_conv_ch": [64, 80, 96, 112], + "stage_out_ch": [112, 256, 384, 512], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": False, +} + +VoVNet19_eSE = { + "stem": [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 3, + "block_per_stage": [1, 1, 1, 1], + "eSE": True, + "dw": False, +} + +VoVNet39_eSE = { + "stem": [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 2, 2], + "eSE": True, + "dw": False, +} + +VoVNet57_eSE = { + "stem": [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 1, 4, 3], + "eSE": True, + "dw": False, +} + +VoVNet99_eSE = { + "stem": [64, 64, 128], + "stage_conv_ch": [128, 160, 192, 224], + "stage_out_ch": [256, 512, 768, 1024], + "layer_per_block": 5, + "block_per_stage": [1, 3, 9, 3], + "eSE": True, + "dw": False, +} + +_STAGE_SPECS = { + "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, + "V-19-dw-eSE": VoVNet19_dw_eSE, + "V-19-slim-eSE": VoVNet19_slim_eSE, + "V-19-eSE": VoVNet19_eSE, + "V-39-eSE": VoVNet39_eSE, + "V-57-eSE": VoVNet57_eSE, + "V-99-eSE": VoVNet99_eSE, +} + + +def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + "{}_{}/dw_conv3x3".format(module_name, postfix), + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=out_channels, + bias=False, + ), + ), + ( + "{}_{}/pw_conv1x1".format(module_name, postfix), + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False), + ), + ("{}_{}/pw_norm".format(module_name, postfix), nn.BatchNorm2d(out_channels)), + ("{}_{}/pw_relu".format(module_name, postfix), nn.ReLU(inplace=True)), + ] + + +def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): + """3x3 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): + """1x1 convolution with padding""" + return [ + ( + f"{module_name}_{postfix}/conv", + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=False, + ), + ), + (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), + (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), + ] + + +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 + + +class eSEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(eSEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) + self.hsigmoid = Hsigmoid() + + def forward(self, x): + input = x + x = self.avg_pool(x) + x = self.fc(x) + x = self.hsigmoid(x) + return input * x + + +class _OSA_module(nn.Module): + def __init__( + self, + in_ch, + stage_ch, + concat_ch, + layer_per_block, + module_name, + SE=False, + identity=False, + depthwise=False, + with_cp=True, + ): + + super(_OSA_module, self).__init__() + + self.identity = identity + self.depthwise = depthwise + self.isReduced = False + self.use_checkpoint = with_cp + self.layers = nn.ModuleList() + in_channel = in_ch + if self.depthwise and in_channel != stage_ch: + self.isReduced = True + self.conv_reduction = nn.Sequential( + OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) + ) + for i in range(layer_per_block): + if self.depthwise: + self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) + else: + self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) + in_channel = stage_ch + + # feature aggregation + in_channel = in_ch + layer_per_block * stage_ch + self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) + + self.ese = eSEModule(concat_ch) + + def _forward(self, x): + + identity_feat = x + + output = [] + output.append(x) + if self.depthwise and self.isReduced: + x = self.conv_reduction(x) + for layer in self.layers: + x = layer(x) + output.append(x) + + x = torch.cat(output, dim=1) + xt = self.concat(x) + + xt = self.ese(xt) + + if self.identity: + xt = xt + identity_feat + + return xt + + def forward(self, x): + + if self.use_checkpoint and self.training: + xt = cp.checkpoint(self._forward, x) + else: + xt = self._forward(x) + + return xt + + +class _OSA_stage(nn.Sequential): + def __init__( + self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False + ): + + super(_OSA_stage, self).__init__() + + if not stage_num == 2: + self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) + + if block_per_stage != 1: + SE = False + module_name = f"OSA{stage_num}_1" + self.add_module( + module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) + ) + for i in range(block_per_stage - 1): + if i != block_per_stage - 2: # last block + SE = False + module_name = f"OSA{stage_num}_{i + 2}" + self.add_module( + module_name, + _OSA_module( + concat_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, identity=True, depthwise=depthwise + ), + ) + + +@BACKBONES.register_module() +class VoVNetCP(BaseModule): + def __init__( + self, spec_name, input_ch=3, out_features=None, frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None + ): + """ + Args: + input_ch(int) : the number of input channel + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "stage2" ... + """ + super(VoVNetCP, self).__init__(init_cfg) + self.frozen_stages = frozen_stages + self.norm_eval = norm_eval + + if isinstance(pretrained, str): + warnings.warn("DeprecationWarning: pretrained is deprecated, " 'please use "init_cfg" instead') + self.init_cfg = dict(type="Pretrained", checkpoint=pretrained) + stage_specs = _STAGE_SPECS[spec_name] + + stem_ch = stage_specs["stem"] + config_stage_ch = stage_specs["stage_conv_ch"] + config_concat_ch = stage_specs["stage_out_ch"] + block_per_stage = stage_specs["block_per_stage"] + layer_per_block = stage_specs["layer_per_block"] + SE = stage_specs["eSE"] + depthwise = stage_specs["dw"] + + self._out_features = out_features + + # Stem module + conv_type = dw_conv3x3 if depthwise else conv3x3 + stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) + stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) + stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) + self.add_module("stem", nn.Sequential((OrderedDict(stem)))) + current_stirde = 4 + self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} + self._out_feature_channels = {"stem": stem_ch[2]} + + stem_out_ch = [stem_ch[2]] + in_ch_list = stem_out_ch + config_concat_ch[:-1] + # OSA stages + self.stage_names = [] + for i in range(4): # num_stages + name = "stage%d" % (i + 2) # stage 2 ... stage 5 + self.stage_names.append(name) + self.add_module( + name, + _OSA_stage( + in_ch_list[i], + config_stage_ch[i], + config_concat_ch[i], + block_per_stage[i], + layer_per_block, + i + 2, + SE, + depthwise, + ), + ) + + self._out_feature_channels[name] = config_concat_ch[i] + if not i == 0: + self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) + + def forward(self, x): + outputs = [] + x = self.stem(x) + if "stem" in self._out_features: + outputs.append(x) + for name in self.stage_names: + x = getattr(self, name)(x) + if name in self._out_features: + outputs.append(x) + + return outputs + + def _freeze_stages(self): + if self.frozen_stages >= 0: + m = getattr(self, "stem") + m.eval() + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f"stage{i+1}") + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(VoVNetCP, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval()