From 8ac9fffd9da2c2bc0f9ac2423d0188d16ff7ed21 Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Thu, 2 Jan 2025 13:16:38 +0000
Subject: [PATCH] Add support for PETR Model(Vovnet based varaints)

---
 .gitattributes                                |  12 +
 env/core_requirements.txt                     |   2 +
 .../petr/data/nuscenes/nuscenes_infos_val.pkl | Bin 0 -> 8003 bytes
 ...16-36-0400__CAM_BACK__1533151603537558.jpg |   3 +
 ...-0400__CAM_BACK_LEFT__1533151603547405.jpg |   3 +
 ...0400__CAM_BACK_RIGHT__1533151603528113.jpg |   3 +
 ...6-36-0400__CAM_FRONT__1533151603512404.jpg |   3 +
 ...0400__CAM_FRONT_LEFT__1533151603504799.jpg |   3 +
 ...400__CAM_FRONT_RIGHT__1533151603520482.jpg |   3 +
 ...-0400__LIDAR_TOP__1533151603547590.pcd.bin |   3 +
 ...400__RADAR_BACK_LEFT__1533151603522238.pcd |   3 +
 ...00__RADAR_BACK_RIGHT__1533151603576423.pcd |   3 +
 ...36-0400__RADAR_FRONT__1533151603555991.pcd |   3 +
 ...00__RADAR_FRONT_LEFT__1533151603526348.pcd |   3 +
 ...0__RADAR_FRONT_RIGHT__1533151603512881.pcd |   3 +
 .../pytorch/vision/petr/mmdet/__init__.py     |   0
 .../vision/petr/mmdet/core/anchor/__init__.py |  33 +
 .../mmdet/core/anchor/anchor_generator.py     | 660 ++++++++++++++++++
 .../vision/petr/mmdet/core/anchor/builder.py  |  21 +
 .../petr/mmdet/core/anchor/point_generator.py | 172 +++++
 .../vision/petr/mmdet/core/anchor/utils.py    |  75 ++
 .../vision/petr/mmdet/core/bbox/__init__.py   |   8 +
 .../mmdet/core/bbox/assigners/__init__.py     |   7 +
 .../core/bbox/assigners/base_assigner.py      |  10 +
 .../vision/petr/mmdet/core/bbox/builder.py    |  15 +
 .../petr/mmdet/core/bbox/coder/__init__.py    |   7 +
 .../mmdet/core/bbox/coder/base_bbox_coder.py  |  13 +
 .../bbox/coder/distance_point_bbox_coder.py   |  66 ++
 .../mmdet/core/bbox/match_costs/__init__.py   |  24 +
 .../mmdet/core/bbox/match_costs/builder.py    |  13 +
 .../mmdet/core/bbox/match_costs/match_cost.py | 345 +++++++++
 .../vision/petr/mmdet/core/utils/__init__.py  |   6 +
 .../petr/mmdet/core/utils/dist_utils.py       |  47 ++
 .../vision/petr/mmdet/datasets/__init__.py    |   6 +
 .../vision/petr/mmdet/datasets/builder.py     | 214 ++++++
 .../petr/mmdet/datasets/pipelines/__init__.py |   8 +
 .../petr/mmdet/datasets/pipelines/compose.py  |  59 ++
 .../mmdet/datasets/pipelines/formatting.py    | 133 ++++
 .../petr/mmdet/datasets/pipelines/loading.py  | 289 ++++++++
 .../petr/mmdet/datasets/samplers/__init__.py  |  18 +
 .../datasets/samplers/class_aware_sampler.py  | 162 +++++
 .../datasets/samplers/distributed_sampler.py  |  49 ++
 .../mmdet/datasets/samplers/group_sampler.py  | 136 ++++
 .../datasets/samplers/infinite_sampler.py     | 167 +++++
 .../vision/petr/mmdet/models/__init__.py      |  20 +
 .../vision/petr/mmdet/models/builder.py       |  49 ++
 .../models/dense_heads/anchor_free_head.py    | 274 ++++++++
 .../models/dense_heads/base_dense_head.py     | 520 ++++++++++++++
 .../models/dense_heads/dense_test_mixins.py   | 149 ++++
 .../petr/mmdet/models/detectors/base.py       | 145 ++++
 .../petr/mmdet/models/losses/__init__.py      |   0
 .../petr/mmdet/models/losses/focal_loss.py    |  80 +++
 .../petr/mmdet/models/losses/iou_loss.py      | 448 ++++++++++++
 .../mmdet/models/losses/smooth_l1_loss.py     |  66 ++
 .../vision/petr/mmdet/models/losses/utils.py  | 104 +++
 .../petr/mmdet/models/utils/__init__.py       |   8 +
 .../vision/petr/mmdet/models/utils/builder.py |  18 +
 .../petr/mmdet/models/utils/res_layer.py      | 179 +++++
 .../petr/mmdet/models/utils/transformer.py    |  25 +
 .../vision/petr/mmdet/utils/__init__.py       |   7 +
 .../petr/mmdet/utils/util_distribution.py     |  13 +
 .../pytorch/vision/petr/mmdet3d/__init__.py   |   0
 .../mmdet3d/configs/_base_/datasets/nus-3d.py |  42 ++
 .../mmdet3d/configs/_base_/default_runtime.py |  17 +
 .../vision/petr/mmdet3d/core/__init__.py      |   0
 .../vision/petr/mmdet3d/core/bbox/__init__.py |   7 +
 .../petr/mmdet3d/core/bbox/coders/__init__.py |   6 +
 .../mmdet3d/core/bbox/structures/__init__.py  |  36 +
 .../core/bbox/structures/base_box3d.py        | 338 +++++++++
 .../core/bbox/structures/box_3d_mode.py       | 165 +++++
 .../mmdet3d/core/bbox/structures/cam_box3d.py | 256 +++++++
 .../core/bbox/structures/coord_3d_mode.py     | 270 +++++++
 .../core/bbox/structures/depth_box3d.py       | 187 +++++
 .../core/bbox/structures/lidar_box3d.py       | 179 +++++
 .../mmdet3d/core/bbox/structures/utils.py     | 229 ++++++
 .../petr/mmdet3d/core/bbox/transforms.py      |  29 +
 .../petr/mmdet3d/core/points/__init__.py      |   6 +
 .../petr/mmdet3d/core/points/base_points.py   | 335 +++++++++
 .../vision/petr/mmdet3d/datasets/__init__.py  |  10 +
 .../vision/petr/mmdet3d/datasets/builder.py   |  26 +
 .../vision/petr/mmdet3d/datasets/custom_3d.py | 222 ++++++
 .../petr/mmdet3d/datasets/nuscenes_dataset.py | 175 +++++
 .../mmdet3d/datasets/pipelines/__init__.py    |   8 +
 .../mmdet3d/datasets/pipelines/formating.py   | 285 ++++++++
 .../mmdet3d/datasets/pipelines/loading.py     |  72 ++
 .../datasets/pipelines/test_time_aug.py       | 114 +++
 .../vision/petr/mmdet3d/models/__init__.py    |   7 +
 .../vision/petr/mmdet3d/models/builder.py     |  65 ++
 .../petr/mmdet3d/models/detectors/__init__.py |   6 +
 .../petr/mmdet3d/models/detectors/base.py     |  54 ++
 .../mmdet3d/models/detectors/mvx_two_stage.py | 418 +++++++++++
 .../models/pytorch/vision/petr/test_petr.py   | 173 +++++
 .../pytorch/vision/petr/utils/__init__.py     |   0
 .../pytorch/vision/petr/utils/cp_fpn.py       | 210 ++++++
 .../pytorch/vision/petr/utils/grid_mask.py    |  62 ++
 .../pytorch/vision/petr/utils/match_cost.py   |  31 +
 .../vision/petr/utils/model_registry.py       |  39 ++
 .../vision/petr/utils/nms_free_coder.py       |  41 ++
 .../vision/petr/utils/nuscenes_dataset.py     |  89 +++
 .../pytorch/vision/petr/utils/petr3d.py       | 124 ++++
 .../pytorch/vision/petr/utils/petr_head.py    | 527 ++++++++++++++
 .../vision/petr/utils/petr_transformer.py     | 447 ++++++++++++
 .../utils/petr_vovnet_gridmask_p4_1600x640.py | 242 +++++++
 .../utils/petr_vovnet_gridmask_p4_800x320.py  | 239 +++++++
 .../vision/petr/utils/positional_encoding.py  | 154 ++++
 .../pytorch/vision/petr/utils/transform_3d.py | 213 ++++++
 .../models/pytorch/vision/petr/utils/utils.py | 201 ++++++
 .../pytorch/vision/petr/utils/vovnetcp.py     | 394 +++++++++++
 108 files changed, 11668 insertions(+)
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd
 create mode 100644 forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/anchor_generator.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py
 create mode 100644 forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py
 create mode 100644 forge/test/models/pytorch/vision/petr/test_petr.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/cp_fpn.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/grid_mask.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/match_cost.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/model_registry.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_head.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_transformer.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/positional_encoding.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/transform_3d.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/utils.py
 create mode 100644 forge/test/models/pytorch/vision/petr/utils/vovnetcp.py

diff --git a/.gitattributes b/.gitattributes
index e69de29bb..7bc04f0f1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,12 @@
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd filter=lfs diff=lfs merge=lfs -text
+forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd filter=lfs diff=lfs merge=lfs -text
diff --git a/env/core_requirements.txt b/env/core_requirements.txt
index 3144d29f4..1fc40915b 100644
--- a/env/core_requirements.txt
+++ b/env/core_requirements.txt
@@ -51,3 +51,5 @@ pytorch_forecasting==1.0.0
 patool
 openpyxl==3.1.5
 GitPython==3.1.44
+mmcv-full==1.7.2
+nuscenes-devkit==1.1.11
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl b/forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..033599a0a3fba05f14a812faef64de07470fc8c8
GIT binary patch
literal 8003
zcmeI1cT`i!_rQa66%?=_7VL|wKq?U1Jc<O20*mOfD8`f*5`}<CsG@?3h(?hmBCcYu
zYeQ6Qzyu2xTr7*?>Pl0wBCw!{3j8LI1k`0$&e=V`@Ar@IobwnabLY<7na{m5?@gTE
z1}AG0krpd$6#8O`FjS@tRH_R-g2e)^lp_yafJ>B$p+W&ljtm^92$*ysh0Y~2nM?}P
z$w|QFIq_(?QqjGwK^RvqQYwxJL#08uy&RXx?L$HZc(BYqELt8a<%{ei#4>TH#6Ap{
zOYOTRCE&`r_L6WJAD7@Vdl@$*EEwg}Jl&jqIey+g_7XDL$(Bl{IN73{OtGafY$*=5
zGzVKUolNF%+66fr3WG+YFenaW8iVf0V3O^^_yRkg7#XOqHI#-sIWovXBqwmBGnqJ(
z$DmMYI1}eNl7&>9s#F-rB5^!SJx>ZfJ~u>0Oqa2%^K_28ueX<9TTwk`PTO3p_$OAj
ztNogl?Sh|KNukoobi4UsL1?V<=rCL<GF6zh%^-&uqSkD>kb%<}T&@G(flH@4@VQJn
z1E-Q5ggmN{&ZJV5iheR&A`6vL@t{zST*{Tmg1K_EkZ2lgSm49jnRmu7&0zsq^kRn%
zSH`eF<I6EO29nZog~ccBq@nUp8kGb0W5HvSS+J})=<&+j`z%oQ#Mk>=+B5V^0XK&I
zmW?a=q3)<b!J#~E@Sl3cmgQal-f}vLg>A}Pzhc7G0#+N#BogZNlXjo=3TsGS<#~ta
zvY<4)%CKlkGYiHsud2X~(=2e~WLU;;XRhE1W4Sm4p^HdUt~T^5&ORiN>MQzc3~A%M
zU8$n4Bs?T6+KwM8#qC1EgXLnbRLYH3DtdCHI6qV(lS{+-a%C!Fj1W{R3?<03)FP=$
zHYq{rs>4$zDt(nUY#p{PTX&^WG+d!CAXW=C>LzqjCar3#*H!3DRC;-NO>ISq_`@d2
zmAqB19+zqkZ`NzA0cGOcD1k{5P|_@QW5Slh<JV)NW?onUH#&VU=5IO(`rb=z{<*#e
zMx>rjUDTKeV}6Xi8eLxlYU*1m3-=!cN~BI<`I(`guC-{GXry|`qS5UGMg_YhS4$FU
z|LC{JqajJ);Qj{x9mHKM4H(J10||vD$W1t6iCij{$i#eQCt3)zksgn<#U&Y7YAU8(
zN%Dn+@Ds&J?PYEKt={Rj`JBb=ziZ{&@>;@>a7C{UukiI`c@VxS>aAG+uXrSfP}z0A
zB*Z`SPAZx1L?ygaWG-?RSty1waDfwD$fF2&IF*hI`8+Zi1s(?nE|n^9U<w?RqP~Q`
zHL-i$^Lcp?IZEx*JN2S67I??n7j8P>4le3X2wXFi=!R6<v*4BZ(h`q&3JW~RzQ4Nr
z+8HePBzJ~YB_kgf4NyyB%OE=@z37PAi9#q+8;1@j5)D)fAcyHwx>dCZIgGRYsvk~F
zL=Izna(?)_lgMGt7;ih2)_{u!uQsf#C)`9d<S&;OaY~mrrub7TfW`PlTN7(uz<%x}
z&~o%axImAbrV`D9hC|GJ;`5Gza-(;=3AHuw`Jp8oa`OwY<Hh$?kz*=AK<u#y=g95E
z@``Ak^rXe{W}m>ZkIMk5_oj*YWhDSJeP)2){*}N{G`^E|?ErpfeK)#gX9s}MffXF@
zbsYfuXzH{3jqz~<v0nkuv;)9tQ{1)ut^0TQQ)}!xeroB@=p)k|nM^Hxv(@xb9ff2n
z+RUj=3;~_br8to(3<izCpimf8vU-!((nkkM&k7+6l<PS&vsT6<`UWkNIGgX$()Z+g
zMErBIH)2f~^whk^aTd5%_24@7o}2{>^-e!td@!M%zC6EuKjpkq(|7E=*PO~u=t~IC
zv+0KD+m~P|o$?aVH|5)qg5Vd3zATev46c-*Z(DX3=(`oyu>Sq%N}zwzWyvDH7hrF!
z&)R@fJHR)(ZXZhm*20t;XYPdLpMkf&b>7g18hAVKu)%^KUcg1)E*9AhssxudU2ug~
zd%jLz+;)>li*iV+Z9aU$?-!u*zq+bY6Ab^JzN|50C(8D9(AQA9B&+(j4*In9Ni(Gt
zd+feOpOKnA7iU+t`b>i2-m-tnomy8{h*Qgb#v09$!En;D7OrNE!X!HhaYq54%;eJ@
z={S$?fODxd9-U9&Gw5WBmbF8kjt}1(jbee7mLs<gwrFL6rhaVKao1jJSu1xuVR0bj
z7z%|CCRg`V)uDf0^?$rM@F+S?T%nD+yK7B5Ybt-Y1!oA>hR5($-2W46uUj*gUfx*7
zf(zYTNa7<dEVxv)G1S@Q9b!$Dv3cN_l>}>P4yV)yg1?V+%T5+%i(6~piFq!wLIdMK
z{e>Nu&N(jwJ}1^FY=4S}O}<;}_nPhoC)N~YikfO*U*D31Z_MK1mRCpVGE|oDY;g0?
z-`y6m{w~sy4ow@rXgMV1nCj?R)I#id^@dR^in|c$Uj9<{Wa!gX?Ga3Hce|Za(dzcH
zw%(-p=o*`%uWhv^TGlw91UkjP0UV*Q%hdfT%pthQ3@v)eYV>eAl}2-<ISA=IE?p>K
zFrAnVXxAkRDYz4z=IE$JueKmpH2+!<3zWVH-h1x;5ftWH=91&9UTM**S~vdY#Q|tT
ztg?z6Pn~MOf;q>UQ%Xk%ut09^8uF~$ZXM|TaW^8n7=^jq#^uE$R(FD)kM7Cg`YafB
z>b94g|Bnd0#J%+r7coNbQdHc>9zg_pCV^c*uZQQZbo=09C|tbrk0aAw1D&^ph5`F`
z1JBwY=U<(&9=3d!;=b8*7nq#V_ogR8O*iO}Y=`e_C^%Q<Dv}n%<DOL)tKN#g9_BK4
zEHwMQ6q0&h+IZ$p7EpQTjqaCqs|)C5zq)z4ZgWSNdqA6%objq7%xUXW43Eyvq<#%O
zVv{98&ZmdGf4s?dXzVh4ogDF;C={BOz5{CdoP=DVfXj8F(s_<_w7v2uTr%2dDFP~!
z#}!h^?ex_z-;bLE7JOK>k7_-cj_50@?6WMUhnBwl%qg)hGq18>{{Ck_HB3rC81~XL
zo9r>31*&{M-j6ubr=31{VdJ#(Ug}MjeWPghkWT0;OJ7^_9SVGp#`W?szK=H9ZaoJ~
z?88U&<%HB6Uq~V7J7(4e`d-BsN}fsY!GInYmh4EXhJ#-o9#}APA3R1EF;;LCz{=mv
zef^-lU|Q<Ix|GHmcn}tZm84XIBitL{Y{WhIdxgmzx<K;vz?a$on^Vsp0+Nrvw*l=W
zjB6cV_vnw?U7+vkqFcuI6FcZTUt^i2Ti8LLwti1-1nbhXuWYj6ioV1PVdw9~HES}?
zH|RIEPdE!SFI-?&aH|4Wn0GX5f0rbV?o}C?KW2%CvtaV9q1(Q%5U@bX&0XW&XWO9#
z-X5m^VGRlenxnhwEp@MGzB;PBU1#cL)!q%RIl8}Y{r;zM1A3vTl6>P#;?`UgRf2TR
zPA@`x<Bdzb<^?3<#JU!=MHPinketH{jlyM}yrT7I8=^N;yCfGQeG)0E7MuI{fwgMa
z)RTQ8WiFWB`?=LjULiO%C_pfw>=E$d_4a3;cntP#zBn{f`VsWbfAx+%q!1jrn<yNS
zJQ%wYeam)41O;m@HmFKBxD4m#$Wtx#s(>h^Iu<Zq19j^w5iMY-v%|9JnqPp)k}a#9
z^G2$+t<Sr?`<^W}g?stX>^T+i$<5&pnwC@oy;-KVSs9POov7!dDMlYbxwH3&!;O34
zm1!j-hB~;Zjz>w)1P`KMkhUr8Os{%4wP&}niTx0BtcsXEN9sV^TO$cOyUf%=zef|u
zxn!p@%O8Oa^%_~Kb9)jte`KPSw?~df|1+?tkGf?o_-T_mwSrZD^rYNvy$DJ@PWZiw
zw8Cyd#+}`ZreK+#Vfqtfj{sxtS<jx`NEk898gpy_G2li}WVC+ls2&EY%AL)pN*?yZ
z_#e1QcT!ZatlRuc<+5k6rC4pyW03IjyVFAw9)ZZbZC3UVTR?W^rsRg{-BlzD^Zsvs
z9j>xlGQ9tcAvA1e_Lid4%o6zY!-2Zn)2lS=Q3lZDqOL!JxRgKm8Qc|++}dAP*JBiR
z^sV1A{ecs(<m%R3hyL4vhk80#Pk?$Az1Kbkle(Ae3Y*jdRGyRfG?^Qz{Hq)*A5JR(
zr}sx0a8xwaR5z=Fg3^z$&8}vUe{IK<?0Pe-t=&8DExz@}0hY09(|xNFo$lXaJ7dq*
z8Hc)Hv6JU0-_N}U+_USOi=!&QQUZYAHSL>#;^?-aO{tGTH|mMQc7gV)+*y0}ObB<u
zUbYwq@d9%+^Z!^g-roSB=J`jEbKVXg?$!YOFK*d9d=mxhzFH?T-F!5*;f8qStdq}R
zRz}Uf0(KcNh~MD4Cb&$)O*1g@jQM&0E^}4tY*+Ed5tgbsO)iz3a)6yPkD5`|?>ex5
zI1)8^)IKk5oV9=f0oF(JtE-`#ygbL=d>FPSfAE{L6MeCuA9gJ`{DUq=VD{U`w(-6<
z!`|A=Nk0GP9k6`xWYW{RLeP*H>wf{aQ0bMYm^AfZV>?aW4p9zj0BEt%idTYkiTQHX
zwI|>*+cC*`-Ul!%cG%?NQ^u+ob6@L==SQgG5*pI4ogS~c=bHW^qm-lyCl<OI5bl##
z2OjNFJD?6W&*SxOQ;(fsf#{nm>mF8^;nwr+HubsC;DGP_*r4Z-aMvqUz=&|*M__+}
zy2BPQWZapoycx!-lYJ^#eTT7BKhLFGt?O-%<zGDT^nK|YI7V%b%`<S}joPq!;I<|h
zp+~~f#8x482h3H!WL(WK=5<$D)jlUjw>qePPkJ7I_-u=29Z1+tL`!qA1@`N7^$6Yr
zQsivr`IyyU&e_46UtT6-t~HGG+Zj_aD}Q=~+wymSaHFwLL7U&d07Un5nn7Cl0W(UP
zp(=n%;tL|isK$+47d2qQMC_wpWNCFxC2WfiPeI%KJO{qS1iu4SInDjTR+_8wcWym9
zbJ{Rflwsk(gzYYv;wDpdVBAAMgrm7t8cv=98j)JTJW>3P!D%;$6A}?MC0z8qu(PO%
zzDJBz=+2~&J3iux?9nGYwhdd4ZG^tICAp;Ql1O1F0^#TpXmT0K<>-;3M~ogWdL-!i
zZ<ph1#;P9o-=+VQ9slj)>*~Dx&yN2e{~$hJ_dv&@2s}8HFP2Al=Hu*cZ6~8J9TF+>
z<)Q8|Rt3;`*GJQv8)~8FVZ%d1X54|Xv+6BzlSWAM%W}30TL`Pyu393jcmur)XM2#g
z)WO-w0Swyw^&oIWf?;C$EtnUGZz;Lj4B!5uKdINbcVMR7*71}d`(om!(g~4QE<;7!
z3Ra(!Z$RS}{uN>TM!3R%MM6wb1DteXYTBN(i=gtt>3J!cjUd@;)%?~eXP~`Z;h2dl
zhG9`oY`^-%3b-`J^v$sekKvb2>b`ULO`W`Y_}kb46#|b;kg-T-KePW`(0!E5`QDJb
zFmsCadC#>Up!@Y5{reSJt4>`mP2a!!9Q?s2#^U*)dN3&2c}DB5%OG!q5qo>j_pr%j
zNAToB5wIoZS1Y#(Z^2OUUcdCEjWBB9)g>l{SD}gUQlIRG1K`Gscx+zOXjN~W>L@+y
zn_xFXl8`s<4VYFp($Ur=4YW4IZyzL{3uhEl4we+%hCT{f{KB|8;<T?YK^H3=&Gn=F
zufuRR@h4X)L=^twA|=W7G+II;8ESH4O|GZO(cza!+OnIbtd-N&57+$OO_OW&>T1f`
z_S)aIxxp9uwDLB8p+{TacF&@o2Ps#5ZyWZP^Y9#5sZmDK_Npy^nQKS+<?q_Ews&p$
z%lu3I+OjtPQeHdje^pOd**$`b)|w*><_2}<jl_=IjYsL&a1}bSo!puwx%SuTJgIvL
o=PHasa5<VwITw9BRTxI#QuV#HQehlHu_N1th$UjBEZpw@0PW}E5C8xG

literal 0
HcmV?d00001

diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg
new file mode 100644
index 000000000..bbe7c64a0
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK/n008-2018-08-01-15-16-36-0400__CAM_BACK__1533151603537558.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:047dd43f3f9fb7cc426b1d4ae369181242c5555b84baff5d7e1cbd0dc8b07259
+size 108796
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg
new file mode 100644
index 000000000..5a0335a88
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-01-15-16-36-0400__CAM_BACK_LEFT__1533151603547405.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:27b2826991733662e4f17c509ad5a66e85e0ca66a6413063d8db9645fa608c11
+size 110941
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg
new file mode 100644
index 000000000..afaafc6d7
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_BACK_RIGHT__1533151603528113.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a998caf0e327816b82d2a142d2581a8c1f9c4e67d50b4af3718a381337b7e415
+size 124547
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg
new file mode 100644
index 000000000..36ad1e76a
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT/n008-2018-08-01-15-16-36-0400__CAM_FRONT__1533151603512404.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e2067a1618c4424050c45c393ada8c9dbf130fff8d6015004fce901dc07984d
+size 141976
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg
new file mode 100644
index 000000000..603c729bd
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_LEFT__1533151603504799.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c52f73a3ef06a129c27ef51ac65704428caebfaa60c52f14a5953bbb0124a18
+size 144177
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg
new file mode 100644
index 000000000..abcaea332
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__CAM_FRONT_RIGHT__1533151603520482.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:15cf59c79b086d14da6fae5a7ee5b0f4d0fac0c0a22c0a85faaa5fb215521191
+size 147910
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin
new file mode 100644
index 000000000..c30be8849
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/LIDAR_TOP/n008-2018-08-01-15-16-36-0400__LIDAR_TOP__1533151603547590.pcd.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c54cec1236b18dd28579e7090d9e3b3a233d41187d9f402133a1c4b08d7f79e3
+size 695040
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd
new file mode 100644
index 000000000..dffb25443
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_LEFT__1533151603522238.pcd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6512e6048d1ff2a586907aa7ef9390ddc66691d3d664daa7d6bae23811e9f7d2
+size 9471
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd
new file mode 100644
index 000000000..00b7db1d2
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_BACK_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_BACK_RIGHT__1533151603576423.pcd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:81ffb1518d6ab9b1ec1b5192eb74897539eea48a7099275fc22227dce53b65ff
+size 9471
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd
new file mode 100644
index 000000000..7a6f1aff7
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT__1533151603555991.pcd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f23fc30b12579b2948adfc0c6553c104aaa9db18e8c8175d1345f34363e16293
+size 9471
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd
new file mode 100644
index 000000000..22cb0e6ab
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_LEFT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_LEFT__1533151603526348.pcd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7cb42b1e52e8b1cd09668798c049f462937d17342942e072d17da5df2509e3cf
+size 7880
diff --git a/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd
new file mode 100644
index 000000000..de4e16b13
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/data/nuscenes/samples/RADAR_FRONT_RIGHT/n008-2018-08-01-15-16-36-0400__RADAR_FRONT_RIGHT__1533151603512881.pcd
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d83f8a6e7849dfcd67855ee517bea3b082971442fe700b3d6458036ce00ea517
+size 6762
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/__init__.py
new file mode 100644
index 000000000..12eb52f94
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/__init__.py
@@ -0,0 +1,33 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .anchor_generator import (
+    AnchorGenerator,
+    LegacyAnchorGenerator,
+    YOLOAnchorGenerator,
+)
+from .builder import (
+    ANCHOR_GENERATORS,
+    PRIOR_GENERATORS,
+    build_anchor_generator,
+    build_prior_generator,
+)
+from .point_generator import MlvlPointGenerator, PointGenerator
+from .utils import anchor_inside_flags, calc_region, images_to_levels
+
+__all__ = [
+    "AnchorGenerator",
+    "LegacyAnchorGenerator",
+    "anchor_inside_flags",
+    "PointGenerator",
+    "images_to_levels",
+    "calc_region",
+    "build_anchor_generator",
+    "ANCHOR_GENERATORS",
+    "YOLOAnchorGenerator",
+    "build_prior_generator",
+    "PRIOR_GENERATORS",
+    "MlvlPointGenerator",
+]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/anchor_generator.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/anchor_generator.py
new file mode 100644
index 000000000..9aa2f35f1
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/anchor_generator.py
@@ -0,0 +1,660 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import mmcv
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class AnchorGenerator:
+    """Standard anchor generator for 2D anchor-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int] | None): The basic sizes
+            of anchors in multiple levels.
+            If None is given, strides will be used as base_sizes.
+            (If strides are non square, the shortest stride is taken.)
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. If a list of tuple of
+            float is given, they will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0 in V2.0.
+
+    Examples:
+        >>> from mmdet.core import AnchorGenerator
+        >>> self = AnchorGenerator([16], [1.], [1.], [9])
+        >>> all_anchors = self.grid_priors([(2, 2)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]])]
+        >>> self = AnchorGenerator([16, 32], [1.], [1.], [9, 18])
+        >>> all_anchors = self.grid_priors([(2, 2), (1, 1)], device='cpu')
+        >>> print(all_anchors)
+        [tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
+                [11.5000, -4.5000, 20.5000,  4.5000],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
+                [11.5000, 11.5000, 20.5000, 20.5000]]), \
+        tensor([[-9., -9., 9., 9.]])]
+    """
+
+    def __init__(
+        self,
+        strides,
+        ratios,
+        scales=None,
+        base_sizes=None,
+        scale_major=True,
+        octave_base_scale=None,
+        scales_per_octave=None,
+        centers=None,
+        center_offset=0.0,
+    ):
+        # check center and center_offset
+        if center_offset != 0:
+            assert centers is None, "center cannot be set when center_offset" f"!=0, {centers} is given."
+        if not (0 <= center_offset <= 1):
+            raise ValueError("center_offset should be in range [0, 1], " f"{center_offset} is given.")
+        if centers is not None:
+            assert len(centers) == len(strides), (
+                "The number of strides should be the same as centers, got " f"{strides} and {centers}"
+            )
+
+        # calculate base sizes of anchors
+        self.strides = [_pair(stride) for stride in strides]
+        self.base_sizes = [min(stride) for stride in self.strides] if base_sizes is None else base_sizes
+        assert len(self.base_sizes) == len(self.strides), (
+            "The number of strides should be the same as base sizes, got " f"{self.strides} and {self.base_sizes}"
+        )
+
+        # calculate scales of anchors
+        assert (octave_base_scale is not None and scales_per_octave is not None) ^ (scales is not None), (
+            "scales and octave_base_scale with scales_per_octave cannot" " be set at the same time"
+        )
+        if scales is not None:
+            self.scales = torch.Tensor(scales)
+        elif octave_base_scale is not None and scales_per_octave is not None:
+            octave_scales = np.array([2 ** (i / scales_per_octave) for i in range(scales_per_octave)])
+            scales = octave_scales * octave_base_scale
+            self.scales = torch.Tensor(scales)
+        else:
+            raise ValueError("Either scales or octave_base_scale with " "scales_per_octave should be set")
+
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.centers = centers
+        self.center_offset = center_offset
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        """list[int]: total number of base anchors in a feature grid"""
+        return self.num_base_priors
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (anchors) at a point
+        on the feature grid"""
+        return [base_anchors.size(0) for base_anchors in self.base_anchors]
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(
+                self.gen_single_level_base_anchors(base_size, scales=self.scales, ratios=self.ratios, center=center)
+            )
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self, base_size, scales, ratios, center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * w
+            y_center = self.center_offset * h
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [x_center - 0.5 * ws, y_center - 0.5 * hs, x_center + 0.5 * ws, y_center + 0.5 * hs]
+        base_anchors = torch.stack(base_anchors, dim=-1)
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        """Generate mesh grid of x and y.
+
+        Args:
+            x (torch.Tensor): Grids of x dimension.
+            y (torch.Tensor): Grids of y dimension.
+            row_major (bool, optional): Whether to return y grids first.
+                Defaults to True.
+
+        Returns:
+            tuple[torch.Tensor]: The mesh grids of x and y.
+        """
+        # use shape instead of len to keep tracing while exporting to onnx
+        xx = x.repeat(y.shape[0])
+        yy = y.view(-1, 1).repeat(1, x.shape[0]).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_priors(self, featmap_sizes, dtype=torch.float32, device="cuda"):
+        """Generate grid anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list[tuple]): List of feature map sizes in
+                multiple feature levels.
+            dtype (:obj:`torch.dtype`): Dtype of priors.
+                Default: torch.float32.
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list[torch.Tensor]: Anchors in multiple feature levels. \
+                The sizes of each tensor should be [N, 4], where \
+                N = width * height * num_base_anchors, width and height \
+                are the sizes of the corresponding feature level, \
+                num_base_anchors is the number of anchors for that level.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_anchors = []
+        for i in range(self.num_levels):
+            anchors = self.single_level_grid_priors(featmap_sizes[i], level_idx=i, dtype=dtype, device=device)
+            multi_level_anchors.append(anchors)
+        return multi_level_anchors
+
+    def single_level_grid_priors(self, featmap_size, level_idx, dtype=torch.float32, device="cuda"):
+        """Generate grid anchors of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps.
+            level_idx (int): The index of corresponding feature map level.
+            dtype (obj:`torch.dtype`): Date type of points.Defaults to
+                ``torch.float32``.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: Anchors in the overall feature maps.
+        """
+
+        base_anchors = self.base_anchors[level_idx].to(device).to(dtype)
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        shift_x = torch.arange(0, feat_w, device=device).to(dtype) * stride_w
+        shift_y = torch.arange(0, feat_h, device=device).to(dtype) * stride_h
+
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_sizes, pad_shape, device="cuda"):
+        """Generate valid flags of anchors in multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels.
+            pad_shape (tuple): The padded shape of the image.
+            device (str): Device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of anchors in multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            anchor_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / anchor_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / anchor_stride[0])), feat_w)
+            flags = self.single_level_valid_flags(
+                (feat_h, feat_w), (valid_feat_h, valid_feat_w), self.num_base_anchors[i], device=device
+            )
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self, featmap_size, valid_size, num_base_anchors, device="cuda"):
+        """Generate the valid flags of anchor in a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+            num_base_anchors (int): The number of base anchors.
+            device (str, optional): Device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each anchor in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(valid.size(0), num_base_anchors).contiguous().view(-1)
+        return valid
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = "    "
+        repr_str = self.__class__.__name__ + "(\n"
+        repr_str += f"{indent_str}strides={self.strides},\n"
+        repr_str += f"{indent_str}ratios={self.ratios},\n"
+        repr_str += f"{indent_str}scales={self.scales},\n"
+        repr_str += f"{indent_str}base_sizes={self.base_sizes},\n"
+        repr_str += f"{indent_str}scale_major={self.scale_major},\n"
+        repr_str += f"{indent_str}octave_base_scale="
+        repr_str += f"{self.octave_base_scale},\n"
+        repr_str += f"{indent_str}scales_per_octave="
+        repr_str += f"{self.scales_per_octave},\n"
+        repr_str += f"{indent_str}num_levels={self.num_levels}\n"
+        repr_str += f"{indent_str}centers={self.centers},\n"
+        repr_str += f"{indent_str}center_offset={self.center_offset})"
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class SSDAnchorGenerator(AnchorGenerator):
+    """Anchor generator for SSD.
+
+    Args:
+        strides (list[int]  | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        min_sizes (list[float]): The list of minimum anchor sizes on each
+            level.
+        max_sizes (list[float]): The list of maximum anchor sizes on each
+            level.
+        basesize_ratio_range (tuple(float)): Ratio range of anchors. Being
+            used when not setting min_sizes and max_sizes.
+        input_size (int): Size of feature map, 300 for SSD300, 512 for
+            SSD512. Being used when not setting min_sizes and max_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. It is always set to be False in SSD.
+    """
+
+    def __init__(
+        self,
+        strides,
+        ratios,
+        min_sizes=None,
+        max_sizes=None,
+        basesize_ratio_range=(0.15, 0.9),
+        input_size=300,
+        scale_major=True,
+    ):
+        assert len(strides) == len(ratios)
+        assert not (min_sizes is None) ^ (max_sizes is None)
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2.0, stride[1] / 2.0) for stride in self.strides]
+
+        if min_sizes is None and max_sizes is None:
+            # use hard code to generate SSD anchors
+            self.input_size = input_size
+            assert mmcv.is_tuple_of(basesize_ratio_range, float)
+            self.basesize_ratio_range = basesize_ratio_range
+            # calculate anchor ratios and sizes
+            min_ratio, max_ratio = basesize_ratio_range
+            min_ratio = int(min_ratio * 100)
+            max_ratio = int(max_ratio * 100)
+            step = int(np.floor(max_ratio - min_ratio) / (self.num_levels - 2))
+            min_sizes = []
+            max_sizes = []
+            for ratio in range(int(min_ratio), int(max_ratio) + 1, step):
+                min_sizes.append(int(self.input_size * ratio / 100))
+                max_sizes.append(int(self.input_size * (ratio + step) / 100))
+            if self.input_size == 300:
+                if basesize_ratio_range[0] == 0.15:  # SSD300 COCO
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                elif basesize_ratio_range[0] == 0.2:  # SSD300 VOC
+                    min_sizes.insert(0, int(self.input_size * 10 / 100))
+                    max_sizes.insert(0, int(self.input_size * 20 / 100))
+                else:
+                    raise ValueError(
+                        "basesize_ratio_range[0] should be either 0.15"
+                        "or 0.2 when input_size is 300, got "
+                        f"{basesize_ratio_range[0]}."
+                    )
+            elif self.input_size == 512:
+                if basesize_ratio_range[0] == 0.1:  # SSD512 COCO
+                    min_sizes.insert(0, int(self.input_size * 4 / 100))
+                    max_sizes.insert(0, int(self.input_size * 10 / 100))
+                elif basesize_ratio_range[0] == 0.15:  # SSD512 VOC
+                    min_sizes.insert(0, int(self.input_size * 7 / 100))
+                    max_sizes.insert(0, int(self.input_size * 15 / 100))
+                else:
+                    raise ValueError(
+                        "When not setting min_sizes and max_sizes,"
+                        "basesize_ratio_range[0] should be either 0.1"
+                        "or 0.15 when input_size is 512, got"
+                        f" {basesize_ratio_range[0]}."
+                    )
+            else:
+                raise ValueError(
+                    "Only support 300 or 512 in SSDAnchorGenerator when "
+                    "not setting min_sizes and max_sizes, "
+                    f"got {self.input_size}."
+                )
+
+        assert len(min_sizes) == len(max_sizes) == len(strides)
+
+        anchor_ratios = []
+        anchor_scales = []
+        for k in range(len(self.strides)):
+            scales = [1.0, np.sqrt(max_sizes[k] / min_sizes[k])]
+            anchor_ratio = [1.0]
+            for r in ratios[k]:
+                anchor_ratio += [1 / r, r]  # 4 or 6 ratio
+            anchor_ratios.append(torch.Tensor(anchor_ratio))
+            anchor_scales.append(torch.Tensor(scales))
+
+        self.base_sizes = min_sizes
+        self.scales = anchor_scales
+        self.ratios = anchor_ratios
+        self.scale_major = scale_major
+        self.center_offset = 0
+        self.base_anchors = self.gen_base_anchors()
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_size in enumerate(self.base_sizes):
+            base_anchors = self.gen_single_level_base_anchors(
+                base_size, scales=self.scales[i], ratios=self.ratios[i], center=self.centers[i]
+            )
+            indices = list(range(len(self.ratios[i])))
+            indices.insert(1, len(indices))
+            base_anchors = torch.index_select(base_anchors, 0, torch.LongTensor(indices))
+            multi_level_base_anchors.append(base_anchors)
+        return multi_level_base_anchors
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        indent_str = "    "
+        repr_str = self.__class__.__name__ + "(\n"
+        repr_str += f"{indent_str}strides={self.strides},\n"
+        repr_str += f"{indent_str}scales={self.scales},\n"
+        repr_str += f"{indent_str}scale_major={self.scale_major},\n"
+        repr_str += f"{indent_str}input_size={self.input_size},\n"
+        repr_str += f"{indent_str}scales={self.scales},\n"
+        repr_str += f"{indent_str}ratios={self.ratios},\n"
+        repr_str += f"{indent_str}num_levels={self.num_levels},\n"
+        repr_str += f"{indent_str}base_sizes={self.base_sizes},\n"
+        repr_str += f"{indent_str}basesize_ratio_range="
+        repr_str += f"{self.basesize_ratio_range})"
+        return repr_str
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacyAnchorGenerator(AnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    Note:
+        Difference to the V2.0 anchor generator:
+
+        1. The center offset of V1.x anchors are set to be 0.5 rather than 0.
+        2. The width/height are minused by 1 when calculating the anchors' \
+            centers and corners to meet the V1.x coordinate system.
+        3. The anchors' corners are quantized.
+
+    Args:
+        strides (list[int] | list[tuple[int]]): Strides of anchors
+            in multiple feature levels.
+        ratios (list[float]): The list of ratios between the height and width
+            of anchors in a single level.
+        scales (list[int] | None): Anchor scales for anchors in a single level.
+            It cannot be set at the same time if `octave_base_scale` and
+            `scales_per_octave` are set.
+        base_sizes (list[int]): The basic sizes of anchors in multiple levels.
+            If None is given, strides will be used to generate base_sizes.
+        scale_major (bool): Whether to multiply scales first when generating
+            base anchors. If true, the anchors in the same row will have the
+            same scales. By default it is True in V2.0
+        octave_base_scale (int): The base scale of octave.
+        scales_per_octave (int): Number of scales for each octave.
+            `octave_base_scale` and `scales_per_octave` are usually used in
+            retinanet and the `scales` should be None when they are set.
+        centers (list[tuple[float, float]] | None): The centers of the anchor
+            relative to the feature grid center in multiple feature levels.
+            By default it is set to be None and not used. It a list of float
+            is given, this list will be used to shift the centers of anchors.
+        center_offset (float): The offset of center in proportion to anchors'
+            width and height. By default it is 0.5 in V2.0 but it should be 0.5
+            in v1.x models.
+
+    Examples:
+        >>> from mmdet.core import LegacyAnchorGenerator
+        >>> self = LegacyAnchorGenerator(
+        >>>     [16], [1.], [1.], [9], center_offset=0.5)
+        >>> all_anchors = self.grid_anchors(((2, 2),), device='cpu')
+        >>> print(all_anchors)
+        [tensor([[ 0.,  0.,  8.,  8.],
+                [16.,  0., 24.,  8.],
+                [ 0., 16.,  8., 24.],
+                [16., 16., 24., 24.]])]
+    """
+
+    def gen_single_level_base_anchors(self, base_size, scales, ratios, center=None):
+        """Generate base anchors of a single level.
+
+        Note:
+            The width/height of anchors are minused by 1 when calculating \
+                the centers and corners to meet the V1.x coordinate system.
+
+        Args:
+            base_size (int | float): Basic size of an anchor.
+            scales (torch.Tensor): Scales of the anchor.
+            ratios (torch.Tensor): The ratio between between the height.
+                and width of anchors in a single level.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature map.
+        """
+        w = base_size
+        h = base_size
+        if center is None:
+            x_center = self.center_offset * (w - 1)
+            y_center = self.center_offset * (h - 1)
+        else:
+            x_center, y_center = center
+
+        h_ratios = torch.sqrt(ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * scales[None, :]).view(-1)
+        else:
+            ws = (w * scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * scales[:, None] * h_ratios[None, :]).view(-1)
+
+        # use float anchor and the anchor's center is aligned with the
+        # pixel center
+        base_anchors = [
+            x_center - 0.5 * (ws - 1),
+            y_center - 0.5 * (hs - 1),
+            x_center + 0.5 * (ws - 1),
+            y_center + 0.5 * (hs - 1),
+        ]
+        base_anchors = torch.stack(base_anchors, dim=-1).round()
+
+        return base_anchors
+
+
+@PRIOR_GENERATORS.register_module()
+class LegacySSDAnchorGenerator(SSDAnchorGenerator, LegacyAnchorGenerator):
+    """Legacy anchor generator used in MMDetection V1.x.
+
+    The difference between `LegacySSDAnchorGenerator` and `SSDAnchorGenerator`
+    can be found in `LegacyAnchorGenerator`.
+    """
+
+    def __init__(self, strides, ratios, basesize_ratio_range, input_size=300, scale_major=True):
+        super(LegacySSDAnchorGenerator, self).__init__(
+            strides=strides,
+            ratios=ratios,
+            basesize_ratio_range=basesize_ratio_range,
+            input_size=input_size,
+            scale_major=scale_major,
+        )
+        self.centers = [((stride - 1) / 2.0, (stride - 1) / 2.0) for stride in strides]
+        self.base_anchors = self.gen_base_anchors()
+
+
+@PRIOR_GENERATORS.register_module()
+class YOLOAnchorGenerator(AnchorGenerator):
+    """Anchor generator for YOLO.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels.
+        base_sizes (list[list[tuple[int, int]]]): The basic sizes
+            of anchors in multiple levels.
+    """
+
+    def __init__(self, strides, base_sizes):
+        self.strides = [_pair(stride) for stride in strides]
+        self.centers = [(stride[0] / 2.0, stride[1] / 2.0) for stride in self.strides]
+        self.base_sizes = []
+        num_anchor_per_level = len(base_sizes[0])
+        for base_sizes_per_level in base_sizes:
+            assert num_anchor_per_level == len(base_sizes_per_level)
+            self.base_sizes.append([_pair(base_size) for base_size in base_sizes_per_level])
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.base_sizes)
+
+    def gen_base_anchors(self):
+        """Generate base anchors.
+
+        Returns:
+            list(torch.Tensor): Base anchors of a feature grid in multiple \
+                feature levels.
+        """
+        multi_level_base_anchors = []
+        for i, base_sizes_per_level in enumerate(self.base_sizes):
+            center = None
+            if self.centers is not None:
+                center = self.centers[i]
+            multi_level_base_anchors.append(self.gen_single_level_base_anchors(base_sizes_per_level, center))
+        return multi_level_base_anchors
+
+    def gen_single_level_base_anchors(self, base_sizes_per_level, center=None):
+        """Generate base anchors of a single level.
+
+        Args:
+            base_sizes_per_level (list[tuple[int, int]]): Basic sizes of
+                anchors.
+            center (tuple[float], optional): The center of the base anchor
+                related to a single feature grid. Defaults to None.
+
+        Returns:
+            torch.Tensor: Anchors in a single-level feature maps.
+        """
+        x_center, y_center = center
+        base_anchors = []
+        for base_size in base_sizes_per_level:
+            w, h = base_size
+
+            # use float anchor and the anchor's center is aligned with the
+            # pixel center
+            base_anchor = torch.Tensor([x_center - 0.5 * w, y_center - 0.5 * h, x_center + 0.5 * w, y_center + 0.5 * h])
+            base_anchors.append(base_anchor)
+        base_anchors = torch.stack(base_anchors, dim=0)
+
+        return base_anchors
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py
new file mode 100644
index 000000000..ba002aca8
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/builder.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.utils import Registry, build_from_cfg
+
+PRIOR_GENERATORS = Registry("Generator for anchors and points")
+
+ANCHOR_GENERATORS = PRIOR_GENERATORS
+
+
+def build_prior_generator(cfg, default_args=None):
+    return build_from_cfg(cfg, PRIOR_GENERATORS, default_args)
+
+
+def build_anchor_generator(cfg, default_args=None):
+    warnings.warn("``build_anchor_generator`` would be deprecated soon, please use " "``build_prior_generator`` ")
+    return build_prior_generator(cfg, default_args=default_args)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py
new file mode 100644
index 000000000..572665975
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/point_generator.py
@@ -0,0 +1,172 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair
+
+from .builder import PRIOR_GENERATORS
+
+
+@PRIOR_GENERATORS.register_module()
+class PointGenerator:
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def valid_flags(self, featmap_size, valid_size, device="cuda"):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
+
+
+@PRIOR_GENERATORS.register_module()
+class MlvlPointGenerator:
+    """Standard points generator for multi-level (Mlvl) feature maps in 2D
+    points-based detectors.
+
+    Args:
+        strides (list[int] | list[tuple[int, int]]): Strides of anchors
+            in multiple feature levels in order (w, h).
+        offset (float): The offset of points, the value is normalized with
+            corresponding stride. Defaults to 0.5.
+    """
+
+    def __init__(self, strides, offset=0.5):
+        self.strides = [_pair(stride) for stride in strides]
+        self.offset = offset
+
+    @property
+    def num_levels(self):
+        """int: number of feature levels that the generator will be applied"""
+        return len(self.strides)
+
+    @property
+    def num_base_priors(self):
+        """list[int]: The number of priors (points) at a point
+        on the feature grid"""
+        return [1 for _ in range(len(self.strides))]
+
+    def _meshgrid(self, x, y, row_major=True):
+        yy, xx = torch.meshgrid(y, x)
+        if row_major:
+            # warning .flatten() would cause error in ONNX exporting
+            # have to use reshape here
+            return xx.reshape(-1), yy.reshape(-1)
+
+        else:
+            return yy.reshape(-1), xx.reshape(-1)
+
+    def single_level_grid_priors(self, featmap_size, level_idx, dtype=torch.float32, device="cuda", with_stride=False):
+        """Generate grid Points of a single level.
+
+        Note:
+            This function is usually called by method ``self.grid_priors``.
+
+        Args:
+            featmap_size (tuple[int]): Size of the feature maps, arrange as
+                (h, w).
+            level_idx (int): The index of corresponding feature map level.
+            dtype (:obj:`dtype`): Dtype of priors. Default: torch.float32.
+            device (str, optional): The device the tensor will be put on.
+                Defaults to 'cuda'.
+            with_stride (bool): Concatenate the stride to the last dimension
+                of points.
+
+        Return:
+            Tensor: Points of single feature levels.
+            The shape of tensor should be (N, 2) when with stride is
+            ``False``, where N = width * height, width and height
+            are the sizes of the corresponding feature level,
+            and the last dimension 2 represent (coord_x, coord_y),
+            otherwise the shape should be (N, 4),
+            and the last dimension 4 represent
+            (coord_x, coord_y, stride_w, stride_h).
+        """
+        feat_h, feat_w = featmap_size
+        stride_w, stride_h = self.strides[level_idx]
+        shift_x = (torch.arange(0, feat_w, device=device) + self.offset) * stride_w
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_x = shift_x.to(dtype)
+
+        shift_y = (torch.arange(0, feat_h, device=device) + self.offset) * stride_h
+        # keep featmap_size as Tensor instead of int, so that we
+        # can convert to ONNX correctly
+        shift_y = shift_y.to(dtype)
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        if not with_stride:
+            shifts = torch.stack([shift_xx, shift_yy], dim=-1)
+        else:
+            # use `shape[0]` instead of `len(shift_xx)` for ONNX export
+            stride_w = shift_xx.new_full((shift_xx.shape[0],), stride_w).to(dtype)
+            stride_h = shift_xx.new_full((shift_yy.shape[0],), stride_h).to(dtype)
+            shifts = torch.stack([shift_xx, shift_yy, stride_w, stride_h], dim=-1)
+        all_points = shifts.to(device)
+        return all_points
+
+    def valid_flags(self, featmap_sizes, pad_shape, device="cuda"):
+        """Generate valid flags of points of multiple feature levels.
+
+        Args:
+            featmap_sizes (list(tuple)): List of feature map sizes in
+                multiple feature levels, each size arrange as
+                as (h, w).
+            pad_shape (tuple(int)): The padded shape of the image,
+                 arrange as (h, w).
+            device (str): The device where the anchors will be put on.
+
+        Return:
+            list(torch.Tensor): Valid flags of points of multiple levels.
+        """
+        assert self.num_levels == len(featmap_sizes)
+        multi_level_flags = []
+        for i in range(self.num_levels):
+            point_stride = self.strides[i]
+            feat_h, feat_w = featmap_sizes[i]
+            h, w = pad_shape[:2]
+            valid_feat_h = min(int(np.ceil(h / point_stride[1])), feat_h)
+            valid_feat_w = min(int(np.ceil(w / point_stride[0])), feat_w)
+            flags = self.single_level_valid_flags((feat_h, feat_w), (valid_feat_h, valid_feat_w), device=device)
+            multi_level_flags.append(flags)
+        return multi_level_flags
+
+    def single_level_valid_flags(self, featmap_size, valid_size, device="cuda"):
+        """Generate the valid flags of points of a single feature map.
+
+        Args:
+            featmap_size (tuple[int]): The size of feature maps, arrange as
+                as (h, w).
+            valid_size (tuple[int]): The valid size of the feature maps.
+                The size arrange as as (h, w).
+            device (str, optional): The device where the flags will be put on.
+                Defaults to 'cuda'.
+
+        Returns:
+            torch.Tensor: The valid flags of each points in a single level \
+                feature map.
+        """
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.bool, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.bool, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        return valid
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py
new file mode 100644
index 000000000..00988b218
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/anchor/utils.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def images_to_levels(target, num_levels):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_levels:
+        end = start + n
+        # level_targets.append(target[:, start:end].squeeze(0))
+        level_targets.append(target[:, start:end])
+        start = end
+    return level_targets
+
+
+def anchor_inside_flags(flat_anchors, valid_flags, img_shape, allowed_border=0):
+    """Check whether the anchors are inside the border.
+
+    Args:
+        flat_anchors (torch.Tensor): Flatten anchors, shape (n, 4).
+        valid_flags (torch.Tensor): An existing valid flags of anchors.
+        img_shape (tuple(int)): Shape of current image.
+        allowed_border (int, optional): The border to allow the valid anchor.
+            Defaults to 0.
+
+    Returns:
+        torch.Tensor: Flags indicating whether the anchors are inside a \
+            valid range.
+    """
+    img_h, img_w = img_shape[:2]
+    if allowed_border >= 0:
+        inside_flags = (
+            valid_flags
+            & (flat_anchors[:, 0] >= -allowed_border)
+            & (flat_anchors[:, 1] >= -allowed_border)
+            & (flat_anchors[:, 2] < img_w + allowed_border)
+            & (flat_anchors[:, 3] < img_h + allowed_border)
+        )
+    else:
+        inside_flags = valid_flags
+    return inside_flags
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4).
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple): Feature map size used for clipping the boundary.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1])
+        y1 = y1.clamp(min=0, max=featmap_size[0])
+        x2 = x2.clamp(min=0, max=featmap_size[1])
+        y2 = y2.clamp(min=0, max=featmap_size[0])
+    return (x1, y1, x2, y2)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py
new file mode 100644
index 000000000..ead3090af
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .builder import build_bbox_coder
+from .coder import BaseBBoxCoder
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py
new file mode 100644
index 000000000..f23da515f
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .base_assigner import BaseAssigner
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py
new file mode 100644
index 000000000..b013d933b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/assigners/base_assigner.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py
new file mode 100644
index 000000000..ae668c05f
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/builder.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry("bbox_assigner")
+BBOX_SAMPLERS = Registry("bbox_sampler")
+BBOX_CODERS = Registry("bbox_coder")
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py
new file mode 100644
index 000000000..c56326d2e
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_bbox_coder import BaseBBoxCoder
+from .distance_point_bbox_coder import DistancePointBBoxCoder
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py
new file mode 100644
index 000000000..d56fe1e50
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder."""
+
+    def __init__(self, **kwargs):
+        pass
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py
new file mode 100644
index 000000000..b88be8af0
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/coder/distance_point_bbox_coder.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..builder import BBOX_CODERS
+from .base_bbox_coder import BaseBBoxCoder
+
+
+@BBOX_CODERS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.clip_border = clip_border
+
+    def encode(self, points, gt_bboxes, max_dis=None, eps=0.1):
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor): Shape (N, 4), The format is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(self, points, pred_bboxes, max_shape=None):
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Tensor: Boxes with shape (N, 4) or (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        return distance2bbox(points, pred_bboxes, max_shape)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py
new file mode 100644
index 000000000..e3eae0288
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/__init__.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_match_cost
+from .match_cost import (
+    BBoxL1Cost,
+    ClassificationCost,
+    CrossEntropyLossCost,
+    DiceCost,
+    FocalLossCost,
+    IoUCost,
+)
+
+__all__ = [
+    "build_match_cost",
+    "ClassificationCost",
+    "BBoxL1Cost",
+    "IoUCost",
+    "FocalLossCost",
+    "DiceCost",
+    "CrossEntropyLossCost",
+]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py
new file mode 100644
index 000000000..341cee374
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/builder.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+MATCH_COST = Registry("Match Cost")
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, MATCH_COST, default_args)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py
new file mode 100644
index 000000000..642f7530e
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/bbox/match_costs/match_cost.py
@@ -0,0 +1,345 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from .builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBoxL1Cost:
+    """BBoxL1Cost.
+
+    Args:
+        weight (int | float, optional): loss_weight
+        box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
+
+    Examples:
+        >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
+        >>> import torch
+        >>> self = BBoxL1Cost()
+        >>> bbox_pred = torch.rand(1, 4)
+        >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(bbox_pred, gt_bboxes, factor)
+        tensor([[1.6172, 1.6422]])
+    """
+
+    def __init__(self, weight=1.0, box_format="xyxy"):
+        self.weight = weight
+        assert box_format in ["xyxy", "xywh"]
+        self.box_format = box_format
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        if self.box_format == "xywh":
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+        elif self.box_format == "xyxy":
+            bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class FocalLossCost:
+    """FocalLossCost.
+
+    Args:
+        weight (int | float, optional): loss_weight
+        alpha (int | float, optional): focal_loss alpha
+        gamma (int | float, optional): focal_loss gamma
+        eps (float, optional): default 1e-12
+        binary_input (bool, optional): Whether the input is binary,
+           default False.
+
+    Examples:
+        >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
+        >>> import torch
+        >>> self = FocalLossCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3236, -0.3364, -0.2699],
+               [-0.3439, -0.3209, -0.4807],
+               [-0.4099, -0.3795, -0.2929],
+               [-0.1950, -0.1207, -0.2626]])
+    """
+
+    def __init__(self, weight=1.0, alpha=0.25, gamma=2, eps=1e-12, binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum("nc,mc->nm", pos_cost, gt_labels) + torch.einsum("nc,mc->nm", neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import \
+         ... ClassificationCost
+         >>> import torch
+         >>> self = ClassificationCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3430, -0.3525, -0.3045],
+                [-0.3077, -0.2931, -0.3992],
+                [-0.3664, -0.3455, -0.2881],
+                [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class IoUCost:
+    """IoUCost.
+
+    Args:
+        iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+        weight (int | float, optional): loss weight
+
+    Examples:
+        >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+        >>> import torch
+        >>> self = IoUCost()
+        >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+        >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+        >>> self(bboxes, gt_bboxes)
+        tensor([[-0.1250,  0.1667],
+               [ 0.1667, -0.5000]])
+    """
+
+    def __init__(self, iou_mode="giou", weight=1.0):
+        self.weight = weight
+        self.iou_mode = iou_mode
+
+    def __call__(self, bboxes, gt_bboxes):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = bbox_overlaps(bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost:
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        weight (int | float, optional): loss_weight. Defaults to 1.
+        pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float, optional): default 1e-12.
+        naive_dice (bool, optional): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If Flase, use the second power that
+            is adopted by K-Net and SOLO.
+            Defaults to True.
+    """
+
+    def __init__(self, weight=1.0, pred_act=False, eps=1e-3, naive_dice=True):
+        self.weight = weight
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def binary_mask_dice_loss(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_query, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_query, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum("nc,mc->nm", mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction logits in shape (num_query, *)
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+
+        Returns:
+            Tensor: Dice cost matrix with weight in shape (num_query, num_gt).
+        """
+        if self.pred_act:
+            mask_preds = mask_preds.sigmoid()
+        dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
+        return dice_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class CrossEntropyLossCost:
+    """CrossEntropyLossCost.
+
+    Args:
+        weight (int | float, optional): loss weight. Defaults to 1.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+    Examples:
+         >>> from mmdet.core.bbox.match_costs import CrossEntropyLossCost
+         >>> import torch
+         >>> bce = CrossEntropyLossCost(use_sigmoid=True)
+         >>> cls_pred = torch.tensor([[7.6, 1.2], [-1.3, 10]])
+         >>> gt_labels = torch.tensor([[1, 1], [1, 0]])
+         >>> print(bce(cls_pred, gt_labels))
+    """
+
+    def __init__(self, weight=1.0, use_sigmoid=True):
+        assert use_sigmoid, "use_sigmoid = False is not supported yet."
+        self.weight = weight
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
+                (num_query, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(cls_pred, torch.ones_like(cls_pred), reduction="none")
+        neg = F.binary_cross_entropy_with_logits(cls_pred, torch.zeros_like(cls_pred), reduction="none")
+        cls_cost = torch.einsum("nc,mc->nm", pos, gt_labels) + torch.einsum("nc,mc->nm", neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+            gt_labels (Tensor): Labels.
+
+        Returns:
+            Tensor: Cross entropy cost matrix with weight in
+                shape (num_query, num_gt).
+        """
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py
new file mode 100644
index 000000000..348904611
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .dist_utils import sync_random_seed
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py
new file mode 100644
index 000000000..4b3eae461
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/core/utils/dist_utils.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+
+
+def sync_random_seed(seed=None, device="cuda"):
+    """Make sure different ranks share the same seed.
+
+    All workers must call this function, otherwise it will deadlock.
+    This method is generally used in `DistributedSampler`,
+    because the seed should be identical across all processes
+    in the distributed group.
+
+    In distributed sampling, different ranks should sample non-overlapped
+    data in the dataset. Therefore, this function is used to make sure that
+    each rank shuffles the data indices in the same order based
+    on the same seed. Then different ranks could use different indices
+    to select non-overlapped data from the same data list.
+
+    Args:
+        seed (int, Optional): The seed. Default to None.
+        device (str): The device where the seed will be put on.
+            Default to 'cuda'.
+
+    Returns:
+        int: Seed to be used.
+    """
+    if seed is None:
+        seed = np.random.randint(2**31)
+    assert isinstance(seed, int)
+
+    rank, world_size = get_dist_info()
+
+    if world_size == 1:
+        return seed
+
+    if rank == 0:
+        random_num = torch.tensor(seed, dtype=torch.int32, device=device)
+    else:
+        random_num = torch.tensor(0, dtype=torch.int32, device=device)
+    dist.broadcast(random_num, src=0)
+    return random_num.item()
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py
new file mode 100644
index 000000000..ee50982e4
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py
new file mode 100644
index 000000000..ec9ee2ad2
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/builder.py
@@ -0,0 +1,214 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import platform
+import random
+import warnings
+from functools import partial
+
+import numpy as np
+import torch
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import TORCH_VERSION, Registry, build_from_cfg, digit_version
+from torch.utils.data import DataLoader
+
+from .samplers import (
+    ClassAwareSampler,
+    DistributedGroupSampler,
+    DistributedSampler,
+    GroupSampler,
+    InfiniteBatchSampler,
+    InfiniteGroupBatchSampler,
+)
+
+if platform.system() != "Windows":
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry("dataset")
+PIPELINES = Registry("pipeline")
+
+
+def _concat_dataset(cfg, default_args=None):
+    from .dataset_wrappers import ConcatDataset
+
+    ann_files = cfg["ann_file"]
+    img_prefixes = cfg.get("img_prefix", None)
+    seg_prefixes = cfg.get("seg_prefix", None)
+    proposal_files = cfg.get("proposal_file", None)
+    separate_eval = cfg.get("separate_eval", True)
+
+    datasets = []
+    num_dset = len(ann_files)
+    for i in range(num_dset):
+        data_cfg = copy.deepcopy(cfg)
+        # pop 'separate_eval' since it is not a valid key for common datasets.
+        if "separate_eval" in data_cfg:
+            data_cfg.pop("separate_eval")
+        data_cfg["ann_file"] = ann_files[i]
+        if isinstance(img_prefixes, (list, tuple)):
+            data_cfg["img_prefix"] = img_prefixes[i]
+        if isinstance(seg_prefixes, (list, tuple)):
+            data_cfg["seg_prefix"] = seg_prefixes[i]
+        if isinstance(proposal_files, (list, tuple)):
+            data_cfg["proposal_file"] = proposal_files[i]
+        datasets.append(build_dataset(data_cfg, default_args))
+
+    return ConcatDataset(datasets, separate_eval)
+
+
+def build_dataset(cfg, default_args=None):
+    from .dataset_wrappers import (
+        ClassBalancedDataset,
+        ConcatDataset,
+        MultiImageMixDataset,
+        RepeatDataset,
+    )
+
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg["type"] == "ConcatDataset":
+        dataset = ConcatDataset(
+            [build_dataset(c, default_args) for c in cfg["datasets"]], cfg.get("separate_eval", True)
+        )
+    elif cfg["type"] == "RepeatDataset":
+        dataset = RepeatDataset(build_dataset(cfg["dataset"], default_args), cfg["times"])
+    elif cfg["type"] == "ClassBalancedDataset":
+        dataset = ClassBalancedDataset(build_dataset(cfg["dataset"], default_args), cfg["oversample_thr"])
+    elif cfg["type"] == "MultiImageMixDataset":
+        cp_cfg = copy.deepcopy(cfg)
+        cp_cfg["dataset"] = build_dataset(cp_cfg["dataset"])
+        cp_cfg.pop("type")
+        dataset = MultiImageMixDataset(**cp_cfg)
+    elif isinstance(cfg.get("ann_file"), (list, tuple)):
+        dataset = _concat_dataset(cfg, default_args)
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(
+    dataset,
+    samples_per_gpu,
+    workers_per_gpu,
+    num_gpus=1,
+    dist=True,
+    shuffle=True,
+    seed=None,
+    runner_type="EpochBasedRunner",
+    persistent_workers=False,
+    class_aware_sampler=None,
+    **kwargs
+):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (Dataset): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int): Number of GPUs. Only used in non-distributed training.
+        dist (bool): Distributed training/test or not. Default: True.
+        shuffle (bool): Whether to shuffle the data at every epoch.
+            Default: True.
+        seed (int, Optional): Seed to be used. Default: None.
+        runner_type (str): Type of runner. Default: `EpochBasedRunner`
+        persistent_workers (bool): If True, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once.
+            This allows to maintain the workers `Dataset` instances alive.
+            This argument is only valid when PyTorch>=1.7.0. Default: False.
+        class_aware_sampler (dict): Whether to use `ClassAwareSampler`
+            during training. Default: None.
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+
+    if dist:
+        # When model is :obj:`DistributedDataParallel`,
+        # `batch_size` of :obj:`dataloader` is the
+        # number of training samples on each GPU.
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        # When model is obj:`DataParallel`
+        # the batch size is samples on all the GPUS
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    if runner_type == "IterBasedRunner":
+        # this is a batch sampler, which can yield
+        # a mini-batch indices each time.
+        # it can be used in both `DataParallel` and
+        # `DistributedDataParallel`
+        if shuffle:
+            batch_sampler = InfiniteGroupBatchSampler(dataset, batch_size, world_size, rank, seed=seed)
+        else:
+            batch_sampler = InfiniteBatchSampler(dataset, batch_size, world_size, rank, seed=seed, shuffle=False)
+        batch_size = 1
+        sampler = None
+    else:
+        if class_aware_sampler is not None:
+            # ClassAwareSampler can be used in both distributed and
+            # non-distributed training.
+            num_sample_class = class_aware_sampler.get("num_sample_class", 1)
+            sampler = ClassAwareSampler(
+                dataset, samples_per_gpu, world_size, rank, seed=seed, num_sample_class=num_sample_class
+            )
+        elif dist:
+            # DistributedGroupSampler will definitely shuffle the data to
+            # satisfy that images on each GPU are in the same group
+            if shuffle:
+                sampler = DistributedGroupSampler(dataset, samples_per_gpu, world_size, rank, seed=seed)
+            else:
+                sampler = DistributedSampler(dataset, world_size, rank, shuffle=False, seed=seed)
+        else:
+            sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None
+        batch_sampler = None
+
+    init_fn = partial(worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None
+
+    if TORCH_VERSION != "parrots" and digit_version(TORCH_VERSION) >= digit_version("1.7.0"):
+        kwargs["persistent_workers"] = persistent_workers
+    elif persistent_workers is True:
+        warnings.warn("persistent_workers is invalid because your pytorch " "version is lower than 1.7.0")
+
+    data_loader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=partial(collate, samples_per_gpu=samples_per_gpu),
+        pin_memory=kwargs.pop("pin_memory", False),
+        worker_init_fn=init_fn,
+        **kwargs
+    )
+
+    return data_loader
+
+
+def worker_init_fn(worker_id, num_workers, rank, seed):
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py
new file mode 100644
index 000000000..7dde517f0
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .compose import Compose
+from .formatting import to_tensor
+from .loading import LoadAnnotations, LoadImageFromFile
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py
new file mode 100644
index 000000000..16cb9d062
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/compose.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose:
+    """Compose multiple transforms sequentially.
+
+    Args:
+        transforms (Sequence[dict | callable]): Sequence of transform object or
+            config dict to be composed.
+    """
+
+    def __init__(self, transforms):
+        assert isinstance(transforms, collections.abc.Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError("transform must be callable or a dict")
+
+    def __call__(self, data):
+        """Call function to apply transforms sequentially.
+
+        Args:
+            data (dict): A result dict contains the data to transform.
+
+        Returns:
+           dict: Transformed data.
+        """
+
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            str_ = t.__repr__()
+            if "Compose(" in str_:
+                str_ = str_.replace("\n", "\n    ")
+            format_string += "\n"
+            format_string += f"    {str_}"
+        format_string += "\n)"
+        return format_string
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py
new file mode 100644
index 000000000..55fcd087a
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/formatting.py
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# # Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+
+    Args:
+        data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
+            be converted.
+    """
+
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(f"type {type(data)} cannot be converted to tensor.")
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle:
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+
+    Args:
+        img_to_float (bool): Whether to force the image to be converted to
+            float type. Default: True.
+        pad_val (dict): A dict for padding value in batch collating,
+            the default value is `dict(img=0, masks=0, seg=255)`.
+            Without this argument, the padding value of "gt_semantic_seg"
+            will be set to 0 by default, which should be 255.
+    """
+
+    def __init__(self, img_to_float=True, pad_val=dict(img=0, masks=0, seg=255)):
+        self.img_to_float = img_to_float
+        self.pad_val = pad_val
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+
+        if "img" in results:
+            img = results["img"]
+            if self.img_to_float is True and img.dtype == np.uint8:
+                # Normally, image is of uint8 type without normalization.
+                # At this time, it needs to be forced to be converted to
+                # flot32, otherwise the model training and inference
+                # will be wrong. Only used for YOLOX currently .
+                img = img.astype(np.float32)
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results["img"] = DC(to_tensor(img), padding_value=self.pad_val["img"], stack=True)
+        for key in ["proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels"]:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        if "gt_masks" in results:
+            results["gt_masks"] = DC(results["gt_masks"], padding_value=self.pad_val["masks"], cpu_only=True)
+        if "gt_semantic_seg" in results:
+            results["gt_semantic_seg"] = DC(
+                to_tensor(results["gt_semantic_seg"][None, ...]), padding_value=self.pad_val["seg"], stack=True
+            )
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results["img"]
+        results.setdefault("pad_shape", img.shape)
+        results.setdefault("scale_factor", 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            "img_norm_cfg",
+            dict(
+                mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False
+            ),
+        )
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f"(img_to_float={self.img_to_float})"
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py
new file mode 100644
index 000000000..35e2b9789
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/pipelines/loading.py
@@ -0,0 +1,289 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import mmcv
+import numpy as np
+import pycocotools.mask as maskUtils
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile:
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
+    "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(
+        self, to_float32=False, color_type="color", channel_order="bgr", file_client_args=dict(backend="disk")
+    ):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.channel_order = channel_order
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        """Call functions to load image and get image meta information.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results["img_prefix"] is not None:
+            filename = osp.join(results["img_prefix"], results["img_info"]["filename"])
+        else:
+            filename = results["img_info"]["filename"]
+
+        img_bytes = self.file_client.get(filename)
+        img = mmcv.imfrombytes(img_bytes, flag=self.color_type, channel_order=self.channel_order)
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results["filename"] = filename
+        results["ori_filename"] = results["img_info"]["filename"]
+        results["img"] = img
+        results["img_shape"] = img.shape
+        results["ori_shape"] = img.shape
+        results["img_fields"] = ["img"]
+        return results
+
+    def __repr__(self):
+        repr_str = (
+            f"{self.__class__.__name__}("
+            f"to_float32={self.to_float32}, "
+            f"color_type='{self.color_type}', "
+            f"channel_order='{self.channel_order}', "
+            f"file_client_args={self.file_client_args})"
+        )
+        return repr_str
+
+
+@PIPELINES.register_module()
+class LoadAnnotations:
+    """Load multiple types of annotations.
+
+    Args:
+        with_bbox (bool): Whether to parse and load the bbox annotation.
+             Default: True.
+        with_label (bool): Whether to parse and load the label annotation.
+            Default: True.
+        with_mask (bool): Whether to parse and load the mask annotation.
+             Default: False.
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Default: False.
+        poly2mask (bool): Whether to convert the instance masks from polygons
+            to bitmaps. Default: True.
+        denorm_bbox (bool): Whether to convert bbox from relative value to
+            absolute value. Only used in OpenImage Dataset.
+            Default: False.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+
+    def __init__(
+        self,
+        with_bbox=True,
+        with_label=True,
+        with_mask=False,
+        with_seg=False,
+        poly2mask=True,
+        denorm_bbox=False,
+        file_client_args=dict(backend="disk"),
+    ):
+        self.with_bbox = with_bbox
+        self.with_label = with_label
+        self.with_mask = with_mask
+        self.with_seg = with_seg
+        self.poly2mask = poly2mask
+        self.denorm_bbox = denorm_bbox
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def _load_bboxes(self, results):
+        """Private function to load bounding box annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box annotations.
+        """
+
+        ann_info = results["ann_info"]
+        results["gt_bboxes"] = ann_info["bboxes"].copy()
+
+        if self.denorm_bbox:
+            bbox_num = results["gt_bboxes"].shape[0]
+            if bbox_num != 0:
+                h, w = results["img_shape"][:2]
+                results["gt_bboxes"][:, 0::2] *= w
+                results["gt_bboxes"][:, 1::2] *= h
+
+        gt_bboxes_ignore = ann_info.get("bboxes_ignore", None)
+        if gt_bboxes_ignore is not None:
+            results["gt_bboxes_ignore"] = gt_bboxes_ignore.copy()
+            results["bbox_fields"].append("gt_bboxes_ignore")
+        results["bbox_fields"].append("gt_bboxes")
+
+        gt_is_group_ofs = ann_info.get("gt_is_group_ofs", None)
+        if gt_is_group_ofs is not None:
+            results["gt_is_group_ofs"] = gt_is_group_ofs.copy()
+
+        return results
+
+    def _load_labels(self, results):
+        """Private function to load label annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded label annotations.
+        """
+
+        results["gt_labels"] = results["ann_info"]["labels"].copy()
+        return results
+
+    def _poly2mask(self, mask_ann, img_h, img_w):
+        """Private function to convert masks represented with polygon to
+        bitmaps.
+
+        Args:
+            mask_ann (list | dict): Polygon mask annotation input.
+            img_h (int): The height of output mask.
+            img_w (int): The width of output mask.
+
+        Returns:
+            numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
+        """
+
+        if isinstance(mask_ann, list):
+            # polygon -- a single object might consist of multiple parts
+            # we merge all parts into one mask rle code
+            rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+            rle = maskUtils.merge(rles)
+        elif isinstance(mask_ann["counts"], list):
+            # uncompressed RLE
+            rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
+        else:
+            # rle
+            rle = mask_ann
+        mask = maskUtils.decode(rle)
+        return mask
+
+    def process_polygons(self, polygons):
+        """Convert polygons to list of ndarray and filter invalid polygons.
+
+        Args:
+            polygons (list[list]): Polygons of one instance.
+
+        Returns:
+            list[numpy.ndarray]: Processed polygons.
+        """
+
+        polygons = [np.array(p) for p in polygons]
+        valid_polygons = []
+        for polygon in polygons:
+            if len(polygon) % 2 == 0 and len(polygon) >= 6:
+                valid_polygons.append(polygon)
+        return valid_polygons
+
+    def _load_masks(self, results):
+        """Private function to load mask annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded mask annotations.
+                If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
+                :obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
+        """
+
+        h, w = results["img_info"]["height"], results["img_info"]["width"]
+        gt_masks = results["ann_info"]["masks"]
+        if self.poly2mask:
+            gt_masks = BitmapMasks([self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
+        else:
+            gt_masks = PolygonMasks([self.process_polygons(polygons) for polygons in gt_masks], h, w)
+        results["gt_masks"] = gt_masks
+        results["mask_fields"].append("gt_masks")
+        return results
+
+    def _load_semantic_seg(self, results):
+        """Private function to load semantic segmentation annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: The dict contains loaded semantic segmentation annotations.
+        """
+
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        filename = osp.join(results["seg_prefix"], results["ann_info"]["seg_map"])
+        img_bytes = self.file_client.get(filename)
+        results["gt_semantic_seg"] = mmcv.imfrombytes(img_bytes, flag="unchanged").squeeze()
+        results["seg_fields"].append("gt_semantic_seg")
+        return results
+
+    def __call__(self, results):
+        """Call function to load multiple types annotations.
+
+        Args:
+            results (dict): Result dict from :obj:`mmdet.CustomDataset`.
+
+        Returns:
+            dict: The dict contains loaded bounding box, label, mask and
+                semantic segmentation annotations.
+        """
+
+        if self.with_bbox:
+            results = self._load_bboxes(results)
+            if results is None:
+                return None
+        if self.with_label:
+            results = self._load_labels(results)
+        if self.with_mask:
+            results = self._load_masks(results)
+        if self.with_seg:
+            results = self._load_semantic_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(with_bbox={self.with_bbox}, "
+        repr_str += f"with_label={self.with_label}, "
+        repr_str += f"with_mask={self.with_mask}, "
+        repr_str += f"with_seg={self.with_seg}, "
+        repr_str += f"poly2mask={self.poly2mask}, "
+        repr_str += f"poly2mask={self.file_client_args})"
+        return repr_str
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py
new file mode 100644
index 000000000..04ce19131
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/__init__.py
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .class_aware_sampler import ClassAwareSampler
+from .distributed_sampler import DistributedSampler
+from .group_sampler import DistributedGroupSampler, GroupSampler
+from .infinite_sampler import InfiniteBatchSampler, InfiniteGroupBatchSampler
+
+__all__ = [
+    "DistributedSampler",
+    "DistributedGroupSampler",
+    "GroupSampler",
+    "InfiniteGroupBatchSampler",
+    "InfiniteBatchSampler",
+    "ClassAwareSampler",
+]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py
new file mode 100644
index 000000000..393ef3feb
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/class_aware_sampler.py
@@ -0,0 +1,162 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from mmcv.runner import get_dist_info
+from mmdet.core.utils import sync_random_seed
+from torch.utils.data import Sampler
+
+
+class ClassAwareSampler(Sampler):
+    r"""Sampler that restricts data loading to the label of the dataset.
+
+    A class-aware sampling strategy to effectively tackle the
+    non-uniform class distribution. The length of the training data is
+    consistent with source data. Simple improvements based on `Relay
+    Backpropagation for Effective Learning of Deep Convolutional
+    Neural Networks <https://arxiv.org/abs/1512.05830>`_
+
+    The implementation logic is referred to
+    https://github.com/Sense-X/TSD/blob/master/mmdet/datasets/samplers/distributed_classaware_sampler.py
+
+    Args:
+        dataset: Dataset used for sampling.
+        samples_per_gpu (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU.
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+        num_sample_class (int): The number of samples taken from each
+            per-label list. Default: 1
+    """
+
+    def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0, num_sample_class=1):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.samples_per_gpu = samples_per_gpu
+        self.rank = rank
+        self.epoch = 0
+        # Must be the same across all workers. If None, will use a
+        # random seed shared among workers
+        # (require synchronization among all workers)
+        self.seed = sync_random_seed(seed)
+
+        # The number of samples taken from each per-label list
+        assert num_sample_class > 0 and isinstance(num_sample_class, int)
+        self.num_sample_class = num_sample_class
+        # Get per-label image list from dataset
+        assert hasattr(dataset, "get_cat2imgs"), "dataset must have `get_cat2imgs` function"
+        self.cat_dict = dataset.get_cat2imgs()
+
+        self.num_samples = (
+            int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas / self.samples_per_gpu)) * self.samples_per_gpu
+        )
+        self.total_size = self.num_samples * self.num_replicas
+
+        # get number of images containing each category
+        self.num_cat_imgs = [len(x) for x in self.cat_dict.values()]
+        # filter labels without images
+        self.valid_cat_inds = [i for i, length in enumerate(self.num_cat_imgs) if length != 0]
+        self.num_classes = len(self.valid_cat_inds)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        # initialize label list
+        label_iter_list = RandomCycleIter(self.valid_cat_inds, generator=g)
+        # initialize each per-label image list
+        data_iter_dict = dict()
+        for i in self.valid_cat_inds:
+            data_iter_dict[i] = RandomCycleIter(self.cat_dict[i], generator=g)
+
+        def gen_cat_img_inds(cls_list, data_dict, num_sample_cls):
+            """Traverse the categories and extract `num_sample_cls` image
+            indexes of the corresponding categories one by one."""
+            id_indices = []
+            for _ in range(len(cls_list)):
+                cls_idx = next(cls_list)
+                for _ in range(num_sample_cls):
+                    id = next(data_dict[cls_idx])
+                    id_indices.append(id)
+            return id_indices
+
+        # deterministically shuffle based on epoch
+        num_bins = int(math.ceil(self.total_size * 1.0 / self.num_classes / self.num_sample_class))
+        indices = []
+        for i in range(num_bins):
+            indices += gen_cat_img_inds(label_iter_list, data_iter_dict, self.num_sample_class)
+
+        # fix extra samples to make it evenly divisible
+        if len(indices) >= self.total_size:
+            indices = indices[: self.total_size]
+        else:
+            indices += indices[: (self.total_size - len(indices))]
+        assert len(indices) == self.total_size
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset : offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class RandomCycleIter:
+    """Shuffle the list and do it again after the list have traversed.
+
+    The implementation logic is referred to
+    https://github.com/wutong16/DistributionBalancedLoss/blob/master/mllt/datasets/loader/sampler.py
+
+    Example:
+        >>> label_list = [0, 1, 2, 4, 5]
+        >>> g = torch.Generator()
+        >>> g.manual_seed(0)
+        >>> label_iter_list = RandomCycleIter(label_list, generator=g)
+        >>> index = next(label_iter_list)
+    Args:
+        data (list or ndarray): The data that needs to be shuffled.
+        generator: An torch.Generator object, which is used in setting the seed
+            for generating random numbers.
+    """  # noqa: W605
+
+    def __init__(self, data, generator=None):
+        self.data = data
+        self.length = len(data)
+        self.index = torch.randperm(self.length, generator=generator).numpy()
+        self.i = 0
+        self.generator = generator
+
+    def __iter__(self):
+        return self
+
+    def __len__(self):
+        return len(self.data)
+
+    def __next__(self):
+        if self.i == self.length:
+            self.index = torch.randperm(self.length, generator=self.generator).numpy()
+            self.i = 0
+        idx = self.data[self.index[self.i]]
+        self.i += 1
+        return idx
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py
new file mode 100644
index 000000000..b9db3a0f2
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+from mmdet.core.utils import sync_random_seed
+from mmdet.utils import get_device
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+class DistributedSampler(_DistributedSampler):
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, seed=0):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
+
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        device = get_device()
+        self.seed = sync_random_seed(seed, device)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            # When :attr:`shuffle=True`, this ensures all replicas
+            # use a different random ordering for each epoch.
+            # Otherwise, the next iteration of this sampler will
+            # yield the same ordering.
+            g.manual_seed(self.epoch + self.seed)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        # in case that indices is shorter than half of total_size
+        indices = (indices * math.ceil(self.total_size / len(indices)))[: self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py
new file mode 100644
index 000000000..923cf4bed
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/group_sampler.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from torch.utils.data import Sampler
+
+
+class GroupSampler(Sampler):
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, "flag")
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate([indice, np.random.choice(indice, num_extra)])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu : (i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = indices.astype(np.int64).tolist()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+
+    .. note::
+        Dataset is assumed to be of constant size.
+
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+        seed (int, optional): random seed used to shuffle the sampler if
+            ``shuffle=True``. This number should be identical across all
+            processes in the distributed group. Default: 0.
+    """
+
+    def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0):
+        _rank, _num_replicas = get_dist_info()
+        if num_replicas is None:
+            num_replicas = _num_replicas
+        if rank is None:
+            rank = _rank
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.seed = seed if seed is not None else 0
+
+        assert hasattr(self.dataset, "flag")
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += (
+                int(math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas))
+                * self.samples_per_gpu
+            )
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch + self.seed)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                # add .numpy() to avoid bug when selecting indice in parrots.
+                # TODO: check whether torch.randperm() can be replaced by
+                # numpy.random.permutation().
+                indice = indice[list(torch.randperm(int(size), generator=g).numpy())].tolist()
+                extra = int(
+                    math.ceil(size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                # pad indice
+                tmp = indice.copy()
+                for _ in range(extra // size):
+                    indice.extend(tmp)
+                indice.extend(tmp[: extra % size])
+                indices.extend(indice)
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j]
+            for i in list(torch.randperm(len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset : offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py
new file mode 100644
index 000000000..11b4acd86
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/datasets/samplers/infinite_sampler.py
@@ -0,0 +1,167 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import itertools
+
+import numpy as np
+import torch
+from mmcv.runner import get_dist_info
+from mmdet.core.utils import sync_random_seed
+from torch.utils.data.sampler import Sampler
+
+
+class InfiniteGroupBatchSampler(Sampler):
+    """Similar to `BatchSampler` warping a `GroupSampler. It is designed for
+    iteration-based runners like `IterBasedRunner` and yields a mini-batch
+    indices each time, all indices in a batch should be in the same group.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py
+
+    Args:
+        dataset (object): The dataset.
+        batch_size (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU.
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        world_size (int, optional): Number of processes participating in
+            distributed training. Default: None.
+        rank (int, optional): Rank of current process. Default: None.
+        seed (int): Random seed. Default: 0.
+        shuffle (bool): Whether shuffle the indices of a dummy `epoch`, it
+            should be noted that `shuffle` can not guarantee that you can
+            generate sequential indices because it need to ensure
+            that all indices in a batch is in a group. Default: True.
+    """  # noqa: W605
+
+    def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0, shuffle=True):
+        _rank, _world_size = get_dist_info()
+        if world_size is None:
+            world_size = _world_size
+        if rank is None:
+            rank = _rank
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.batch_size = batch_size
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
+        self.shuffle = shuffle
+
+        assert hasattr(self.dataset, "flag")
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+        # buffer used to save indices of each group
+        self.buffer_per_group = {k: [] for k in range(len(self.group_sizes))}
+
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self):
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self):
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None, self.world_size)
+
+    def __iter__(self):
+        # once batch size is reached, yield the indices
+        for idx in self.indices:
+            flag = self.flag[idx]
+            group_buffer = self.buffer_per_group[flag]
+            group_buffer.append(idx)
+            if len(group_buffer) == self.batch_size:
+                yield group_buffer[:]
+                del group_buffer[:]
+
+    def __len__(self):
+        """Length of base dataset."""
+        return self.size
+
+
+class InfiniteBatchSampler(Sampler):
+    """Similar to `BatchSampler` warping a `DistributedSampler. It is designed
+    iteration-based runners like `IterBasedRunner` and yields a mini-batch
+    indices each time.
+
+    The implementation logic is referred to
+    https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/samplers/grouped_batch_sampler.py
+
+    Args:
+        dataset (object): The dataset.
+        batch_size (int): When model is :obj:`DistributedDataParallel`,
+            it is the number of training samples on each GPU,
+            When model is :obj:`DataParallel`, it is
+            `num_gpus * samples_per_gpu`.
+            Default : 1.
+        world_size (int, optional): Number of processes participating in
+            distributed training. Default: None.
+        rank (int, optional): Rank of current process. Default: None.
+        seed (int): Random seed. Default: 0.
+        shuffle (bool): Whether shuffle the dataset or not. Default: True.
+    """  # noqa: W605
+
+    def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0, shuffle=True):
+        _rank, _world_size = get_dist_info()
+        if world_size is None:
+            world_size = _world_size
+        if rank is None:
+            rank = _rank
+        self.rank = rank
+        self.world_size = world_size
+        self.dataset = dataset
+        self.batch_size = batch_size
+        # In distributed sampling, different ranks should sample
+        # non-overlapped data in the dataset. Therefore, this function
+        # is used to make sure that each rank shuffles the data indices
+        # in the same order based on the same seed. Then different ranks
+        # could use different indices to select non-overlapped data from the
+        # same data list.
+        self.seed = sync_random_seed(seed)
+        self.shuffle = shuffle
+        self.size = len(dataset)
+        self.indices = self._indices_of_rank()
+
+    def _infinite_indices(self):
+        """Infinitely yield a sequence of indices."""
+        g = torch.Generator()
+        g.manual_seed(self.seed)
+        while True:
+            if self.shuffle:
+                yield from torch.randperm(self.size, generator=g).tolist()
+
+            else:
+                yield from torch.arange(self.size).tolist()
+
+    def _indices_of_rank(self):
+        """Slice the infinite indices by rank."""
+        yield from itertools.islice(self._infinite_indices(), self.rank, None, self.world_size)
+
+    def __iter__(self):
+        # once batch size is reached, yield the indices
+        batch_buffer = []
+        for idx in self.indices:
+            batch_buffer.append(idx)
+            if len(batch_buffer) == self.batch_size:
+                yield batch_buffer
+                batch_buffer = []
+
+    def __len__(self):
+        """Length of base dataset."""
+        return self.size
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py
new file mode 100644
index 000000000..7281a4c13
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/__init__.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .builder import (
+    BACKBONES,
+    DETECTORS,
+    HEADS,
+    LOSSES,
+    NECKS,
+    ROI_EXTRACTORS,
+    SHARED_HEADS,
+    build_backbone,
+    build_detector,
+    build_head,
+    build_loss,
+    build_neck,
+)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py
new file mode 100644
index 000000000..9048153df
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/builder.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+
+MODELS = Registry("models", parent=MMCV_MODELS)
+
+BACKBONES = MODELS
+NECKS = MODELS
+ROI_EXTRACTORS = MODELS
+SHARED_HEADS = MODELS
+HEADS = MODELS
+LOSSES = MODELS
+DETECTORS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    return LOSSES.build(cfg)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning)
+    assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field "
+    assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field "
+
+    return DETECTORS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py
new file mode 100644
index 000000000..d4b12417f
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/anchor_free_head.py
@@ -0,0 +1,274 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import force_fp32
+from mmdet3d.core.bbox.coders import build_bbox_coder  # multi_apply
+from mmdet.core.anchor.point_generator import MlvlPointGenerator
+
+from ..builder import HEADS, build_loss
+from .base_dense_head import BaseDenseHead
+from .dense_test_mixins import BBoxTestMixin
+
+
+@HEADS.register_module()
+class AnchorFreeHead(BaseDenseHead, BBoxTestMixin):
+    """Anchor-free head (FCOS, Fovea, RepPoints, etc.).
+
+    Args:
+        num_classes (int): Number of categories excluding the background
+            category.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of hidden channels. Used in child classes.
+        stacked_convs (int): Number of stacking convs of the head.
+        strides (tuple): Downsample factor of each feature map.
+        dcn_on_last_conv (bool): If true, use dcn in the last layer of
+            towers. Default: False.
+        conv_bias (bool | str): If specified as `auto`, it will be decided by
+            the norm_cfg. Bias of conv will be set as True if `norm_cfg` is
+            None, otherwise False. Default: "auto".
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of localization loss.
+        bbox_coder (dict): Config of bbox coder. Defaults
+            'DistancePointBBoxCoder'.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        train_cfg (dict): Training config of anchor head.
+        test_cfg (dict): Testing config of anchor head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """  # noqa: W605
+
+    _version = 1
+
+    def __init__(
+        self,
+        num_classes,
+        in_channels,
+        feat_channels=256,
+        stacked_convs=4,
+        strides=(4, 8, 16, 32, 64),
+        dcn_on_last_conv=False,
+        conv_bias="auto",
+        loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0),
+        loss_bbox=dict(type="IoULoss", loss_weight=1.0),
+        bbox_coder=dict(type="DistancePointBBoxCoder"),
+        conv_cfg=None,
+        norm_cfg=None,
+        train_cfg=None,
+        test_cfg=None,
+        init_cfg=dict(
+            type="Normal",
+            layer="Conv2d",
+            std=0.01,
+            override=dict(type="Normal", name="conv_cls", std=0.01, bias_prob=0.01),
+        ),
+    ):
+        super(AnchorFreeHead, self).__init__(init_cfg)
+        self.num_classes = num_classes
+        self.use_sigmoid_cls = loss_cls.get("use_sigmoid", False)
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.stacked_convs = stacked_convs
+        self.strides = strides
+        self.dcn_on_last_conv = dcn_on_last_conv
+        assert conv_bias == "auto" or isinstance(conv_bias, bool)
+        self.conv_bias = conv_bias
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+
+        self.prior_generator = MlvlPointGenerator(strides)
+
+        # In order to keep a more general interface and be consistent with
+        # anchor_head. We can think of point like one anchor
+        self.num_base_priors = self.prior_generator.num_base_priors[0]
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.fp16_enabled = False
+
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the head."""
+        self._init_cls_convs()
+        self._init_reg_convs()
+        self._init_predictor()
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type="DCNv2")
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias,
+                )
+            )
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type="DCNv2")
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(
+                    chn,
+                    self.feat_channels,
+                    3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    bias=self.conv_bias,
+                )
+            )
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 3, padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        """Hack some keys of the model state dict so that can load checkpoints
+        of previous version."""
+        version = local_metadata.get("version", None)
+        if version is None:
+            # the key is different in early versions
+            # for example, 'fcos_cls' become 'conv_cls' now
+            bbox_head_keys = [k for k in state_dict.keys() if k.startswith(prefix)]
+            ori_predictor_keys = []
+            new_predictor_keys = []
+            # e.g. 'fcos_cls' or 'fcos_reg'
+            for key in bbox_head_keys:
+                ori_predictor_keys.append(key)
+                key = key.split(".")
+                conv_name = None
+                if key[1].endswith("cls"):
+                    conv_name = "conv_cls"
+                elif key[1].endswith("reg"):
+                    conv_name = "conv_reg"
+                elif key[1].endswith("centerness"):
+                    conv_name = "conv_centerness"
+                else:
+                    assert NotImplementedError
+                if conv_name is not None:
+                    key[1] = conv_name
+                    new_predictor_keys.append(".".join(key))
+                else:
+                    ori_predictor_keys.pop(-1)
+            for i in range(len(new_predictor_keys)):
+                state_dict[new_predictor_keys[i]] = state_dict.pop(ori_predictor_keys[i])
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, feats):
+        """Forward features from the upstream network.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+
+        Returns:
+            tuple: Usually contain classification scores and bbox predictions.
+                cls_scores (list[Tensor]): Box scores for each scale level,
+                    each is a 4D-tensor, the channel number is
+                    num_points * num_classes.
+                bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                    level, each is a 4D-tensor, the channel number is
+                    num_points * 4.
+        """
+        return multi_apply(self.forward_single, feats)[:2]
+
+    def forward_single(self, x):
+        """Forward features of a single scale level.
+
+        Args:
+            x (Tensor): FPN feature maps of the specified stride.
+
+        Returns:
+            tuple: Scores for each class, bbox predictions, features
+                after classification and regression conv layers, some
+                models needs these features like FCOS.
+        """
+        cls_feat = x
+        reg_feat = x
+
+        for cls_layer in self.cls_convs:
+            cls_feat = cls_layer(cls_feat)
+        cls_score = self.conv_cls(cls_feat)
+
+        for reg_layer in self.reg_convs:
+            reg_feat = reg_layer(reg_feat)
+        bbox_pred = self.conv_reg(reg_feat)
+        return cls_score, bbox_pred, cls_feat, reg_feat
+
+    @abstractmethod
+    @force_fp32(apply_to=("cls_scores", "bbox_preds"))
+    def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, img_metas, gt_bboxes_ignore=None):
+        """Compute loss of the head.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level,
+                each is a 4D-tensor, the channel number is
+                num_points * num_classes.
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level, each is a 4D-tensor, the channel number is
+                num_points * 4.
+            gt_bboxes (list[Tensor]): Ground truth bboxes for each image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): class indices corresponding to each box
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes_ignore (None | list[Tensor]): specify which bounding
+                boxes can be ignored when computing the loss.
+        """
+
+        raise NotImplementedError
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py
new file mode 100644
index 000000000..af3369445
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/base_dense_head.py
@@ -0,0 +1,520 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmcv.cnn.utils.weight_init import constant_init
+from mmcv.ops import batched_nms
+from mmcv.runner import BaseModule, force_fp32
+
+
+class BaseDenseHead(BaseModule, metaclass=ABCMeta):
+    """Base class for DenseHeads."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDenseHead, self).__init__(init_cfg)
+
+    def init_weights(self):
+        super(BaseDenseHead, self).init_weights()
+        # avoid init_cfg overwrite the initialization of `conv_offset`
+        for m in self.modules():
+            # DeformConv2dPack, ModulatedDeformConv2dPack
+            if hasattr(m, "conv_offset"):
+                constant_init(m.conv_offset, 0)
+
+    @abstractmethod
+    def loss(self, **kwargs):
+        """Compute losses of the head."""
+        pass
+
+    @force_fp32(apply_to=("cls_scores", "bbox_preds"))
+    def get_bboxes(
+        self,
+        cls_scores,
+        bbox_preds,
+        score_factors=None,
+        img_metas=None,
+        cfg=None,
+        rescale=False,
+        with_nms=True,
+        **kwargs
+    ):
+        """Transform network outputs of a batch into bbox results.
+
+        Note: When score_factors is not None, the cls_scores are
+        usually multiplied by it then obtain the real score used in NMS,
+        such as CenterNess in FCOS, IoU branch in ATSS.
+
+        Args:
+            cls_scores (list[Tensor]): Classification scores for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for all
+                scale levels, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 4, H, W).
+            score_factors (list[Tensor], Optional): Score factor for
+                all scale level, each is a 4D-tensor, has shape
+                (batch_size, num_priors * 1, H, W). Default None.
+            img_metas (list[dict], Optional): Image meta info. Default None.
+            cfg (mmcv.Config, Optional): Test / postprocessing configuration,
+                if None, test_cfg would be used.  Default None.
+            rescale (bool): If True, return boxes in original image space.
+                Default False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default True.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is an (n, 5) tensor, where the first 4 columns
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
+                5-th column is a score between 0 and 1. The second item is a
+                (n,) tensor where each item is the predicted class label of
+                the corresponding box.
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        if score_factors is None:
+            # e.g. Retina, FreeAnchor, Foveabox, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, AutoAssign, etc.
+            with_score_factors = True
+            assert len(cls_scores) == len(score_factors)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device
+        )
+
+        result_list = []
+
+        for img_id in range(len(img_metas)):
+            img_meta = img_metas[img_id]
+            cls_score_list = select_single_mlvl(cls_scores, img_id)
+            bbox_pred_list = select_single_mlvl(bbox_preds, img_id)
+            if with_score_factors:
+                score_factor_list = select_single_mlvl(score_factors, img_id)
+            else:
+                score_factor_list = [None for _ in range(num_levels)]
+
+            results = self._get_bboxes_single(
+                cls_score_list,
+                bbox_pred_list,
+                score_factor_list,
+                mlvl_priors,
+                img_meta,
+                cfg,
+                rescale,
+                with_nms,
+                **kwargs
+            )
+            result_list.append(results)
+        return result_list
+
+    def _get_bboxes_single(
+        self,
+        cls_score_list,
+        bbox_pred_list,
+        score_factor_list,
+        mlvl_priors,
+        img_meta,
+        cfg,
+        rescale=False,
+        with_nms=True,
+        **kwargs
+    ):
+        """Transform outputs of a single image into bbox predictions.
+
+        Args:
+            cls_score_list (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_priors * num_classes, H, W).
+            bbox_pred_list (list[Tensor]): Box energies / deltas from
+                all scale levels of a single image, each item has shape
+                (num_priors * 4, H, W).
+            score_factor_list (list[Tensor]): Score factor from all scale
+                levels of a single image, each item has shape
+                (num_priors * 1, H, W).
+            mlvl_priors (list[Tensor]): Each element in the list is
+                the priors of a single level in feature pyramid. In all
+                anchor-based methods, it has shape (num_priors, 4). In
+                all anchor-free methods, it has shape (num_priors, 2)
+                when `with_stride=True`, otherwise it still has shape
+                (num_priors, 4).
+            img_meta (dict): Image meta info.
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        if score_factor_list[0] is None:
+            # e.g. Retina, FreeAnchor, etc.
+            with_score_factors = False
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+
+        cfg = self.test_cfg if cfg is None else cfg
+        img_shape = img_meta["img_shape"]
+        nms_pre = cfg.get("nms_pre", -1)
+
+        mlvl_bboxes = []
+        mlvl_scores = []
+        mlvl_labels = []
+        if with_score_factors:
+            mlvl_score_factors = []
+        else:
+            mlvl_score_factors = None
+        for level_idx, (cls_score, bbox_pred, score_factor, priors) in enumerate(
+            zip(cls_score_list, bbox_pred_list, score_factor_list, mlvl_priors)
+        ):
+
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            if with_score_factors:
+                score_factor = score_factor.permute(1, 2, 0).reshape(-1).sigmoid()
+            cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                # remind that we set FG labels to [0, num_class-1]
+                # since mmdet v2.0
+                # BG cat_id: num_class
+                scores = cls_score.softmax(-1)[:, :-1]
+
+            # After https://github.com/open-mmlab/mmdetection/pull/6268/,
+            # this operation keeps fewer bboxes under the same `nms_pre`.
+            # There is no difference in performance for most models. If you
+            # find a slight drop in performance, you can set a larger
+            # `nms_pre` than before.
+            results = filter_scores_and_topk(scores, cfg.score_thr, nms_pre, dict(bbox_pred=bbox_pred, priors=priors))
+            scores, labels, keep_idxs, filtered_results = results
+
+            bbox_pred = filtered_results["bbox_pred"]
+            priors = filtered_results["priors"]
+
+            if with_score_factors:
+                score_factor = score_factor[keep_idxs]
+
+            bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            mlvl_labels.append(labels)
+            if with_score_factors:
+                mlvl_score_factors.append(score_factor)
+
+        return self._bbox_post_process(
+            mlvl_scores,
+            mlvl_labels,
+            mlvl_bboxes,
+            img_meta["scale_factor"],
+            cfg,
+            rescale,
+            with_nms,
+            mlvl_score_factors,
+            **kwargs
+        )
+
+    def _bbox_post_process(
+        self,
+        mlvl_scores,
+        mlvl_labels,
+        mlvl_bboxes,
+        scale_factor,
+        cfg,
+        rescale=False,
+        with_nms=True,
+        mlvl_score_factors=None,
+        **kwargs
+    ):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_labels (list[Tensor]): Box class labels from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            mlvl_score_factors (list[Tensor], optional): Score factor from
+                all scale levels of a single image, each item has shape
+                (num_bboxes, ). Default: None.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_labels = torch.cat(mlvl_labels)
+
+        if mlvl_score_factors is not None:
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            mlvl_score_factors = torch.cat(mlvl_score_factors)
+            mlvl_scores = mlvl_scores * mlvl_score_factors
+
+        if with_nms:
+            if mlvl_bboxes.numel() == 0:
+                det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1)
+                return det_bboxes, mlvl_labels
+
+            det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores, mlvl_labels, cfg.nms)
+            det_bboxes = det_bboxes[: cfg.max_per_img]
+            det_labels = mlvl_labels[keep_idxs][: cfg.max_per_img]
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_labels
+
+    def forward_train(
+        self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=None, proposal_cfg=None, **kwargs
+    ):
+        """
+        Args:
+            x (list[Tensor]): Features from FPN.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used
+
+        Returns:
+            tuple:
+                losses: (dict[str, Tensor]): A dictionary of loss components.
+                proposal_list (list[Tensor]): Proposals of each image.
+        """
+        outs = self(x)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        if proposal_cfg is None:
+            return losses
+        else:
+            proposal_list = self.get_bboxes(*outs, img_metas=img_metas, cfg=proposal_cfg)
+            return losses, proposal_list
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n, ).
+        """
+        return self.simple_test_bboxes(feats, img_metas, rescale=rescale)
+
+    @force_fp32(apply_to=("cls_scores", "bbox_preds"))
+    def onnx_export(self, cls_scores, bbox_preds, score_factors=None, img_metas=None, with_nms=True):
+        """Transform network output for a batch into bbox predictions.
+
+        Args:
+            cls_scores (list[Tensor]): Box scores for each scale level
+                with shape (N, num_points * num_classes, H, W).
+            bbox_preds (list[Tensor]): Box energies / deltas for each scale
+                level with shape (N, num_points * 4, H, W).
+            score_factors (list[Tensor]): score_factors for each s
+                cale level with shape (N, num_points * 1, H, W).
+                Default: None.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc. Default: None.
+            with_nms (bool): Whether apply nms to the bboxes. Default: True.
+
+        Returns:
+            tuple[Tensor, Tensor] | list[tuple]: When `with_nms` is True,
+            it is tuple[Tensor, Tensor], first tensor bboxes with shape
+            [N, num_det, 5], 5 arrange as (x1, y1, x2, y2, score)
+            and second element is class labels of shape [N, num_det].
+            When `with_nms` is False, first tensor is bboxes with
+            shape [N, num_det, 4], second tensor is raw score has
+            shape  [N, num_det, num_classes].
+        """
+        assert len(cls_scores) == len(bbox_preds)
+
+        num_levels = len(cls_scores)
+
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        mlvl_priors = self.prior_generator.grid_priors(
+            featmap_sizes, dtype=bbox_preds[0].dtype, device=bbox_preds[0].device
+        )
+
+        mlvl_cls_scores = [cls_scores[i].detach() for i in range(num_levels)]
+        mlvl_bbox_preds = [bbox_preds[i].detach() for i in range(num_levels)]
+
+        assert len(img_metas) == 1, "Only support one input image while in exporting to ONNX"
+        img_shape = img_metas[0]["img_shape_for_onnx"]
+
+        cfg = self.test_cfg
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_priors)
+        device = cls_scores[0].device
+        batch_size = cls_scores[0].shape[0]
+        # convert to tensor to keep tracing
+        nms_pre_tensor = torch.tensor(cfg.get("nms_pre", -1), device=device, dtype=torch.long)
+
+        # e.g. Retina, FreeAnchor, etc.
+        if score_factors is None:
+            with_score_factors = False
+            mlvl_score_factor = [None for _ in range(num_levels)]
+        else:
+            # e.g. FCOS, PAA, ATSS, etc.
+            with_score_factors = True
+            mlvl_score_factor = [score_factors[i].detach() for i in range(num_levels)]
+            mlvl_score_factors = []
+
+        mlvl_batch_bboxes = []
+        mlvl_scores = []
+
+        for cls_score, bbox_pred, score_factors, priors in zip(
+            mlvl_cls_scores, mlvl_bbox_preds, mlvl_score_factor, mlvl_priors
+        ):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+
+            scores = cls_score.permute(0, 2, 3, 1).reshape(batch_size, -1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = scores.sigmoid()
+                nms_pre_score = scores
+            else:
+                scores = scores.softmax(-1)
+                nms_pre_score = scores
+
+            if with_score_factors:
+                score_factors = score_factors.permute(0, 2, 3, 1).reshape(batch_size, -1).sigmoid()
+            bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 4)
+            priors = priors.expand(batch_size, -1, priors.size(-1))
+            # Get top-k predictions
+            from mmdet.core.export import get_k_for_topk
+
+            nms_pre = get_k_for_topk(nms_pre_tensor, bbox_pred.shape[1])
+            if nms_pre > 0:
+
+                if with_score_factors:
+                    nms_pre_score = nms_pre_score * score_factors[..., None]
+                else:
+                    nms_pre_score = nms_pre_score
+
+                # Get maximum scores for foreground classes.
+                if self.use_sigmoid_cls:
+                    max_scores, _ = nms_pre_score.max(-1)
+                else:
+                    # remind that we set FG labels to [0, num_class-1]
+                    # since mmdet v2.0
+                    # BG cat_id: num_class
+                    max_scores, _ = nms_pre_score[..., :-1].max(-1)
+                _, topk_inds = max_scores.topk(nms_pre)
+
+                batch_inds = torch.arange(batch_size, device=bbox_pred.device).view(-1, 1).expand_as(topk_inds).long()
+                # Avoid onnx2tensorrt issue in https://github.com/NVIDIA/TensorRT/issues/1134 # noqa: E501
+                transformed_inds = bbox_pred.shape[1] * batch_inds + topk_inds
+                priors = priors.reshape(-1, priors.size(-1))[transformed_inds, :].reshape(
+                    batch_size, -1, priors.size(-1)
+                )
+                bbox_pred = bbox_pred.reshape(-1, 4)[transformed_inds, :].reshape(batch_size, -1, 4)
+                scores = scores.reshape(-1, self.cls_out_channels)[transformed_inds, :].reshape(
+                    batch_size, -1, self.cls_out_channels
+                )
+                if with_score_factors:
+                    score_factors = score_factors.reshape(-1, 1)[transformed_inds].reshape(batch_size, -1)
+
+            bboxes = self.bbox_coder.decode(priors, bbox_pred, max_shape=img_shape)
+
+            mlvl_batch_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+            if with_score_factors:
+                mlvl_score_factors.append(score_factors)
+
+        batch_bboxes = torch.cat(mlvl_batch_bboxes, dim=1)
+        batch_scores = torch.cat(mlvl_scores, dim=1)
+        if with_score_factors:
+            batch_score_factors = torch.cat(mlvl_score_factors, dim=1)
+
+        # Replace multiclass_nms with ONNX::NonMaxSuppression in deployment
+
+        from mmdet.core.export import add_dummy_nms_for_onnx
+
+        if not self.use_sigmoid_cls:
+            batch_scores = batch_scores[..., : self.num_classes]
+
+        if with_score_factors:
+            batch_scores = batch_scores * (batch_score_factors.unsqueeze(2))
+
+        if with_nms:
+            max_output_boxes_per_class = cfg.nms.get("max_output_boxes_per_class", 200)
+            iou_threshold = cfg.nms.get("iou_threshold", 0.5)
+            score_threshold = cfg.score_thr
+            nms_pre = cfg.get("deploy_nms_pre", -1)
+            return add_dummy_nms_for_onnx(
+                batch_bboxes,
+                batch_scores,
+                max_output_boxes_per_class,
+                iou_threshold,
+                score_threshold,
+                nms_pre,
+                cfg.max_per_img,
+            )
+        else:
+            return batch_bboxes, batch_scores
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py
new file mode 100644
index 000000000..c1424e602
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/dense_heads/dense_test_mixins.py
@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from inspect import signature
+
+import torch
+
+
+class BBoxTestMixin(object):
+    """Mixin class for testing det bboxes via DenseHead."""
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        outs = self.forward(feats)
+        results_list = self.get_bboxes(*outs, img_metas=img_metas, rescale=rescale)
+        return results_list
+
+    def aug_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes with test time augmentation, can be applied in
+        DenseHead except for ``RPNHead`` and its variants, e.g., ``GARPNHead``,
+        etc.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,). The length of list should always be 1.
+        """
+        # check with_nms argument
+        gb_sig = signature(self.get_bboxes)
+        gb_args = [p.name for p in gb_sig.parameters.values()]
+        gbs_sig = signature(self._get_bboxes_single)
+        gbs_args = [p.name for p in gbs_sig.parameters.values()]
+        assert ("with_nms" in gb_args) and ("with_nms" in gbs_args), (
+            f"{self.__class__.__name__}" " does not support test-time augmentation"
+        )
+
+        aug_bboxes = []
+        aug_scores = []
+        aug_labels = []
+        for x, img_meta in zip(feats, img_metas):
+            # only one image in the batch
+            outs = self.forward(x)
+            bbox_outputs = self.get_bboxes(*outs, img_metas=img_meta, cfg=self.test_cfg, rescale=False, with_nms=False)[
+                0
+            ]
+            aug_bboxes.append(bbox_outputs[0])
+            aug_scores.append(bbox_outputs[1])
+            if len(bbox_outputs) >= 3:
+                aug_labels.append(bbox_outputs[2])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = self.merge_aug_bboxes(aug_bboxes, aug_scores, img_metas)
+        merged_labels = torch.cat(aug_labels, dim=0) if aug_labels else None
+
+        if merged_bboxes.numel() == 0:
+            det_bboxes = torch.cat([merged_bboxes, merged_scores[:, None]], -1)
+            return [
+                (det_bboxes, merged_labels),
+            ]
+
+        det_bboxes, keep_idxs = batched_nms(merged_bboxes, merged_scores, merged_labels, self.test_cfg.nms)
+        det_bboxes = det_bboxes[: self.test_cfg.max_per_img]
+        det_labels = merged_labels[keep_idxs][: self.test_cfg.max_per_img]
+
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= det_bboxes.new_tensor(img_metas[0][0]["scale_factor"])
+
+        return [
+            (_det_bboxes, det_labels),
+        ]
+
+    def simple_test_rpn(self, x, img_metas):
+        """Test without augmentation, only for ``RPNHead`` and its variants,
+        e.g., ``GARPNHead``, etc.
+
+        Args:
+            x (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): Meta info of each image.
+
+        Returns:
+            list[Tensor]: Proposals of each image, each item has shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+        """
+        rpn_outs = self(x)
+        proposal_list = self.get_bboxes(*rpn_outs, img_metas=img_metas)
+        return proposal_list
+
+    def merge_aug_bboxes(self, aug_bboxes, aug_scores, img_metas):
+        """Merge augmented detection bboxes and scores.
+
+        Args:
+            aug_bboxes (list[Tensor]): shape (n, 4*#class)
+            aug_scores (list[Tensor] or None): shape (n, #class)
+            img_shapes (list[Tensor]): shape (3, ).
+
+        Returns:
+            tuple[Tensor]: ``bboxes`` with shape (n,4), where
+            4 represent (tl_x, tl_y, br_x, br_y)
+            and ``scores`` with shape (n,).
+        """
+        recovered_bboxes = []
+        for bboxes, img_info in zip(aug_bboxes, img_metas):
+            img_shape = img_info[0]["img_shape"]
+            scale_factor = img_info[0]["scale_factor"]
+            flip = img_info[0]["flip"]
+            flip_direction = img_info[0]["flip_direction"]
+            bboxes = bbox_mapping_back(bboxes, img_shape, scale_factor, flip, flip_direction)
+            recovered_bboxes.append(bboxes)
+        bboxes = torch.cat(recovered_bboxes, dim=0)
+        if aug_scores is None:
+            return bboxes
+        else:
+            scores = torch.cat(aug_scores, dim=0)
+            return bboxes, scores
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py b/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py
new file mode 100644
index 000000000..6dd11bd8d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/detectors/base.py
@@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+from mmcv.runner import BaseModule, auto_fp16
+
+
+class BaseDetector(BaseModule, metaclass=ABCMeta):
+    """Base class for detectors."""
+
+    def __init__(self, init_cfg=None):
+        super(BaseDetector, self).__init__(init_cfg)
+        self.fp16_enabled = False
+
+    @property
+    def with_neck(self):
+        """bool: whether the detector has a neck"""
+        return hasattr(self, "neck") and self.neck is not None
+
+    # TODO: these properties need to be carefully handled
+    # for both single stage & two stage detectors
+    @property
+    def with_shared_head(self):
+        """bool: whether the detector has a shared head in the RoI Head"""
+        return hasattr(self, "roi_head") and self.roi_head.with_shared_head
+
+    @property
+    def with_bbox(self):
+        """bool: whether the detector has a bbox head"""
+        return (hasattr(self, "roi_head") and self.roi_head.with_bbox) or (
+            hasattr(self, "bbox_head") and self.bbox_head is not None
+        )
+
+    @property
+    def with_mask(self):
+        """bool: whether the detector has a mask head"""
+        return (hasattr(self, "roi_head") and self.roi_head.with_mask) or (
+            hasattr(self, "mask_head") and self.mask_head is not None
+        )
+
+    @abstractmethod
+    def extract_feat(self, imgs):
+        """Extract features from images."""
+        pass
+
+    def extract_feats(self, imgs):
+        """Extract features from multiple images.
+
+        Args:
+            imgs (list[torch.Tensor]): A list of images. The images are
+                augmented from the same image but in different ways.
+
+        Returns:
+            list[torch.Tensor]: Features of different images
+        """
+        assert isinstance(imgs, list)
+        return [self.extract_feat(img) for img in imgs]
+
+    def forward_train(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            img (Tensor): of shape (N, C, H, W) encoding input images.
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys, see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            kwargs (keyword arguments): Specific to concrete implementation.
+        """
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        batch_input_shape = tuple(imgs[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta["batch_input_shape"] = batch_input_shape
+
+    @abstractmethod
+    def simple_test(self, img, img_metas, **kwargs):
+        pass
+
+    def forward_test(self, imgs, img_metas, **kwargs):
+        """
+        Args:
+            imgs (List[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            img_metas (List[List[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch.
+        """
+        for var, name in [(imgs, "imgs"), (img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError(f"{name} must be a list, but got {type(var)}")
+
+        num_augs = len(imgs)
+        if num_augs != len(img_metas):
+            raise ValueError(f"num of augmentations ({len(imgs)}) " f"!= num of image meta ({len(img_metas)})")
+
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        for img, img_meta in zip(imgs, img_metas):
+            batch_size = len(img_meta)
+            for img_id in range(batch_size):
+                img_meta[img_id]["batch_input_shape"] = tuple(img.size()[-2:])
+
+        if num_augs == 1:
+            # proposals (List[List[Tensor]]): the outer list indicates
+            # test-time augs (multiscale, flip, etc.) and the inner list
+            # indicates images in a batch.
+            # The Tensor should have a shape Px4, where P is the number of
+            # proposals.
+            if "proposals" in kwargs:
+                kwargs["proposals"] = kwargs["proposals"][0]
+            return self.simple_test(imgs[0], img_metas[0], **kwargs)
+        else:
+            assert imgs[0].size(0) == 1, "aug test does not support " "inference with batch size " f"{imgs[0].size(0)}"
+            # TODO: support test augmentation for predefined proposals
+            assert "proposals" not in kwargs
+            return self.aug_test(imgs, img_metas, **kwargs)
+
+    @auto_fp16(apply_to=("img",))
+    def forward(self, img, img_metas, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if torch.onnx.is_in_onnx_export():
+            assert len(img_metas) == 1
+            return self.onnx_export(img[0], img_metas[0])
+
+        if return_loss:
+            return self.forward_train(img, img_metas, **kwargs)
+        else:
+            return self.forward_test(img, img_metas, **kwargs)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py
new file mode 100644
index 000000000..2265daf99
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/focal_loss.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+
+
+@LOSSES.register_module()
+class FocalLoss(nn.Module):
+    def __init__(self, use_sigmoid=True, gamma=2.0, alpha=0.25, reduction="mean", loss_weight=1.0, activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, "Only sigmoid focal loss supported now."
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred, target, weight, gamma=self.gamma, alpha=self.alpha, reduction=reduction, avg_factor=avg_factor
+            )
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py
new file mode 100644
index 000000000..9aa621834
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,448 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import mmcv
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def iou_loss(pred, target, linear=False, mode="log", eps=1e-6):
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    assert mode in ["linear", "square", "log"]
+    if linear:
+        mode = "linear"
+        warnings.warn(
+            'DeprecationWarning: Setting "linear=True" in '
+            'iou_loss is deprecated, please use "mode=`linear`" '
+            "instead."
+        )
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+    if mode == "linear":
+        loss = 1 - ious
+    elif mode == "square":
+        loss = 1 - ious**2
+    elif mode == "log":
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+# @mmcv.jit(derivate=True, coderize=True)
+# @weighted_loss
+# def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
+#     """BIoULoss.
+
+#     This is an implementation of paper
+#     `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+#     <https://arxiv.org/abs/1711.00164>`_.
+
+#     Args:
+#         pred (torch.Tensor): Predicted bboxes.
+#         target (torch.Tensor): Target bboxes.
+#         beta (float): beta parameter in smoothl1.
+#         eps (float): eps to avoid NaN.
+#     """
+#     pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+#     pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+#     pred_w = pred[:, 2] - pred[:, 0]
+#     pred_h = pred[:, 3] - pred[:, 1]
+#     with torch.no_grad():
+#         target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+#         target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+#         target_w = target[:, 2] - target[:, 0]
+#         target_h = target[:, 3] - target[:, 1]
+
+#     dx = target_ctrx - pred_ctrx
+#     dy = target_ctry - pred_ctry
+
+#     loss_dx = 1 - torch.max(
+#         (target_w - 2 * dx.abs()) /
+#         (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+#     loss_dy = 1 - torch.max(
+#         (target_h - 2 * dy.abs()) /
+#         (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+#     loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+#                             (target_w + eps))
+#     loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+#                             (target_h + eps))
+#     # view(..., -1) does not work for empty tensor
+#     loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+#                             dim=-1).flatten(1)
+
+#     loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+#                        loss_comb - 0.5 * beta)
+#     return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode="giou", is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+# @mmcv.jit(derivate=True, coderize=True)
+# @weighted_loss
+# def diou_loss(pred, target, eps=1e-7):
+#     r"""`Implementation of Distance-IoU Loss: Faster and Better
+#     Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_.
+
+#     Code is modified from https://github.com/Zzh-tju/DIoU.
+
+#     Args:
+#         pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+#             shape (n, 4).
+#         target (Tensor): Corresponding gt bboxes, shape (n, 4).
+#         eps (float): Eps to avoid log(0).
+#     Return:
+#         Tensor: Loss tensor.
+#     """
+#     # overlap
+#     lt = torch.max(pred[:, :2], target[:, :2])
+#     rb = torch.min(pred[:, 2:], target[:, 2:])
+#     wh = (rb - lt).clamp(min=0)
+#     overlap = wh[:, 0] * wh[:, 1]
+
+#     # union
+#     ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+#     ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+#     union = ap + ag - overlap + eps
+
+#     # IoU
+#     ious = overlap / union
+
+#     # enclose area
+#     enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+#     enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+#     enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+#     cw = enclose_wh[:, 0]
+#     ch = enclose_wh[:, 1]
+
+#     c2 = cw**2 + ch**2 + eps
+
+#     b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+#     b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+#     b2_x1, b2_y1 = target[:, 0], target[:, 1]
+#     b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+#     left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+#     right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+#     rho2 = left + right
+
+#     # DIoU
+#     dious = ious - rho2 / c2
+#     loss = 1 - dious
+#     return loss
+
+
+# @mmcv.jit(derivate=True, coderize=True)
+# @weighted_loss
+# def ciou_loss(pred, target, eps=1e-7):
+#     r"""`Implementation of paper `Enhancing Geometric Factors into
+#     Model Learning and Inference for Object Detection and Instance
+#     Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+#     Code is modified from https://github.com/Zzh-tju/CIoU.
+
+#     Args:
+#         pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+#             shape (n, 4).
+#         target (Tensor): Corresponding gt bboxes, shape (n, 4).
+#         eps (float): Eps to avoid log(0).
+#     Return:
+#         Tensor: Loss tensor.
+#     """
+#     # overlap
+#     lt = torch.max(pred[:, :2], target[:, :2])
+#     rb = torch.min(pred[:, 2:], target[:, 2:])
+#     wh = (rb - lt).clamp(min=0)
+#     overlap = wh[:, 0] * wh[:, 1]
+
+#     # union
+#     ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+#     ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+#     union = ap + ag - overlap + eps
+
+#     # IoU
+#     ious = overlap / union
+
+#     # enclose area
+#     enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+#     enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+#     enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+#     cw = enclose_wh[:, 0]
+#     ch = enclose_wh[:, 1]
+
+#     c2 = cw**2 + ch**2 + eps
+
+#     b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+#     b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+#     b2_x1, b2_y1 = target[:, 0], target[:, 1]
+#     b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+#     w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+#     w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+#     left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+#     right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+#     rho2 = left + right
+
+#     factor = 4 / math.pi**2
+#     v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+#     with torch.no_grad():
+#         alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+#     # CIoU
+#     cious = ious - (rho2 / c2 + alpha * v)
+#     loss = 1 - cious.clamp(min=-1.0, max=1.0)
+#     return loss
+
+
+@LOSSES.register_module()
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+
+    def __init__(self, linear=False, eps=1e-6, reduction="mean", loss_weight=1.0, mode="log"):
+        super(IoULoss, self).__init__()
+        assert mode in ["linear", "square", "log"]
+        if linear:
+            mode = "linear"
+            warnings.warn(
+                'DeprecationWarning: Setting "linear=True" in '
+                'IOULoss is deprecated, please use "mode=`linear`" '
+                "instead."
+            )
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+        if (weight is not None) and (not torch.any(weight > 0)) and (reduction != "none"):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(
+            pred, target, weight, mode=self.mode, eps=self.eps, reduction=reduction, avg_factor=avg_factor, **kwargs
+        )
+        return loss
+
+
+# @LOSSES.register_module()
+# class BoundedIoULoss(nn.Module):
+
+#     def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0):
+#         super(BoundedIoULoss, self).__init__()
+#         self.beta = beta
+#         self.eps = eps
+#         self.reduction = reduction
+#         self.loss_weight = loss_weight
+
+#     def forward(self,
+#                 pred,
+#                 target,
+#                 weight=None,
+#                 avg_factor=None,
+#                 reduction_override=None,
+#                 **kwargs):
+#         if weight is not None and not torch.any(weight > 0):
+#             if pred.dim() == weight.dim() + 1:
+#                 weight = weight.unsqueeze(1)
+#             return (pred * weight).sum()  # 0
+#         assert reduction_override in (None, 'none', 'mean', 'sum')
+#         reduction = (
+#             reduction_override if reduction_override else self.reduction)
+#         loss = self.loss_weight * bounded_iou_loss(
+#             pred,
+#             target,
+#             weight,
+#             beta=self.beta,
+#             eps=self.eps,
+#             reduction=reduction,
+#             avg_factor=avg_factor,
+#             **kwargs)
+#         return loss
+
+
+@LOSSES.register_module()
+class GIoULoss(nn.Module):
+    def __init__(self, eps=1e-6, reduction="mean", loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(
+            pred, target, weight, eps=self.eps, reduction=reduction, avg_factor=avg_factor, **kwargs
+        )
+        return loss
+
+
+# @LOSSES.register_module()
+# class DIoULoss(nn.Module):
+
+#     def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+#         super(DIoULoss, self).__init__()
+#         self.eps = eps
+#         self.reduction = reduction
+#         self.loss_weight = loss_weight
+
+#     def forward(self,
+#                 pred,
+#                 target,
+#                 weight=None,
+#                 avg_factor=None,
+#                 reduction_override=None,
+#                 **kwargs):
+#         if weight is not None and not torch.any(weight > 0):
+#             if pred.dim() == weight.dim() + 1:
+#                 weight = weight.unsqueeze(1)
+#             return (pred * weight).sum()  # 0
+#         assert reduction_override in (None, 'none', 'mean', 'sum')
+#         reduction = (
+#             reduction_override if reduction_override else self.reduction)
+#         if weight is not None and weight.dim() > 1:
+#             # TODO: remove this in the future
+#             # reduce the weight of shape (n, 4) to (n,) to match the
+#             # giou_loss of shape (n,)
+#             assert weight.shape == pred.shape
+#             weight = weight.mean(-1)
+#         loss = self.loss_weight * diou_loss(
+#             pred,
+#             target,
+#             weight,
+#             eps=self.eps,
+#             reduction=reduction,
+#             avg_factor=avg_factor,
+#             **kwargs)
+#         return loss
+
+
+# @LOSSES.register_module()
+# class CIoULoss(nn.Module):
+
+#     def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+#         super(CIoULoss, self).__init__()
+#         self.eps = eps
+#         self.reduction = reduction
+#         self.loss_weight = loss_weight
+
+#     def forward(self,
+#                 pred,
+#                 target,
+#                 weight=None,
+#                 avg_factor=None,
+#                 reduction_override=None,
+#                 **kwargs):
+#         if weight is not None and not torch.any(weight > 0):
+#             if pred.dim() == weight.dim() + 1:
+#                 weight = weight.unsqueeze(1)
+#             return (pred * weight).sum()  # 0
+#         assert reduction_override in (None, 'none', 'mean', 'sum')
+#         reduction = (
+#             reduction_override if reduction_override else self.reduction)
+#         if weight is not None and weight.dim() > 1:
+#             # TODO: remove this in the future
+#             # reduce the weight of shape (n, 4) to (n,) to match the
+#             # giou_loss of shape (n,)
+#             assert weight.shape == pred.shape
+#             weight = weight.mean(-1)
+#         loss = self.loss_weight * ciou_loss(
+#             pred,
+#             target,
+#             weight,
+#             eps=self.eps,
+#             reduction=reduction,
+#             avg_factor=avg_factor,
+#             **kwargs)
+#         return loss
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py
new file mode 100644
index 000000000..82fda7704
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/smooth_l1_loss.py
@@ -0,0 +1,66 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import torch
+import torch.nn as nn
+
+from ..builder import LOSSES
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    if target.numel() == 0:
+        return pred.sum() * 0
+
+    assert pred.size() == target.size()
+    loss = torch.abs(pred - target)
+    return loss
+
+
+@LOSSES.register_module()
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction="mean", loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, "none", "mean", "sum")
+        reduction = reduction_override if reduction_override else self.reduction
+        loss_bbox = self.loss_weight * l1_loss(pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py
new file mode 100644
index 000000000..7e79bd2b3
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/losses/utils.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+
+import mmcv
+import torch
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+@mmcv.jit(derivate=True, coderize=True)
+def weight_reduce_loss(loss, weight=None, reduction="mean", avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == "mean":
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = torch.finfo(torch.float32).eps
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != "none":
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+
+    @functools.wraps(loss_func)
+    def wrapper(pred, target, weight=None, reduction="mean", avg_factor=None, **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py
new file mode 100644
index 000000000..0b288063d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# # Copyright (c) OpenMMLab. All rights reserved.
+
+from .builder import build_transformer
+from .res_layer import ResLayer, SimplifiedBasicBlock
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py
new file mode 100644
index 000000000..d258773c6
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/builder.py
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+
+TRANSFORMER = Registry("Transformer")
+LINEAR_LAYERS = Registry("linear layers")
+
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+LINEAR_LAYERS.register_module("Linear", module=nn.Linear)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py
new file mode 100644
index 000000000..7e734c1eb
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/res_layer.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+
+    def __init__(
+        self,
+        block,
+        inplanes,
+        planes,
+        num_blocks,
+        stride=1,
+        avg_down=False,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN"),
+        downsample_first=True,
+        **kwargs
+    ):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False)
+                )
+            downsample.extend(
+                [
+                    build_conv_layer(
+                        conv_cfg, inplanes, planes * block.expansion, kernel_size=1, stride=conv_stride, bias=False
+                    ),
+                    build_norm_layer(norm_cfg, planes * block.expansion)[1],
+                ]
+            )
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs
+                )
+            )
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(inplanes=inplanes, planes=planes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)
+                )
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(inplanes=inplanes, planes=inplanes, stride=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, **kwargs)
+                )
+            layers.append(
+                block(
+                    inplanes=inplanes,
+                    planes=planes,
+                    stride=stride,
+                    downsample=downsample,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    **kwargs
+                )
+            )
+        super(ResLayer, self).__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+
+    expansion = 1
+
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        stride=1,
+        dilation=1,
+        downsample=None,
+        style="pytorch",
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=dict(type="BN"),
+        dcn=None,
+        plugins=None,
+        init_fg=None,
+    ):
+        super(SimplifiedBasicBlock, self).__init__(init_fg)
+        assert dcn is None, "Not implemented yet."
+        assert plugins is None, "Not implemented yet."
+        assert not with_cp, "Not implemented yet."
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(
+            conv_cfg, inplanes, planes, 3, stride=stride, padding=dilation, dilation=dilation, bias=with_bias
+        )
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(conv_cfg, planes, planes, 3, padding=1, bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py
new file mode 100644
index 000000000..ba1fa3973
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/models/utils/transformer.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py
new file mode 100644
index 000000000..d0d7b8f0b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/utils/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .util_distribution import get_device
diff --git a/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py b/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py
new file mode 100644
index 000000000..1466423c4
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet/utils/util_distribution.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+def get_device():
+    """Returns an available device, cpu, cuda or mlu."""
+    is_device_available = {"cuda": torch.cuda.is_available(), "mlu": is_mlu_available()}
+    device_list = [k for k, v in is_device_available.items() if v]
+    return device_list[0] if len(device_list) == 1 else "cpu"
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py
new file mode 100644
index 000000000..bfcdeed3b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/datasets/nus-3d.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-50, -50, -5, 50, 50, 3]
+# For nuScenes we usually do 10-class detection
+class_names = [
+    "car",
+    "truck",
+    "trailer",
+    "bus",
+    "construction_vehicle",
+    "bicycle",
+    "motorcycle",
+    "pedestrian",
+    "traffic_cone",
+    "barrier",
+]
+dataset_type = "NuScenesDataset"
+# Input modality for nuScenes dataset, this is consistent with the submission
+# format which requires the information in input_modality.
+input_modality = dict(use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False)
+file_client_args = dict(backend="disk")
+test_pipeline = [
+    dict(type="LoadPointsFromFile", coord_type="LIDAR", load_dim=5, use_dim=5, file_client_args=file_client_args),
+    dict(type="LoadPointsFromMultiSweeps", sweeps_num=10, file_client_args=file_client_args),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type="GlobalRotScaleTrans", rot_range=[0, 0], scale_ratio_range=[1.0, 1.0], translation_std=[0, 0, 0]),
+            dict(type="RandomFlip3D"),
+            dict(type="PointsRangeFilter", point_cloud_range=point_cloud_range),
+            dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False),
+            dict(type="Collect3D", keys=["points"]),
+        ],
+    ),
+]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py
new file mode 100644
index 000000000..ed42a1e20
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/configs/_base_/default_runtime.py
@@ -0,0 +1,17 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+checkpoint_config = dict(interval=1)
+# yapf:disable push
+# By default we use textlogger hook and tensorboard
+# For more loggers see
+# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
+log_config = dict(interval=50, hooks=[dict(type="TextLoggerHook"), dict(type="TensorboardLoggerHook")])
+# yapf:enable
+dist_params = dict(backend="nccl")
+log_level = "INFO"
+work_dir = None
+load_from = None
+resume_from = None
+workflow = [("train", 1)]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py
new file mode 100644
index 000000000..9bc22e852
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .transforms import bbox3d2result
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py
new file mode 100644
index 000000000..94fc09eb7
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/coders/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.core.bbox import build_bbox_coder
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py
new file mode 100644
index 000000000..1a3ab5953
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/__init__.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_box3d import BaseInstance3DBoxes
+from .box_3d_mode import Box3DMode
+from .cam_box3d import CameraInstance3DBoxes
+from .coord_3d_mode import Coord3DMode
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+from .utils import (
+    get_box_type,
+    get_proj_mat_by_coord_type,
+    limit_period,
+    mono_cam_box2vis,
+    points_cam2img,
+    rotation_3d_in_axis,
+    xywhr2xyxyr,
+)
+
+__all__ = [
+    "Box3DMode",
+    "BaseInstance3DBoxes",
+    "LiDARInstance3DBoxes",
+    "CameraInstance3DBoxes",
+    "DepthInstance3DBoxes",
+    "xywhr2xyxyr",
+    "get_box_type",
+    "rotation_3d_in_axis",
+    "limit_period",
+    "points_cam2img",
+    "Coord3DMode",
+    "mono_cam_box2vis",
+    "get_proj_mat_by_coord_type",
+]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py
new file mode 100644
index 000000000..073e57222
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/base_box3d.py
@@ -0,0 +1,338 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import abstractmethod
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .utils import xywhr2xyxyr
+
+
+class BaseInstance3DBoxes(object):
+    """Base class for 3D Boxes.
+
+    Note:
+        The box is bottom centered, i.e. the relative position of origin in
+        the box is (0.5, 0.5, 0).
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
+        box_dim (int): Number of the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw).
+            Default to 7.
+        with_yaw (bool): Whether the box is with yaw rotation.
+            If False, the value of yaw will be set to 0 as minmax boxes.
+            Default to True.
+        origin (tuple[float]): The relative position of origin in the box.
+            Default to (0.5, 0.5, 0). This will guide the box be converted to
+            (0.5, 0.5, 0) mode.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 0.5, 0):
+            dst = self.tensor.new_tensor((0.5, 0.5, 0))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def volume(self):
+        """torch.Tensor: A vector with volume of each box."""
+        return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
+
+    @property
+    def dims(self):
+        """torch.Tensor: Corners of each box with size (N, 8, 3)."""
+        return self.tensor[:, 3:6]
+
+    @property
+    def yaw(self):
+        """torch.Tensor: A vector with yaw of each box."""
+        return self.tensor[:, 6]
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box."""
+        return self.tensor[:, 5]
+
+    @property
+    def top_height(self):
+        """torch.Tensor: A vector with the top height of each box."""
+        return self.bottom_height + self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor: A vector with bottom's height of each box."""
+        return self.tensor[:, 2]
+
+    @property
+    def center(self):
+        """Calculate the center of all the boxes.
+
+        Note:
+            In the MMDetection3D's convention, the bottom center is
+            usually taken as the default center.
+
+            The relative position of the centers in different kinds of
+            boxes are different, e.g., the relative center of a boxes is
+            (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
+            It is recommended to use ``bottom_center`` or ``gravity_center``
+            for more clear usage.
+
+        Returns:
+            torch.Tensor: A tensor with center of each box.
+        """
+        return self.bottom_center
+
+    @property
+    def bottom_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        return self.tensor[:, :3]
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        pass
+
+    @property
+    def corners(self):
+        """torch.Tensor: a tensor with 8 corners of each box."""
+        pass
+
+    @abstractmethod
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+        """
+        pass
+
+    @abstractmethod
+    def flip(self, bev_direction="horizontal"):
+        """Flip the boxes in BEV along given BEV direction."""
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the box with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the boxes.
+        """
+        self.tensor[:, :6] *= scale_factor
+        self.tensor[:, 7:] *= scale_factor
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_boxes = boxes[3]`:
+                return a `Boxes` that contains only one box.
+            2. `new_boxes = boxes[2:10]`:
+                return a slice of boxes.
+            3. `new_boxes = boxes[vector]`:
+                where vector is a torch.BoolTensor with `length = len(boxes)`.
+                Nonzero elements in the vector will be selected.
+            Note that the returned Boxes might share storage with this Boxes,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new object of  \
+                :class:`BaseInstances3DBoxes` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(self.tensor[item].view(1, -1), box_dim=self.box_dim, with_yaw=self.with_yaw)
+        b = self.tensor[item]
+        assert b.dim() == 2, f"Indexing on Boxes with {item} failed to return a matrix!"
+        return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def __len__(self):
+        """int: Number of boxes in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + "(\n    " + str(self.tensor) + ")"
+
+    @classmethod
+    def cat(cls, boxes_list):
+        """Concatenate a list of Boxes into a single Boxes.
+
+        Args:
+            boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: The concatenated Boxes.
+        """
+        assert isinstance(boxes_list, (list, tuple))
+        if len(boxes_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(box, cls) for box in boxes_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned boxes never share storage with input
+        cat_boxes = cls(
+            torch.cat([b.tensor for b in boxes_list], dim=0),
+            box_dim=boxes_list[0].tensor.shape[1],
+            with_yaw=boxes_list[0].with_yaw,
+        )
+        return cat_boxes
+
+    def to(self, device):
+        """Convert current boxes to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: A new boxes object on the \
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.to(device), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    def clone(self):
+        """Clone the Boxes.
+
+        Returns:
+            :obj:`BaseInstance3DBoxes`: Box object with the same properties \
+                as self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
+
+    @property
+    def device(self):
+        """str: The device of the boxes are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a box as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A box of shape (4,).
+        """
+        yield from self.tensor
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode="iou"):
+        """Calculate height overlaps of two boxes.
+
+        Note:
+            This function calculates the height overlaps between boxes1 and
+            boxes2,  boxes1 and boxes2 should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), (
+            '"boxes1" and "boxes2" should' f"be in the same type, got {type(boxes1)} and {type(boxes2)}."
+        )
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        heighest_of_bottom = torch.max(boxes1_bottom_height, boxes2_bottom_height)
+        lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
+        return overlaps_h
+
+    @classmethod
+    def overlaps(cls, boxes1, boxes2, mode="iou"):
+        """Calculate 3D overlaps of two boxes.
+
+        Note:
+            This function calculates the overlaps between ``boxes1`` and
+            ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes' heights.
+        """
+        assert isinstance(boxes1, BaseInstance3DBoxes)
+        assert isinstance(boxes2, BaseInstance3DBoxes)
+        assert type(boxes1) == type(boxes2), (
+            '"boxes1" and "boxes2" should' f"be in the same type, got {type(boxes1)} and {type(boxes2)}."
+        )
+
+        assert mode in ["iou", "iof"]
+
+        rows = len(boxes1)
+        cols = len(boxes2)
+        if rows * cols == 0:
+            return boxes1.tensor.new(rows, cols)
+
+        # height overlap
+        overlaps_h = cls.height_overlaps(boxes1, boxes2)
+
+        # obtain BEV boxes in XYXYR format
+        boxes1_bev = xywhr2xyxyr(boxes1.bev)
+        boxes2_bev = xywhr2xyxyr(boxes2.bev)
+
+        # bev overlap
+        overlaps_bev = boxes1_bev.new_zeros((boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda()  # (N, M)
+        iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(), boxes2_bev.contiguous().cuda(), overlaps_bev)
+
+        # 3d overlaps
+        overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
+
+        volume1 = boxes1.volume.view(-1, 1)
+        volume2 = boxes2.volume.view(1, -1)
+
+        if mode == "iou":
+            # the clamp func is used to avoid division of 0
+            iou3d = overlaps_3d / torch.clamp(volume1 + volume2 - overlaps_3d, min=1e-8)
+        else:
+            iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
+
+        return iou3d
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py
new file mode 100644
index 000000000..3b3a9716b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/box_3d_mode.py
@@ -0,0 +1,165 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from enum import IntEnum, unique
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Box3DMode(IntEnum):
+    r"""Enum of different ways to represent a box.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(box, src, dst, rt_mat=None):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | BaseInstance3DBoxes):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`Box3DMode`): The src Box mode.
+            dst (:obj:`Box3DMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+                The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                "Box3DMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 7"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+        elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        else:
+            raise NotImplementedError(f"Conversion from Box3DMode {src} to {dst} " "is not supported yet")
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[..., 6:]
+        arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Box3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Box3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Box3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet")
+            return target_type(arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+        else:
+            return arr
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py
new file mode 100644
index 000000000..d0073db07
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/cam_box3d.py
@@ -0,0 +1,256 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class CameraInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in CAM coordinates.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front (yaw=-0.5*pi)
+               /
+              /
+             0 ------> x right (yaw=0)
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
+    and the yaw is around the y axis, thus the rotation axis=1.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of z.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 1.0, 0.5)):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, box_dim)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
+
+        if tensor.shape[-1] == 6:
+            # If the dimension of boxes is 6, we expand box_dim by padding
+            # 0 as a fake yaw and set with_yaw to False.
+            assert box_dim == 6
+            fake_rot = tensor.new_zeros(tensor.shape[0], 1)
+            tensor = torch.cat((tensor, fake_rot), dim=-1)
+            self.box_dim = box_dim + 1
+            self.with_yaw = False
+        else:
+            self.box_dim = box_dim
+            self.with_yaw = with_yaw
+        self.tensor = tensor.clone()
+
+        if origin != (0.5, 1.0, 0.5):
+            dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
+            src = self.tensor.new_tensor(origin)
+            self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each box."""
+        return self.tensor[:, 4]
+
+    @property
+    def top_height(self):
+        """torch.Tensor: A vector with the top height of each box."""
+        # the positive direction is down rather than up
+        return self.bottom_height - self.height
+
+    @property
+    def bottom_height(self):
+        """torch.Tensor: A vector with bottom's height of each box."""
+        return self.tensor[:, 1]
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
+        gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes in
+                         shape (N, 8, 3).
+
+        Convert the boxes to  in clockwise order, in the form of
+        (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
+
+        .. code-block:: none
+
+                         front z
+                              /
+                             /
+               (x0, y0, z1) + -----------  + (x1, y0, z1)
+                           /|            / |
+                          / |           /  |
+            (x0, y0, z0) + ----------- +   + (x1, y1, z1)
+                         |  /      .   |  /
+                         | / origin    | /
+            (x0, y1, z0) + ----------- + -------> x right
+                         |             (x1, y1, z0)
+                         |
+                         v
+                    down y
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+            device=dims.device, dtype=dims.dtype
+        )
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 1, 0.5]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around y axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+        with rotation in XYWHR format."""
+        return self.tensor[:, [0, 2, 3, 5, 6]]
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}"
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]])
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[2, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+        self.tensor[:, 6] += angle
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # clockwise
+                points.rotate(-angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction="horizontal", points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ("horizontal", "vertical")
+        if bev_direction == "horizontal":
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == "vertical":
+            self.tensor[:, 2::7] = -self.tensor[:, 2::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == "horizontal":
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == "vertical":
+                    points[:, 2] = -points[:, 2]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
+
+    @classmethod
+    def height_overlaps(cls, boxes1, boxes2, mode="iou"):
+        """Calculate height overlaps of two boxes.
+
+        This function calculates the height overlaps between ``boxes1`` and
+        ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
+
+        Args:
+            boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
+            boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
+            mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
+
+        Returns:
+            torch.Tensor: Calculated iou of boxes' heights.
+        """
+        assert isinstance(boxes1, CameraInstance3DBoxes)
+        assert isinstance(boxes2, CameraInstance3DBoxes)
+
+        boxes1_top_height = boxes1.top_height.view(-1, 1)
+        boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
+        boxes2_top_height = boxes2.top_height.view(1, -1)
+        boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
+
+        # In camera coordinate system
+        # from up to down is the positive direction
+        heighest_of_bottom = torch.min(boxes1_bottom_height, boxes2_bottom_height)
+        lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
+        overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
+        return overlaps_h
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py
new file mode 100644
index 000000000..5acf9dc00
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/coord_3d_mode.py
@@ -0,0 +1,270 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from enum import IntEnum, unique
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .cam_box3d import CameraInstance3DBoxes
+from .depth_box3d import DepthInstance3DBoxes
+from .lidar_box3d import LiDARInstance3DBoxes
+
+
+@unique
+class Coord3DMode(IntEnum):
+    r"""Enum of different ways to represent a box
+        and point cloud.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                    up z
+                       ^   x front
+                       |  /
+                       | /
+        left y <------ 0
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+
+    Coordinates in camera:
+
+    .. code-block:: none
+
+                z front
+               /
+              /
+             0 ------> x right
+             |
+             |
+             v
+        down y
+
+    The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
+    and the yaw is around the y axis, thus the rotation axis=1.
+
+    Coordinates in Depth mode:
+
+    .. code-block:: none
+
+        up z
+           ^   y front
+           |  /
+           | /
+           0 ------> x right
+
+    The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    """
+
+    LIDAR = 0
+    CAM = 1
+    DEPTH = 2
+
+    @staticmethod
+    def convert(input, src, dst, rt_mat=None):
+        """Convert boxes or points from `src` mode to `dst` mode."""
+        if isinstance(input, BaseInstance3DBoxes):
+            return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat)
+        elif isinstance(input, BasePoints):
+            return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def convert_box(box, src, dst, rt_mat=None):
+        """Convert boxes from `src` mode to `dst` mode.
+
+        Args:
+            box (tuple | list | np.ndarray |
+                torch.Tensor | BaseInstance3DBoxes):
+                Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
+            src (:obj:`CoordMode`): The src Box mode.
+            dst (:obj:`CoordMode`): The target Box mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BaseInstance3DBoxes): \
+                The converted box of the same type.
+        """
+        if src == dst:
+            return box
+
+        is_numpy = isinstance(box, np.ndarray)
+        is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
+        single_box = isinstance(box, (list, tuple))
+        if single_box:
+            assert len(box) >= 7, (
+                "CoordMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 7"
+            )
+            arr = torch.tensor(box)[None, :]
+        else:
+            # avoid modifying the input box
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(box)).clone()
+            elif is_Instance3DBoxes:
+                arr = box.tensor.clone()
+            else:
+                arr = box.clone()
+
+        # convert box from `src` mode to `dst` mode.
+        x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+            xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+            xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+            xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+            xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
+        else:
+            raise NotImplementedError(f"Conversion from Coord3DMode {src} to {dst} " "is not supported yet")
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[..., 6:]
+        arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(box)
+        if single_box:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_Instance3DBoxes:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraInstance3DBoxes
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARInstance3DBoxes
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthInstance3DBoxes
+            else:
+                raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet")
+            return target_type(arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
+        else:
+            return arr
+
+    @staticmethod
+    def convert_point(point, src, dst, rt_mat=None):
+        """Convert points from `src` mode to `dst` mode.
+
+        Args:
+            point (tuple | list | np.ndarray |
+                torch.Tensor | BasePoints):
+                Can be a k-tuple, k-list or an Nxk array/tensor.
+            src (:obj:`CoordMode`): The src Point mode.
+            dst (:obj:`CoordMode`): The target Point mode.
+            rt_mat (np.ndarray | torch.Tensor): The rotation and translation
+                matrix between different coordinates. Defaults to None.
+                The conversion from `src` coordinates to `dst` coordinates
+                usually comes along the change of sensors, e.g., from camera
+                to LiDAR. This requires a transformation matrix.
+
+        Returns:
+            (tuple | list | np.ndarray | torch.Tensor | BasePoints): \
+                The converted point of the same type.
+        """
+        if src == dst:
+            return point
+
+        is_numpy = isinstance(point, np.ndarray)
+        is_InstancePoints = isinstance(point, BasePoints)
+        single_point = isinstance(point, (list, tuple))
+        if single_point:
+            assert len(point) >= 3, (
+                "CoordMode.convert takes either a k-tuple/list or " "an Nxk array/tensor, where k >= 3"
+            )
+            arr = torch.tensor(point)[None, :]
+        else:
+            # avoid modifying the input point
+            if is_numpy:
+                arr = torch.from_numpy(np.asarray(point)).clone()
+            elif is_InstancePoints:
+                arr = point.tensor.clone()
+            else:
+                arr = point.clone()
+
+        # convert point from `src` mode to `dst` mode.
+        # TODO: LIDAR
+        # only implemented provided Rt matrix in cam-depth conversion
+        if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
+        elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
+        elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
+        elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
+            if rt_mat is None:
+                rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
+        else:
+            raise NotImplementedError(f"Conversion from Coord3DMode {src} to {dst} " "is not supported yet")
+
+        if not isinstance(rt_mat, torch.Tensor):
+            rt_mat = arr.new_tensor(rt_mat)
+        if rt_mat.size(1) == 4:
+            extended_xyz = torch.cat([arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
+            xyz = extended_xyz @ rt_mat.t()
+        else:
+            xyz = arr[:, :3] @ rt_mat.t()
+
+        remains = arr[:, 3:]
+        arr = torch.cat([xyz[:, :3], remains], dim=-1)
+
+        # convert arr to the original type
+        original_type = type(point)
+        if single_point:
+            return original_type(arr.flatten().tolist())
+        if is_numpy:
+            return arr.numpy()
+        elif is_InstancePoints:
+            if dst == Coord3DMode.CAM:
+                target_type = CameraPoints
+            elif dst == Coord3DMode.LIDAR:
+                target_type = LiDARPoints
+            elif dst == Coord3DMode.DEPTH:
+                target_type = DepthPoints
+            else:
+                raise NotImplementedError(f"Conversion to {dst} through {original_type}" " is not supported yet")
+            return target_type(arr, points_dim=arr.size(-1), attribute_dims=point.attribute_dims)
+        else:
+            return arr
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py
new file mode 100644
index 000000000..7753d025c
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/depth_box3d.py
@@ -0,0 +1,187 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class DepthInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in Depth coordinates.
+
+    Coordinates in Depth:
+
+    .. code-block:: none
+
+                    up z    y front (yaw=-0.5*pi)
+                       ^   ^
+                       |  /
+                       | /
+                       0 ------> x right (yaw=0)
+
+    The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the positive direction of x axis, and decreases from
+    the positive direction of x to the positive direction of y.
+    Also note that rotation of DepthInstance3DBoxes is counterclockwise,
+    which is reverse to the definition of the yaw angle (clockwise).
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicates the dimension of a box
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front y           ^
+                                 /            |
+                                /             |
+                  (x0, y1, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+               (x0, y0, z0) + ----------- + --------> right x
+                                          (x1, y0, z0)
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+            device=dims.device, dtype=dims.dtype
+        )
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin (0.5, 0.5, 0)
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: A n x 5 tensor of 2D BEV box of each box
+        in XYWHR format."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angle (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}"
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]).T
+        else:
+            rot_mat_T = angle.T
+            rot_sin = rot_mat_T[0, 1]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
+        if self.with_yaw:
+            self.tensor[:, 6] -= angle
+        else:
+            corners_rot = self.corners @ rot_mat_T
+            new_x_size = (
+                corners_rot[..., 0].max(dim=1, keepdim=True)[0] - corners_rot[..., 0].min(dim=1, keepdim=True)[0]
+            )
+            new_y_size = (
+                corners_rot[..., 1].max(dim=1, keepdim=True)[0] - corners_rot[..., 1].min(dim=1, keepdim=True)[0]
+            )
+            self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # anti-clockwise
+                points.rotate(angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction="horizontal", points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ("horizontal", "vertical")
+        if bev_direction == "horizontal":
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == "vertical":
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == "horizontal":
+                    points[:, 0] = -points[:, 0]
+                elif bev_direction == "vertical":
+                    points[:, 1] = -points[:, 1]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py
new file mode 100644
index 000000000..9a6a4e79a
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/lidar_box3d.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+from .base_box3d import BaseInstance3DBoxes
+from .utils import rotation_3d_in_axis
+
+
+class LiDARInstance3DBoxes(BaseInstance3DBoxes):
+    """3D boxes of instances in LIDAR coordinates.
+
+    Coordinates in LiDAR:
+
+    .. code-block:: none
+
+                            up z    x front (yaw=-0.5*pi)
+                               ^   ^
+                               |  /
+                               | /
+      (yaw=-pi) left y <------ 0 -------- (yaw=0)
+
+    The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
+    and the yaw is around the z axis, thus the rotation axis=2.
+    The yaw is 0 at the negative direction of y axis, and decreases from
+    the negative direction of y to the positive direction of x.
+
+    A refactor is ongoing to make the three coordinate systems
+    easier to understand and convert between each other.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x box_dim.
+        box_dim (int): Integer indicating the dimension of a box.
+            Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
+        with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
+            boxes.
+    """
+
+    @property
+    def gravity_center(self):
+        """torch.Tensor: A tensor with center of each box."""
+        bottom_center = self.bottom_center
+        gravity_center = torch.zeros_like(bottom_center)
+        gravity_center[:, :2] = bottom_center[:, :2]
+        gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
+        return gravity_center
+
+    @property
+    def corners(self):
+        """torch.Tensor: Coordinates of corners of all the boxes
+        in shape (N, 8, 3).
+
+        Convert the boxes to corners in clockwise order, in form of
+        ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
+
+        .. code-block:: none
+
+                                           up z
+                            front x           ^
+                                 /            |
+                                /             |
+                  (x1, y0, z1) + -----------  + (x1, y1, z1)
+                              /|            / |
+                             / |           /  |
+               (x0, y0, z1) + ----------- +   + (x1, y1, z0)
+                            |  /      .   |  /
+                            | / origin    | /
+            left y<-------- + ----------- + (x0, y1, z0)
+                (x0, y0, z0)
+        """
+        # TODO: rotation_3d_in_axis function do not support
+        #  empty tensor currently.
+        assert len(self.tensor) != 0
+        dims = self.dims
+        corners_norm = torch.from_numpy(np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
+            device=dims.device, dtype=dims.dtype
+        )
+
+        corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
+        # use relative origin [0.5, 0.5, 0]
+        corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
+        corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
+
+        # rotate around z axis
+        corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
+        corners += self.tensor[:, :3].view(-1, 1, 3)
+        return corners
+
+    @property
+    def bev(self):
+        """torch.Tensor: 2D BEV box of each box with rotation
+        in XYWHR format."""
+        return self.tensor[:, [0, 1, 3, 4, 6]]
+
+    def rotate(self, angle, points=None):
+        """Rotate boxes with points (optional) with the given angle or \
+        rotation matrix.
+
+        Args:
+            angles (float | torch.Tensor | np.ndarray):
+                Rotation angle or rotation matrix.
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
+                Points to rotate. Defaults to None.
+
+        Returns:
+            tuple or None: When ``points`` is None, the function returns \
+                None, otherwise it returns the rotated points and the \
+                rotation matrix ``rot_mat_T``.
+        """
+        if not isinstance(angle, torch.Tensor):
+            angle = self.tensor.new_tensor(angle)
+        assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, f"invalid rotation angle shape {angle.shape}"
+
+        if angle.numel() == 1:
+            rot_sin = torch.sin(angle)
+            rot_cos = torch.cos(angle)
+            rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]])
+        else:
+            rot_mat_T = angle
+            rot_sin = rot_mat_T[1, 0]
+            rot_cos = rot_mat_T[0, 0]
+            angle = np.arctan2(rot_sin, rot_cos)
+
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+        self.tensor[:, 6] += angle
+
+        if self.tensor.shape[1] == 9:
+            # rotate velo vector
+            self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
+
+        if points is not None:
+            if isinstance(points, torch.Tensor):
+                points[:, :3] = points[:, :3] @ rot_mat_T
+            elif isinstance(points, np.ndarray):
+                rot_mat_T = rot_mat_T.numpy()
+                points[:, :3] = np.dot(points[:, :3], rot_mat_T)
+            elif isinstance(points, BasePoints):
+                # clockwise
+                points.rotate(-angle)
+            else:
+                raise ValueError
+            return points, rot_mat_T
+
+    def flip(self, bev_direction="horizontal", points=None):
+        """Flip the boxes in BEV along given BEV direction.
+
+        In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
+
+        Args:
+            bev_direction (str): Flip direction (horizontal or vertical).
+            points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
+                Points to flip. Defaults to None.
+
+        Returns:
+            torch.Tensor, numpy.ndarray or None: Flipped points.
+        """
+        assert bev_direction in ("horizontal", "vertical")
+        if bev_direction == "horizontal":
+            self.tensor[:, 1::7] = -self.tensor[:, 1::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
+        elif bev_direction == "vertical":
+            self.tensor[:, 0::7] = -self.tensor[:, 0::7]
+            if self.with_yaw:
+                self.tensor[:, 6] = -self.tensor[:, 6]
+
+        if points is not None:
+            assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
+            if isinstance(points, (torch.Tensor, np.ndarray)):
+                if bev_direction == "horizontal":
+                    points[:, 1] = -points[:, 1]
+                elif bev_direction == "vertical":
+                    points[:, 0] = -points[:, 0]
+            elif isinstance(points, BasePoints):
+                points.flip(bev_direction)
+            return points
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py
new file mode 100644
index 000000000..0b1201e93
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/structures/utils.py
@@ -0,0 +1,229 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from logging import warning
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def limit_period(val, offset=0.5, period=np.pi):
+    """Limit the value into a period for periodic function.
+
+    Args:
+        val (torch.Tensor): The value to be converted.
+        offset (float, optional): Offset to set the value range. \
+            Defaults to 0.5.
+        period ([type], optional): Period of the value. Defaults to np.pi.
+
+    Returns:
+        torch.Tensor: Value in the range of \
+            [-offset * period, (1-offset) * period]
+    """
+    return val - torch.floor(val / period + offset) * period
+
+
+def rotation_3d_in_axis(points, angles, axis=0):
+    """Rotate points by angles according to axis.
+
+    Args:
+        points (torch.Tensor): Points of shape (N, M, 3).
+        angles (torch.Tensor): Vector of angles in shape (N,)
+        axis (int, optional): The axis to be rotated. Defaults to 0.
+
+    Raises:
+        ValueError: when the axis is not in range [0, 1, 2], it will \
+            raise value error.
+
+    Returns:
+        torch.Tensor: Rotated points in shape (N, M, 3)
+    """
+    rot_sin = torch.sin(angles)
+    rot_cos = torch.cos(angles)
+    ones = torch.ones_like(rot_cos)
+    zeros = torch.zeros_like(rot_cos)
+    if axis == 1:
+        rot_mat_T = torch.stack(
+            [
+                torch.stack([rot_cos, zeros, -rot_sin]),
+                torch.stack([zeros, ones, zeros]),
+                torch.stack([rot_sin, zeros, rot_cos]),
+            ]
+        )
+    elif axis == 2 or axis == -1:
+        rot_mat_T = torch.stack(
+            [
+                torch.stack([rot_cos, -rot_sin, zeros]),
+                torch.stack([rot_sin, rot_cos, zeros]),
+                torch.stack([zeros, zeros, ones]),
+            ]
+        )
+    elif axis == 0:
+        rot_mat_T = torch.stack(
+            [
+                torch.stack([zeros, rot_cos, -rot_sin]),
+                torch.stack([zeros, rot_sin, rot_cos]),
+                torch.stack([ones, zeros, zeros]),
+            ]
+        )
+    else:
+        raise ValueError(f"axis should in range [0, 1, 2], got {axis}")
+
+    return torch.einsum("aij,jka->aik", (points, rot_mat_T))
+
+
+def xywhr2xyxyr(boxes_xywhr):
+    """Convert a rotated boxes in XYWHR format to XYXYR format.
+
+    Args:
+        boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format.
+
+    Returns:
+        torch.Tensor: Converted boxes in XYXYR format.
+    """
+    boxes = torch.zeros_like(boxes_xywhr)
+    half_w = boxes_xywhr[:, 2] / 2
+    half_h = boxes_xywhr[:, 3] / 2
+
+    boxes[:, 0] = boxes_xywhr[:, 0] - half_w
+    boxes[:, 1] = boxes_xywhr[:, 1] - half_h
+    boxes[:, 2] = boxes_xywhr[:, 0] + half_w
+    boxes[:, 3] = boxes_xywhr[:, 1] + half_h
+    boxes[:, 4] = boxes_xywhr[:, 4]
+    return boxes
+
+
+def get_box_type(box_type):
+    """Get the type and mode of box structure.
+
+    Args:
+        box_type (str): The type of box structure.
+            The valid value are "LiDAR", "Camera", or "Depth".
+
+    Returns:
+        tuple: Box type and box mode.
+    """
+    from .box_3d_mode import (
+        Box3DMode,
+        CameraInstance3DBoxes,
+        DepthInstance3DBoxes,
+        LiDARInstance3DBoxes,
+    )
+
+    box_type_lower = box_type.lower()
+    if box_type_lower == "lidar":
+        box_type_3d = LiDARInstance3DBoxes
+        box_mode_3d = Box3DMode.LIDAR
+    elif box_type_lower == "camera":
+        box_type_3d = CameraInstance3DBoxes
+        box_mode_3d = Box3DMode.CAM
+    elif box_type_lower == "depth":
+        box_type_3d = DepthInstance3DBoxes
+        box_mode_3d = Box3DMode.DEPTH
+    else:
+        raise ValueError('Only "box_type" of "camera", "lidar", "depth"' f" are supported, got {box_type}")
+
+    return box_type_3d, box_mode_3d
+
+
+def points_cam2img(points_3d, proj_mat, with_depth=False):
+    """Project points from camera coordicates to image coordinates.
+
+    Args:
+        points_3d (torch.Tensor): Points in shape (N, 3).
+        proj_mat (torch.Tensor): Transformation matrix between coordinates.
+        with_depth (bool, optional): Whether to keep depth in the output.
+            Defaults to False.
+
+    Returns:
+        torch.Tensor: Points in image coordinates with shape [N, 2].
+    """
+    points_num = list(points_3d.shape)[:-1]
+
+    points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
+    assert len(proj_mat.shape) == 2, (
+        "The dimension of the projection" f" matrix should be 2 instead of {len(proj_mat.shape)}."
+    )
+    d1, d2 = proj_mat.shape[:2]
+    assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (d1 == 4 and d2 == 4), (
+        "The shape of the projection matrix" f" ({d1}*{d2}) is not supported."
+    )
+    if d1 == 3:
+        proj_mat_expanded = torch.eye(4, device=proj_mat.device, dtype=proj_mat.dtype)
+        proj_mat_expanded[:d1, :d2] = proj_mat
+        proj_mat = proj_mat_expanded
+
+    # previous implementation use new_zeros, new_one yeilds better results
+    points_4 = torch.cat([points_3d, points_3d.new_ones(*points_shape)], dim=-1)
+    point_2d = torch.matmul(points_4, proj_mat.t())
+    point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
+
+    if with_depth:
+        return torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1)
+    return point_2d_res
+
+
+def mono_cam_box2vis(cam_box):
+    """This is a post-processing function on the bboxes from Mono-3D task. If
+    we want to perform projection visualization, we need to:
+
+        1. rotate the box along x-axis for np.pi / 2 (roll)
+        2. change orientation from local yaw to global yaw
+        3. convert yaw by (np.pi / 2 - yaw)
+
+    After applying this function, we can project and draw it on 2D images.
+
+    Args:
+        cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate \
+            system before conversion. Could be gt bbox loaded from dataset or \
+                network prediction output.
+
+    Returns:
+        :obj:`CameraInstance3DBoxes`: Box after conversion.
+    """
+    warning.warn(
+        "DeprecationWarning: The hack of yaw and dimension in the "
+        "monocular 3D detection on nuScenes has been removed. The "
+        "function mono_cam_box2vis will be deprecated."
+    )
+    from . import CameraInstance3DBoxes
+
+    assert isinstance(cam_box, CameraInstance3DBoxes), "input bbox should be CameraInstance3DBoxes!"
+
+    loc = cam_box.gravity_center
+    dim = cam_box.dims
+    yaw = cam_box.yaw
+    feats = cam_box.tensor[:, 7:]
+    # rotate along x-axis for np.pi / 2
+    # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557  # noqa
+    dim[:, [1, 2]] = dim[:, [2, 1]]
+    # change local yaw to global yaw for visualization
+    # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166  # noqa
+    yaw += torch.atan2(loc[:, 0], loc[:, 2])
+    # convert yaw by (-yaw - np.pi / 2)
+    # this is because mono 3D box class such as `NuScenesBox` has different
+    # definition of rotation with our `CameraInstance3DBoxes`
+    yaw = -yaw - np.pi / 2
+    cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1)
+    cam_box = CameraInstance3DBoxes(cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5))
+
+    return cam_box
+
+
+def get_proj_mat_by_coord_type(img_meta, coord_type):
+    """Obtain image features using points.
+
+    Args:
+        img_meta (dict): Meta info.
+        coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'.
+            Can be case-insensitive.
+
+    Returns:
+        torch.Tensor: transformation matrix.
+    """
+    coord_type = coord_type.upper()
+    mapping = {"LIDAR": "lidar2img", "DEPTH": "depth2img", "CAMERA": "cam2img"}
+    assert coord_type in mapping.keys()
+    return img_meta[mapping[coord_type]]
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py
new file mode 100644
index 000000000..3a23aebd7
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/bbox/transforms.py
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+def bbox3d2result(bboxes, scores, labels, attrs=None):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
+        labels (torch.Tensor): Labels with shape of (n, ).
+        scores (torch.Tensor): Scores with shape of (n, ).
+        attrs (torch.Tensor, optional): Attributes with shape of (n, ). \
+            Defaults to None.
+
+    Returns:
+        dict[str, torch.Tensor]: Bounding box results in cpu mode.
+
+            - boxes_3d (torch.Tensor): 3D boxes.
+            - scores (torch.Tensor): Prediction scores.
+            - labels_3d (torch.Tensor): Box labels.
+            - attrs_3d (torch.Tensor, optional): Box attributes.
+    """
+    result_dict = dict(boxes_3d=bboxes.to("cpu"), scores_3d=scores.cpu(), labels_3d=labels.cpu())
+
+    if attrs is not None:
+        result_dict["attrs_3d"] = attrs.cpu()
+
+    return result_dict
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py
new file mode 100644
index 000000000..bdded694d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_points import BasePoints
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py
new file mode 100644
index 000000000..8c51c5adc
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/core/points/base_points.py
@@ -0,0 +1,335 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+from abc import abstractmethod
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+class BasePoints(object):
+    """Base class for Points.
+
+    Args:
+        tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
+        points_dim (int): Number of the dimension of a point.
+            Each row is (x, y, z). Default to 3.
+        attribute_dims (dict): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+
+    Attributes:
+        tensor (torch.Tensor): Float matrix of N x points_dim.
+        points_dim (int): Integer indicating the dimension of a point.
+            Each row is (x, y, z, ...).
+        attribute_dims (bool): Dictionary to indicate the meaning of extra
+            dimension. Default to None.
+        rotation_axis (int): Default rotation axis for points rotation.
+    """
+
+    def __init__(self, tensor, points_dim=3, attribute_dims=None):
+        if isinstance(tensor, torch.Tensor):
+            device = tensor.device
+        else:
+            device = torch.device("cpu")
+        tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
+        if tensor.numel() == 0:
+            # Use reshape, so we don't end up creating a new tensor that
+            # does not depend on the inputs (and consequently confuses jit)
+            tensor = tensor.reshape((0, points_dim)).to(dtype=torch.float32, device=device)
+        assert tensor.dim() == 2 and tensor.size(-1) == points_dim, tensor.size()
+
+        self.tensor = tensor
+        self.points_dim = points_dim
+        self.attribute_dims = attribute_dims
+        self.rotation_axis = 0
+
+    @property
+    def coord(self):
+        """torch.Tensor: Coordinates of each point with size (N, 3)."""
+        return self.tensor[:, :3]
+
+    @coord.setter
+    def coord(self, tensor):
+        """Set the coordinates of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f"got unexpected shape {tensor.shape}")
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        self.tensor[:, :3] = tensor
+
+    @property
+    def height(self):
+        """torch.Tensor: A vector with height of each point."""
+        if self.attribute_dims is not None and "height" in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims["height"]]
+        else:
+            return None
+
+    @height.setter
+    def height(self, tensor):
+        """Set the height of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0])
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f"got unexpected shape {tensor.shape}")
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and "height" in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims["height"]] = tensor
+        else:
+            # add height attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1)
+            self.attribute_dims.update(dict(height=attr_dim))
+            self.points_dim += 1
+
+    @property
+    def color(self):
+        """torch.Tensor: A vector with color of each point."""
+        if self.attribute_dims is not None and "color" in self.attribute_dims.keys():
+            return self.tensor[:, self.attribute_dims["color"]]
+        else:
+            return None
+
+    @color.setter
+    def color(self, tensor):
+        """Set the color of each point."""
+        try:
+            tensor = tensor.reshape(self.shape[0], 3)
+        except (RuntimeError, ValueError):  # for torch.Tensor and np.ndarray
+            raise ValueError(f"got unexpected shape {tensor.shape}")
+        if tensor.max() >= 256 or tensor.min() < 0:
+            warnings.warn("point got color value beyond [0, 255]")
+        if not isinstance(tensor, torch.Tensor):
+            tensor = self.tensor.new_tensor(tensor)
+        if self.attribute_dims is not None and "color" in self.attribute_dims.keys():
+            self.tensor[:, self.attribute_dims["color"]] = tensor
+        else:
+            # add color attribute
+            if self.attribute_dims is None:
+                self.attribute_dims = dict()
+            attr_dim = self.shape[1]
+            self.tensor = torch.cat([self.tensor, tensor], dim=1)
+            self.attribute_dims.update(dict(color=[attr_dim, attr_dim + 1, attr_dim + 2]))
+            self.points_dim += 3
+
+    @property
+    def shape(self):
+        """torch.Shape: Shape of points."""
+        return self.tensor.shape
+
+    def shuffle(self):
+        """Shuffle the points.
+
+        Returns:
+            torch.Tensor: The shuffled index.
+        """
+        idx = torch.randperm(self.__len__(), device=self.tensor.device)
+        self.tensor = self.tensor[idx]
+        return idx
+
+    def rotate(self, rotation, axis=None):
+        """Rotate points with the given rotation matrix or angle.
+
+        Args:
+            rotation (float, np.ndarray, torch.Tensor): Rotation matrix
+                or angle.
+            axis (int): Axis to rotate at. Defaults to None.
+        """
+        if not isinstance(rotation, torch.Tensor):
+            rotation = self.tensor.new_tensor(rotation)
+        assert rotation.shape == torch.Size([3, 3]) or rotation.numel() == 1, f"invalid rotation shape {rotation.shape}"
+
+        if axis is None:
+            axis = self.rotation_axis
+
+        if rotation.numel() == 1:
+            rot_sin = torch.sin(rotation)
+            rot_cos = torch.cos(rotation)
+            if axis == 1:
+                rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]])
+            elif axis == 2 or axis == -1:
+                rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]])
+            elif axis == 0:
+                rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin], [0, rot_sin, rot_cos], [1, 0, 0]])
+            else:
+                raise ValueError("axis should in range")
+            rot_mat_T = rot_mat_T.T
+        elif rotation.numel() == 9:
+            rot_mat_T = rotation
+        else:
+            raise NotImplementedError
+        self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
+
+        return rot_mat_T
+
+    @abstractmethod
+    def flip(self, bev_direction="horizontal"):
+        """Flip the points in BEV along given BEV direction."""
+        pass
+
+    def scale(self, scale_factor):
+        """Scale the points with horizontal and vertical scaling factors.
+
+        Args:
+            scale_factors (float): Scale factors to scale the points.
+        """
+        self.tensor[:, :3] *= scale_factor
+
+    def __getitem__(self, item):
+        """
+        Note:
+            The following usage are allowed:
+            1. `new_points = points[3]`:
+                return a `Points` that contains only one point.
+            2. `new_points = points[2:10]`:
+                return a slice of points.
+            3. `new_points = points[vector]`:
+                where vector is a torch.BoolTensor with `length = len(points)`.
+                Nonzero elements in the vector will be selected.
+            4. `new_points = points[3:11, vector]`:
+                return a slice of points and attribute dims.
+            5. `new_points = points[4:12, 2]`:
+                return a slice of points with single attribute.
+            Note that the returned Points might share storage with this Points,
+            subject to Pytorch's indexing semantics.
+
+        Returns:
+            :obj:`BasePoints`: A new object of  \
+                :class:`BasePoints` after indexing.
+        """
+        original_type = type(self)
+        if isinstance(item, int):
+            return original_type(
+                self.tensor[item].view(1, -1), points_dim=self.points_dim, attribute_dims=self.attribute_dims
+            )
+        elif isinstance(item, tuple) and len(item) == 2:
+            if isinstance(item[1], slice):
+                start = 0 if item[1].start is None else item[1].start
+                stop = self.tensor.shape[1] if item[1].stop is None else item[1].stop
+                step = 1 if item[1].step is None else item[1].step
+                item = list(item)
+                item[1] = list(range(start, stop, step))
+                item = tuple(item)
+            elif isinstance(item[1], int):
+                item = list(item)
+                item[1] = [item[1]]
+                item = tuple(item)
+            p = self.tensor[item[0], item[1]]
+
+            keep_dims = list(set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
+            if self.attribute_dims is not None:
+                attribute_dims = self.attribute_dims.copy()
+                for key in self.attribute_dims.keys():
+                    cur_attribute_dims = attribute_dims[key]
+                    if isinstance(cur_attribute_dims, int):
+                        cur_attribute_dims = [cur_attribute_dims]
+                    intersect_attr = list(set(cur_attribute_dims).intersection(set(keep_dims)))
+                    if len(intersect_attr) == 1:
+                        attribute_dims[key] = intersect_attr[0]
+                    elif len(intersect_attr) > 1:
+                        attribute_dims[key] = intersect_attr
+                    else:
+                        attribute_dims.pop(key)
+            else:
+                attribute_dims = None
+        elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
+            p = self.tensor[item]
+            attribute_dims = self.attribute_dims
+        else:
+            raise NotImplementedError(f"Invalid slice {item}!")
+
+        assert p.dim() == 2, f"Indexing on Points with {item} failed to return a matrix!"
+        return original_type(p, points_dim=p.shape[1], attribute_dims=attribute_dims)
+
+    def __len__(self):
+        """int: Number of points in the current object."""
+        return self.tensor.shape[0]
+
+    def __repr__(self):
+        """str: Return a strings that describes the object."""
+        return self.__class__.__name__ + "(\n    " + str(self.tensor) + ")"
+
+    @classmethod
+    def cat(cls, points_list):
+        """Concatenate a list of Points into a single Points.
+
+        Args:
+            points_list (list[:obj:`BasePoints`]): List of points.
+
+        Returns:
+            :obj:`BasePoints`: The concatenated Points.
+        """
+        assert isinstance(points_list, (list, tuple))
+        if len(points_list) == 0:
+            return cls(torch.empty(0))
+        assert all(isinstance(points, cls) for points in points_list)
+
+        # use torch.cat (v.s. layers.cat)
+        # so the returned points never share storage with input
+        cat_points = cls(
+            torch.cat([p.tensor for p in points_list], dim=0),
+            points_dim=points_list[0].tensor.shape[1],
+            attribute_dims=points_list[0].attribute_dims,
+        )
+        return cat_points
+
+    def to(self, device):
+        """Convert current points to a specific device.
+
+        Args:
+            device (str | :obj:`torch.device`): The name of the device.
+
+        Returns:
+            :obj:`BasePoints`: A new boxes object on the \
+                specific device.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.to(device), points_dim=self.points_dim, attribute_dims=self.attribute_dims)
+
+    def clone(self):
+        """Clone the Points.
+
+        Returns:
+            :obj:`BasePoints`: Box object with the same properties \
+                as self.
+        """
+        original_type = type(self)
+        return original_type(self.tensor.clone(), points_dim=self.points_dim, attribute_dims=self.attribute_dims)
+
+    @property
+    def device(self):
+        """str: The device of the points are on."""
+        return self.tensor.device
+
+    def __iter__(self):
+        """Yield a point as a Tensor of shape (4,) at a time.
+
+        Returns:
+            torch.Tensor: A point of shape (4,).
+        """
+        yield from self.tensor
+
+    def new_point(self, data):
+        """Create a new point object with data.
+
+        The new point and its tensor has the similar properties \
+            as self and self.tensor, respectively.
+
+        Args:
+            data (torch.Tensor | numpy.array | list): Data to be copied.
+
+        Returns:
+            :obj:`BasePoints`: A new point object with ``data``, \
+                the object's other properties are similar to ``self``.
+        """
+        new_tensor = self.tensor.new_tensor(data) if not isinstance(data, torch.Tensor) else data.to(self.device)
+        original_type = type(self)
+        return original_type(new_tensor, points_dim=self.points_dim, attribute_dims=self.attribute_dims)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py
new file mode 100644
index 000000000..d9127610f
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/__init__.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets.builder import build_dataloader
+
+from .builder import DATASETS, build_dataset
+from .custom_3d import Custom3DDataset
+from .nuscenes_dataset import NuScenesDataset
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py
new file mode 100644
index 000000000..8dbd95eb3
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/builder.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import platform
+
+from mmcv.utils import build_from_cfg
+from mmdet.datasets import DATASETS
+
+if platform.system() != "Windows":
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+
+def build_dataset(cfg, default_args=None):
+
+    dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py
new file mode 100644
index 000000000..65bf42cfd
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/custom_3d.py
@@ -0,0 +1,222 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+from os import path as osp
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmdet.datasets import DATASETS
+from torch.utils.data import Dataset
+
+from ..core.bbox.structures import get_box_type
+from .pipelines import Compose
+
+
+@DATASETS.register_module()
+class Custom3DDataset(Dataset):
+    """Customized 3D dataset.
+
+    This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
+    dataset.
+
+    Args:
+        data_root (str): Path of dataset root.
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR'. Available options includes
+
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+    """
+
+    def __init__(
+        self,
+        data_root,
+        ann_file,
+        pipeline=None,
+        classes=None,
+        modality=None,
+        box_type_3d="LiDAR",
+        filter_empty_gt=True,
+        test_mode=False,
+    ):
+        super().__init__()
+        self.data_root = data_root
+        self.ann_file = ann_file
+        self.test_mode = test_mode
+        self.modality = modality
+        self.filter_empty_gt = filter_empty_gt
+        self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
+
+        self.CLASSES = self.get_classes(classes)
+        self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
+        self.data_infos = self.load_annotations(self.ann_file)
+
+        if pipeline is not None:
+            self.pipeline = Compose(pipeline)
+
+        # set group flag for the sampler
+        if not self.test_mode:
+            self._set_group_flag()
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations.
+        """
+        return mmcv.load(ann_file)
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+
+        Args:
+            index (int): Index of the sample data to get.
+
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - file_name (str): Filename of point clouds.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        sample_idx = info["point_cloud"]["lidar_idx"]
+        pts_filename = osp.join(self.data_root, info["pts_path"])
+
+        input_dict = dict(pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename)
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict["ann_info"] = annos
+            if self.filter_empty_gt and ~(annos["gt_labels_3d"] != -1).any():
+                return None
+        return input_dict
+
+    def pre_pipeline(self, results):
+        """Initialization before data preparation.
+
+        Args:
+            results (dict): Dict before data preprocessing.
+
+                - img_fields (list): Image fields.
+                - bbox3d_fields (list): 3D bounding boxes fields.
+                - pts_mask_fields (list): Mask fields of points.
+                - pts_seg_fields (list): Mask fields of point segments.
+                - bbox_fields (list): Fields of bounding boxes.
+                - mask_fields (list): Fields of masks.
+                - seg_fields (list): Segment fields.
+                - box_type_3d (str): 3D box type.
+                - box_mode_3d (str): 3D box mode.
+        """
+        results["img_fields"] = []
+        results["bbox3d_fields"] = []
+        results["pts_mask_fields"] = []
+        results["pts_seg_fields"] = []
+        results["bbox_fields"] = []
+        results["mask_fields"] = []
+        results["seg_fields"] = []
+        results["box_type_3d"] = self.box_type_3d
+        results["box_mode_3d"] = self.box_mode_3d
+
+    def prepare_test_data(self, index):
+        """Prepare data for testing.
+
+        Args:
+            index (int): Index for accessing the target data.
+
+        Returns:
+            dict: Testing data dict of the corresponding index.
+        """
+        input_dict = self.get_data_info(index)
+        self.pre_pipeline(input_dict)
+        example = self.pipeline(input_dict)
+        return example
+
+    @classmethod
+    def get_classes(cls, classes=None):
+        """Get class names of current dataset.
+
+        Args:
+            classes (Sequence[str] | str | None): If classes is None, use
+                default CLASSES defined by builtin dataset. If classes is a
+                string, take it as a file name. The file contains the name of
+                classes where each line contains one class name. If classes is
+                a tuple or list, override the CLASSES defined by the dataset.
+
+        Return:
+            list[str]: A list of class names.
+        """
+        if classes is None:
+            return cls.CLASSES
+
+        if isinstance(classes, str):
+            # take it as a file path
+            class_names = mmcv.list_from_file(classes)
+        elif isinstance(classes, (tuple, list)):
+            class_names = classes
+        else:
+            raise ValueError(f"Unsupported type {type(classes)} of classes.")
+
+        return class_names
+
+    def __len__(self):
+        """Return the length of data infos.
+
+        Returns:
+            int: Length of data infos.
+        """
+        return len(self.data_infos)
+
+    def _rand_another(self, idx):
+        """Randomly get another item with the same flag.
+
+        Returns:
+            int: Another index of item with the same flag.
+        """
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        """Get item from infos according to the given index.
+
+        Returns:
+            dict: Data dictionary of the corresponding index.
+        """
+        if self.test_mode:
+            return self.prepare_test_data(idx)
+        while True:
+            data = self.prepare_train_data(idx)
+            if data is None:
+                idx = self._rand_another(idx)
+                continue
+            return data
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0. In 3D datasets, they are all the same, thus are all
+        zeros.
+        """
+        self.flag = np.zeros(len(self), dtype=np.uint8)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py
new file mode 100644
index 000000000..f48c9daf6
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/nuscenes_dataset.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmdet.datasets import DATASETS
+
+from .custom_3d import Custom3DDataset
+
+
+@DATASETS.register_module()
+class NuScenesDataset(Custom3DDataset):
+    r"""NuScenes Dataset.
+
+    This class serves as the API for experiments on the NuScenes Dataset.
+
+    Please refer to `NuScenes Dataset <https://www.nuscenes.org/download>`_
+    for data downloading.
+
+    Args:
+        ann_file (str): Path of annotation file.
+        pipeline (list[dict], optional): Pipeline used for data processing.
+            Defaults to None.
+        data_root (str): Path of dataset root.
+        classes (tuple[str], optional): Classes used in the dataset.
+            Defaults to None.
+        load_interval (int, optional): Interval of loading the dataset. It is
+            used to uniformly sample the dataset. Defaults to 1.
+        with_velocity (bool, optional): Whether include velocity prediction
+            into the experiments. Defaults to True.
+        modality (dict, optional): Modality to specify the sensor data used
+            as input. Defaults to None.
+        box_type_3d (str, optional): Type of 3D box of this dataset.
+            Based on the `box_type_3d`, the dataset will encapsulate the box
+            to its original format then converted them to `box_type_3d`.
+            Defaults to 'LiDAR' in this dataset. Available options includes.
+            - 'LiDAR': Box in LiDAR coordinates.
+            - 'Depth': Box in depth coordinates, usually for indoor dataset.
+            - 'Camera': Box in camera coordinates.
+        filter_empty_gt (bool, optional): Whether to filter empty GT.
+            Defaults to True.
+        test_mode (bool, optional): Whether the dataset is in test mode.
+            Defaults to False.
+        eval_version (bool, optional): Configuration version of evaluation.
+            Defaults to  'detection_cvpr_2019'.
+        use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
+            file as mask to filter gt_boxes and gt_names. Defaults to False.
+    """
+    NameMapping = {
+        "movable_object.barrier": "barrier",
+        "vehicle.bicycle": "bicycle",
+        "vehicle.bus.bendy": "bus",
+        "vehicle.bus.rigid": "bus",
+        "vehicle.car": "car",
+        "vehicle.construction": "construction_vehicle",
+        "vehicle.motorcycle": "motorcycle",
+        "human.pedestrian.adult": "pedestrian",
+        "human.pedestrian.child": "pedestrian",
+        "human.pedestrian.construction_worker": "pedestrian",
+        "human.pedestrian.police_officer": "pedestrian",
+        "movable_object.trafficcone": "traffic_cone",
+        "vehicle.trailer": "trailer",
+        "vehicle.truck": "truck",
+    }
+    DefaultAttribute = {
+        "car": "vehicle.parked",
+        "pedestrian": "pedestrian.moving",
+        "trailer": "vehicle.parked",
+        "truck": "vehicle.parked",
+        "bus": "vehicle.moving",
+        "motorcycle": "cycle.without_rider",
+        "construction_vehicle": "vehicle.parked",
+        "bicycle": "cycle.without_rider",
+        "barrier": "",
+        "traffic_cone": "",
+    }
+    AttrMapping = {
+        "cycle.with_rider": 0,
+        "cycle.without_rider": 1,
+        "pedestrian.moving": 2,
+        "pedestrian.standing": 3,
+        "pedestrian.sitting_lying_down": 4,
+        "vehicle.moving": 5,
+        "vehicle.parked": 6,
+        "vehicle.stopped": 7,
+    }
+    AttrMapping_rev = [
+        "cycle.with_rider",
+        "cycle.without_rider",
+        "pedestrian.moving",
+        "pedestrian.standing",
+        "pedestrian.sitting_lying_down",
+        "vehicle.moving",
+        "vehicle.parked",
+        "vehicle.stopped",
+    ]
+    # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa
+    ErrNameMapping = {
+        "trans_err": "mATE",
+        "scale_err": "mASE",
+        "orient_err": "mAOE",
+        "vel_err": "mAVE",
+        "attr_err": "mAAE",
+    }
+    CLASSES = (
+        "car",
+        "truck",
+        "trailer",
+        "bus",
+        "construction_vehicle",
+        "bicycle",
+        "motorcycle",
+        "pedestrian",
+        "traffic_cone",
+        "barrier",
+    )
+
+    def __init__(
+        self,
+        ann_file,
+        pipeline=None,
+        data_root=None,
+        classes=None,
+        load_interval=1,
+        with_velocity=True,
+        modality=None,
+        box_type_3d="LiDAR",
+        filter_empty_gt=True,
+        test_mode=False,
+        eval_version="detection_cvpr_2019",
+        use_valid_flag=False,
+    ):
+        self.load_interval = load_interval
+        self.use_valid_flag = use_valid_flag
+        super().__init__(
+            data_root=data_root,
+            ann_file=ann_file,
+            pipeline=pipeline,
+            classes=classes,
+            modality=modality,
+            box_type_3d=box_type_3d,
+            filter_empty_gt=filter_empty_gt,
+            test_mode=test_mode,
+        )
+
+        self.with_velocity = with_velocity
+        self.eval_version = eval_version
+        from nuscenes.eval.detection.config import config_factory
+
+        self.eval_detection_configs = config_factory(self.eval_version)
+        if self.modality is None:
+            self.modality = dict(
+                use_camera=False,
+                use_lidar=True,
+                use_radar=False,
+                use_map=False,
+                use_external=False,
+            )
+
+    def load_annotations(self, ann_file):
+        """Load annotations from ann_file.
+
+        Args:
+            ann_file (str): Path of the annotation file.
+
+        Returns:
+            list[dict]: List of annotations sorted by timestamps.
+        """
+        data = mmcv.load(ann_file)
+        data_infos = list(sorted(data["infos"], key=lambda e: e["timestamp"]))
+        data_infos = data_infos[:: self.load_interval]
+        self.metadata = data["metadata"]
+        self.version = self.metadata["version"]
+        return data_infos
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py
new file mode 100644
index 000000000..843c9ff7c
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmdet.datasets.pipelines import Compose
+
+from .loading import LoadMultiViewImageFromFiles
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py
new file mode 100644
index 000000000..e4e0f56c3
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/formating.py
@@ -0,0 +1,285 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+from mmcv.parallel import DataContainer as DC
+from mmdet3d.core.bbox.structures import BaseInstance3DBoxes
+from mmdet3d.core.points import BasePoints
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import to_tensor
+
+PIPELINES._module_dict.pop("DefaultFormatBundle")
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle(object):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+    """
+
+    def __init__(
+        self,
+    ):
+        return
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        if "img" in results:
+            if isinstance(results["img"], list):
+                # process multiple imgs in single frame
+                imgs = [img.transpose(2, 0, 1) for img in results["img"]]
+                imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
+                results["img"] = DC(to_tensor(imgs), stack=True)
+            else:
+                img = np.ascontiguousarray(results["img"].transpose(2, 0, 1))
+                results["img"] = DC(to_tensor(img), stack=True)
+        for key in [
+            "proposals",
+            "gt_bboxes",
+            "gt_bboxes_ignore",
+            "gt_labels",
+            "gt_labels_3d",
+            "attr_labels",
+            "pts_instance_mask",
+            "pts_semantic_mask",
+            "centers2d",
+            "depths",
+        ]:
+            if key not in results:
+                continue
+            if isinstance(results[key], list):
+                results[key] = DC([to_tensor(res) for res in results[key]])
+            else:
+                results[key] = DC(to_tensor(results[key]))
+        if "gt_bboxes_3d" in results:
+            if isinstance(results["gt_bboxes_3d"], BaseInstance3DBoxes):
+                results["gt_bboxes_3d"] = DC(results["gt_bboxes_3d"], cpu_only=True)
+            else:
+                results["gt_bboxes_3d"] = DC(to_tensor(results["gt_bboxes_3d"]))
+
+        if "gt_masks" in results:
+            results["gt_masks"] = DC(results["gt_masks"], cpu_only=True)
+        if "gt_semantic_seg" in results:
+            results["gt_semantic_seg"] = DC(to_tensor(results["gt_semantic_seg"][None, ...]), stack=True)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__
+
+
+@PIPELINES.register_module()
+class Collect3D(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img", "proposals", "gt_bboxes",
+    "gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
+
+    The "img_meta" item is always populated.  The contents of the "img_meta"
+    dictionary depends on "meta_keys". By default this includes:
+
+        - 'img_shape': shape of the image input to the network as a tuple \
+            (h, w, c).  Note that images may be zero padded on the \
+            bottom/right if the batch tensor is larger than this shape.
+        - 'scale_factor': a float indicating the preprocessing scale
+        - 'flip': a boolean indicating if image flip transform was used
+        - 'filename': path to the image file
+        - 'ori_shape': original shape of the image as a tuple (h, w, c)
+        - 'pad_shape': image shape after padding
+        - 'lidar2img': transform from lidar to image
+        - 'depth2img': transform from depth to image
+        - 'cam2img': transform from camera to image
+        - 'pcd_horizontal_flip': a boolean indicating if point cloud is \
+            flipped horizontally
+        - 'pcd_vertical_flip': a boolean indicating if point cloud is \
+            flipped vertically
+        - 'box_mode_3d': 3D box mode
+        - 'box_type_3d': 3D box type
+        - 'img_norm_cfg': a dict of normalization information:
+            - mean: per channel mean subtraction
+            - std: per channel std divisor
+            - to_rgb: bool indicating if bgr was converted to rgb
+        - 'pcd_trans': point cloud transformations
+        - 'sample_idx': sample index
+        - 'pcd_scale_factor': point cloud scale factor
+        - 'pcd_rotation': rotation applied to point cloud
+        - 'pts_filename': path to point cloud file.
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img',
+            'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip',
+            'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d',
+            'box_type_3d', 'img_norm_cfg', 'pcd_trans',
+            'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
+    """
+
+    def __init__(
+        self,
+        keys,
+        meta_keys=(
+            "filename",
+            "ori_shape",
+            "img_shape",
+            "lidar2img",
+            "depth2img",
+            "cam2img",
+            "pad_shape",
+            "scale_factor",
+            "flip",
+            "pcd_horizontal_flip",
+            "pcd_vertical_flip",
+            "box_mode_3d",
+            "box_type_3d",
+            "img_norm_cfg",
+            "pcd_trans",
+            "sample_idx",
+            "pcd_scale_factor",
+            "pcd_rotation",
+            "pts_filename",
+            "transformation_3d_flow",
+        ),
+    ):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        """Call function to collect keys in results. The keys in ``meta_keys``
+        will be converted to :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to collect.
+
+        Returns:
+            dict: The result dict contains the following keys
+                - keys in ``self.keys``
+                - ``img_metas``
+        """
+        data = {}
+        img_metas = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_metas[key] = results[key]
+
+        data["img_metas"] = DC(img_metas, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        return self.__class__.__name__ + f"(keys={self.keys}, meta_keys={self.meta_keys})"
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle3D(DefaultFormatBundle):
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields for voxels,
+    including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
+    "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    """
+
+    def __init__(self, class_names, with_gt=True, with_label=True):
+        super(DefaultFormatBundle3D, self).__init__()
+        self.class_names = class_names
+        self.with_gt = with_gt
+        self.with_label = with_label
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with
+                default bundle.
+        """
+        # Format 3D data
+        if "points" in results:
+            assert isinstance(results["points"], BasePoints)
+            results["points"] = DC(results["points"].tensor)
+
+        for key in ["voxels", "coors", "voxel_centers", "num_points"]:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]), stack=False)
+
+        if self.with_gt:
+            # Clean GT bboxes in the final
+            if "gt_bboxes_3d_mask" in results:
+                gt_bboxes_3d_mask = results["gt_bboxes_3d_mask"]
+                results["gt_bboxes_3d"] = results["gt_bboxes_3d"][gt_bboxes_3d_mask]
+                if "gt_names_3d" in results:
+                    results["gt_names_3d"] = results["gt_names_3d"][gt_bboxes_3d_mask]
+                if "centers2d" in results:
+                    results["centers2d"] = results["centers2d"][gt_bboxes_3d_mask]
+                if "depths" in results:
+                    results["depths"] = results["depths"][gt_bboxes_3d_mask]
+            if "gt_bboxes_mask" in results:
+                gt_bboxes_mask = results["gt_bboxes_mask"]
+                if "gt_bboxes" in results:
+                    results["gt_bboxes"] = results["gt_bboxes"][gt_bboxes_mask]
+                results["gt_names"] = results["gt_names"][gt_bboxes_mask]
+            if self.with_label:
+                if "gt_names" in results and len(results["gt_names"]) == 0:
+                    results["gt_labels"] = np.array([], dtype=np.int64)
+                    results["attr_labels"] = np.array([], dtype=np.int64)
+                elif "gt_names" in results and isinstance(results["gt_names"][0], list):
+                    # gt_labels might be a list of list in multi-view setting
+                    results["gt_labels"] = [
+                        np.array([self.class_names.index(n) for n in res], dtype=np.int64)
+                        for res in results["gt_names"]
+                    ]
+                elif "gt_names" in results:
+                    results["gt_labels"] = np.array(
+                        [self.class_names.index(n) for n in results["gt_names"]], dtype=np.int64
+                    )
+                # we still assume one pipeline for one frame LiDAR
+                # thus, the 3D name is list[string]
+                if "gt_names_3d" in results:
+                    results["gt_labels_3d"] = np.array(
+                        [self.class_names.index(n) for n in results["gt_names_3d"]], dtype=np.int64
+                    )
+        results = super(DefaultFormatBundle3D, self).__call__(results)
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(class_names={self.class_names}, "
+        repr_str += f"with_gt={self.with_gt}, with_label={self.with_label})"
+        return repr_str
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py
new file mode 100644
index 000000000..75dc9dfc2
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/loading.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# # Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+import numpy as np
+from mmdet.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadMultiViewImageFromFiles(object):
+    """Load multi channel images from a list of separate channel files.
+
+    Expects results['img_filename'] to be a list of filenames.
+
+    Args:
+        to_float32 (bool): Whether to convert the img to float32.
+            Defaults to False.
+        color_type (str): Color type of the file. Defaults to 'unchanged'.
+    """
+
+    def __init__(self, to_float32=False, color_type="unchanged"):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+
+    def __call__(self, results):
+        """Call function to load multi-view image from files.
+
+        Args:
+            results (dict): Result dict containing multi-view image filenames.
+
+        Returns:
+            dict: The result dict containing the multi-view image data. \
+                Added keys and values are described below.
+
+                - filename (str): Multi-view image filenames.
+                - img (np.ndarray): Multi-view image arrays.
+                - img_shape (tuple[int]): Shape of multi-view image arrays.
+                - ori_shape (tuple[int]): Shape of original image arrays.
+                - pad_shape (tuple[int]): Shape of padded image arrays.
+                - scale_factor (float): Scale factor.
+                - img_norm_cfg (dict): Normalization configuration of images.
+        """
+
+        filename = results["img_filename"]
+
+        # img is of shape (h, w, c, num_views)
+        img = np.stack([mmcv.imread(name, self.color_type) for name in filename], axis=-1)
+        if self.to_float32:
+            img = img.astype(np.float32)
+        results["filename"] = filename
+        # unravel to list, see `DefaultFormatBundle` in formating.py
+        # which will transpose each image separately and then stack into array
+        results["img"] = [img[..., i] for i in range(img.shape[-1])]
+        results["img_shape"] = img.shape
+        results["ori_shape"] = img.shape
+        # Set initial values for default meta_keys
+        results["pad_shape"] = img.shape
+        results["scale_factor"] = 1.0
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results["img_norm_cfg"] = dict(
+            mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False
+        )
+        return results
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(to_float32={self.to_float32}, "
+        repr_str += f"color_type='{self.color_type}')"
+        return repr_str
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py
new file mode 100644
index 000000000..74fb057f4
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/datasets/pipelines/test_time_aug.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+from copy import deepcopy
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmcv
+from mmdet.datasets.builder import PIPELINES
+from mmdet.datasets.pipelines import Compose
+
+
+@PIPELINES.register_module()
+class MultiScaleFlipAug3D(object):
+    """Test-time augmentation with multiple scales and flipping.
+
+    Args:
+        transforms (list[dict]): Transforms to apply in each augmentation.
+        img_scale (tuple | list[tuple]: Images scales for resizing.
+        pts_scale_ratio (float | list[float]): Points scale ratios for
+            resizing.
+        flip (bool): Whether apply flip augmentation. Defaults to False.
+        flip_direction (str | list[str]): Flip augmentation directions
+            for images, options are "horizontal" and "vertical".
+            If flip_direction is list, multiple flip augmentations will
+            be applied. It has no effect when ``flip == False``.
+            Defaults to "horizontal".
+        pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation
+            to point cloud. Defaults to True. Note that it works only when
+            'flip' is turned on.
+        pcd_vertical_flip (bool): Whether apply vertical flip augmentation
+            to point cloud. Defaults to True. Note that it works only when
+            'flip' is turned on.
+    """
+
+    def __init__(
+        self,
+        transforms,
+        img_scale,
+        pts_scale_ratio,
+        flip=False,
+        flip_direction="horizontal",
+        pcd_horizontal_flip=False,
+        pcd_vertical_flip=False,
+    ):
+        self.transforms = Compose(transforms)
+        self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale]
+        self.pts_scale_ratio = pts_scale_ratio if isinstance(pts_scale_ratio, list) else [float(pts_scale_ratio)]
+
+        assert mmcv.is_list_of(self.img_scale, tuple)
+        assert mmcv.is_list_of(self.pts_scale_ratio, float)
+
+        self.flip = flip
+        self.pcd_horizontal_flip = pcd_horizontal_flip
+        self.pcd_vertical_flip = pcd_vertical_flip
+
+        self.flip_direction = flip_direction if isinstance(flip_direction, list) else [flip_direction]
+        assert mmcv.is_list_of(self.flip_direction, str)
+        if not self.flip and self.flip_direction != ["horizontal"]:
+            warnings.warn("flip_direction has no effect when flip is set to False")
+        if self.flip and not any([(t["type"] == "RandomFlip3D" or t["type"] == "RandomFlip") for t in transforms]):
+            warnings.warn("flip has no effect when RandomFlip is not in transforms")
+
+    def __call__(self, results):
+        """Call function to augment common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to augment.
+
+        Returns:
+            dict: The result dict contains the data that is augmented with \
+                different scales and flips.
+        """
+        aug_data = []
+
+        # modified from `flip_aug = [False, True] if self.flip else [False]`
+        # to reduce unnecessary scenes when using double flip augmentation
+        # during test time
+        flip_aug = [True] if self.flip else [False]
+        pcd_horizontal_flip_aug = [False, True] if self.flip and self.pcd_horizontal_flip else [False]
+        pcd_vertical_flip_aug = [False, True] if self.flip and self.pcd_vertical_flip else [False]
+        for scale in self.img_scale:
+            for pts_scale_ratio in self.pts_scale_ratio:
+                for flip in flip_aug:
+                    for pcd_horizontal_flip in pcd_horizontal_flip_aug:
+                        for pcd_vertical_flip in pcd_vertical_flip_aug:
+                            for direction in self.flip_direction:
+                                # results.copy will cause bug
+                                # since it is shallow copy
+                                _results = deepcopy(results)
+                                _results["scale"] = scale
+                                _results["flip"] = flip
+                                _results["pcd_scale_factor"] = pts_scale_ratio
+                                _results["flip_direction"] = direction
+                                _results["pcd_horizontal_flip"] = pcd_horizontal_flip
+                                _results["pcd_vertical_flip"] = pcd_vertical_flip
+                                data = self.transforms(_results)
+                                aug_data.append(data)
+        # list of dict to dict of list
+        aug_data_dict = {key: [] for key in aug_data[0]}
+        for data in aug_data:
+            for key, val in data.items():
+                aug_data_dict[key].append(val)
+        return aug_data_dict
+
+    def __repr__(self):
+        """str: Return a string that describes the module."""
+        repr_str = self.__class__.__name__
+        repr_str += f"(transforms={self.transforms}, "
+        repr_str += f"img_scale={self.img_scale}, flip={self.flip}, "
+        repr_str += f"pts_scale_ratio={self.pts_scale_ratio}, "
+        repr_str += f"flip_direction={self.flip_direction})"
+        return repr_str
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py
new file mode 100644
index 000000000..64f636518
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/__init__.py
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# # Copyright (c) OpenMMLab. All rights reserved.
+
+from .detectors import *
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py
new file mode 100644
index 000000000..a73c30306
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/builder.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+from mmdet.models.builder import BACKBONES, DETECTORS, HEADS, LOSSES, MODELS, NECKS
+
+VOXEL_ENCODERS = MODELS
+MIDDLE_ENCODERS = MODELS
+FUSION_LAYERS = MODELS
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    return BACKBONES.build(cfg)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    return NECKS.build(cfg)
+
+
+def build_head(cfg):
+    """Build head."""
+    return HEADS.build(cfg)
+
+
+def build_loss(cfg):
+    """Build loss function."""
+    return LOSSES.build(cfg)
+
+
+def build_detector(cfg, train_cfg=None, test_cfg=None):
+    """Build detector."""
+    if train_cfg is not None or test_cfg is not None:
+        warnings.warn("train_cfg and test_cfg is deprecated, " "please specify them in model", UserWarning)
+    assert cfg.get("train_cfg") is None or train_cfg is None, "train_cfg specified in both outer field and model field "
+    assert cfg.get("test_cfg") is None or test_cfg is None, "test_cfg specified in both outer field and model field "
+    return DETECTORS.build(cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg))
+
+
+def build_model(cfg, train_cfg=None, test_cfg=None):
+    """A function warpper for building 3D detector according to
+    cfg.
+
+    Should be deprecated in the future.
+    """
+    return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg)
+
+
+def build_voxel_encoder(cfg):
+    """Build voxel encoder."""
+    return VOXEL_ENCODERS.build(cfg)
+
+
+def build_middle_encoder(cfg):
+    """Build middle level encoder."""
+    return MIDDLE_ENCODERS.build(cfg)
+
+
+def build_fusion_layer(cfg):
+    """Build fusion layer."""
+    return FUSION_LAYERS.build(cfg)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py
new file mode 100644
index 000000000..5d1d0befa
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import Base3DDetector
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py
new file mode 100644
index 000000000..7ab88bea1
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/base.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import auto_fp16
+from mmdet.models.detectors.base import BaseDetector
+
+
+class Base3DDetector(BaseDetector):
+    """Base class for detectors."""
+
+    def forward_test(self, points, img_metas, img=None, **kwargs):
+        """
+        Args:
+            points (list[torch.Tensor]): the outer list indicates test-time
+                augmentations and inner torch.Tensor should have a shape NxC,
+                which contains all points in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch
+            img (list[torch.Tensor], optional): the outer
+                list indicates test-time augmentations and inner
+                torch.Tensor should have a shape NxCxHxW, which contains
+                all images in the batch. Defaults to None.
+        """
+        for var, name in [(points, "points"), (img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError("{} must be a list, but got {}".format(name, type(var)))
+
+        num_augs = len(points)
+        if num_augs != len(img_metas):
+            raise ValueError("num of augmentations ({}) != num of image meta ({})".format(len(points), len(img_metas)))
+
+        if num_augs == 1:
+            img = [img] if img is None else img
+            return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
+        else:
+            return self.aug_test(points, img_metas, img, **kwargs)
+
+    @auto_fp16(apply_to=("img", "points"))
+    def forward(self, **kwargs):
+        """Calls either forward_train or forward_test depending on whether
+        return_loss=True.
+
+        Note this setting will change the expected inputs. When
+        `return_loss=True`, img and img_metas are single-nested (i.e.
+        torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
+        img_metas should be double nested (i.e.  list[torch.Tensor],
+        list[list[dict]]), with the outer list indicating test time
+        augmentations.
+        """
+
+        return self.forward_test(**kwargs)
diff --git a/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py
new file mode 100644
index 000000000..dcb498542
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/mmdet3d/models/detectors/mvx_two_stage.py
@@ -0,0 +1,418 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.runner import force_fp32
+from mmdet.models import DETECTORS
+from torch.nn import functional as F
+
+from .. import builder
+from .base import Base3DDetector
+
+
+@DETECTORS.register_module()
+class MVXTwoStageDetector(Base3DDetector):
+    """Base class of Multi-modality VoxelNet."""
+
+    def __init__(
+        self,
+        pts_voxel_layer=None,
+        pts_voxel_encoder=None,
+        pts_middle_encoder=None,
+        pts_fusion_layer=None,
+        img_backbone=None,
+        pts_backbone=None,
+        img_neck=None,
+        pts_neck=None,
+        pts_bbox_head=None,
+        img_roi_head=None,
+        img_rpn_head=None,
+        train_cfg=None,
+        test_cfg=None,
+        pretrained=None,
+        init_cfg=None,
+    ):
+        super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg)
+
+        if pts_voxel_layer:
+            self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
+        if pts_voxel_encoder:
+            self.pts_voxel_encoder = builder.build_voxel_encoder(pts_voxel_encoder)
+        if pts_middle_encoder:
+            self.pts_middle_encoder = builder.build_middle_encoder(pts_middle_encoder)
+        if pts_backbone:
+            self.pts_backbone = builder.build_backbone(pts_backbone)
+        if pts_fusion_layer:
+            self.pts_fusion_layer = builder.build_fusion_layer(pts_fusion_layer)
+        if pts_neck is not None:
+            self.pts_neck = builder.build_neck(pts_neck)
+        if pts_bbox_head:
+            pts_train_cfg = train_cfg.pts if train_cfg else None
+            pts_bbox_head.update(train_cfg=pts_train_cfg)
+            pts_test_cfg = test_cfg.pts if test_cfg else None
+            pts_bbox_head.update(test_cfg=pts_test_cfg)
+            self.pts_bbox_head = builder.build_head(pts_bbox_head)
+
+        if img_backbone:
+            self.img_backbone = builder.build_backbone(img_backbone)
+        if img_neck is not None:
+            self.img_neck = builder.build_neck(img_neck)
+        if img_rpn_head is not None:
+            self.img_rpn_head = builder.build_head(img_rpn_head)
+        if img_roi_head is not None:
+            self.img_roi_head = builder.build_head(img_roi_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        if pretrained is None:
+            img_pretrained = None
+            pts_pretrained = None
+        elif isinstance(pretrained, dict):
+            img_pretrained = pretrained.get("img", None)
+            pts_pretrained = pretrained.get("pts", None)
+        else:
+            raise ValueError(f"pretrained should be a dict, got {type(pretrained)}")
+
+        if self.with_img_backbone:
+            if img_pretrained is not None:
+                warnings.warn(
+                    "DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg"
+                )
+                self.img_backbone.init_cfg = dict(type="Pretrained", checkpoint=img_pretrained)
+        if self.with_img_roi_head:
+            if img_pretrained is not None:
+                warnings.warn(
+                    "DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg"
+                )
+                self.img_roi_head.init_cfg = dict(type="Pretrained", checkpoint=img_pretrained)
+
+        if self.with_pts_backbone:
+            if pts_pretrained is not None:
+                warnings.warn(
+                    "DeprecationWarning: pretrained is a deprecated \
+                    key, please consider using init_cfg"
+                )
+                self.pts_backbone.init_cfg = dict(type="Pretrained", checkpoint=pts_pretrained)
+
+    @property
+    def with_img_shared_head(self):
+        """bool: Whether the detector has a shared head in image branch."""
+        return hasattr(self, "img_shared_head") and self.img_shared_head is not None
+
+    @property
+    def with_pts_bbox(self):
+        """bool: Whether the detector has a 3D box head."""
+        return hasattr(self, "pts_bbox_head") and self.pts_bbox_head is not None
+
+    @property
+    def with_img_bbox(self):
+        """bool: Whether the detector has a 2D image box head."""
+        return hasattr(self, "img_bbox_head") and self.img_bbox_head is not None
+
+    @property
+    def with_img_backbone(self):
+        """bool: Whether the detector has a 2D image backbone."""
+        return hasattr(self, "img_backbone") and self.img_backbone is not None
+
+    @property
+    def with_pts_backbone(self):
+        """bool: Whether the detector has a 3D backbone."""
+        return hasattr(self, "pts_backbone") and self.pts_backbone is not None
+
+    @property
+    def with_fusion(self):
+        """bool: Whether the detector has a fusion layer."""
+        return hasattr(self, "pts_fusion_layer") and self.fusion_layer is not None
+
+    @property
+    def with_img_neck(self):
+        """bool: Whether the detector has a neck in image branch."""
+        return hasattr(self, "img_neck") and self.img_neck is not None
+
+    @property
+    def with_pts_neck(self):
+        """bool: Whether the detector has a neck in 3D detector branch."""
+        return hasattr(self, "pts_neck") and self.pts_neck is not None
+
+    @property
+    def with_img_rpn(self):
+        """bool: Whether the detector has a 2D RPN in image detector branch."""
+        return hasattr(self, "img_rpn_head") and self.img_rpn_head is not None
+
+    @property
+    def with_img_roi_head(self):
+        """bool: Whether the detector has a RoI Head in image branch."""
+        return hasattr(self, "img_roi_head") and self.img_roi_head is not None
+
+    @property
+    def with_voxel_encoder(self):
+        """bool: Whether the detector has a voxel encoder."""
+        return hasattr(self, "voxel_encoder") and self.voxel_encoder is not None
+
+    @property
+    def with_middle_encoder(self):
+        """bool: Whether the detector has a middle encoder."""
+        return hasattr(self, "middle_encoder") and self.middle_encoder is not None
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if self.with_img_backbone and img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+
+            if img.dim() == 5 and img.size(0) == 1:
+                img.squeeze_()
+            elif img.dim() == 5 and img.size(0) > 1:
+                B, N, C, H, W = img.size()
+                img = img.view(B * N, C, H, W)
+            img_feats = self.img_backbone(img)
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        return img_feats
+
+    def extract_pts_feat(self, pts, img_feats, img_metas):
+        """Extract features of points."""
+        if not self.with_pts_bbox:
+            return None
+        voxels, num_points, coors = self.voxelize(pts)
+        voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, img_feats, img_metas)
+        batch_size = coors[-1, 0] + 1
+        x = self.pts_middle_encoder(voxel_features, coors, batch_size)
+        x = self.pts_backbone(x)
+        if self.with_pts_neck:
+            x = self.pts_neck(x)
+        return x
+
+    def extract_feat(self, points, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
+        return (img_feats, pts_feats)
+
+    @torch.no_grad()
+    @force_fp32()
+    def voxelize(self, points):
+        """Apply dynamic voxelization to points.
+
+        Args:
+            points (list[torch.Tensor]): Points of each sample.
+
+        Returns:
+            tuple[torch.Tensor]: Concatenated points, number of points
+                per voxel, and coordinates.
+        """
+        voxels, coors, num_points = [], [], []
+        for res in points:
+            res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
+            voxels.append(res_voxels)
+            coors.append(res_coors)
+            num_points.append(res_num_points)
+        voxels = torch.cat(voxels, dim=0)
+        num_points = torch.cat(num_points, dim=0)
+        coors_batch = []
+        for i, coor in enumerate(coors):
+            coor_pad = F.pad(coor, (1, 0), mode="constant", value=i)
+            coors_batch.append(coor_pad)
+        coors_batch = torch.cat(coors_batch, dim=0)
+        return voxels, num_points, coors_batch
+
+    def forward_train(
+        self,
+        points=None,
+        img_metas=None,
+        gt_bboxes_3d=None,
+        gt_labels_3d=None,
+        gt_labels=None,
+        gt_bboxes=None,
+        img=None,
+        proposals=None,
+        gt_bboxes_ignore=None,
+    ):
+        """Forward training function.
+
+        Args:
+            points (list[torch.Tensor], optional): Points of each sample.
+                Defaults to None.
+            img_metas (list[dict], optional): Meta information of each sample.
+                Defaults to None.
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
+                Ground truth 3D boxes. Defaults to None.
+            gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
+                of 3D boxes. Defaults to None.
+            gt_labels (list[torch.Tensor], optional): Ground truth labels
+                of 2D boxes in images. Defaults to None.
+            gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
+                images. Defaults to None.
+            img (torch.Tensor optional): Images of each sample with shape
+                (N, C, H, W). Defaults to None.
+            proposals ([list[torch.Tensor], optional): Predicted proposals
+                used for training Fast RCNN. Defaults to None.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                2D boxes in images to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of different branches.
+        """
+        img_feats, pts_feats = self.extract_feat(points, img=img, img_metas=img_metas)
+        losses = dict()
+        if pts_feats:
+            losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore)
+            losses.update(losses_pts)
+        if img_feats:
+            losses_img = self.forward_img_train(
+                img_feats,
+                img_metas=img_metas,
+                gt_bboxes=gt_bboxes,
+                gt_labels=gt_labels,
+                gt_bboxes_ignore=gt_bboxes_ignore,
+                proposals=proposals,
+            )
+            losses.update(losses_img)
+        return losses
+
+    def forward_pts_train(self, pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None):
+        """Forward function for point cloud branch.
+
+        Args:
+            pts_feats (list[torch.Tensor]): Features of point cloud branch
+            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
+                boxes for each sample.
+            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
+                boxes of each sampole
+            img_metas (list[dict]): Meta information of samples.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        outs = self.pts_bbox_head(pts_feats)
+        loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
+        losses = self.pts_bbox_head.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    def forward_img_train(self, x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, proposals=None, **kwargs):
+        """Forward function for image branch.
+
+        This function works similar to the forward function of Faster R-CNN.
+
+        Args:
+            x (list[torch.Tensor]): Image features of shape (B, C, H, W)
+                of multiple levels.
+            img_metas (list[dict]): Meta information of images.
+            gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image
+                sample.
+            gt_labels (list[torch.Tensor]): Ground truth labels of boxes.
+            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
+                boxes to be ignored. Defaults to None.
+            proposals (list[torch.Tensor], optional): Proposals of each sample.
+                Defaults to None.
+
+        Returns:
+            dict: Losses of each branch.
+        """
+        losses = dict()
+        # RPN forward and loss
+        if self.with_img_rpn:
+            rpn_outs = self.img_rpn_head(x)
+            rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.img_rpn)
+            rpn_losses = self.img_rpn_head.loss(*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+            losses.update(rpn_losses)
+
+            proposal_cfg = self.train_cfg.get("img_rpn_proposal", self.test_cfg.img_rpn)
+            proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
+            proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        else:
+            proposal_list = proposals
+
+        # bbox head forward and loss
+        if self.with_img_bbox:
+            # bbox head forward and loss
+            img_roi_losses = self.img_roi_head.forward_train(
+                x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs
+            )
+            losses.update(img_roi_losses)
+
+        return losses
+
+    def simple_test_img(self, x, img_metas, proposals=None, rescale=False):
+        """Test without augmentation."""
+        if proposals is None:
+            proposal_list = self.simple_test_rpn(x, img_metas, self.test_cfg.img_rpn)
+        else:
+            proposal_list = proposals
+
+        return self.img_roi_head.simple_test(x, proposal_list, img_metas, rescale=rescale)
+
+    def simple_test_rpn(self, x, img_metas, rpn_test_cfg):
+        """RPN test function."""
+        rpn_outs = self.img_rpn_head(x)
+        proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg)
+        proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
+        return proposal_list
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x)
+        bbox_list = self.pts_bbox_head.get_bboxes(*outs, img_metas, rescale=rescale)
+        bbox_results = [bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list]
+        return bbox_results
+
+    def simple_test(self, points, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats, pts_feats = self.extract_feat(points, img=img, img_metas=img_metas)
+
+        bbox_list = [dict() for i in range(len(img_metas))]
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.simple_test_pts(pts_feats, img_metas, rescale=rescale)
+            for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+                result_dict["pts_bbox"] = pts_bbox
+        if img_feats and self.with_img_bbox:
+            bbox_img = self.simple_test_img(img_feats, img_metas, rescale=rescale)
+            for result_dict, img_bbox in zip(bbox_list, bbox_img):
+                result_dict["img_bbox"] = img_bbox
+        return bbox_list
+
+    def aug_test(self, points, img_metas, imgs=None, rescale=False):
+        """Test function with augmentaiton."""
+        img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
+
+        bbox_list = dict()
+        if pts_feats and self.with_pts_bbox:
+            bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale)
+            bbox_list.update(pts_bbox=bbox_pts)
+        return [bbox_list]
+
+    def extract_feats(self, points, img_metas, imgs=None):
+        """Extract point and image features of multiple samples."""
+        if imgs is None:
+            imgs = [None] * len(img_metas)
+        img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs, img_metas)
+        return img_feats, pts_feats
+
+    def aug_test_pts(self, feats, img_metas, rescale=False):
+        """Test function of point cloud branch with augmentaiton."""
+        # only support aug_test for one sample
+        aug_bboxes = []
+        for x, img_meta in zip(feats, img_metas):
+            outs = self.pts_bbox_head(x)
+            bbox_list = self.pts_bbox_head.get_bboxes(*outs, img_meta, rescale=rescale)
+            bbox_list = [
+                dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list
+            ]
+            aug_bboxes.append(bbox_list[0])
+
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.pts_bbox_head.test_cfg)
+        return merged_bboxes
diff --git a/forge/test/models/pytorch/vision/petr/test_petr.py b/forge/test/models/pytorch/vision/petr/test_petr.py
new file mode 100644
index 000000000..87f0c3372
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/test_petr.py
@@ -0,0 +1,173 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+import torch
+
+sys.path.append("forge/test/models/pytorch/vision/petr")
+import pytest
+from mmcv.parallel import MMDataParallel
+from mmdet3d.models.builder import build_model
+
+import forge
+from forge.verify.verify import verify
+
+# Import necessary classes for model registration, ensuring availability even if not used directly
+from utils import model_registry
+from utils.utils import load_config, prepare_model_inputs
+
+from test.models.utils import Framework, Source, Task, build_module_name
+
+
+class petr_wrapper(torch.nn.Module):
+    def __init__(
+        self,
+        model,
+        filename,
+        ori_shape,
+        img_shape,
+        pad_shape,
+        scale_factor,
+        flip,
+        pcd_horizontal_flip,
+        pcd_vertical_flip,
+        box_mode_3d,
+        box_type_3d,
+        to_rgb,
+        sample_idx,
+        pcd_scale_factor,
+        pts_filename,
+    ):
+        super().__init__()
+        self.model = model
+
+        self.filename = filename
+        self.ori_shape = ori_shape
+        self.img_shape = img_shape
+        self.pad_shape = pad_shape
+        self.scale_factor = scale_factor
+        self.flip = flip
+        self.pcd_horizontal_flip = pcd_horizontal_flip
+        self.pcd_vertical_flip = pcd_vertical_flip
+        self.box_mode_3d = box_mode_3d
+        self.box_type_3d = box_type_3d
+        self.to_rgb = to_rgb
+        self.sample_idx = sample_idx
+        self.pcd_scale_factor = pcd_scale_factor
+        self.pts_filename = pts_filename
+
+    def forward(self, l0, l1, l2, l3, l4, l5, img, mean, std, masks):
+
+        l0 = l0.squeeze(0)
+        l1 = l1.squeeze(0)
+        l2 = l2.squeeze(0)
+        l3 = l3.squeeze(0)
+        l4 = l4.squeeze(0)
+        l5 = l5.squeeze(0)
+        img = img.squeeze(0)
+        mean = mean.squeeze(0)
+        std = std.squeeze(0)
+        masks = masks.squeeze(0)
+
+        data = {
+            "img_metas": [
+                [
+                    {
+                        "filename": self.filename,
+                        "ori_shape": self.ori_shape,
+                        "img_shape": self.img_shape,
+                        "lidar2img": [l0, l1, l2, l3, l4, l5],
+                        "pad_shape": self.pad_shape,
+                        "scale_factor": self.scale_factor,
+                        "flip": self.flip,
+                        "pcd_horizontal_flip": self.pcd_horizontal_flip,
+                        "pcd_vertical_flip": self.pcd_vertical_flip,
+                        "box_mode_3d": self.box_mode_3d,
+                        "box_type_3d": self.box_type_3d,
+                        "img_norm_cfg": {"mean": mean, "std": std, "to_rgb": self.to_rgb},
+                        "sample_idx": self.sample_idx,
+                        "pcd_scale_factor": self.pcd_scale_factor,
+                        "pts_filename": self.pts_filename,
+                        "masks": masks,
+                    }
+                ]
+            ],
+            "img": [img],
+        }
+
+        output = self.model(**data)
+        return (output["all_cls_scores"], output["all_bbox_preds"])
+
+
+variants = ["vovnet_gridmask_p4_800x320", "vovnet_gridmask_p4_1600x640"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_petr(record_forge_property, variant):
+
+    # Build Module Name
+    module_name = build_module_name(
+        framework=Framework.PYTORCH, model="petr", source=Source.GITHUB, task=Task.OBJECT_DETECTION, variant=variant
+    )
+
+    # Record Forge Property
+    record_forge_property("model_name", module_name)
+
+    _ = model_registry  # Prevents removal by linters/formatters
+
+    # Load config
+    cfg = load_config(variant)
+
+    # Prepare input
+    (
+        filename,
+        ori_shape,
+        img_shape,
+        pad_shape,
+        scale_factor,
+        flip,
+        pcd_horizontal_flip,
+        pcd_vertical_flip,
+        box_mode_3d,
+        box_type_3d,
+        to_rgb,
+        sample_idx,
+        pcd_scale_factor,
+        pts_filename,
+        inputs,
+    ) = prepare_model_inputs(cfg)
+
+    # Load Model
+    model = build_model(cfg.model, test_cfg=cfg.get("test_cfg"), train_cfg=cfg.get("train_cfg"))
+    model = MMDataParallel(model, device_ids=[0])
+    model.eval()
+
+    for param in model.parameters():
+        param.requires_grad = False
+
+    framework_model = petr_wrapper(
+        model,
+        filename,
+        ori_shape,
+        img_shape,
+        pad_shape,
+        scale_factor,
+        flip,
+        pcd_horizontal_flip,
+        pcd_vertical_flip,
+        box_mode_3d,
+        box_type_3d,
+        to_rgb,
+        sample_idx,
+        pcd_scale_factor,
+        pts_filename,
+    )
+    framework_model.eval()
+
+    # Forge compile framework model
+    compiled_model = forge.compile(framework_model, sample_inputs=inputs, module_name=module_name)
+
+    # Model Verification
+    verify(inputs, framework_model, compiled_model)
diff --git a/forge/test/models/pytorch/vision/petr/utils/__init__.py b/forge/test/models/pytorch/vision/petr/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py b/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py
new file mode 100644
index 000000000..8ebae7e5d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/cp_fpn.py
@@ -0,0 +1,210 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule, auto_fp16
+from mmdet.models import NECKS
+
+
+####This FPN remove the unused parameters which can used with checkpoint (with_cp = True in Backbone)
+@NECKS.register_module()
+class CPFPN(BaseModule):
+    r"""Feature Pyramid Network.
+
+    This is an implementation of paper `Feature Pyramid Networks for Object
+    Detection <https://arxiv.org/abs/1612.03144>`_.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        num_outs (int): Number of output scales.
+        start_level (int): Index of the start input backbone level used to
+            build the feature pyramid. Default: 0.
+        end_level (int): Index of the end input backbone level (exclusive) to
+            build the feature pyramid. Default: -1, which means the last level.
+        add_extra_convs (bool | str): If bool, it decides whether to add conv
+            layers on top of the original feature maps. Default to False.
+            If True, it is equivalent to `add_extra_convs='on_input'`.
+            If str, it specifies the source feature map of the extra convs.
+            Only the following options are allowed
+
+            - 'on_input': Last feat map of neck inputs (i.e. backbone feature).
+            - 'on_lateral':  Last feature map after lateral convs.
+            - 'on_output': The last output feature map after fpn convs.
+        relu_before_extra_convs (bool): Whether to apply relu before the extra
+            conv. Default: False.
+        no_norm_on_lateral (bool): Whether to apply norm on lateral.
+            Default: False.
+        conv_cfg (dict): Config dict for convolution layer. Default: None.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (str): Config dict for activation layer in ConvModule.
+            Default: None.
+        upsample_cfg (dict): Config dict for interpolate layer.
+            Default: `dict(mode='nearest')`
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = FPN(in_channels, 11, len(in_channels)).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        num_outs,
+        start_level=0,
+        end_level=-1,
+        add_extra_convs=False,
+        relu_before_extra_convs=False,
+        no_norm_on_lateral=False,
+        conv_cfg=None,
+        norm_cfg=None,
+        act_cfg=None,
+        upsample_cfg=dict(mode="nearest"),
+        init_cfg=dict(type="Xavier", layer="Conv2d", distribution="uniform"),
+    ):
+        super(CPFPN, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.no_norm_on_lateral = no_norm_on_lateral
+        self.fp16_enabled = False
+        self.upsample_cfg = upsample_cfg.copy()
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+        assert isinstance(add_extra_convs, (str, bool))
+        if isinstance(add_extra_convs, str):
+            # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output'
+            assert add_extra_convs in ("on_input", "on_lateral", "on_output")
+        elif add_extra_convs:  # True
+            self.add_extra_convs = "on_input"
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
+                act_cfg=act_cfg,
+                inplace=False,
+            )
+            self.lateral_convs.append(l_conv)
+            if i == 0:
+                fpn_conv = ConvModule(
+                    out_channels,
+                    out_channels,
+                    3,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                )
+                self.fpn_convs.append(fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if self.add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                if i == 0 and self.add_extra_convs == "on_input":
+                    in_channels = self.in_channels[self.backbone_end_level - 1]
+                else:
+                    in_channels = out_channels
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    conv_cfg=conv_cfg,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    inplace=False,
+                )
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @auto_fp16()
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs)]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            # In some cases, fixing `scale factor` (e.g. 2) is preferred, but
+            #  it cannot co-exist with `size` in `F.interpolate`.
+            if "scale_factor" in self.upsample_cfg:
+                laterals[i - 1] += F.interpolate(laterals[i], **self.upsample_cfg)
+            else:
+                prev_shape = laterals[i - 1].shape[2:]
+                laterals[i - 1] += F.interpolate(laterals[i], size=prev_shape, **self.upsample_cfg)
+
+        # build outputs
+        # part 1: from original levels
+        outs = [self.fpn_convs[i](laterals[i]) if i == 0 else laterals[i] for i in range(used_backbone_levels)]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs
+            # (e.g., Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                if self.add_extra_convs == "on_input":
+                    extra_source = inputs[self.backbone_end_level - 1]
+                elif self.add_extra_convs == "on_lateral":
+                    extra_source = laterals[-1]
+                elif self.add_extra_convs == "on_output":
+                    extra_source = outs[-1]
+                else:
+                    raise NotImplementedError
+                outs.append(self.fpn_convs[used_backbone_levels](extra_source))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    if self.relu_before_extra_convs:
+                        outs.append(self.fpn_convs[i](F.relu(outs[-1])))
+                    else:
+                        outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/forge/test/models/pytorch/vision/petr/utils/grid_mask.py b/forge/test/models/pytorch/vision/petr/utils/grid_mask.py
new file mode 100644
index 000000000..c8fa6db4b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/grid_mask.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import torch
+import torch.nn as nn
+from PIL import Image
+
+
+class GridMask(nn.Module):
+    def __init__(self, use_h, use_w, rotate=1, offset=False, ratio=0.5, mode=0, prob=1.0):
+        super(GridMask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.st_prob = prob
+        self.prob = prob
+
+    def forward(self, x):
+        if np.random.rand() > self.prob or not self.training:
+            return x
+        n, c, h, w = x.size()
+        x = x.view(-1, h, w)
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2 : (hh - h) // 2 + h, (ww - w) // 2 : (ww - w) // 2 + w]
+
+        mask = torch.from_numpy(mask).float().cuda()
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = mask.expand_as(x)
+        if self.offset:
+            offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float().cuda()
+            x = x * mask + offset * (1 - mask)
+        else:
+            x = x * mask
+
+        return x.view(n, c, h, w)
diff --git a/forge/test/models/pytorch/vision/petr/utils/match_cost.py b/forge/test/models/pytorch/vision/petr/utils/match_cost.py
new file mode 100644
index 000000000..dca25f395
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/match_cost.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from mmdet.core.bbox.match_costs.builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBox3DL1Cost(object):
+    """BBox3DL1Cost.
+    Args:
+        weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
diff --git a/forge/test/models/pytorch/vision/petr/utils/model_registry.py b/forge/test/models/pytorch/vision/petr/utils/model_registry.py
new file mode 100644
index 000000000..56cb667ed
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/model_registry.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+from mmdet3d.datasets.pipelines.formating import DefaultFormatBundle3D
+from mmdet3d.datasets.pipelines.test_time_aug import MultiScaleFlipAug3D
+from mmdet.core.bbox.coder import distance_point_bbox_coder
+from mmdet.models.losses import focal_loss, iou_loss
+from mmdet.models.losses.smooth_l1_loss import L1Loss
+from utils.cp_fpn import CPFPN
+from utils.grid_mask import GridMask
+from utils.match_cost import BBox3DL1Cost
+from utils.nms_free_coder import NMSFreeCoder
+from utils.nuscenes_dataset import CustomNuScenesDataset
+from utils.petr3d import Petr3D
+from utils.petr_head import PETRHead
+from utils.petr_transformer import PETRTransformer
+from utils.positional_encoding import SinePositionalEncoding3D
+from utils.transform_3d import ResizeCropFlipImage
+from utils.vovnetcp import VoVNetCP
+
+__all__ = [
+    "Petr3D",
+    "PETRHead",
+    "BBox3DL1Cost",
+    "focal_loss",
+    "iou_loss",
+    "L1Loss",
+    "distance_point_bbox_coder",
+    "SinePositionalEncoding3D",
+    "PETRTransformer",
+    "NMSFreeCoder",
+    "GridMask",
+    "CustomNuScenesDataset",
+    "ResizeCropFlipImage",
+    "MultiScaleFlipAug3D",
+    "DefaultFormatBundle3D",
+    "CPFPN",
+    "VoVNetCP",
+]
diff --git a/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py b/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py
new file mode 100644
index 000000000..1a4f30544
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/nms_free_coder.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2021 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+from mmdet.core.bbox import BaseBBoxCoder
+from mmdet.core.bbox.builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class NMSFreeCoder(BaseBBoxCoder):
+    """Bbox coder for NMS-free detector.
+    Args:
+        pc_range (list[float]): Range of point cloud.
+        post_center_range (list[float]): Limit of the center.
+            Default: None.
+        max_num (int): Max number to be kept. Default: 100.
+        score_threshold (float): Threshold to filter boxes based on score.
+            Default: None.
+        code_size (int): Code size of bboxes. Default: 9
+    """
+
+    def __init__(
+        self, pc_range, voxel_size=None, post_center_range=None, max_num=100, score_threshold=None, num_classes=10
+    ):
+
+        self.pc_range = pc_range
+        self.voxel_size = voxel_size
+        self.post_center_range = post_center_range
+        self.max_num = max_num
+        self.score_threshold = score_threshold
+        self.num_classes = num_classes
diff --git a/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py b/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py
new file mode 100644
index 000000000..bbb32d222
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/nuscenes_dataset.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import numpy as np
+from mmdet3d.datasets import NuScenesDataset
+from mmdet.datasets import DATASETS
+
+
+@DATASETS.register_module()
+class CustomNuScenesDataset(NuScenesDataset):
+    r"""NuScenes Dataset.
+    This datset only add camera intrinsics and extrinsics to the results.
+    """
+
+    def get_data_info(self, index):
+        """Get data info according to the given index.
+        Args:
+            index (int): Index of the sample data to get.
+        Returns:
+            dict: Data information that will be passed to the data \
+                preprocessing pipelines. It includes the following keys:
+
+                - sample_idx (str): Sample index.
+                - pts_filename (str): Filename of point clouds.
+                - sweeps (list[dict]): Infos of sweeps.
+                - timestamp (float): Sample timestamp.
+                - img_filename (str, optional): Image filename.
+                - lidar2img (list[np.ndarray], optional): Transformations \
+                    from lidar to different cameras.
+                - ann_info (dict): Annotation info.
+        """
+        info = self.data_infos[index]
+        # standard protocal modified from SECOND.Pytorch
+        input_dict = dict(
+            sample_idx=info["token"],
+            pts_filename=info["lidar_path"],
+            sweeps=info["sweeps"],
+            timestamp=info["timestamp"] / 1e6,
+        )
+
+        if self.modality["use_camera"]:
+            image_paths = []
+            lidar2img_rts = []
+            intrinsics = []
+            extrinsics = []
+            img_timestamp = []
+            for cam_type, cam_info in info["cams"].items():
+                img_timestamp.append(cam_info["timestamp"] / 1e6)
+                image_paths.append(cam_info["data_path"])
+                # obtain lidar to image transformation matrix
+                lidar2cam_r = np.linalg.inv(cam_info["sensor2lidar_rotation"])
+                lidar2cam_t = cam_info["sensor2lidar_translation"] @ lidar2cam_r.T
+                lidar2cam_rt = np.eye(4)
+                lidar2cam_rt[:3, :3] = lidar2cam_r.T
+                lidar2cam_rt[3, :3] = -lidar2cam_t
+                intrinsic = cam_info["cam_intrinsic"]
+                viewpad = np.eye(4)
+                viewpad[: intrinsic.shape[0], : intrinsic.shape[1]] = intrinsic
+                lidar2img_rt = viewpad @ lidar2cam_rt.T
+                intrinsics.append(viewpad)
+                extrinsics.append(
+                    lidar2cam_rt
+                )  ###The extrinsics mean the tranformation from lidar to camera. If anyone want to use the extrinsics as sensor to lidar, please use np.linalg.inv(lidar2cam_rt.T) and modify the ResizeCropFlipImage and LoadMultiViewImageFromMultiSweepsFiles.
+                lidar2img_rts.append(lidar2img_rt)
+
+            input_dict.update(
+                dict(
+                    img_timestamp=img_timestamp,
+                    img_filename=image_paths,
+                    lidar2img=lidar2img_rts,
+                    intrinsics=intrinsics,
+                    extrinsics=extrinsics,
+                )
+            )
+
+        if not self.test_mode:
+            annos = self.get_ann_info(index)
+            input_dict["ann_info"] = annos
+        return input_dict
diff --git a/forge/test/models/pytorch/vision/petr/utils/petr3d.py b/forge/test/models/pytorch/vision/petr/utils/petr3d.py
new file mode 100644
index 000000000..9e313af10
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/petr3d.py
@@ -0,0 +1,124 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+import torch
+from mmcv.runner import auto_fp16, force_fp32
+from mmdet3d.models.detectors.mvx_two_stage import MVXTwoStageDetector
+from mmdet.models.builder import DETECTORS
+from utils.grid_mask import GridMask
+
+
+@DETECTORS.register_module()
+class Petr3D(MVXTwoStageDetector):
+    """Petr3D."""
+
+    def __init__(
+        self,
+        use_grid_mask=False,
+        pts_voxel_layer=None,
+        pts_voxel_encoder=None,
+        pts_middle_encoder=None,
+        pts_fusion_layer=None,
+        img_backbone=None,
+        pts_backbone=None,
+        img_neck=None,
+        pts_neck=None,
+        pts_bbox_head=None,
+        img_roi_head=None,
+        img_rpn_head=None,
+        train_cfg=None,
+        test_cfg=None,
+        pretrained=None,
+    ):
+        super(Petr3D, self).__init__(
+            pts_voxel_layer,
+            pts_voxel_encoder,
+            pts_middle_encoder,
+            pts_fusion_layer,
+            img_backbone,
+            pts_backbone,
+            img_neck,
+            pts_neck,
+            pts_bbox_head,
+            img_roi_head,
+            img_rpn_head,
+            train_cfg,
+            test_cfg,
+            pretrained,
+        )
+        self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7)
+        self.use_grid_mask = use_grid_mask
+
+    def extract_img_feat(self, img, img_metas):
+        """Extract features of images."""
+        if isinstance(img, list):
+            img = torch.stack(img, dim=0)
+
+        B = img.size(0)
+        if img is not None:
+            input_shape = img.shape[-2:]
+            # update real input shape of each single img
+            for img_meta in img_metas:
+                img_meta.update(input_shape=input_shape)
+            if img.dim() == 5:
+                if img.size(0) == 1 and img.size(1) != 1:
+                    img.squeeze_()
+                else:
+                    B, N, C, H, W = img.size()
+                    img = img.view(B * N, C, H, W)
+            if self.use_grid_mask:
+                img = self.grid_mask(img)
+
+            img_feats = self.img_backbone(img)
+            if isinstance(img_feats, dict):
+                img_feats = list(img_feats.values())
+        else:
+            return None
+        if self.with_img_neck:
+            img_feats = self.img_neck(img_feats)
+        img_feats_reshaped = []
+        for img_feat in img_feats:
+            BN, C, H, W = img_feat.size()
+            img_feats_reshaped.append(img_feat.view(B, int(BN / B), C, H, W))
+        return img_feats_reshaped
+
+    @auto_fp16(apply_to=("img"), out_fp32=True)
+    def extract_feat(self, img, img_metas):
+        """Extract features from images and points."""
+        img_feats = self.extract_img_feat(img, img_metas)
+        return img_feats
+
+    @force_fp32(apply_to=("img", "points"))
+    def forward(self, **kwargs):
+        return self.forward_test(**kwargs)
+
+    def forward_test(self, img_metas, img=None, **kwargs):
+        for var, name in [(img_metas, "img_metas")]:
+            if not isinstance(var, list):
+                raise TypeError("{} must be a list, but got {}".format(name, type(var)))
+        img = [img] if img is None else img
+        return self.simple_test(img_metas[0], img[0], **kwargs)
+
+    def simple_test_pts(self, x, img_metas, rescale=False):
+        """Test function of point cloud branch."""
+        outs = self.pts_bbox_head(x, img_metas)
+        return outs
+
+    def simple_test(self, img_metas, img=None, rescale=False):
+        """Test function without augmentaiton."""
+        img_feats = self.extract_feat(img=img, img_metas=img_metas)
+
+        bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale)
+
+        return bbox_pts
diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_head.py b/forge/test/models/pytorch/vision/petr/utils/petr_head.py
new file mode 100644
index 000000000..4a2398214
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/petr_head.py
@@ -0,0 +1,527 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+
+import numpy as np
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import Conv2d, Linear, bias_init_with_prob
+from mmcv.cnn.bricks.transformer import build_positional_encoding
+from mmcv.runner import force_fp32
+from mmdet3d.core.bbox.coders import build_bbox_coder
+from mmdet.models import HEADS, build_loss
+from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead
+from mmdet.models.utils import build_transformer
+from mmdet.models.utils.transformer import inverse_sigmoid
+
+
+def pos2posemb3d(pos, num_pos_feats=128, temperature=10000):
+    scale = 2 * math.pi
+    pos = pos * scale
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / num_pos_feats)
+    pos_x = pos[..., 0, None] / dim_t
+    pos_y = pos[..., 1, None] / dim_t
+    pos_z = pos[..., 2, None] / dim_t
+    pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2)
+    pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2)
+    pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2)
+    posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1)
+    return posemb
+
+
+@HEADS.register_module()
+class PETRHead(AnchorFreeHead):
+    """Implements the DETR transformer head.
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(
+        self,
+        num_classes,
+        in_channels,
+        num_query=100,
+        num_reg_fcs=2,
+        transformer=None,
+        sync_cls_avg_factor=False,
+        positional_encoding=dict(type="SinePositionalEncoding", num_feats=128, normalize=True),
+        code_weights=None,
+        bbox_coder=None,
+        loss_cls=dict(type="CrossEntropyLoss", bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=5.0),
+        loss_iou=dict(type="GIoULoss", loss_weight=2.0),
+        train_cfg=dict(
+            assigner=dict(
+                type="HungarianAssigner",
+                cls_cost=dict(type="ClassificationCost", weight=1.0),
+                reg_cost=dict(type="BBoxL1Cost", weight=5.0),
+                iou_cost=dict(type="IoUCost", iou_mode="giou", weight=2.0),
+            )
+        ),
+        test_cfg=dict(max_per_img=100),
+        with_position=True,
+        with_multiview=False,
+        depth_step=0.8,
+        depth_num=64,
+        LID=False,
+        depth_start=1,
+        position_range=[-65, -65, -8.0, 65, 65, 8.0],
+        init_cfg=None,
+        normedlinear=False,
+        **kwargs,
+    ):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        if "code_size" in kwargs:
+            self.code_size = kwargs["code_size"]
+        else:
+            self.code_size = 10
+        if code_weights is not None:
+            self.code_weights = code_weights
+        else:
+            self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2]
+        self.code_weights = self.code_weights[: self.code_size]
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get("class_weight", None)
+        if class_weight is not None and (self.__class__ is PETRHead):
+            assert isinstance(class_weight, float), (
+                "Expected " "class_weight to have type float. Found " f"{type(class_weight)}."
+            )
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get("bg_cls_weight", class_weight)
+            assert isinstance(bg_cls_weight, float), (
+                "Expected " "bg_cls_weight to have type float. Found " f"{type(bg_cls_weight)}."
+            )
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({"class_weight": class_weight})
+            if "bg_cls_weight" in loss_cls:
+                loss_cls.pop("bg_cls_weight")
+            self.bg_cls_weight = bg_cls_weight
+
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.embed_dims = 256
+        self.depth_step = depth_step
+        self.depth_num = depth_num
+        self.position_dim = 3 * self.depth_num
+        self.position_range = position_range
+        self.LID = LID
+        self.depth_start = depth_start
+        self.position_level = 0
+        self.with_position = with_position
+        self.with_multiview = with_multiview
+        assert "num_feats" in positional_encoding
+        num_feats = positional_encoding["num_feats"]
+        assert num_feats * 2 == self.embed_dims, (
+            "embed_dims should" f" be exactly 2 times of num_feats. Found {self.embed_dims}" f" and {num_feats}."
+        )
+        self.act_cfg = transformer.get("act_cfg", dict(type="ReLU", inplace=True))
+        self.num_pred = 6
+        self.normedlinear = normedlinear
+        super(PETRHead, self).__init__(num_classes, in_channels, init_cfg=init_cfg)
+
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.positional_encoding = build_positional_encoding(positional_encoding)
+        self.transformer = build_transformer(transformer)
+
+        self.code_weights = nn.Parameter(torch.tensor(self.code_weights, requires_grad=False), requires_grad=False)
+        self.bbox_coder = build_bbox_coder(bbox_coder)
+        self.pc_range = self.bbox_coder.pc_range
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        if self.with_position:
+            self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1)
+        else:
+            self.input_proj = Conv2d(self.in_channels, self.embed_dims, kernel_size=1)
+
+        cls_branch = []
+        for _ in range(self.num_reg_fcs):
+            cls_branch.append(Linear(self.embed_dims, self.embed_dims))
+            cls_branch.append(nn.LayerNorm(self.embed_dims))
+            cls_branch.append(nn.ReLU(inplace=True))
+        if self.normedlinear:
+            cls_branch.append(NormedLinear(self.embed_dims, self.cls_out_channels))
+        else:
+            cls_branch.append(Linear(self.embed_dims, self.cls_out_channels))
+        fc_cls = nn.Sequential(*cls_branch)
+
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, self.code_size))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        self.cls_branches = nn.ModuleList([fc_cls for _ in range(self.num_pred)])
+        self.reg_branches = nn.ModuleList([reg_branch for _ in range(self.num_pred)])
+
+        if self.with_multiview:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(self.embed_dims * 3 // 2, self.embed_dims * 4, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+                nn.Conv2d(self.embed_dims * 4, self.embed_dims, kernel_size=1, stride=1, padding=0),
+            )
+        else:
+            self.adapt_pos3d = nn.Sequential(
+                nn.Conv2d(self.embed_dims, self.embed_dims, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+                nn.Conv2d(self.embed_dims, self.embed_dims, kernel_size=1, stride=1, padding=0),
+            )
+
+        if self.with_position:
+            self.position_encoder = nn.Sequential(
+                nn.Conv2d(self.position_dim, self.embed_dims * 4, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+                nn.Conv2d(self.embed_dims * 4, self.embed_dims, kernel_size=1, stride=1, padding=0),
+            )
+
+        self.reference_points = nn.Embedding(self.num_query, 3)
+        self.query_embedding = nn.Sequential(
+            nn.Linear(self.embed_dims * 3 // 2, self.embed_dims),
+            nn.ReLU(),
+            nn.Linear(self.embed_dims, self.embed_dims),
+        )
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+        nn.init.uniform_(self.reference_points.weight.data, 0, 1)
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m[-1].bias, bias_init)
+
+    def position_embeding(self, img_feats, img_metas, masks=None):
+        eps = 1e-5
+        pad_h, pad_w, _ = img_metas[0]["pad_shape"][0]
+        B, N, C, H, W = img_feats[self.position_level].shape
+        coords_h = torch.arange(H, device=img_feats[0].device).float() * pad_h / H
+        coords_w = torch.arange(W, device=img_feats[0].device).float() * pad_w / W
+
+        if self.LID:
+            index = torch.arange(start=0, end=self.depth_num, step=1, device=img_feats[0].device).float()
+            index_1 = index + 1
+            bin_size = (self.position_range[3] - self.depth_start) / (self.depth_num * (1 + self.depth_num))
+            coords_d = self.depth_start + bin_size * index * index_1
+        else:
+            index = torch.arange(start=0, end=self.depth_num, step=1, device=img_feats[0].device).float()
+            bin_size = (self.position_range[3] - self.depth_start) / self.depth_num
+            coords_d = self.depth_start + bin_size * index
+
+        D = coords_d.shape[0]
+        coords = torch.stack(torch.meshgrid([coords_w, coords_h, coords_d])).permute(1, 2, 3, 0)  # W, H, D, 3
+        coords = torch.cat((coords, torch.ones_like(coords[..., :1])), -1)
+
+        # coords[..., :2] = coords[..., :2] * torch.maximum(coords[..., 2:3], torch.ones_like(coords[..., 2:3])*eps)
+
+        updated_coords = coords[..., :2] * torch.maximum(coords[..., 2:3], torch.ones_like(coords[..., 2:3]) * eps)
+        coords = torch.cat((updated_coords, coords[..., 2:]), dim=-1)
+
+        img2lidars = []
+        for img_meta in img_metas:
+            img2lidar = []
+            for i in range(len(img_meta["lidar2img"])):
+                img2lidar.append(np.linalg.inv(img_meta["lidar2img"][i]))
+            img2lidars.append(np.asarray(img2lidar))
+        img2lidars = np.asarray(img2lidars)
+        img2lidars = coords.new_tensor(img2lidars)  # (B, N, 4, 4)
+
+        coords = coords.view(1, 1, W, H, D, 4, 1).repeat(B, N, 1, 1, 1, 1, 1)
+        img2lidars = img2lidars.view(B, N, 1, 1, 1, 4, 4).repeat(1, 1, W, H, D, 1, 1)
+        coords3d = torch.matmul(img2lidars, coords).squeeze(-1)[..., :3]
+
+        # coords3d[..., 0:1] = (coords3d[..., 0:1] - self.position_range[0]) / (self.position_range[3] - self.position_range[0])
+        # coords3d[..., 1:2] = (coords3d[..., 1:2] - self.position_range[1]) / (self.position_range[4] - self.position_range[1])
+        # coords3d[..., 2:3] = (coords3d[..., 2:3] - self.position_range[2]) / (self.position_range[5] - self.position_range[2])
+
+        x = (coords3d[..., 0:1] - self.position_range[0]) / (self.position_range[3] - self.position_range[0])
+        y = (coords3d[..., 1:2] - self.position_range[1]) / (self.position_range[4] - self.position_range[1])
+        z = (coords3d[..., 2:3] - self.position_range[2]) / (self.position_range[5] - self.position_range[2])
+        coords3d = torch.cat([x, y, z], dim=-1)
+
+        coords_mask = (coords3d > 1.0) | (coords3d < 0.0)
+        coords_mask = coords_mask.flatten(-2).sum(-1) > (D * 0.5)
+        coords_mask = masks | coords_mask.permute(0, 1, 3, 2)
+        coords3d = coords3d.permute(0, 1, 4, 5, 3, 2).contiguous().view(B * N, -1, H, W)
+        coords3d = inverse_sigmoid(coords3d)
+        coords_position_embeding = self.position_encoder(coords3d)
+
+        return coords_position_embeding.view(B, N, self.embed_dims, H, W), coords_mask
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get("version", None)
+        if (version is None or version < 2) and self.__class__ is PETRHead:
+            convert_dict = {
+                ".self_attn.": ".attentions.0.",
+                # '.ffn.': '.ffns.0.',
+                ".multihead_attn.": ".attentions.1.",
+                ".decoder.norm.": ".decoder.post_norm.",
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super(AnchorFreeHead, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 5D-tensor with shape
+                (B, N, C, H, W).
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
+                Shape [nb_dec, bs, num_query, 9].
+        """
+
+        x = mlvl_feats[0]
+        batch_size, num_cams = x.size(0), x.size(1)
+        input_img_h, input_img_w, _ = img_metas[0]["pad_shape"][0]
+        # masks = x.new_ones(
+        #     (batch_size, num_cams, input_img_h, input_img_w))
+        masks = img_metas[0]["masks"]
+
+        for img_id in range(batch_size):
+            for cam_id in range(num_cams):
+                img_h, img_w, _ = img_metas[img_id]["img_shape"][cam_id]
+                masks[img_id, cam_id, :img_h, :img_w] = 0
+        x = self.input_proj(x.flatten(0, 1))
+        x = x.view(batch_size, num_cams, *x.shape[-3:])
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks, size=x.shape[-2:]).to(torch.bool)
+
+        if self.with_position:
+            coords_position_embeding, _ = self.position_embeding(mlvl_feats, img_metas, masks)
+            pos_embed = coords_position_embeding
+            if self.with_multiview:
+                sin_embed = self.positional_encoding(masks)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(x.size())
+                pos_embed = pos_embed + sin_embed
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    xy_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(xy_embed.unsqueeze(1))
+                sin_embed = torch.cat(pos_embeds, 1)
+                sin_embed = self.adapt_pos3d(sin_embed.flatten(0, 1)).view(x.size())
+                pos_embed = pos_embed + sin_embed
+        else:
+            if self.with_multiview:
+                pos_embed = self.positional_encoding(masks)
+                pos_embed = self.adapt_pos3d(pos_embed.flatten(0, 1)).view(x.size())
+            else:
+                pos_embeds = []
+                for i in range(num_cams):
+                    pos_embed = self.positional_encoding(masks[:, i, :, :])
+                    pos_embeds.append(pos_embed.unsqueeze(1))
+                pos_embed = torch.cat(pos_embeds, 1)
+
+        reference_points = self.reference_points.weight
+        query_embeds = self.query_embedding(pos2posemb3d(reference_points))
+        reference_points = reference_points.unsqueeze(0).repeat(batch_size, 1, 1)  # .sigmoid()
+        outs_dec, _ = self.transformer(x, masks, query_embeds, pos_embed, self.reg_branches)
+
+        outs_dec = torch.nan_to_num(outs_dec)
+        outputs_classes = []
+        outputs_coords = []
+        for lvl in range(outs_dec.shape[0]):
+            reference = inverse_sigmoid(reference_points.clone())
+            assert reference.shape[-1] == 3
+            outputs_class = self.cls_branches[lvl](outs_dec[lvl])
+            tmp = self.reg_branches[lvl](outs_dec[lvl])
+
+            # tmp[..., 0:2] += reference[..., 0:2]
+            # tmp[..., 0:2] = tmp[..., 0:2].sigmoid()
+            # tmp[..., 4:5] += reference[..., 2:3]
+            # tmp[..., 4:5] = tmp[..., 4:5].sigmoid()
+
+            xy = tmp[..., 0:2] + reference[..., 0:2]
+            xy = xy.sigmoid()
+            z = tmp[..., 4:5] + reference[..., 2:3]
+            z = z.sigmoid()
+            tmp = torch.cat([xy, tmp[..., 2:4], z, tmp[..., 5:]], dim=-1)
+
+            outputs_coord = tmp
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        all_cls_scores = torch.stack(outputs_classes)
+        all_bbox_preds = torch.stack(outputs_coords)
+
+        # all_bbox_preds[..., 0:1] = (all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0])
+        # all_bbox_preds[..., 1:2] = (all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1])
+        # all_bbox_preds[..., 4:5] = (all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2])
+
+        updated_0_1 = all_bbox_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]
+        updated_1_2 = all_bbox_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]
+        updated_4_5 = all_bbox_preds[..., 4:5] * (self.pc_range[5] - self.pc_range[2]) + self.pc_range[2]
+
+        all_bbox_preds = torch.cat(
+            [updated_0_1, updated_1_2, all_bbox_preds[..., 2:4], updated_4_5, all_bbox_preds[..., 5:]], dim=-1
+        )
+
+        outs = {
+            "all_cls_scores": all_cls_scores,
+            "all_bbox_preds": all_bbox_preds,
+            "enc_cls_scores": None,
+            "enc_bbox_preds": None,
+        }
+        return outs
+
+    @force_fp32(apply_to=("preds_dicts"))
+    def loss(self, gt_bboxes_list, gt_labels_list, preds_dicts, gt_bboxes_ignore=None):
+        """ "Loss function.
+        Args:
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            preds_dicts:
+                all_cls_scores (Tensor): Classification score of all
+                    decoder layers, has shape
+                    [nb_dec, bs, num_query, cls_out_channels].
+                all_bbox_preds (Tensor): Sigmoid regression
+                    outputs of all decode layers. Each is a 4D-tensor with
+                    normalized coordinate format (cx, cy, w, h) and shape
+                    [nb_dec, bs, num_query, 4].
+                enc_cls_scores (Tensor): Classification scores of
+                    points on encode feature map , has shape
+                    (N, h*w, num_classes). Only be passed when as_two_stage is
+                    True, otherwise is None.
+                enc_bbox_preds (Tensor): Regression results of each points
+                    on the encode feature map, has shape (N, h*w, 4). Only be
+                    passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, (
+            f"{self.__class__.__name__} only supports " f"for gt_bboxes_ignore setting to None."
+        )
+
+        all_cls_scores = preds_dicts["all_cls_scores"]
+        all_bbox_preds = preds_dicts["all_bbox_preds"]
+        enc_cls_scores = preds_dicts["enc_cls_scores"]
+        enc_bbox_preds = preds_dicts["enc_bbox_preds"]
+
+        num_dec_layers = len(all_cls_scores)
+        device = gt_labels_list[0].device
+        gt_bboxes_list = [
+            torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device)
+            for gt_bboxes in gt_bboxes_list
+        ]
+
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [gt_bboxes_ignore for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            all_cls_scores,
+            all_bbox_preds,
+            all_gt_bboxes_list,
+            all_gt_labels_list,
+            all_gt_bboxes_ignore_list,
+        )
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [torch.zeros_like(gt_labels_list[i]) for i in range(len(all_gt_labels_list))]
+            enc_loss_cls, enc_losses_bbox = self.loss_single(
+                enc_cls_scores, enc_bbox_preds, gt_bboxes_list, binary_labels_list, gt_bboxes_ignore
+            )
+            loss_dict["enc_loss_cls"] = enc_loss_cls
+            loss_dict["enc_loss_bbox"] = enc_losses_bbox
+
+        # loss from the last decoder layer
+        loss_dict["loss_cls"] = losses_cls[-1]
+        loss_dict["loss_bbox"] = losses_bbox[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]):
+            loss_dict[f"d{num_dec_layer}.loss_cls"] = loss_cls_i
+            loss_dict[f"d{num_dec_layer}.loss_bbox"] = loss_bbox_i
+            num_dec_layer += 1
+        return loss_dict
diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py b/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py
new file mode 100644
index 000000000..b728d7f51
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/petr_transformer.py
@@ -0,0 +1,447 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_norm_layer, xavier_init
+from mmcv.cnn.bricks.drop import build_dropout
+from mmcv.cnn.bricks.registry import (
+    ATTENTION,
+    TRANSFORMER_LAYER,
+    TRANSFORMER_LAYER_SEQUENCE,
+)
+from mmcv.cnn.bricks.transformer import (
+    BaseTransformerLayer,
+    TransformerLayerSequence,
+    build_transformer_layer_sequence,
+)
+from mmcv.runner.base_module import BaseModule
+from mmcv.utils import deprecated_api_warning
+from mmdet.models.utils.builder import TRANSFORMER
+
+
+@TRANSFORMER.register_module()
+class PETRTransformer(BaseModule):
+    """Implements the DETR transformer.
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+
+    def __init__(self, encoder=None, decoder=None, init_cfg=None, cross=False):
+        super(PETRTransformer, self).__init__(init_cfg=init_cfg)
+        if encoder is not None:
+            self.encoder = build_transformer_layer_sequence(encoder)
+        else:
+            self.encoder = None
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.decoder.embed_dims
+        self.cross = cross
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, "weight") and m.weight.dim() > 1:
+                xavier_init(m, distribution="uniform")
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed, reg_branch=None):
+        """Forward function for `Transformer`.
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, n, c, h, w = x.shape
+        memory = x.permute(1, 3, 4, 0, 2).reshape(-1, bs, c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        pos_embed = pos_embed.permute(1, 3, 4, 0, 2).reshape(-1, bs, c)  # [bs, n, c, h, w] -> [n*h*w, bs, c]
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, n, h, w] -> [bs, n*h*w]
+
+        # target = torch.zeros_like(query_embed)
+        target = torch.zeros(query_embed.shape)
+
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(
+            query=target,
+            key=memory,
+            value=memory,
+            key_pos=pos_embed,
+            query_pos=query_embed,
+            key_padding_mask=mask,
+            reg_branch=reg_branch,
+        )
+
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.reshape(n, h, w, bs, c).permute(3, 0, 4, 1, 2)
+        return out_dec, memory
+
+
+@TRANSFORMER_LAYER.register_module()
+class PETRTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+
+    def __init__(
+        self,
+        attn_cfgs,
+        feedforward_channels,
+        ffn_dropout=0.0,
+        operation_order=None,
+        act_cfg=dict(type="ReLU", inplace=True),
+        norm_cfg=dict(type="LN"),
+        ffn_num_fcs=2,
+        with_cp=True,
+        **kwargs,
+    ):
+        super(PETRTransformerDecoderLayer, self).__init__(
+            attn_cfgs=attn_cfgs,
+            feedforward_channels=feedforward_channels,
+            ffn_dropout=ffn_dropout,
+            operation_order=operation_order,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg,
+            ffn_num_fcs=ffn_num_fcs,
+            **kwargs,
+        )
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(["self_attn", "norm", "cross_attn", "ffn"])
+        self.use_checkpoint = with_cp
+
+    def _forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+    ):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerDecoderLayer, self).forward(
+            query,
+            key=key,
+            value=value,
+            query_pos=query_pos,
+            key_pos=key_pos,
+            attn_masks=attn_masks,
+            query_key_padding_mask=query_key_padding_mask,
+            key_padding_mask=key_padding_mask,
+        )
+
+        return x
+
+    def forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        query_pos=None,
+        key_pos=None,
+        attn_masks=None,
+        query_key_padding_mask=None,
+        key_padding_mask=None,
+        **kwargs,
+    ):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+
+        if self.use_checkpoint and self.training:
+            x = cp.checkpoint(
+                self._forward,
+                query,
+                key,
+                value,
+                query_pos,
+                key_pos,
+                attn_masks,
+                query_key_padding_mask,
+                key_padding_mask,
+            )
+        else:
+            x = self._forward(
+                query,
+                key=key,
+                value=value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+            )
+        return x
+
+
+@ATTENTION.register_module()
+class PETRMultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(
+        self,
+        embed_dims,
+        num_heads,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        dropout_layer=dict(type="Dropout", drop_prob=0.0),
+        init_cfg=None,
+        batch_first=False,
+        **kwargs,
+    ):
+        super(PETRMultiheadAttention, self).__init__(init_cfg)
+        if "dropout" in kwargs:
+            warnings.warn(
+                "The arguments `dropout` in MultiheadAttention "
+                "has been deprecated, now you can separately "
+                "set `attn_drop`(float), proj_drop(float), "
+                "and `dropout_layer`(dict) ",
+                DeprecationWarning,
+            )
+            attn_drop = kwargs["dropout"]
+            dropout_layer["drop_prob"] = kwargs.pop("dropout")
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({"residual": "identity"}, cls_name="MultiheadAttention")
+    def forward(
+        self,
+        query,
+        key=None,
+        value=None,
+        identity=None,
+        query_pos=None,
+        key_pos=None,
+        attn_mask=None,
+        key_padding_mask=None,
+        **kwargs,
+    ):
+        """Forward function for `MultiheadAttention`.
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f"position encoding of key is" f"missing in {self.__class__.__name__}.")
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(query=query, key=key, value=value, attn_mask=attn_mask, key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class PETRTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type="LN"), **kwargs):
+        super(PETRTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f"Use prenorm in " f"{self.__class__.__name__}," f"Please specify post_norm_cfg"
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(PETRTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class PETRTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+
+    def __init__(self, *args, post_norm_cfg=dict(type="LN"), return_intermediate=False, **kwargs):
+
+        super(PETRTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg, self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py
new file mode 100644
index 000000000..2417e73eb
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_1600x640.py
@@ -0,0 +1,242 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+backbone_norm_cfg = dict(type="LN", requires_grad=True)
+plugin = True
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    "car",
+    "truck",
+    "construction_vehicle",
+    "bus",
+    "trailer",
+    "barrier",
+    "motorcycle",
+    "bicycle",
+    "pedestrian",
+    "traffic_cone",
+]
+input_modality = dict(use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True)
+model = dict(
+    type="Petr3D",
+    use_grid_mask=True,
+    img_backbone=dict(
+        type="VoVNetCP",
+        spec_name="V-99-eSE",
+        norm_eval=True,
+        frozen_stages=-1,
+        input_ch=3,
+        out_features=(
+            "stage4",
+            "stage5",
+        ),
+    ),
+    img_neck=dict(type="CPFPN", in_channels=[768, 1024], out_channels=256, num_outs=2),
+    pts_bbox_head=dict(
+        type="PETRHead",
+        num_classes=10,
+        in_channels=256,
+        num_query=900,
+        LID=True,
+        with_position=True,
+        with_multiview=True,
+        position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        normedlinear=False,
+        transformer=dict(
+            type="PETRTransformer",
+            decoder=dict(
+                type="PETRTransformerDecoder",
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type="PETRTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1),
+                        dict(type="PETRMultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    with_cp=True,
+                    operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            # type='NMSFreeClsCoder',
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10,
+        ),
+        positional_encoding=dict(type="SinePositionalEncoding3D", num_feats=128, normalize=True),
+        loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+
+dataset_type = "CustomNuScenesDataset"
+data_root = "../data/nuscenes/"
+file_client_args = dict(backend="disk")
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + "nuscenes_dbinfos_train.pkl",
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5,
+        ),
+    ),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2,
+    ),
+    points_loader=dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args,
+    ),
+)
+ida_aug_conf = {
+    "resize_lim": (0.94, 1.25),
+    "final_dim": (640, 1600),
+    "bot_pct_lim": (0.0, 0.0),
+    "rot_lim": (0.0, 0.0),
+    "H": 900,
+    "W": 1600,
+    # "rand_flip": False,
+    "rand_flip": True,
+}
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilter", classes=class_names),
+    dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=True),
+    dict(
+        type="GlobalRotScaleTransImage",
+        rot_range=[-0.3925, 0.3925],
+        translation_std=[0, 0, 0],
+        scale_ratio_range=[0.95, 1.05],
+        reverse_angle=True,
+        training=True,
+    ),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(type="Collect3D", keys=["gt_bboxes_3d", "gt_labels_3d", "img"]),
+]
+test_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=False),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False),
+            dict(type="Collect3D", keys=["img"]),
+        ],
+    ),
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + "nuscenes_infos_train.pkl",
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d="LiDAR",
+    ),
+    val=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality),
+    test=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality),
+)
+
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+
+optimizer_config = dict(type="Fp16OptimizerHook", loss_scale=512.0, grad_clip=dict(max_norm=35, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+    # by_epoch=False
+)
+total_epochs = 24
+evaluation = dict(interval=24, pipeline=test_pipeline)
+find_unused_parameters = False
+
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+resume_from = None
diff --git a/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py
new file mode 100644
index 000000000..329e58094
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/petr_vovnet_gridmask_p4_800x320.py
@@ -0,0 +1,239 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+backbone_norm_cfg = dict(type="LN", requires_grad=True)
+plugin = True
+
+# If point cloud range is changed, the models should also change their point
+# cloud range accordingly
+point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+voxel_size = [0.2, 0.2, 8]
+img_norm_cfg = dict(mean=[103.530, 116.280, 123.675], std=[57.375, 57.120, 58.395], to_rgb=False)
+# For nuScenes we usually do 10-class detection
+class_names = [
+    "car",
+    "truck",
+    "construction_vehicle",
+    "bus",
+    "trailer",
+    "barrier",
+    "motorcycle",
+    "bicycle",
+    "pedestrian",
+    "traffic_cone",
+]
+input_modality = dict(use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=True)
+model = dict(
+    type="Petr3D",
+    use_grid_mask=True,
+    img_backbone=dict(
+        type="VoVNetCP",
+        spec_name="V-99-eSE",
+        norm_eval=True,
+        frozen_stages=-1,
+        input_ch=3,
+        out_features=(
+            "stage4",
+            "stage5",
+        ),
+    ),
+    img_neck=dict(type="CPFPN", in_channels=[768, 1024], out_channels=256, num_outs=2),
+    pts_bbox_head=dict(
+        type="PETRHead",
+        num_classes=10,
+        in_channels=256,
+        num_query=900,
+        LID=True,
+        with_position=True,
+        with_multiview=True,
+        position_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+        normedlinear=False,
+        transformer=dict(
+            type="PETRTransformer",
+            decoder=dict(
+                type="PETRTransformerDecoder",
+                return_intermediate=True,
+                num_layers=6,
+                transformerlayers=dict(
+                    type="PETRTransformerDecoderLayer",
+                    attn_cfgs=[
+                        dict(type="MultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1),
+                        dict(type="PETRMultiheadAttention", embed_dims=256, num_heads=8, dropout=0.1),
+                    ],
+                    feedforward_channels=2048,
+                    ffn_dropout=0.1,
+                    operation_order=("self_attn", "norm", "cross_attn", "norm", "ffn", "norm"),
+                ),
+            ),
+        ),
+        bbox_coder=dict(
+            type="NMSFreeCoder",
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            pc_range=point_cloud_range,
+            max_num=300,
+            voxel_size=voxel_size,
+            num_classes=10,
+        ),
+        positional_encoding=dict(type="SinePositionalEncoding3D", num_feats=128, normalize=True),
+        loss_cls=dict(type="FocalLoss", use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0),
+        loss_bbox=dict(type="L1Loss", loss_weight=0.25),
+        loss_iou=dict(type="GIoULoss", loss_weight=0.0),
+    ),
+    # model training and testing settings
+    train_cfg=dict(
+        pts=dict(
+            grid_size=[512, 512, 1],
+            voxel_size=voxel_size,
+            point_cloud_range=point_cloud_range,
+            out_size_factor=4,
+            assigner=dict(
+                type="HungarianAssigner3D",
+                cls_cost=dict(type="FocalLossCost", weight=2.0),
+                reg_cost=dict(type="BBox3DL1Cost", weight=0.25),
+                iou_cost=dict(
+                    type="IoUCost", weight=0.0
+                ),  # Fake cost. This is just to make it compatible with DETR head.
+                pc_range=point_cloud_range,
+            ),
+        )
+    ),
+)
+
+dataset_type = "CustomNuScenesDataset"
+data_root = "../data/nuscenes/"
+
+file_client_args = dict(backend="disk")
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + "nuscenes_dbinfos_train.pkl",
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5,
+        ),
+    ),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2,
+    ),
+    points_loader=dict(
+        type="LoadPointsFromFile",
+        coord_type="LIDAR",
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        file_client_args=file_client_args,
+    ),
+)
+ida_aug_conf = {
+    "resize_lim": (0.47, 0.625),
+    "final_dim": (320, 800),
+    "bot_pct_lim": (0.0, 0.0),
+    "rot_lim": (0.0, 0.0),
+    "H": 900,
+    "W": 1600,
+    "rand_flip": True,
+}
+train_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(type="LoadAnnotations3D", with_bbox_3d=True, with_label_3d=True, with_attr_label=False),
+    dict(type="ObjectRangeFilter", point_cloud_range=point_cloud_range),
+    dict(type="ObjectNameFilter", classes=class_names),
+    dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=True),
+    dict(
+        type="GlobalRotScaleTransImage",
+        rot_range=[-0.3925, 0.3925],
+        translation_std=[0, 0, 0],
+        scale_ratio_range=[0.95, 1.05],
+        reverse_angle=True,
+        training=True,
+    ),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(type="DefaultFormatBundle3D", class_names=class_names),
+    dict(type="Collect3D", keys=["gt_bboxes_3d", "gt_labels_3d", "img"]),
+]
+test_pipeline = [
+    dict(type="LoadMultiViewImageFromFiles", to_float32=True),
+    dict(type="ResizeCropFlipImage", data_aug_conf=ida_aug_conf, training=False),
+    dict(type="NormalizeMultiviewImage", **img_norm_cfg),
+    dict(type="PadMultiViewImage", size_divisor=32),
+    dict(
+        type="MultiScaleFlipAug3D",
+        img_scale=(1333, 800),
+        pts_scale_ratio=1,
+        flip=False,
+        transforms=[
+            dict(type="DefaultFormatBundle3D", class_names=class_names, with_label=False),
+            dict(type="Collect3D", keys=["img"]),
+        ],
+    ),
+]
+
+data = dict(
+    samples_per_gpu=1,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file=data_root + "nuscenes_infos_train.pkl",
+        pipeline=train_pipeline,
+        classes=class_names,
+        modality=input_modality,
+        test_mode=False,
+        use_valid_flag=True,
+        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+        box_type_3d="LiDAR",
+    ),
+    val=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality),
+    test=dict(type=dataset_type, pipeline=test_pipeline, classes=class_names, modality=input_modality),
+)
+
+optimizer = dict(
+    type="AdamW",
+    lr=2e-4,
+    paramwise_cfg=dict(
+        custom_keys={
+            "img_backbone": dict(lr_mult=0.1),
+        }
+    ),
+    weight_decay=0.01,
+)
+
+optimizer_config = dict(type="Fp16OptimizerHook", loss_scale=512.0, grad_clip=dict(max_norm=35, norm_type=2))
+
+# learning policy
+lr_config = dict(
+    policy="CosineAnnealing",
+    warmup="linear",
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    min_lr_ratio=1e-3,
+)
+total_epochs = 24
+evaluation = dict(interval=24, pipeline=test_pipeline)
+find_unused_parameters = False
+
+runner = dict(type="EpochBasedRunner", max_epochs=total_epochs)
+resume_from = None
diff --git a/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py b/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py
new file mode 100644
index 000000000..dc6896208
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/positional_encoding.py
@@ -0,0 +1,154 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from mmdetection (https://github.com/open-mmlab/mmdetection)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING
+from mmcv.runner import BaseModule
+
+
+@POSITIONAL_ENCODING.register_module()
+class SinePositionalEncoding3D(BaseModule):
+    """Position encoding with sine and cosine functions.
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    def __init__(
+        self, num_feats, temperature=10000, normalize=False, scale=2 * math.pi, eps=1e-6, offset=0.0, init_cfg=None
+    ):
+        super(SinePositionalEncoding3D, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), (
+                "when normalize is set," "scale should be provided and in float or int type, " f"found {type(scale)}"
+            )
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        n_embed = not_mask.cumsum(1, dtype=torch.float32)
+        y_embed = not_mask.cumsum(2, dtype=torch.float32)
+        x_embed = not_mask.cumsum(3, dtype=torch.float32)
+        if self.normalize:
+            n_embed = (n_embed + self.offset) / (n_embed[:, -1:, :, :] + self.eps) * self.scale
+            y_embed = (y_embed + self.offset) / (y_embed[:, :, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / (x_embed[:, :, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(self.num_feats, dtype=torch.float32, device=mask.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_feats)
+        pos_n = n_embed[:, :, :, :, None] / dim_t
+        pos_x = x_embed[:, :, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, N, H, W = mask.size()
+        pos_n = torch.stack((pos_n[:, :, :, :, 0::2].sin(), pos_n[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1)
+        pos_x = torch.stack((pos_x[:, :, :, :, 0::2].sin(), pos_x[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1)
+        pos_y = torch.stack((pos_y[:, :, :, :, 0::2].sin(), pos_y[:, :, :, :, 1::2].cos()), dim=4).view(B, N, H, W, -1)
+        pos = torch.cat((pos_n, pos_y, pos_x), dim=4).permute(0, 1, 4, 2, 3)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f"(num_feats={self.num_feats}, "
+        repr_str += f"temperature={self.temperature}, "
+        repr_str += f"normalize={self.normalize}, "
+        repr_str += f"scale={self.scale}, "
+        repr_str += f"eps={self.eps})"
+        return repr_str
+
+
+@POSITIONAL_ENCODING.register_module()
+class LearnedPositionalEncoding3D(BaseModule):
+    """Position embedding with learnable embedding weights.
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+
+    def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type="Uniform", layer="Embedding")):
+        super(LearnedPositionalEncoding3D, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = (
+            torch.cat((x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(1, w, 1)), dim=-1)
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(mask.shape[0], 1, 1, 1)
+        )
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f"(num_feats={self.num_feats}, "
+        repr_str += f"row_num_embed={self.row_num_embed}, "
+        repr_str += f"col_num_embed={self.col_num_embed})"
+        return repr_str
diff --git a/forge/test/models/pytorch/vision/petr/utils/transform_3d.py b/forge/test/models/pytorch/vision/petr/utils/transform_3d.py
new file mode 100644
index 000000000..429d157da
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/transform_3d.py
@@ -0,0 +1,213 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import mmcv
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d)
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------
+import numpy as np
+import torch
+from mmdet.datasets.builder import PIPELINES
+from PIL import Image
+
+
+@PIPELINES.register_module()
+class PadMultiViewImage(object):
+    """Pad the multi-view image.
+    There are two padding modes: (1) pad to a fixed size and (2) pad to the
+    minimum size that is divisible by some number.
+    Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
+    Args:
+        size (tuple, optional): Fixed padding size.
+        size_divisor (int, optional): The divisor of padded size.
+        pad_val (float, optional): Padding value, 0 by default.
+    """
+
+    def __init__(self, size=None, size_divisor=None, pad_val=0):
+        self.size = size
+        self.size_divisor = size_divisor
+        self.pad_val = pad_val
+        # only one of size and size_divisor should be valid
+        assert size is not None or size_divisor is not None
+        assert size is None or size_divisor is None
+
+    def _pad_img(self, results):
+        """Pad images according to ``self.size``."""
+        if self.size is not None:
+            padded_img = [mmcv.impad(img, shape=self.size, pad_val=self.pad_val) for img in results["img"]]
+        elif self.size_divisor is not None:
+            padded_img = [
+                mmcv.impad_to_multiple(img, self.size_divisor, pad_val=self.pad_val) for img in results["img"]
+            ]
+        results["img_shape"] = [img.shape for img in results["img"]]
+        results["img"] = padded_img
+        results["pad_shape"] = [img.shape for img in padded_img]
+        results["pad_fixed_size"] = self.size
+        results["pad_size_divisor"] = self.size_divisor
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(size={self.size}, "
+        repr_str += f"size_divisor={self.size_divisor}, "
+        repr_str += f"pad_val={self.pad_val})"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class NormalizeMultiviewImage(object):
+    """Normalize the image.
+    Added key is "img_norm_cfg".
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        """Call function to normalize images.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Normalized results, 'img_norm_cfg' key is added into
+                result dict.
+        """
+        results["img"] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results["img"]]
+        results["img_norm_cfg"] = dict(mean=self.mean, std=self.std, to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f"(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})"
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ResizeCropFlipImage(object):
+    """Random resize, Crop and flip the image
+    Args:
+        size (tuple, optional): Fixed padding size.
+    """
+
+    def __init__(self, data_aug_conf=None, training=True):
+        self.data_aug_conf = data_aug_conf
+        self.training = training
+
+    def __call__(self, results):
+        """Call function to pad images, masks, semantic segmentation maps.
+        Args:
+            results (dict): Result dict from loading pipeline.
+        Returns:
+            dict: Updated result dict.
+        """
+
+        imgs = results["img"]
+        N = len(imgs)
+        new_imgs = []
+        resize, resize_dims, crop, flip, rotate = self._sample_augmentation()
+        for i in range(N):
+            img = Image.fromarray(np.uint8(imgs[i]))
+            # augmentation (resize, crop, horizontal flip, rotate)
+            # resize, resize_dims, crop, flip, rotate = self._sample_augmentation()  ###different view use different aug (BEV Det)
+            img, ida_mat = self._img_transform(
+                img,
+                resize=resize,
+                resize_dims=resize_dims,
+                crop=crop,
+                flip=flip,
+                rotate=rotate,
+            )
+            new_imgs.append(np.array(img).astype(np.float32))
+            results["intrinsics"][i][:3, :3] = ida_mat @ results["intrinsics"][i][:3, :3]
+
+        results["img"] = new_imgs
+        results["lidar2img"] = [
+            results["intrinsics"][i] @ results["extrinsics"][i].T for i in range(len(results["extrinsics"]))
+        ]
+
+        return results
+
+    def _get_rot(self, h):
+
+        return torch.Tensor(
+            [
+                [np.cos(h), np.sin(h)],
+                [-np.sin(h), np.cos(h)],
+            ]
+        )
+
+    def _img_transform(self, img, resize, resize_dims, crop, flip, rotate):
+        ida_rot = torch.eye(2)
+        ida_tran = torch.zeros(2)
+        # adjust image
+        img = img.resize(resize_dims)
+        img = img.crop(crop)
+        if flip:
+            img = img.transpose(method=Image.FLIP_LEFT_RIGHT)
+        img = img.rotate(rotate)
+
+        # post-homography transformation
+        ida_rot *= resize
+        ida_tran -= torch.Tensor(crop[:2])
+        if flip:
+            A = torch.Tensor([[-1, 0], [0, 1]])
+            b = torch.Tensor([crop[2] - crop[0], 0])
+            ida_rot = A.matmul(ida_rot)
+            ida_tran = A.matmul(ida_tran) + b
+        A = self._get_rot(rotate / 180 * np.pi)
+        b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
+        b = A.matmul(-b) + b
+        ida_rot = A.matmul(ida_rot)
+        ida_tran = A.matmul(ida_tran) + b
+        ida_mat = torch.eye(3)
+        ida_mat[:2, :2] = ida_rot
+        ida_mat[:2, 2] = ida_tran
+        return img, ida_mat
+
+    def _sample_augmentation(self):
+        H, W = self.data_aug_conf["H"], self.data_aug_conf["W"]
+        fH, fW = self.data_aug_conf["final_dim"]
+        if self.training:
+            resize = np.random.uniform(*self.data_aug_conf["resize_lim"])
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int((1 - np.random.uniform(*self.data_aug_conf["bot_pct_lim"])) * newH) - fH
+            crop_w = int(np.random.uniform(0, max(0, newW - fW)))
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            if self.data_aug_conf["rand_flip"] and np.random.choice([0, 1]):
+                flip = True
+            rotate = np.random.uniform(*self.data_aug_conf["rot_lim"])
+        else:
+            resize = max(fH / H, fW / W)
+            resize_dims = (int(W * resize), int(H * resize))
+            newW, newH = resize_dims
+            crop_h = int((1 - np.mean(self.data_aug_conf["bot_pct_lim"])) * newH) - fH
+            crop_w = int(max(0, newW - fW) / 2)
+            crop = (crop_w, crop_h, crop_w + fW, crop_h + fH)
+            flip = False
+            rotate = 0
+        return resize, resize_dims, crop, flip, rotate
diff --git a/forge/test/models/pytorch/vision/petr/utils/utils.py b/forge/test/models/pytorch/vision/petr/utils/utils.py
new file mode 100644
index 000000000..ab04e9365
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/utils.py
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from mmcv import Config
+from mmdet3d.datasets import build_dataloader, build_dataset
+
+from test.models.pytorch.vision.petr.mmdet3d.core.bbox.transforms import bbox3d2result
+
+
+def load_config(variant):
+    cfg = Config.fromfile(f"forge/test/models/pytorch/vision/petr/utils/petr_{variant}.py")
+    cfg.data.test.test_mode = True
+    cfg.data.test.ann_file = "forge/test/models/pytorch/vision/petr/data/nuscenes/nuscenes_infos_val.pkl"
+    return cfg
+
+
+def prepare_model_inputs(cfg):
+
+    dataset = build_dataset(cfg.data.test)
+    data_loader = build_dataloader(dataset, samples_per_gpu=1, workers_per_gpu=0, dist=False, shuffle=False)
+    dataset = data_loader.dataset
+
+    for i, data in enumerate(data_loader):
+
+        img_metas = data["img_metas"][0].data[0]
+        filename = img_metas[0]["filename"]
+        ori_shape = img_metas[0]["ori_shape"]
+        img_shape = img_metas[0]["img_shape"]
+        pad_shape = img_metas[0]["pad_shape"]
+        scale_factor = img_metas[0]["scale_factor"]
+        flip = img_metas[0]["flip"]
+        pcd_horizontal_flip = img_metas[0]["pcd_horizontal_flip"]
+        pcd_vertical_flip = img_metas[0]["pcd_vertical_flip"]
+        box_mode_3d = img_metas[0]["box_mode_3d"]
+        box_type_3d = img_metas[0]["box_type_3d"]
+        mean = torch.from_numpy(img_metas[0]["img_norm_cfg"]["mean"])
+        std = torch.from_numpy(img_metas[0]["img_norm_cfg"]["std"])
+        to_rgb = img_metas[0]["img_norm_cfg"]["to_rgb"]
+        sample_idx = img_metas[0]["sample_idx"]
+        pcd_scale_factor = img_metas[0]["pcd_scale_factor"]
+        pts_filename = img_metas[0]["pts_filename"]
+        img = data["img"][0].data[0]
+        lidar2img_list = img_metas[0]["lidar2img"]
+
+        lidar2img_tensors_list = []
+
+        for idx, lidar2img_array in enumerate(lidar2img_list):
+            lidar2img_tensor = torch.from_numpy(lidar2img_array)
+            lidar2img_tensors_list.append(lidar2img_tensor)
+
+        batch_size = 1
+        num_cams = 6
+        input_img_h, input_img_w, _ = pad_shape[0]
+        x = torch.rand(batch_size, num_cams, input_img_h, input_img_w)
+        masks = x.new_ones((batch_size, num_cams, input_img_h, input_img_w))
+
+        inputs = [
+            lidar2img_tensors_list[0].unsqueeze(0),
+            lidar2img_tensors_list[1].unsqueeze(0),
+            lidar2img_tensors_list[2].unsqueeze(0),
+            lidar2img_tensors_list[3].unsqueeze(0),
+            lidar2img_tensors_list[4].unsqueeze(0),
+            lidar2img_tensors_list[5].unsqueeze(0),
+            img.unsqueeze(0),
+            mean.unsqueeze(0),
+            std.unsqueeze(0),
+            masks.unsqueeze(0),
+        ]
+
+        for i, tensor in enumerate(inputs):
+            if tensor.dtype == torch.float64:
+                inputs[i] = tensor.to(torch.float32)
+
+        return (
+            filename,
+            ori_shape,
+            img_shape,
+            pad_shape,
+            scale_factor,
+            flip,
+            pcd_horizontal_flip,
+            pcd_vertical_flip,
+            box_mode_3d,
+            box_type_3d,
+            to_rgb,
+            sample_idx,
+            pcd_scale_factor,
+            pts_filename,
+            inputs,
+        )
+
+
+def denormalize_bbox(normalized_bboxes, pc_range):
+    # rotation
+    rot_sine = normalized_bboxes[..., 6:7]
+
+    rot_cosine = normalized_bboxes[..., 7:8]
+    rot = torch.atan2(rot_sine, rot_cosine)
+
+    # center in the bev
+    cx = normalized_bboxes[..., 0:1]
+    cy = normalized_bboxes[..., 1:2]
+    cz = normalized_bboxes[..., 4:5]
+
+    # size
+    w = normalized_bboxes[..., 2:3]
+    l = normalized_bboxes[..., 3:4]
+    h = normalized_bboxes[..., 5:6]
+
+    w = w.exp()
+    l = l.exp()
+    h = h.exp()
+    if normalized_bboxes.size(-1) > 8:
+        # velocity
+        vx = normalized_bboxes[:, 8:9]
+        vy = normalized_bboxes[:, 9:10]
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1)
+    else:
+        denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1)
+    return denormalized_bboxes
+
+
+def decode_single(cls_scores, bbox_preds):
+
+    # post processing
+    pc_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
+    voxel_size = [0.2, 0.2, 8]
+    post_center_range = [-61.2, -61.2, -10.0, 61.2, 61.2, 10.0]
+    max_num = 300
+    score_threshold = None
+    num_classes = 10
+
+    cls_scores = cls_scores.sigmoid()
+    scores, indexs = cls_scores.view(-1).topk(max_num)
+    labels = indexs % num_classes
+    bbox_index = indexs // num_classes
+
+    bbox_preds = bbox_preds[bbox_index]
+
+    final_box_preds = denormalize_bbox(bbox_preds, pc_range)
+    final_scores = scores
+    final_preds = labels
+
+    if score_threshold is not None:
+        thresh_mask = final_scores > self.score_threshold
+    if post_center_range is not None:
+        post_center_range = torch.tensor(post_center_range, device=scores.device)
+
+        mask = (final_box_preds[..., :3] >= post_center_range[:3]).all(1)
+        mask &= (final_box_preds[..., :3] <= post_center_range[3:]).all(1)
+
+        if score_threshold:
+            mask &= thresh_mask
+
+        boxes3d = final_box_preds[mask]
+        scores = final_scores[mask]
+        labels = final_preds[mask]
+        predictions_dict = {"bboxes": boxes3d, "scores": scores, "labels": labels}
+
+        return predictions_dict
+
+
+def post_process(img_metas, all_class_scores, all_bbox_predictions):
+
+    all_cls_scores = all_class_scores[-1]
+    all_bbox_preds = all_bbox_predictions[-1]
+
+    batch_size = all_cls_scores.size()[0]
+    predictions_list = []
+    for i in range(batch_size):
+        predictions_list.append(decode_single(all_cls_scores[i], all_bbox_preds[i]))
+
+    preds_dicts = predictions_list
+    num_samples = len(preds_dicts)
+    ret_list = []
+    for i in range(num_samples):
+        preds = preds_dicts[i]
+        bboxes = preds["bboxes"]
+        bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
+        bboxes = img_metas[i]["box_type_3d"](bboxes, bboxes.size(-1))
+        scores = preds["scores"]
+        labels = preds["labels"]
+        ret_list.append([bboxes, scores, labels])
+
+    bbox_list = ret_list
+    bbox_results = [bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list]
+
+    bbox_list = [dict() for i in range(1)]
+    bbox_pts = bbox_results
+    for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
+        result_dict["pts_bbox"] = pts_bbox
+
+    boxes_3d_tensor = bbox_list[0]["pts_bbox"]["boxes_3d"].tensor
+    scores_3d_tensor = bbox_list[0]["pts_bbox"]["scores_3d"]
+    labels_3d_tensor = bbox_list[0]["pts_bbox"]["labels_3d"]
+
+    output = (boxes_3d_tensor, scores_3d_tensor, labels_3d_tensor)
+
+    return output
diff --git a/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py b/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py
new file mode 100644
index 000000000..08f9f41ae
--- /dev/null
+++ b/forge/test/models/pytorch/vision/petr/utils/vovnetcp.py
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import warnings
+
+# ------------------------------------------------------------------------
+# Copyright (c) 2022 megvii-model. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from DETR3D (https://github.com/WangYueFt/detr3d)
+# Copyright (c) 2021 Wang, Yue
+# ------------------------------------------------------------------------
+# Copyright (c) Youngwan Lee (ETRI) All Rights Reserved.
+# Copyright 2021 Toyota Research Institute.  All rights reserved.
+# ------------------------------------------------------------------------
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from mmcv.runner import BaseModule
+from mmdet.models.builder import BACKBONES
+from torch.nn.modules.batchnorm import _BatchNorm
+
+VoVNet19_slim_dw_eSE = {
+    "stem": [64, 64, 64],
+    "stage_conv_ch": [64, 80, 96, 112],
+    "stage_out_ch": [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True,
+}
+
+VoVNet19_dw_eSE = {
+    "stem": [64, 64, 64],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": True,
+}
+
+VoVNet19_slim_eSE = {
+    "stem": [64, 64, 128],
+    "stage_conv_ch": [64, 80, 96, 112],
+    "stage_out_ch": [112, 256, 384, 512],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False,
+}
+
+VoVNet19_eSE = {
+    "stem": [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 3,
+    "block_per_stage": [1, 1, 1, 1],
+    "eSE": True,
+    "dw": False,
+}
+
+VoVNet39_eSE = {
+    "stem": [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 2, 2],
+    "eSE": True,
+    "dw": False,
+}
+
+VoVNet57_eSE = {
+    "stem": [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 1, 4, 3],
+    "eSE": True,
+    "dw": False,
+}
+
+VoVNet99_eSE = {
+    "stem": [64, 64, 128],
+    "stage_conv_ch": [128, 160, 192, 224],
+    "stage_out_ch": [256, 512, 768, 1024],
+    "layer_per_block": 5,
+    "block_per_stage": [1, 3, 9, 3],
+    "eSE": True,
+    "dw": False,
+}
+
+_STAGE_SPECS = {
+    "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE,
+    "V-19-dw-eSE": VoVNet19_dw_eSE,
+    "V-19-slim-eSE": VoVNet19_slim_eSE,
+    "V-19-eSE": VoVNet19_eSE,
+    "V-39-eSE": VoVNet39_eSE,
+    "V-57-eSE": VoVNet57_eSE,
+    "V-99-eSE": VoVNet99_eSE,
+}
+
+
+def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            "{}_{}/dw_conv3x3".format(module_name, postfix),
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=out_channels,
+                bias=False,
+            ),
+        ),
+        (
+            "{}_{}/pw_conv1x1".format(module_name, postfix),
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False),
+        ),
+        ("{}_{}/pw_norm".format(module_name, postfix), nn.BatchNorm2d(out_channels)),
+        ("{}_{}/pw_relu".format(module_name, postfix), nn.ReLU(inplace=True)),
+    ]
+
+
+def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1):
+    """3x3 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0):
+    """1x1 convolution with padding"""
+    return [
+        (
+            f"{module_name}_{postfix}/conv",
+            nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                groups=groups,
+                bias=False,
+            ),
+        ),
+        (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)),
+        (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)),
+    ]
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3.0, inplace=self.inplace) / 6.0
+
+
+class eSEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(eSEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0)
+        self.hsigmoid = Hsigmoid()
+
+    def forward(self, x):
+        input = x
+        x = self.avg_pool(x)
+        x = self.fc(x)
+        x = self.hsigmoid(x)
+        return input * x
+
+
+class _OSA_module(nn.Module):
+    def __init__(
+        self,
+        in_ch,
+        stage_ch,
+        concat_ch,
+        layer_per_block,
+        module_name,
+        SE=False,
+        identity=False,
+        depthwise=False,
+        with_cp=True,
+    ):
+
+        super(_OSA_module, self).__init__()
+
+        self.identity = identity
+        self.depthwise = depthwise
+        self.isReduced = False
+        self.use_checkpoint = with_cp
+        self.layers = nn.ModuleList()
+        in_channel = in_ch
+        if self.depthwise and in_channel != stage_ch:
+            self.isReduced = True
+            self.conv_reduction = nn.Sequential(
+                OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0"))
+            )
+        for i in range(layer_per_block):
+            if self.depthwise:
+                self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i))))
+            else:
+                self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i))))
+            in_channel = stage_ch
+
+        # feature aggregation
+        in_channel = in_ch + layer_per_block * stage_ch
+        self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat")))
+
+        self.ese = eSEModule(concat_ch)
+
+    def _forward(self, x):
+
+        identity_feat = x
+
+        output = []
+        output.append(x)
+        if self.depthwise and self.isReduced:
+            x = self.conv_reduction(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+
+        x = torch.cat(output, dim=1)
+        xt = self.concat(x)
+
+        xt = self.ese(xt)
+
+        if self.identity:
+            xt = xt + identity_feat
+
+        return xt
+
+    def forward(self, x):
+
+        if self.use_checkpoint and self.training:
+            xt = cp.checkpoint(self._forward, x)
+        else:
+            xt = self._forward(x)
+
+        return xt
+
+
+class _OSA_stage(nn.Sequential):
+    def __init__(
+        self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False
+    ):
+
+        super(_OSA_stage, self).__init__()
+
+        if not stage_num == 2:
+            self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True))
+
+        if block_per_stage != 1:
+            SE = False
+        module_name = f"OSA{stage_num}_1"
+        self.add_module(
+            module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise)
+        )
+        for i in range(block_per_stage - 1):
+            if i != block_per_stage - 2:  # last block
+                SE = False
+            module_name = f"OSA{stage_num}_{i + 2}"
+            self.add_module(
+                module_name,
+                _OSA_module(
+                    concat_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, identity=True, depthwise=depthwise
+                ),
+            )
+
+
+@BACKBONES.register_module()
+class VoVNetCP(BaseModule):
+    def __init__(
+        self, spec_name, input_ch=3, out_features=None, frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None
+    ):
+        """
+        Args:
+            input_ch(int) : the number of input channel
+            out_features (list[str]): name of the layers whose outputs should
+                be returned in forward. Can be anything in "stem", "stage2" ...
+        """
+        super(VoVNetCP, self).__init__(init_cfg)
+        self.frozen_stages = frozen_stages
+        self.norm_eval = norm_eval
+
+        if isinstance(pretrained, str):
+            warnings.warn("DeprecationWarning: pretrained is deprecated, " 'please use "init_cfg" instead')
+            self.init_cfg = dict(type="Pretrained", checkpoint=pretrained)
+        stage_specs = _STAGE_SPECS[spec_name]
+
+        stem_ch = stage_specs["stem"]
+        config_stage_ch = stage_specs["stage_conv_ch"]
+        config_concat_ch = stage_specs["stage_out_ch"]
+        block_per_stage = stage_specs["block_per_stage"]
+        layer_per_block = stage_specs["layer_per_block"]
+        SE = stage_specs["eSE"]
+        depthwise = stage_specs["dw"]
+
+        self._out_features = out_features
+
+        # Stem module
+        conv_type = dw_conv3x3 if depthwise else conv3x3
+        stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2)
+        stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1)
+        stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2)
+        self.add_module("stem", nn.Sequential((OrderedDict(stem))))
+        current_stirde = 4
+        self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde}
+        self._out_feature_channels = {"stem": stem_ch[2]}
+
+        stem_out_ch = [stem_ch[2]]
+        in_ch_list = stem_out_ch + config_concat_ch[:-1]
+        # OSA stages
+        self.stage_names = []
+        for i in range(4):  # num_stages
+            name = "stage%d" % (i + 2)  # stage 2 ... stage 5
+            self.stage_names.append(name)
+            self.add_module(
+                name,
+                _OSA_stage(
+                    in_ch_list[i],
+                    config_stage_ch[i],
+                    config_concat_ch[i],
+                    block_per_stage[i],
+                    layer_per_block,
+                    i + 2,
+                    SE,
+                    depthwise,
+                ),
+            )
+
+            self._out_feature_channels[name] = config_concat_ch[i]
+            if not i == 0:
+                self._out_feature_strides[name] = current_stirde = int(current_stirde * 2)
+
+    def forward(self, x):
+        outputs = []
+        x = self.stem(x)
+        if "stem" in self._out_features:
+            outputs.append(x)
+        for name in self.stage_names:
+            x = getattr(self, name)(x)
+            if name in self._out_features:
+                outputs.append(x)
+
+        return outputs
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            m = getattr(self, "stem")
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f"stage{i+1}")
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(VoVNetCP, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()