From d1073224b0db857bc92e2bdadbfe21d45b6f1dbc Mon Sep 17 00:00:00 2001
From: kkannan <kkannan@tenstorrent.com>
Date: Wed, 5 Feb 2025 04:18:59 +0000
Subject: [PATCH] Add support for complex_yolov4 model

---
 env/core_requirements.txt                     |    1 +
 .../pytorch/vision/complex_yolov4/__init__.py |    0
 .../complex_yolov4/test_complex_yolov4.py     |   47 +
 .../vision/complex_yolov4/utils/__init__.py   |    0
 .../utils/cal_intersection_rotated_boxes.py   |  175 +++
 .../complex_yolov4/utils/complex_yolov4.cfg   | 1164 +++++++++++++++++
 .../utils/complex_yolov4_tiny.cfg             |  285 ++++
 .../complex_yolov4/utils/darknet2pytorch.py   |  416 ++++++
 .../complex_yolov4/utils/darknet_utils.py     |  310 +++++
 .../utils/iou_rotated_boxes_utils.py          |  221 ++++
 .../complex_yolov4/utils/model_utils.py       |   16 +
 .../complex_yolov4/utils/torch_utils.py       |   32 +
 .../vision/complex_yolov4/utils/yolo_layer.py |  301 +++++
 forge/test/models/utils.py                    |    1 +
 14 files changed, 2969 insertions(+)
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/test_complex_yolov4.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/__init__.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/cal_intersection_rotated_boxes.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4.cfg
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4_tiny.cfg
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/darknet2pytorch.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/darknet_utils.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/iou_rotated_boxes_utils.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/model_utils.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/torch_utils.py
 create mode 100644 forge/test/models/pytorch/vision/complex_yolov4/utils/yolo_layer.py

diff --git a/env/core_requirements.txt b/env/core_requirements.txt
index 3144d29f4..ab8b26f52 100644
--- a/env/core_requirements.txt
+++ b/env/core_requirements.txt
@@ -51,3 +51,4 @@ pytorch_forecasting==1.0.0
 patool
 openpyxl==3.1.5
 GitPython==3.1.44
+easydict==1.13
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/__init__.py b/forge/test/models/pytorch/vision/complex_yolov4/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/test_complex_yolov4.py b/forge/test/models/pytorch/vision/complex_yolov4/test_complex_yolov4.py
new file mode 100644
index 000000000..119ac6b3d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/test_complex_yolov4.py
@@ -0,0 +1,47 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from easydict import EasyDict as edict
+
+import forge
+from forge.verify.verify import verify
+
+from test.models.pytorch.vision.complex_yolov4.utils.model_utils import create_model
+from test.models.utils import Framework, Source, Task, build_module_name
+
+
+@pytest.mark.parametrize("variant", ["complex_yolov4_tiny", "complex_yolov4"])
+def test_compelx_yolov4(record_forge_property, variant):
+
+    # Build Module Name
+    module_name = build_module_name(
+        framework=Framework.PYTORCH,
+        model="complex_yolov4",
+        variant=variant,
+        source=Source.GITHUB,
+        task=Task.OBJECT_DETECTION_3D,
+    )
+
+    # Record Forge Property
+    record_forge_property("model_name", module_name)
+
+    # Load model
+    configs = edict(
+        {
+            "arch": "darknet",
+            "cfgfile": f"forge/test/models/pytorch/vision/complex_yolov4/utils/{variant}.cfg",
+        }
+    )
+    model = create_model(configs)
+    model.eval()
+
+    # prepare sample input
+    inputs = [torch.randn((1, 3, 608, 608))]
+
+    # Forge compile framework model
+    compiled_model = forge.compile(model, inputs, module_name=module_name)
+
+    # Model Verification
+    verify(inputs, model, compiled_model)
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/__init__.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/cal_intersection_rotated_boxes.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/cal_intersection_rotated_boxes.py
new file mode 100644
index 000000000..a0f29f1e9
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/cal_intersection_rotated_boxes.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+# -*- coding: utf-8 -*-
+-----------------------------------------------------------------------------------
+# Author: Nguyen Mau Dung
+# DoC: 2020.07.20
+# email: nguyenmaudung93.kstn@gmail.com
+-----------------------------------------------------------------------------------
+# Description: This script for intersection calculation of rotated boxes (on GPU)
+
+Refer from # https://stackoverflow.com/questions/44797713/calculate-the-area-of-intersection-of-two-rotated-rectangles-in-python?noredirect=1&lq=1
+"""
+
+import torch
+
+
+class Line:
+    # ax + by + c = 0
+    def __init__(self, p1, p2):
+        """
+
+        Args:
+            p1: (x, y)
+            p2: (x, y)
+        """
+        self.a = p2[1] - p1[1]
+        self.b = p1[0] - p2[0]
+        self.c = p2[0] * p1[1] - p2[1] * p1[0]  # cross
+        self.device = p1.device
+
+    def cal_values(self, pts):
+        return self.a * pts[:, 0] + self.b * pts[:, 1] + self.c
+
+    def find_intersection(self, other):
+        # See e.g.     https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection#Using_homogeneous_coordinates
+        if not isinstance(other, Line):
+            return NotImplemented
+        w = self.a * other.b - self.b * other.a
+        return torch.tensor(
+            [(self.b * other.c - self.c * other.b) / w, (self.c * other.a - self.a * other.c) / w], device=self.device
+        )
+
+
+def intersection_area(rect1, rect2):
+    """Calculate the inter
+
+    Args:
+        rect1: vertices of the rectangles (4, 2)
+        rect2: vertices of the rectangles (4, 2)
+
+    Returns:
+
+    """
+
+    # Use the vertices of the first rectangle as, starting vertices of the intersection polygon.
+    intersection = rect1
+
+    # Loop over the edges of the second rectangle
+    roll_rect2 = torch.roll(rect2, -1, dims=0)
+    for p, q in zip(rect2, roll_rect2):
+        if len(intersection) <= 2:
+            break  # No intersection
+
+        line = Line(p, q)
+
+        # Any point p with line(p) <= 0 is on the "inside" (or on the boundary),
+        # any point p with line(p) > 0 is on the "outside".
+        # Loop over the edges of the intersection polygon,
+        # and determine which part is inside and which is outside.
+        new_intersection = []
+        line_values = line.cal_values(intersection)
+        roll_intersection = torch.roll(intersection, -1, dims=0)
+        roll_line_values = torch.roll(line_values, -1, dims=0)
+        for s, t, s_value, t_value in zip(intersection, roll_intersection, line_values, roll_line_values):
+            if s_value <= 0:
+                new_intersection.append(s)
+            if s_value * t_value < 0:
+                # Points are on opposite sides.
+                # Add the intersection of the lines to new_intersection.
+                intersection_point = line.find_intersection(Line(s, t))
+                new_intersection.append(intersection_point)
+
+        if len(new_intersection) > 0:
+            intersection = torch.stack(new_intersection)
+        else:
+            break
+
+    # Calculate area
+    if len(intersection) <= 2:
+        return 0.0
+
+    return PolyArea2D(intersection)
+
+
+def PolyArea2D(pts):
+    roll_pts = torch.roll(pts, -1, dims=0)
+    area = (pts[:, 0] * roll_pts[:, 1] - pts[:, 1] * roll_pts[:, 0]).sum().abs() * 0.5
+    return area
+
+
+if __name__ == "__main__":
+    import cv2
+    import numpy as np
+    from shapely.geometry import Polygon
+
+    def cvt_box_2_polygon(box):
+        """
+        :param array: an array of shape [num_conners, 2]
+        :return: a shapely.geometry.Polygon object
+        """
+        # use .buffer(0) to fix a line polygon
+        # more infor: https://stackoverflow.com/questions/13062334/polygon-intersection-error-in-shapely-shapely-geos-topologicalerror-the-opera
+        return Polygon([(box[i, 0], box[i, 1]) for i in range(len(box))]).buffer(0)
+
+    def get_corners_torch(x, y, w, l, yaw):
+        device = x.device
+        bev_corners = torch.zeros((4, 2), dtype=torch.float, device=device)
+        cos_yaw = torch.cos(yaw)
+        sin_yaw = torch.sin(yaw)
+        # front left
+        bev_corners[0, 0] = x - w / 2 * cos_yaw - l / 2 * sin_yaw
+        bev_corners[0, 1] = y - w / 2 * sin_yaw + l / 2 * cos_yaw
+
+        # rear left
+        bev_corners[1, 0] = x - w / 2 * cos_yaw + l / 2 * sin_yaw
+        bev_corners[1, 1] = y - w / 2 * sin_yaw - l / 2 * cos_yaw
+
+        # rear right
+        bev_corners[2, 0] = x + w / 2 * cos_yaw + l / 2 * sin_yaw
+        bev_corners[2, 1] = y + w / 2 * sin_yaw - l / 2 * cos_yaw
+
+        # front right
+        bev_corners[3, 0] = x + w / 2 * cos_yaw - l / 2 * sin_yaw
+        bev_corners[3, 1] = y + w / 2 * sin_yaw + l / 2 * cos_yaw
+
+        return bev_corners
+
+    # Show convex in an image
+
+    img_size = 300
+    img = np.zeros((img_size, img_size, 3))
+    img = cv2.resize(img, (img_size, img_size))
+
+    box1 = torch.tensor([100, 100, 40, 10, np.pi / 2], dtype=torch.float).cuda()
+    box2 = torch.tensor([100, 100, 40, 20, 0], dtype=torch.float).cuda()
+
+    box1_conners = get_corners_torch(box1[0], box1[1], box1[2], box1[3], box1[4])
+    box1_polygon = cvt_box_2_polygon(box1_conners)
+    box1_area = box1_polygon.area
+
+    box2_conners = get_corners_torch(box2[0], box2[1], box2[2], box2[3], box2[4])
+    box2_polygon = cvt_box_2_polygon(box2_conners)
+    box2_area = box2_polygon.area
+
+    intersection = box2_polygon.intersection(box1_polygon).area
+    union = box1_area + box2_area - intersection
+    iou = intersection / (union + 1e-16)
+
+    print(
+        "Shapely- box1_area: {:.2f}, box2_area: {:.2f}, inter: {:.2f}, iou: {:.4f}".format(
+            box1_area, box2_area, intersection, iou
+        )
+    )
+
+    print("intersection from intersection_area(): {}".format(intersection_area(box1_conners, box2_conners)))
+
+    img = cv2.polylines(img, [box1_conners.cpu().numpy().astype(np.int)], True, (255, 0, 0), 2)
+    img = cv2.polylines(img, [box2_conners.cpu().numpy().astype(np.int)], True, (0, 255, 0), 2)
+
+    while True:
+        cv2.imshow("img", img)
+        if cv2.waitKey(0) & 0xFF == 27:
+            break
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4.cfg b/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4.cfg
new file mode 100644
index 000000000..1debcfcf7
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4.cfg
@@ -0,0 +1,1164 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[net]
+batch=64
+subdivisions=8
+# Training
+#width=512
+#height=512
+width=608
+height=608
+channels=3
+momentum=0.949
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0013
+burn_in=1000
+max_batches = 500500
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+#cutmix=1
+mosaic=1
+
+#:104x104 54:52x52 85:26x26 104:13x13 for 416
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-7
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-10
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-28
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+# Downsample
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=2
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=mish
+
+[shortcut]
+from=-3
+activation=linear
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=mish
+
+[route]
+layers = -1,-16
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=1
+stride=1
+pad=1
+activation=mish
+
+##########################
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+### SPP ###
+[maxpool]
+stride=1
+size=5
+
+[route]
+layers=-2
+
+[maxpool]
+stride=1
+size=9
+
+[route]
+layers=-4
+
+[maxpool]
+stride=1
+size=13
+
+[route]
+layers=-1,-3,-5,-6
+### End SPP ###
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 85
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = 54
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1, -3
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+##########################
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=256
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=30
+activation=linear
+
+
+[yolo]
+mask = 0,1,2
+#anchors = 11,14,-3.14,  11,14,0,  11,14,3.14,  11,25,-3.14,  11,25,0,  11,25,3.14,  23,51,-3.14,  23,51,0,  23,51,3.14
+anchors = 11, 15, 0, 10, 24, 0, 11, 25, 0, 23, 49, 0, 23, 55, 0, 24, 53, 0, 24, 60, 0, 27, 63, 0, 29, 74, 0
+classes=3
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.2
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=256
+activation=leaky
+
+[route]
+layers = -1, -16
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=512
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=30
+activation=linear
+
+
+[yolo]
+mask = 3,4,5
+#anchors = 11,14,-3.14,  11,14,0,  11,14,3.14,  11,25,-3.14,  11,25,0,  11,25,3.14,  23,51,-3.14,  23,51,0,  23,51,3.14
+anchors = 11, 15, 0, 10, 24, 0, 11, 25, 0, 23, 49, 0, 23, 55, 0, 24, 53, 0, 24, 60, 0, 27, 63, 0, 29, 74, 0
+classes=3
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+scale_x_y = 1.1
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
+
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=2
+pad=1
+filters=512
+activation=leaky
+
+[route]
+layers = -1, -37
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=30
+activation=linear
+
+
+[yolo]
+mask = 6,7,8
+#anchors = 11,14,-3.14,  11,14,0,  11,14,3.14,  11,25,-3.14,  11,25,0,  11,25,3.14,  23,51,-3.14,  23,51,0,  23,51,3.14
+anchors = 11, 15, 0, 10, 24, 0, 11, 25, 0, 23, 49, 0, 23, 55, 0, 24, 53, 0, 24, 60, 0, 27, 63, 0, 29, 74, 0
+classes=3
+num=9
+jitter=.3
+ignore_thresh = .7
+truth_thresh = 1
+random=1
+scale_x_y = 1.05
+iou_thresh=0.213
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+nms_kind=greedynms
+beta_nms=0.6
+max_delta=5
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4_tiny.cfg b/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4_tiny.cfg
new file mode 100644
index 000000000..f29f86e41
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/complex_yolov4_tiny.cfg
@@ -0,0 +1,285 @@
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+[net]
+# Testing
+#batch=1
+#subdivisions=1
+# Training
+batch=64
+subdivisions=1
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.00261
+burn_in=1000
+max_batches = 500200
+policy=steps
+steps=400000,450000
+scales=.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=2
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers=-1
+groups=2
+group_id=1
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -1,-2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[route]
+layers = -6,-1
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+##################################
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=30
+activation=linear
+
+
+
+[yolo]
+mask = 3,4,5
+anchors = 11, 15, 0, 11, 25, 0, 23, 49, 0, 23, 55, 0, 24, 53, 0, 25, 61, 0
+classes=3
+num=6
+jitter=.3
+scale_x_y = 1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+resize=1.5
+nms_kind=greedynms
+beta_nms=0.6
+
+[route]
+layers = -4
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[upsample]
+stride=2
+
+[route]
+layers = -1, 23
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=30
+activation=linear
+
+[yolo]
+mask = 0,1,2
+anchors = 11, 15, 0, 11, 25, 0, 23, 49, 0, 23, 55, 0, 24, 53, 0, 25, 61, 0
+classes=3
+num=6
+jitter=.3
+scale_x_y = 1.05
+cls_normalizer=1.0
+iou_normalizer=0.07
+iou_loss=ciou
+ignore_thresh = .7
+truth_thresh = 1
+random=0
+resize=1.5
+nms_kind=greedynms
+beta_nms=0.6
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet2pytorch.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet2pytorch.py
new file mode 100644
index 000000000..0918e6f3e
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet2pytorch.py
@@ -0,0 +1,416 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+# -*- coding: utf-8 -*-
+-----------------------------------------------------------------------------------
+# Refer: https://github.com/Tianxiaomo/pytorch-YOLOv4
+"""
+
+import math
+import sys
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+sys.path.append("../")
+
+from test.models.pytorch.vision.complex_yolov4.utils.darknet_utils import (
+    parse_cfg,
+    print_cfg,
+)
+from test.models.pytorch.vision.complex_yolov4.utils.torch_utils import to_cpu
+from test.models.pytorch.vision.complex_yolov4.utils.yolo_layer import YoloLayer
+
+
+class Mish(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(F.softplus(x)))
+        return x
+
+
+class MaxPoolDark(nn.Module):
+    def __init__(self, size=2, stride=1):
+        super(MaxPoolDark, self).__init__()
+        self.size = size
+        self.stride = stride
+
+    def forward(self, x):
+        """
+        darknet output_size = (input_size + p - k) / s +1
+        p : padding = k - 1
+        k : size
+        s : stride
+        torch output_size = (input_size + 2*p -k) / s +1
+        p : padding = k//2
+        """
+        p = self.size // 2
+        if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1 + 1
+        else:
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1
+        if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3 + 1
+        else:
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3
+        x = F.max_pool2d(
+            F.pad(x, (padding3, padding4, padding1, padding2), mode="replicate"), self.size, stride=self.stride
+        )
+        return x
+
+
+class Upsample_expand(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample_expand, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert x.data.dim() == 4
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H, 1, W, 1).expand(B, C, H, stride, W, stride).contiguous().view(B, C, H * stride, W * stride)
+        return x
+
+
+class Upsample_interpolate(nn.Module):
+    def __init__(self, stride):
+        super(Upsample_interpolate, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        x_numpy = x.cpu().detach().numpy()
+        H = x_numpy.shape[2]
+        W = x_numpy.shape[3]
+
+        H = H * self.stride
+        W = W * self.stride
+
+        out = F.interpolate(x, size=(H, W), mode="nearest")
+        return out
+
+
+class Reorg(nn.Module):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert x.data.dim() == 4
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        assert H % stride == 0
+        assert W % stride == 0
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
+        x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
+        x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
+        x = x.view(B, hs * ws * C, H / hs, W / ws)
+        return x
+
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, x):
+        N = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        x = F.avg_pool2d(x, (H, W))
+        x = x.view(N, C)
+        return x
+
+
+# for route and shortcut
+class EmptyModule(nn.Module):
+    def __init__(self):
+        super(EmptyModule, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+# support route shortcut and reorg
+class Darknet(nn.Module):
+    def __init__(self, cfgfile):
+        super(Darknet, self).__init__()
+        self.blocks = parse_cfg(cfgfile)
+        self.width = int(self.blocks[0]["width"])
+        self.height = int(self.blocks[0]["height"])
+
+        self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
+        self.yolo_layers = [layer for layer in self.models if layer.__class__.__name__ == "YoloLayer"]
+
+        self.loss = self.models[len(self.models) - 1]
+
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+
+    def forward(self, x, targets=None):
+        # batch_size, c, h, w
+        img_size = x.size(2)
+        ind = -2
+        self.loss = None
+        outputs = dict()
+        loss = 0.0
+        yolo_outputs = []
+        for block in self.blocks:
+            ind = ind + 1
+            # if ind > 0:
+            #    return x
+
+            if block["type"] == "net":
+                continue
+            elif block["type"] in ["convolutional", "maxpool", "reorg", "upsample", "avgpool", "softmax", "connected"]:
+                x = self.models[ind](x)
+                outputs[ind] = x
+            elif block["type"] == "route":
+                layers = block["layers"].split(",")
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if "groups" not in block.keys() or int(block["groups"]) == 1:
+                        x = outputs[layers[0]]
+                        outputs[ind] = x
+                    else:
+                        groups = int(block["groups"])
+                        group_id = int(block["group_id"])
+                        _, b, _, _ = outputs[layers[0]].shape
+                        x = outputs[layers[0]][:, b // groups * group_id : b // groups * (group_id + 1)]
+                        outputs[ind] = x
+                elif len(layers) == 2:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x = torch.cat((x1, x2), 1)
+                    outputs[ind] = x
+                elif len(layers) == 4:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x3 = outputs[layers[2]]
+                    x4 = outputs[layers[3]]
+                    x = torch.cat((x1, x2, x3, x4), 1)
+                    outputs[ind] = x
+                else:
+                    print("rounte number > 2 ,is {}".format(len(layers)))
+
+            elif block["type"] == "shortcut":
+                from_layer = int(block["from"])
+                activation = block["activation"]
+                from_layer = from_layer if from_layer > 0 else from_layer + ind
+                x1 = outputs[from_layer]
+                x2 = outputs[ind - 1]
+                x = x1 + x2
+                if activation == "leaky":
+                    x = F.leaky_relu(x, 0.1, inplace=True)
+                elif activation == "relu":
+                    x = F.relu(x, inplace=True)
+                outputs[ind] = x
+            elif block["type"] == "yolo":
+                x, layer_loss = self.models[ind](x, targets, img_size)
+                loss += layer_loss
+                yolo_outputs.append(x)
+            elif block["type"] == "cost":
+                continue
+            else:
+                print("unknown type %s" % (block["type"]))
+        yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
+
+        return yolo_outputs if targets is None else (loss, yolo_outputs)
+
+    def print_network(self):
+        print_cfg(self.blocks)
+
+    def create_network(self, blocks):
+        models = nn.ModuleList()
+
+        prev_filters = 3
+        out_filters = []
+        prev_stride = 1
+        out_strides = []
+        conv_id = 0
+        for block in blocks:
+            if block["type"] == "net":
+                prev_filters = int(block["channels"])
+                continue
+            elif block["type"] == "convolutional":
+                conv_id = conv_id + 1
+                batch_normalize = int(block["batch_normalize"])
+                filters = int(block["filters"])
+                kernel_size = int(block["size"])
+                stride = int(block["stride"])
+                is_pad = int(block["pad"])
+                pad = (kernel_size - 1) // 2 if is_pad else 0
+                activation = block["activation"]
+                model = nn.Sequential()
+                if batch_normalize:
+                    model.add_module(
+                        "conv{0}".format(conv_id),
+                        nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False),
+                    )
+                    model.add_module("bn{0}".format(conv_id), nn.BatchNorm2d(filters))
+                    # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
+                else:
+                    model.add_module(
+                        "conv{0}".format(conv_id), nn.Conv2d(prev_filters, filters, kernel_size, stride, pad)
+                    )
+                if activation == "leaky":
+                    model.add_module("leaky{0}".format(conv_id), nn.LeakyReLU(0.1, inplace=True))
+                elif activation == "relu":
+                    model.add_module("relu{0}".format(conv_id), nn.ReLU(inplace=True))
+                elif activation == "mish":
+                    model.add_module("mish{0}".format(conv_id), Mish())
+                else:
+                    print("[INFO] No error, the convolution haven't activate {}".format(activation))
+
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block["type"] == "maxpool":
+                pool_size = int(block["size"])
+                stride = int(block["stride"])
+                if stride == 1 and pool_size % 2:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=3 stride=1
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
+                elif stride == pool_size:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=2 stride=2
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0)
+                else:
+                    model = MaxPoolDark(pool_size, stride)
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block["type"] == "avgpool":
+                model = GlobalAvgPool2d()
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block["type"] == "softmax":
+                model = nn.Softmax()
+                out_strides.append(prev_stride)
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block["type"] == "cost":
+                if block["_type"] == "sse":
+                    model = nn.MSELoss(size_average=True)
+                elif block["_type"] == "L1":
+                    model = nn.L1Loss(size_average=True)
+                elif block["_type"] == "smooth":
+                    model = nn.SmoothL1Loss(size_average=True)
+                out_filters.append(1)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block["type"] == "reorg":
+                stride = int(block["stride"])
+                prev_filters = stride * stride * prev_filters
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride * stride
+                out_strides.append(prev_stride)
+                models.append(Reorg(stride))
+            elif block["type"] == "upsample":
+                stride = int(block["stride"])
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride // stride
+                out_strides.append(prev_stride)
+
+                models.append(Upsample_expand(stride))
+                # models.append(Upsample_interpolate(stride))
+
+            elif block["type"] == "route":
+                layers = block["layers"].split(",")
+                ind = len(models)
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if "groups" not in block.keys() or int(block["groups"]) == 1:
+                        prev_filters = out_filters[layers[0]]
+                        prev_stride = out_strides[layers[0]]
+                    else:
+                        prev_filters = out_filters[layers[0]] // int(block["groups"])
+                        prev_stride = out_strides[layers[0]] // int(block["groups"])
+                elif len(layers) == 2:
+                    assert layers[0] == ind - 1 or layers[1] == ind - 1
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+                    prev_stride = out_strides[layers[0]]
+                elif len(layers) == 4:
+                    assert layers[0] == ind - 1
+                    prev_filters = (
+                        out_filters[layers[0]]
+                        + out_filters[layers[1]]
+                        + out_filters[layers[2]]
+                        + out_filters[layers[3]]
+                    )
+                    prev_stride = out_strides[layers[0]]
+                else:
+                    print("route error!!!")
+
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block["type"] == "shortcut":
+                ind = len(models)
+                prev_filters = out_filters[ind - 1]
+                out_filters.append(prev_filters)
+                prev_stride = out_strides[ind - 1]
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block["type"] == "connected":
+                filters = int(block["output"])
+                if block["activation"] == "linear":
+                    model = nn.Linear(prev_filters, filters)
+                elif block["activation"] == "leaky":
+                    model = nn.Sequential(nn.Linear(prev_filters, filters), nn.LeakyReLU(0.1, inplace=True))
+                elif block["activation"] == "relu":
+                    model = nn.Sequential(nn.Linear(prev_filters, filters), nn.ReLU(inplace=True))
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block["type"] == "yolo":
+                anchor_masks = [int(i) for i in block["mask"].split(",")]
+                anchors = [float(i) for i in block["anchors"].split(",")]
+                anchors = [
+                    (anchors[i], anchors[i + 1], math.sin(anchors[i + 2]), math.cos(anchors[i + 2]))
+                    for i in range(0, len(anchors), 3)
+                ]
+                anchors = [anchors[i] for i in anchor_masks]
+
+                num_classes = int(block["classes"])
+                self.num_classes = num_classes
+                scale_x_y = float(block["scale_x_y"])
+                ignore_thresh = float(block["ignore_thresh"])
+
+                yolo_layer = YoloLayer(
+                    num_classes=num_classes,
+                    anchors=anchors,
+                    stride=prev_stride,
+                    scale_x_y=scale_x_y,
+                    ignore_thresh=ignore_thresh,
+                )
+
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(yolo_layer)
+            else:
+                print("unknown type %s" % (block["type"]))
+
+        return models
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet_utils.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet_utils.py
new file mode 100644
index 000000000..dc37cf280
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/darknet_utils.py
@@ -0,0 +1,310 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+# -*- coding: utf-8 -*-
+-----------------------------------------------------------------------------------
+# Refer: https://github.com/Tianxiaomo/pytorch-YOLOv4
+"""
+
+import sys
+
+import torch
+
+sys.path.append("../")
+from test.models.pytorch.vision.complex_yolov4.utils.torch_utils import convert2cpu
+
+__all__ = ["parse_cfg", "print_cfg", "load_conv", "load_conv_bn", "save_conv", "save_conv_bn", "load_fc", "save_fc"]
+
+
+def parse_cfg(cfgfile):
+    blocks = []
+    fp = open(cfgfile, "r")
+    block = None
+    line = fp.readline()
+    while line != "":
+        line = line.rstrip()
+        if line == "" or line[0] == "#":
+            line = fp.readline()
+            continue
+        elif line[0] == "[":
+            if block:
+                blocks.append(block)
+            block = dict()
+            block["type"] = line.lstrip("[").rstrip("]")
+            # set default value
+            if block["type"] == "convolutional":
+                block["batch_normalize"] = 0
+        else:
+            key, value = line.split("=")
+            key = key.strip()
+            if key == "type":
+                key = "_type"
+            value = value.strip()
+            block[key] = value
+        line = fp.readline()
+
+    if block:
+        blocks.append(block)
+    fp.close()
+    return blocks
+
+
+def print_cfg(blocks):
+    print("layer     filters    size              input                output")
+    prev_width = 416
+    prev_height = 416
+    prev_filters = 3
+    out_filters = []
+    out_widths = []
+    out_heights = []
+    ind = -2
+    for block in blocks:
+        ind = ind + 1
+        if block["type"] == "net":
+            prev_width = int(block["width"])
+            prev_height = int(block["height"])
+            continue
+        elif block["type"] == "convolutional":
+            filters = int(block["filters"])
+            kernel_size = int(block["size"])
+            stride = int(block["stride"])
+            is_pad = int(block["pad"])
+            pad = (kernel_size - 1) // 2 if is_pad else 0
+            width = (prev_width + 2 * pad - kernel_size) // stride + 1
+            height = (prev_height + 2 * pad - kernel_size) // stride + 1
+            print(
+                "%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d"
+                % (
+                    ind,
+                    "conv",
+                    filters,
+                    kernel_size,
+                    kernel_size,
+                    stride,
+                    prev_width,
+                    prev_height,
+                    prev_filters,
+                    width,
+                    height,
+                    filters,
+                )
+            )
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "maxpool":
+            pool_size = int(block["size"])
+            stride = int(block["stride"])
+            width = prev_width // stride
+            height = prev_height // stride
+            print(
+                "%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d"
+                % (
+                    ind,
+                    "max",
+                    pool_size,
+                    pool_size,
+                    stride,
+                    prev_width,
+                    prev_height,
+                    prev_filters,
+                    width,
+                    height,
+                    filters,
+                )
+            )
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "avgpool":
+            width = 1
+            height = 1
+            print(
+                "%5d %-6s                   %3d x %3d x%4d   ->  %3d"
+                % (ind, "avg", prev_width, prev_height, prev_filters, prev_filters)
+            )
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "softmax":
+            print("%5d %-6s                                    ->  %3d" % (ind, "softmax", prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "cost":
+            print("%5d %-6s                                     ->  %3d" % (ind, "cost", prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "reorg":
+            stride = int(block["stride"])
+            filters = stride * stride * prev_filters
+            width = prev_width // stride
+            height = prev_height // stride
+            print(
+                "%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d"
+                % (ind, "reorg", stride, prev_width, prev_height, prev_filters, width, height, filters)
+            )
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "upsample":
+            stride = int(block["stride"])
+            filters = prev_filters
+            width = prev_width * stride
+            height = prev_height * stride
+            print(
+                "%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d"
+                % (ind, "upsample", stride, prev_width, prev_height, prev_filters, width, height, filters)
+            )
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "route":
+            layers = block["layers"].split(",")
+            layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+            if len(layers) == 1:
+                print("%5d %-6s %d" % (ind, "route", layers[0]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                prev_filters = out_filters[layers[0]]
+            elif len(layers) == 2:
+                print("%5d %-6s %d %d" % (ind, "route", layers[0], layers[1]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert prev_width == out_widths[layers[1]]
+                assert prev_height == out_heights[layers[1]]
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+            elif len(layers) == 4:
+                print("%5d %-6s %d %d %d %d" % (ind, "route", layers[0], layers[1], layers[2], layers[3]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]]
+                assert prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]]
+                prev_filters = (
+                    out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[layers[3]]
+                )
+            else:
+                print(
+                    "route error !!! {} {} {}".format(
+                        sys._getframe().f_code.co_filename, sys._getframe().f_code.co_name, sys._getframe().f_lineno
+                    )
+                )
+
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] in ["region", "yolo"]:
+            print("%5d %-6s" % (ind, "detection"))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "shortcut":
+            from_id = int(block["from"])
+            from_id = from_id if from_id > 0 else from_id + ind
+            print("%5d %-6s %d" % (ind, "shortcut", from_id))
+            prev_width = out_widths[from_id]
+            prev_height = out_heights[from_id]
+            prev_filters = out_filters[from_id]
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block["type"] == "connected":
+            filters = int(block["output"])
+            print("%5d %-6s                            %d  ->  %3d" % (ind, "connected", prev_filters, filters))
+            prev_filters = filters
+            out_widths.append(1)
+            out_heights.append(1)
+            out_filters.append(prev_filters)
+        else:
+            print("unknown type %s" % (block["type"]))
+
+
+def load_conv(buf, start, conv_model):
+    num_w = conv_model.weight.numel()
+    num_b = conv_model.bias.numel()
+    conv_model.bias.data.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start : start + num_w]).reshape(conv_model.weight.data.shape))
+    start = start + num_w
+    return start
+
+
+def save_conv(fp, conv_model):
+    if conv_model.bias.is_cuda:
+        convert2cpu(conv_model.bias.data).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        conv_model.bias.data.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_conv_bn(buf, start, conv_model, bn_model):
+    num_w = conv_model.weight.numel()
+    num_b = bn_model.bias.numel()
+    bn_model.bias.data.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    bn_model.weight.data.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    bn_model.running_mean.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    bn_model.running_var.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start : start + num_w]).reshape(conv_model.weight.data.shape))
+    start = start + num_w
+    return start
+
+
+def save_conv_bn(fp, conv_model, bn_model):
+    if bn_model.bias.is_cuda:
+        convert2cpu(bn_model.bias.data).numpy().tofile(fp)
+        convert2cpu(bn_model.weight.data).numpy().tofile(fp)
+        convert2cpu(bn_model.running_mean).numpy().tofile(fp)
+        convert2cpu(bn_model.running_var).numpy().tofile(fp)
+        convert2cpu(conv_model.weight.data).numpy().tofile(fp)
+    else:
+        bn_model.bias.data.numpy().tofile(fp)
+        bn_model.weight.data.numpy().tofile(fp)
+        bn_model.running_mean.numpy().tofile(fp)
+        bn_model.running_var.numpy().tofile(fp)
+        conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_fc(buf, start, fc_model):
+    num_w = fc_model.weight.numel()
+    num_b = fc_model.bias.numel()
+    fc_model.bias.data.copy_(torch.from_numpy(buf[start : start + num_b]))
+    start = start + num_b
+    fc_model.weight.data.copy_(torch.from_numpy(buf[start : start + num_w]))
+    start = start + num_w
+    return start
+
+
+def save_fc(fp, fc_model):
+    fc_model.bias.data.numpy().tofile(fp)
+    fc_model.weight.data.numpy().tofile(fp)
+
+
+if __name__ == "__main__":
+    import sys
+
+    blocks = parse_cfg("cfg/yolo.cfg")
+    if len(sys.argv) == 2:
+        blocks = parse_cfg(sys.argv[1])
+    print_cfg(blocks)
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/iou_rotated_boxes_utils.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/iou_rotated_boxes_utils.py
new file mode 100644
index 000000000..43049380d
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/iou_rotated_boxes_utils.py
@@ -0,0 +1,221 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+# -*- coding: utf-8 -*-
+-----------------------------------------------------------------------------------
+# Author: Nguyen Mau Dung
+# DoC: 2020.07.20
+# email: nguyenmaudung93.kstn@gmail.com
+-----------------------------------------------------------------------------------
+# Description: This script for iou calculation of rotated boxes (on GPU)
+
+"""
+
+from __future__ import division
+
+import torch
+from scipy.spatial import ConvexHull
+from shapely.geometry import Polygon
+
+from test.models.pytorch.vision.complex_yolov4.utils.cal_intersection_rotated_boxes import (
+    PolyArea2D,
+    intersection_area,
+)
+
+# sys.path.append('../')
+
+
+def cvt_box_2_polygon(box):
+    """
+    :param array: an array of shape [num_conners, 2]
+    :return: a shapely.geometry.Polygon object
+    """
+    # use .buffer(0) to fix a line polygon
+    # more infor: https://stackoverflow.com/questions/13062334/polygon-intersection-error-in-shapely-shapely-geos-topologicalerror-the-opera
+    return Polygon([(box[i, 0], box[i, 1]) for i in range(len(box))]).buffer(0)
+
+
+def get_corners_vectorize(x, y, w, l, yaw):
+    """bev image coordinates format - vectorization
+
+    :param x, y, w, l, yaw: [num_boxes,]
+    :return: num_boxes x (x,y) of 4 conners
+    """
+    device = x.device
+    bbox2 = torch.zeros((x.size(0), 4, 2), device=device, dtype=torch.float)
+    cos_yaw = torch.cos(yaw)
+    sin_yaw = torch.sin(yaw)
+
+    # front left
+    bbox2[:, 0, 0] = x - w / 2 * cos_yaw - l / 2 * sin_yaw
+    bbox2[:, 0, 1] = y - w / 2 * sin_yaw + l / 2 * cos_yaw
+
+    # rear left
+    bbox2[:, 1, 0] = x - w / 2 * cos_yaw + l / 2 * sin_yaw
+    bbox2[:, 1, 1] = y - w / 2 * sin_yaw - l / 2 * cos_yaw
+
+    # rear right
+    bbox2[:, 2, 0] = x + w / 2 * cos_yaw + l / 2 * sin_yaw
+    bbox2[:, 2, 1] = y + w / 2 * sin_yaw - l / 2 * cos_yaw
+
+    # front right
+    bbox2[:, 3, 0] = x + w / 2 * cos_yaw - l / 2 * sin_yaw
+    bbox2[:, 3, 1] = y + w / 2 * sin_yaw + l / 2 * cos_yaw
+
+    return bbox2
+
+
+def get_polygons_areas_fix_xy(boxes, fix_xy=100.0):
+    """
+    Args:
+        box: (num_boxes, 4) --> w, l, im, re
+    """
+    device = boxes.device
+    n_boxes = boxes.size(0)
+    x = torch.full(size=(n_boxes,), fill_value=fix_xy, device=device, dtype=torch.float)
+    y = torch.full(size=(n_boxes,), fill_value=fix_xy, device=device, dtype=torch.float)
+    w, l, im, re = boxes.t()
+    yaw = torch.atan2(im, re)
+    boxes_conners = get_corners_vectorize(x, y, w, l, yaw)
+    boxes_polygons = [cvt_box_2_polygon(box_) for box_ in boxes_conners]
+    boxes_areas = w * l
+
+    return boxes_polygons, boxes_areas
+
+
+def iou_rotated_boxes_targets_vs_anchors(anchors_polygons, anchors_areas, targets_polygons, targets_areas):
+    device = anchors_areas.device
+    num_anchors = len(anchors_areas)
+    num_targets_boxes = len(targets_areas)
+
+    ious = torch.zeros(size=(num_anchors, num_targets_boxes), device=device, dtype=torch.float)
+
+    for a_idx in range(num_anchors):
+        for tg_idx in range(num_targets_boxes):
+            intersection = anchors_polygons[a_idx].intersection(targets_polygons[tg_idx]).area
+            iou = intersection / (anchors_areas[a_idx] + targets_areas[tg_idx] - intersection + 1e-16)
+            ious[a_idx, tg_idx] = iou
+
+    return ious
+
+
+def iou_pred_vs_target_boxes(pred_boxes, target_boxes, GIoU=False, DIoU=False, CIoU=False):
+    assert pred_boxes.size() == target_boxes.size(), "Unmatch size of pred_boxes and target_boxes"
+    device = pred_boxes.device
+    n_boxes = pred_boxes.size(0)
+
+    t_x, t_y, t_w, t_l, t_im, t_re = target_boxes.t()
+    t_yaw = torch.atan2(t_im, t_re)
+    t_conners = get_corners_vectorize(t_x, t_y, t_w, t_l, t_yaw)
+    t_areas = t_w * t_l
+
+    p_x, p_y, p_w, p_l, p_im, p_re = pred_boxes.t()
+    p_yaw = torch.atan2(p_im, p_re)
+    p_conners = get_corners_vectorize(p_x, p_y, p_w, p_l, p_yaw)
+    p_areas = p_w * p_l
+
+    ious = []
+    giou_loss = torch.tensor([0.0], device=device, dtype=torch.float)
+    # Thinking to apply vectorization this step
+    for box_idx in range(n_boxes):
+        p_cons, t_cons = p_conners[box_idx], t_conners[box_idx]
+        if not GIoU:
+            p_poly, t_poly = cvt_box_2_polygon(p_cons), cvt_box_2_polygon(t_cons)
+            intersection = p_poly.intersection(t_poly).area
+        else:
+            intersection = intersection_area(p_cons, t_cons)
+
+        p_area, t_area = p_areas[box_idx], t_areas[box_idx]
+        union = p_area + t_area - intersection
+        iou = intersection / (union + 1e-16)
+
+        if GIoU:
+            convex_conners = torch.cat((p_cons, t_cons), dim=0)
+            hull = ConvexHull(convex_conners.clone().detach().cpu().numpy())  # done on cpu, just need indices output
+            convex_conners = convex_conners[hull.vertices]
+            convex_area = PolyArea2D(convex_conners)
+            giou_loss += 1.0 - (iou - (convex_area - union) / (convex_area + 1e-16))
+        else:
+            giou_loss += 1.0 - iou
+
+        if DIoU or CIoU:
+            raise NotImplementedError
+
+        ious.append(iou)
+
+    return torch.tensor(ious, device=device, dtype=torch.float), giou_loss
+
+
+if __name__ == "__main__":
+    import cv2
+    import numpy as np
+
+    def get_corners_torch(x, y, w, l, yaw):
+        device = x.device
+        bev_corners = torch.zeros((4, 2), dtype=torch.float, device=device)
+        cos_yaw = torch.cos(yaw)
+        sin_yaw = torch.sin(yaw)
+        # front left
+        bev_corners[0, 0] = x - w / 2 * cos_yaw - l / 2 * sin_yaw
+        bev_corners[0, 1] = y - w / 2 * sin_yaw + l / 2 * cos_yaw
+
+        # rear left
+        bev_corners[1, 0] = x - w / 2 * cos_yaw + l / 2 * sin_yaw
+        bev_corners[1, 1] = y - w / 2 * sin_yaw - l / 2 * cos_yaw
+
+        # rear right
+        bev_corners[2, 0] = x + w / 2 * cos_yaw + l / 2 * sin_yaw
+        bev_corners[2, 1] = y + w / 2 * sin_yaw - l / 2 * cos_yaw
+
+        # front right
+        bev_corners[3, 0] = x + w / 2 * cos_yaw - l / 2 * sin_yaw
+        bev_corners[3, 1] = y + w / 2 * sin_yaw + l / 2 * cos_yaw
+
+        return bev_corners
+
+    # Show convex in an image
+
+    img_size = 300
+    img = np.zeros((img_size, img_size, 3))
+    img = cv2.resize(img, (img_size, img_size))
+
+    box1 = torch.tensor([100, 100, 60, 10, 0.5], dtype=torch.float).cuda()
+    box2 = torch.tensor([100, 100, 40, 20, 0], dtype=torch.float).cuda()
+
+    box1_conners = get_corners_torch(box1[0], box1[1], box1[2], box1[3], box1[4])
+    box1_polygon = cvt_box_2_polygon(box1_conners)
+    box1_area = box1_polygon.area
+
+    box2_conners = get_corners_torch(box2[0], box2[1], box2[2], box2[3], box2[4])
+    box2_polygon = cvt_box_2_polygon(box2_conners)
+    box2_area = box2_polygon.area
+
+    intersection = box2_polygon.intersection(box1_polygon).area
+    union = box1_area + box2_area - intersection
+    iou = intersection / (union + 1e-16)
+
+    convex_conners = torch.cat((box1_conners, box2_conners), dim=0)
+    hull = ConvexHull(convex_conners.clone().detach().cpu().numpy())  # done on cpu, just need indices output
+    convex_conners = convex_conners[hull.vertices]
+    convex_polygon = cvt_box_2_polygon(convex_conners)
+    convex_area = convex_polygon.area
+    giou_loss = 1.0 - (iou - (convex_area - union) / (convex_area + 1e-16))
+
+    print(
+        "box1_area: {:.2f}, box2_area: {:.2f}, intersection: {:.2f}, iou: {:.4f}, convex_area: {:.4f}, giou_loss: {}".format(
+            box1_area, box2_area, intersection, iou, convex_area, giou_loss
+        )
+    )
+
+    print("intersection_area: {}".format(intersection_area(box1_conners, box2_conners)))
+    print("convex_area using PolyArea2D: {}".format(PolyArea2D(convex_conners)))
+
+    img = cv2.polylines(img, [box1_conners.cpu().numpy().astype(np.int)], True, (255, 0, 0), 2)
+    img = cv2.polylines(img, [box2_conners.cpu().numpy().astype(np.int)], True, (0, 255, 0), 2)
+    img = cv2.polylines(img, [convex_conners.cpu().numpy().astype(np.int)], True, (0, 0, 255), 2)
+
+    while True:
+        cv2.imshow("img", img)
+        if cv2.waitKey(0) & 0xFF == 27:
+            break
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/model_utils.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/model_utils.py
new file mode 100644
index 000000000..f120f172b
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/model_utils.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+# import sys
+from test.models.pytorch.vision.complex_yolov4.utils.darknet2pytorch import Darknet
+
+
+def create_model(configs):
+    """Create model based on architecture name"""
+    if (configs.arch == "darknet") and (configs.cfgfile is not None):
+        print("using darknet")
+        model = Darknet(cfgfile=configs.cfgfile)
+    else:
+        assert False, "Undefined model backbone"
+
+    return model
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/torch_utils.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/torch_utils.py
new file mode 100644
index 000000000..fcca1b30e
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/torch_utils.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+"""
+# -*- coding: utf-8 -*-
+-----------------------------------------------------------------------------------
+# Author: Nguyen Mau Dung
+# DoC: 2020.07.05
+# email: nguyenmaudung93.kstn@gmail.com
+-----------------------------------------------------------------------------------
+# Description: some utilities of torch (conversion)
+-----------------------------------------------------------------------------------
+# Refer: https://github.com/Tianxiaomo/pytorch-YOLOv4
+"""
+
+import torch
+
+__all__ = ["convert2cpu", "convert2cpu_long", "to_cpu"]
+
+
+def convert2cpu(gpu_matrix):
+    return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+def convert2cpu_long(gpu_matrix):
+    return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+def to_cpu(tensor):
+    return tensor.detach().cpu()
diff --git a/forge/test/models/pytorch/vision/complex_yolov4/utils/yolo_layer.py b/forge/test/models/pytorch/vision/complex_yolov4/utils/yolo_layer.py
new file mode 100644
index 000000000..34c612835
--- /dev/null
+++ b/forge/test/models/pytorch/vision/complex_yolov4/utils/yolo_layer.py
@@ -0,0 +1,301 @@
+# SPDX-FileCopyrightText: (c) 2025 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from test.models.pytorch.vision.complex_yolov4.utils.iou_rotated_boxes_utils import (
+    get_polygons_areas_fix_xy,
+    iou_pred_vs_target_boxes,
+    iou_rotated_boxes_targets_vs_anchors,
+)
+from test.models.pytorch.vision.complex_yolov4.utils.torch_utils import to_cpu
+
+# sys.path.append('../')
+
+
+class YoloLayer(nn.Module):
+    """Yolo layer"""
+
+    def __init__(self, num_classes, anchors, stride, scale_x_y, ignore_thresh):
+        super(YoloLayer, self).__init__()
+        # Update the attributions when parsing the cfg during create the darknet
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = len(anchors)
+        self.stride = stride
+        self.scale_x_y = scale_x_y
+        self.ignore_thresh = ignore_thresh
+
+        self.noobj_scale = 100
+        self.obj_scale = 1
+        self.lgiou_scale = 3.54
+        self.leular_scale = 3.54
+        self.lobj_scale = 64.3
+        self.lcls_scale = 37.4
+
+        self.seen = 0
+        # Initialize dummy variables
+        self.grid_size = 0
+        self.img_size = 0
+        self.metrics = {}
+
+    def compute_grid_offsets(self, grid_size):
+        self.grid_size = grid_size
+        g = self.grid_size
+        self.stride = self.img_size / self.grid_size
+        # Calculate offsets for each grid
+        self.grid_x = torch.arange(g, device=self.device, dtype=torch.float).repeat(g, 1).view([1, 1, g, g])
+        self.grid_y = torch.arange(g, device=self.device, dtype=torch.float).repeat(g, 1).t().view([1, 1, g, g])
+        self.scaled_anchors = torch.tensor(
+            [(a_w / self.stride, a_h / self.stride, im, re) for a_w, a_h, im, re in self.anchors],
+            device=self.device,
+            dtype=torch.float,
+        )
+        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))
+        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
+
+        # Pre compute polygons and areas of anchors
+        self.scaled_anchors_polygons, self.scaled_anchors_areas = get_polygons_areas_fix_xy(self.scaled_anchors)
+
+    def build_targets(self, pred_boxes, pred_cls, target, anchors):
+        """Built yolo targets to compute loss
+        :param out_boxes: [num_samples or batch, num_anchors, grid_size, grid_size, 6]
+        :param pred_cls: [num_samples or batch, num_anchors, grid_size, grid_size, num_classes]
+        :param target: [num_boxes, 8]
+        :param anchors: [num_anchors, 4]
+        :return:
+        """
+        nB, nA, nG, _, nC = pred_cls.size()
+        n_target_boxes = target.size(0)
+
+        # Create output tensors on "device"
+        obj_mask = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.uint8)
+        noobj_mask = torch.full(size=(nB, nA, nG, nG), fill_value=1, device=self.device, dtype=torch.uint8)
+        class_mask = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        iou_scores = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        tx = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        ty = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        tw = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        th = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        tim = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        tre = torch.full(size=(nB, nA, nG, nG), fill_value=0, device=self.device, dtype=torch.float)
+        tcls = torch.full(size=(nB, nA, nG, nG, nC), fill_value=0, device=self.device, dtype=torch.float)
+        tconf = obj_mask.float()
+        giou_loss = torch.tensor([0.0], device=self.device, dtype=torch.float)
+
+        if n_target_boxes > 0:  # Make sure that there is at least 1 box
+            b, target_labels = target[:, :2].long().t()
+            target_boxes = torch.cat((target[:, 2:6] * nG, target[:, 6:8]), dim=-1)  # scale up x, y, w, h
+
+            gxy = target_boxes[:, :2]
+            gwh = target_boxes[:, 2:4]
+            gimre = target_boxes[:, 4:6]
+
+            targets_polygons, targets_areas = get_polygons_areas_fix_xy(target_boxes[:, 2:6])
+            # Get anchors with best iou
+            ious = iou_rotated_boxes_targets_vs_anchors(
+                self.scaled_anchors_polygons, self.scaled_anchors_areas, targets_polygons, targets_areas
+            )
+            best_ious, best_n = ious.max(0)
+
+            gx, gy = gxy.t()
+            gw, gh = gwh.t()
+            gim, gre = gimre.t()
+            gi, gj = gxy.long().t()
+            # Set masks
+            obj_mask[b, best_n, gj, gi] = 1
+            noobj_mask[b, best_n, gj, gi] = 0
+
+            # Set noobj mask to zero where iou exceeds ignore threshold
+            for i, anchor_ious in enumerate(ious.t()):
+                noobj_mask[b[i], anchor_ious > self.ignore_thresh, gj[i], gi[i]] = 0
+
+            # Coordinates
+            tx[b, best_n, gj, gi] = gx - gx.floor()
+            ty[b, best_n, gj, gi] = gy - gy.floor()
+            # Width and height
+            tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
+            th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
+            # Im and real part
+            tim[b, best_n, gj, gi] = gim
+            tre[b, best_n, gj, gi] = gre
+
+            # One-hot encoding of label
+            tcls[b, best_n, gj, gi, target_labels] = 1
+            class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
+            ious, giou_loss = iou_pred_vs_target_boxes(
+                pred_boxes[b, best_n, gj, gi], target_boxes, GIoU=self.use_giou_loss
+            )
+            iou_scores[b, best_n, gj, gi] = ious
+            if self.reduction == "mean":
+                giou_loss /= n_target_boxes
+            tconf = obj_mask.float()
+
+        return (
+            iou_scores,
+            giou_loss,
+            class_mask,
+            obj_mask.type(torch.bool),
+            noobj_mask.type(torch.bool),
+            tx,
+            ty,
+            tw,
+            th,
+            tim,
+            tre,
+            tcls,
+            tconf,
+        )
+
+    def forward(self, x, targets=None, img_size=608, use_giou_loss=False):
+        """
+        :param x: [num_samples or batch, num_anchors * (6 + 1 + num_classes), grid_size, grid_size]
+        :param targets: [num boxes, 8] (box_idx, class, x, y, w, l, sin(yaw), cos(yaw))
+        :param img_size: default 608
+        :return:
+        """
+        self.img_size = img_size
+        self.use_giou_loss = use_giou_loss
+        self.device = x.device
+        num_samples, _, _, grid_size = x.size()
+
+        prediction = x.view(num_samples, self.num_anchors, self.num_classes + 7, grid_size, grid_size)
+        prediction = prediction.permute(0, 1, 3, 4, 2).contiguous()
+        # prediction size: [num_samples, num_anchors, grid_size, grid_size, num_classes + 7]
+
+        # Get outputs
+        pred_x = torch.sigmoid(prediction[..., 0])
+        pred_y = torch.sigmoid(prediction[..., 1])
+        pred_w = prediction[..., 2]  # Width
+        pred_h = prediction[..., 3]  # Height
+        pred_im = prediction[..., 4]  # angle imaginary part
+        pred_re = prediction[..., 5]  # angle real part
+        pred_conf = torch.sigmoid(prediction[..., 6])  # Conf
+        pred_cls = torch.sigmoid(prediction[..., 7:])  # Cls pred.
+
+        # If grid size does not match current we compute new offsets
+        if grid_size != self.grid_size:
+            self.compute_grid_offsets(grid_size)
+
+        # # Add offset and scale with anchors
+        # # pred_boxes size: [num_samples, num_anchors, grid_size, grid_size, 6]
+        # pred_boxes = torch.empty(prediction[..., :6].shape, device=self.device, dtype=torch.float)
+        # pred_boxes[..., 0] = pred_x + self.grid_x
+        # pred_boxes[..., 1] = pred_y + self.grid_y
+        # pred_boxes[..., 2] = torch.exp(pred_w).clamp(max=1E3) * self.anchor_w
+        # pred_boxes[..., 3] = torch.exp(pred_h).clamp(max=1E3) * self.anchor_h
+        # pred_boxes[..., 4] = pred_im
+        # pred_boxes[..., 5] = pred_re
+
+        # ================ my line ===================
+        pred_boxes = torch.stack(
+            [
+                pred_x + self.grid_x,
+                pred_y + self.grid_y,
+                torch.exp(pred_w).clamp(max=1e3) * self.anchor_w,
+                torch.exp(pred_h).clamp(max=1e3) * self.anchor_h,
+                pred_im,
+                pred_re,
+            ],
+            dim=-1,
+        )
+        # ==========================================
+
+        output = torch.cat(
+            (
+                pred_boxes[..., :4].view(num_samples, -1, 4) * self.stride,
+                pred_boxes[..., 4:6].view(num_samples, -1, 2),
+                pred_conf.view(num_samples, -1, 1),
+                pred_cls.view(num_samples, -1, self.num_classes),
+            ),
+            dim=-1,
+        )
+        # output size: [num_samples, num boxes, 7 + num_classes]
+
+        if targets is None:
+            return output, 0
+        else:
+            self.reduction = "mean"
+            (
+                iou_scores,
+                giou_loss,
+                class_mask,
+                obj_mask,
+                noobj_mask,
+                tx,
+                ty,
+                tw,
+                th,
+                tim,
+                tre,
+                tcls,
+                tconf,
+            ) = self.build_targets(
+                pred_boxes=pred_boxes, pred_cls=pred_cls, target=targets, anchors=self.scaled_anchors
+            )
+
+            loss_x = F.mse_loss(pred_x[obj_mask], tx[obj_mask], reduction=self.reduction)
+            loss_y = F.mse_loss(pred_y[obj_mask], ty[obj_mask], reduction=self.reduction)
+            loss_w = F.mse_loss(pred_w[obj_mask], tw[obj_mask], reduction=self.reduction)
+            loss_h = F.mse_loss(pred_h[obj_mask], th[obj_mask], reduction=self.reduction)
+            loss_im = F.mse_loss(pred_im[obj_mask], tim[obj_mask], reduction=self.reduction)
+            loss_re = F.mse_loss(pred_re[obj_mask], tre[obj_mask], reduction=self.reduction)
+            loss_im_re = (
+                1.0 - torch.sqrt(pred_im[obj_mask] ** 2 + pred_re[obj_mask] ** 2)
+            ) ** 2  # as tim^2 + tre^2 = 1
+            loss_im_re_red = loss_im_re.sum() if self.reduction == "sum" else loss_im_re.mean()
+            loss_eular = loss_im + loss_re + loss_im_re_red
+
+            loss_conf_obj = F.binary_cross_entropy(pred_conf[obj_mask], tconf[obj_mask], reduction=self.reduction)
+            loss_conf_noobj = F.binary_cross_entropy(pred_conf[noobj_mask], tconf[noobj_mask], reduction=self.reduction)
+            loss_cls = F.binary_cross_entropy(pred_cls[obj_mask], tcls[obj_mask], reduction=self.reduction)
+
+            if self.use_giou_loss:
+                loss_obj = loss_conf_obj + loss_conf_noobj
+                total_loss = (
+                    giou_loss * self.lgiou_scale
+                    + loss_eular * self.leular_scale
+                    + loss_obj * self.lobj_scale
+                    + loss_cls * self.lcls_scale
+                )
+            else:
+                loss_obj = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj
+                total_loss = loss_x + loss_y + loss_w + loss_h + loss_eular + loss_obj + loss_cls
+
+                # Metrics (store loss values using tensorboard)
+            cls_acc = 100 * class_mask[obj_mask].mean()
+            conf_obj = pred_conf[obj_mask].mean()
+            conf_noobj = pred_conf[noobj_mask].mean()
+            conf50 = (pred_conf > 0.5).float()
+            iou50 = (iou_scores > 0.5).float()
+            iou75 = (iou_scores > 0.75).float()
+            detected_mask = conf50 * class_mask * tconf
+            precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
+            recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
+            recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)
+
+            self.metrics = {
+                "loss": to_cpu(total_loss).item(),
+                "iou_score": to_cpu(iou_scores[obj_mask].mean()).item(),
+                "giou_loss": to_cpu(giou_loss).item(),
+                "loss_x": to_cpu(loss_x).item(),
+                "loss_y": to_cpu(loss_y).item(),
+                "loss_w": to_cpu(loss_w).item(),
+                "loss_h": to_cpu(loss_h).item(),
+                "loss_eular": to_cpu(loss_eular).item(),
+                "loss_im": to_cpu(loss_im).item(),
+                "loss_re": to_cpu(loss_re).item(),
+                "loss_obj": to_cpu(loss_obj).item(),
+                "loss_cls": to_cpu(loss_cls).item(),
+                "cls_acc": to_cpu(cls_acc).item(),
+                "recall50": to_cpu(recall50).item(),
+                "recall75": to_cpu(recall75).item(),
+                "precision": to_cpu(precision).item(),
+                "conf_obj": to_cpu(conf_obj).item(),
+                "conf_noobj": to_cpu(conf_noobj).item(),
+            }
+
+            return output, total_loss
diff --git a/forge/test/models/utils.py b/forge/test/models/utils.py
index 3789f36a7..95008f897 100644
--- a/forge/test/models/utils.py
+++ b/forge/test/models/utils.py
@@ -36,6 +36,7 @@ class Task(StrEnum):
     CONDITIONAL_GENERATION = "cond_gen"
     IMAGE_ENCODING = "img_enc"
     VISUAL_BACKBONE = "visual_bb"
+    OBJECT_DETECTION_3D = "object_detection_3d"
 
 
 class Source(StrEnum):