CoffeeVampir3 · May 19, 2026 20:40
diff --git a/yolo.py b/yolo.py
 """
 YOLO26-small detect — modules + graph.

 All modules are faithful ports of their upstream counterparts. The branches we
 collapsed:
  * Conv:    dropped forward_fuse (deploy-time), dropped activation overrides.
  * C3k2:    kept BOTH the bottleneck and PSABlock paths — the PSABlock path is
             used only at layer 22 (attn=True). Dropped forward_split.
  * Detect:  dropped legacy v3/v5/v8 head shape, dropped export/dynamic/format,
             dropped postprocess/_inference/decode_bboxes (inference-only).
             reg_max=1, so DFL is identity; we drop the DFL class entirely.
  * Everything fuses-related, MPS guards, etc.

 The `YOLO26s` class hardcodes the 24-layer graph at scale s (width=0.5,
 depth=0.5, max_channels=1024) — channel widths verified against
 `parse_model(ultralytics/cfg/models/26/yolo26.yaml, scale='s')`.
 """

 from __future__ import annotations

 import math

 import torch
 import torch.nn as nn

 from .ops import dist2bbox, make_anchors


 # ---------------------------------------------------------------------------
 # Basic building blocks (ports from ultralytics/nn/modules/conv.py)
 # ---------------------------------------------------------------------------


 def autopad(k: int, p: int | None = None, d: int = 1) -> int:
    """Same-padding for a kxk conv with dilation d."""
    if d > 1:
        k = d * (k - 1) + 1
    return p if p is not None else k // 2


 class Conv(nn.Module):
    """Conv2d + BatchNorm2d + SiLU. The single most-used module in the network."""

    def __init__(self, c1: int, c2: int, k: int = 1, s: int = 1, p: int | None = None, g: int = 1, d: int = 1):
        super().__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = nn.SiLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(self.bn(self.conv(x)))


 class Concat(nn.Module):
    """Channel-dim concatenation of a list of tensors."""

    def __init__(self, dim: int = 1):
        super().__init__()
        self.d = dim

    def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
        return torch.cat(xs, self.d)


 # ---------------------------------------------------------------------------
 # CSP family (ports from ultralytics/nn/modules/block.py)
 # ---------------------------------------------------------------------------


 class Bottleneck(nn.Module):
    """k0xk0 + k1xk1 conv with optional residual when c1 == c2."""

    def __init__(
        self,
        c1: int,
        c2: int,
        shortcut: bool = True,
        g: int = 1,
        k: tuple[int, int] = (3, 3),
        e: float = 0.5,
    ):
        super().__init__()
        c_ = int(c2 * e)
        self.cv1 = Conv(c1, c_, k[0], 1)
        self.cv2 = Conv(c_, c2, k[1], 1, g=g)
        self.add = shortcut and c1 == c2

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = self.cv2(self.cv1(x))
        return x + y if self.add else y


 class C3k(nn.Module):
    """CSP block with 3 1x1 convs; inner block is a stack of `n` Bottlenecks (kxk)."""

    def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
        super().__init__()
        c_ = int(c2 * e)
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv2 = Conv(c1, c_, 1, 1)
        self.cv3 = Conv(2 * c_, c2, 1)
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))


 class Attention(nn.Module):
    def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
        super().__init__()
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        self.key_dim = int(self.head_dim * attn_ratio)
        self.scale = self.key_dim**-0.5
        nh_kd = self.key_dim * num_heads
        h = dim + nh_kd * 2  # one fused projection: q + k + v widths
        self.qkv = Conv(dim, h, 1)
        self.qkv.act = nn.Identity()  # the source uses act=False here
        self.proj = Conv(dim, dim, 1)
        self.proj.act = nn.Identity()
        self.pe = Conv(dim, dim, 3, 1, g=dim)
        self.pe.act = nn.Identity()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, C, H, W = x.shape
        N = H * W
        qkv = self.qkv(x)
        q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
            [self.key_dim, self.key_dim, self.head_dim], dim=2
        )
        attn = (q.transpose(-2, -1) @ k) * self.scale
        attn = attn.softmax(dim=-1)
        x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
        return self.proj(x)


 class PSABlock(nn.Module):
    """Position-sensitive attention transformer block: attn + FFN, both with residuals."""

    def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True):
        super().__init__()
        self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
        self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1))
        self.ffn[1].act = nn.Identity()  # source: act=False on the second FFN conv
        self.add = shortcut

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.attn(x) if self.add else self.attn(x)
        x = x + self.ffn(x) if self.add else self.ffn(x)
        return x


 class C2PSA(nn.Module):
    """CSP shell around n stacked PSABlocks.

    Splits the input into two halves; runs attention on one; cats and projects.
    """

    def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
        super().__init__()
        assert c1 == c2
        self.c = int(c1 * e)
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        self.cv2 = Conv(2 * self.c, c1, 1)
        self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        a, b = self.cv1(x).split((self.c, self.c), dim=1)
        b = self.m(b)
        return self.cv2(torch.cat((a, b), 1))


 class C3k2(nn.Module):
    """The workhorse CSP block of YOLO26.

    Like C2f: splits the cv1 output into two halves and chains `n` inner blocks
    on the second half, keeping every intermediate, then cats all and projects.

    The inner block depends on the layer:
      * attn=True (only layer 22 in yolo26s): Bottleneck -> PSABlock pair.
      * c3k=True (layers 6, 8, 13, 16, 19, 22): a nested C3k.
      * else (layers 2, 4): a plain Bottleneck.
    """

    def __init__(
        self,
        c1: int,
        c2: int,
        n: int = 1,
        c3k: bool = False,
        e: float = 0.5,
        attn: bool = False,
        g: int = 1,
        shortcut: bool = True,
    ):
        super().__init__()
        self.c = int(c2 * e)
        self.cv1 = Conv(c1, 2 * self.c, 1, 1)
        self.cv2 = Conv((2 + n) * self.c, c2, 1)
        if attn:
            self.m = nn.ModuleList(
                nn.Sequential(
                    Bottleneck(self.c, self.c, shortcut, g),
                    PSABlock(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1)),
                )
                for _ in range(n)
            )
        elif c3k:
            self.m = nn.ModuleList(C3k(self.c, self.c, 2, shortcut, g) for _ in range(n))
        else:
            self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g) for _ in range(n))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = list(self.cv1(x).chunk(2, 1))
        y.extend(m(y[-1]) for m in self.m)
        return self.cv2(torch.cat(y, 1))


 class SPPF(nn.Module):
    """Spatial Pyramid Pooling — Fast. Equivalent to SPP(k=5,9,13) via serial pools."""

    def __init__(self, c1: int, c2: int, k: int = 5, n: int = 3, shortcut: bool = False):
        super().__init__()
        c_ = c1 // 2
        self.cv1 = Conv(c1, c_, 1, 1)
        self.cv1.act = nn.Identity()  # source: act=False
        self.cv2 = Conv(c_ * (n + 1), c2, 1, 1)
        self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
        self.n = n
        self.add = shortcut and c1 == c2

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        y = [self.cv1(x)]
        for _ in range(self.n):
            y.append(self.m(y[-1]))
        out = self.cv2(torch.cat(y, 1))
        return out + x if self.add else out


 # ---------------------------------------------------------------------------
 # Detection head (port from ultralytics/nn/modules/head.py, end2end=True, reg_max=1)
 # ---------------------------------------------------------------------------


 class Detect(nn.Module):
    """YOLO26 detection head: end2end (one2many + one2one), reg_max=1 (no DFL).

    For each pyramid level (P3, P4, P5) it predicts:
      * 4 channels of distance regression (left, top, right, bottom from anchor),
      * nc channels of per-class logits.

    `end2end=True` builds a parallel one2one head with identical structure. In
    training, BOTH heads run; the input features are detached into the one2one
    branch so the strict one-to-one matcher doesn't push gradients into the
    backbone — only the dense one2many path trains the backbone.
    """

    def __init__(self, nc: int = 80, reg_max: int = 1, ch: tuple[int, ...] = ()):
        super().__init__()
        assert reg_max == 1, "this repro hardcodes reg_max=1 (the YOLO26 default)"
        self.nc = nc
        self.nl = len(ch)
        self.reg_max = reg_max
        self.no = nc + reg_max * 4
        self.stride = torch.zeros(self.nl)

        c2, c3 = max(16, ch[0] // 4, reg_max * 4), max(ch[0], min(nc, 100))

        # Box-distance head: two 3x3 Convs + 1x1 -> 4*reg_max logits.
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * reg_max, 1)) for x in ch
        )
        # Class head: the modern depthwise-separable variant (legacy v3/v5/v8 branch dropped).
        self.cv3 = nn.ModuleList(
            nn.Sequential(
                nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)),  # DWConv -> 1x1
                nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
                nn.Conv2d(c3, nc, 1),
            )
            for x in ch
        )
        # DFL with reg_max=1 is the identity. Inlined; the DFL class is omitted.
        self.dfl = nn.Identity()

        # end2end is always True for YOLO26 — build the parallel one2one head.
        import copy

        self.one2one_cv2 = copy.deepcopy(self.cv2)
        self.one2one_cv3 = copy.deepcopy(self.cv3)

    # The upstream Detect uses `one2many`/`one2one` @property dicts so OBB/Pose can
    # extend with extra branches. For detect-only we keep the same shape.
    @property
    def one2many(self):
        return dict(box_head=self.cv2, cls_head=self.cv3)

    @property
    def one2one(self):
        return dict(box_head=self.one2one_cv2, cls_head=self.one2one_cv3)

    def forward_head(self, x: list[torch.Tensor], box_head: nn.ModuleList, cls_head: nn.ModuleList) -> dict:
        """Run a single (one2many or one2one) head over all pyramid levels."""
        bs = x[0].shape[0]
        boxes = torch.cat([box_head[i](x[i]).view(bs, 4 * self.reg_max, -1) for i in range(self.nl)], dim=-1)
        scores = torch.cat([cls_head[i](x[i]).view(bs, self.nc, -1) for i in range(self.nl)], dim=-1)
        return dict(boxes=boxes, scores=scores, feats=x)

    def forward(self, x: list[torch.Tensor]) -> dict:
        """Training-mode forward: returns {"one2many": {...}, "one2one": {...}}.

        Inference-time decode (anchors -> xyxy, sigmoid scores, topk postprocess)
        is intentionally omitted — this repro is about the training data path.
        """
        preds_o2m = self.forward_head(x, **self.one2many)
        x_detach = [xi.detach() for xi in x]
        preds_o2o = self.forward_head(x_detach, **self.one2one)
        return {"one2many": preds_o2m, "one2one": preds_o2o}

    def bias_init(self) -> None:
        """Set head biases for warm-start convergence (called once, after stride).

        Box-conv bias is 2.0 (a positive prior that decoded distances are ~2 strides).
        Cls-conv bias is log(5 / nc / (640/stride)^2) — assumes ~5 positives per image
        across the feature map at 640px input; this dramatically speeds early epochs.
        """
        for heads in (self.one2many, self.one2one):
            for i, (a, b) in enumerate(zip(heads["box_head"], heads["cls_head"])):
                a[-1].bias.data[:] = 2.0
                b[-1].bias.data[: self.nc] = math.log(5 / self.nc / (640 / self.stride[i]) ** 2)


 # ---------------------------------------------------------------------------
 # YOLO26-small graph
 # ---------------------------------------------------------------------------


 class YOLO26s(nn.Module):
    """Hardcoded YOLO26-s detect graph (no YAML, no parse_model).

    Channel widths at scale s (width=0.5, max_channels=1024, divisor=8) and
    depth=0.5 (so every C3k2/C2PSA with n=2 in the YAML becomes n=1 here).
    Verified by running upstream parse_model with scale='s'.

    Layer reference (idx | module | shape):
      0   Conv(3, 32, k=3, s=2)        # P1/2
      1   Conv(32, 64, k=3, s=2)       # P2/4
      2   C3k2(64, 128, n=1, c3k=False, e=0.25)
      3   Conv(128, 128, k=3, s=2)
      4   C3k2(128, 256, n=1, c3k=False, e=0.25)        <- saved (feeds Concat 15)
      5   Conv(256, 256, k=3, s=2)     # P3/8
      6   C3k2(256, 256, n=1, c3k=True)                 <- saved (feeds Concat 12)
      7   Conv(256, 512, k=3, s=2)     # P4/16
      8   C3k2(512, 512, n=1, c3k=True)
      9   SPPF(512, 512, k=5, n=3, shortcut=True)
     10   C2PSA(512, 512, n=1, e=0.5)                   <- saved (feeds Concat 21)
     11   Upsample x2 nearest
     12   Concat([11, 6])      -> 768
     13   C3k2(768, 256, n=1, c3k=True)                 <- saved (feeds Concat 18)
     14   Upsample x2 nearest
     15   Concat([14, 4])      -> 512
     16   C3k2(512, 128, n=1, c3k=True)                 -> Detect P3
     17   Conv(128, 128, k=3, s=2)
     18   Concat([17, 13])     -> 384
     19   C3k2(384, 256, n=1, c3k=True)                 -> Detect P4
     20   Conv(256, 256, k=3, s=2)
     21   Concat([20, 10])     -> 768
     22   C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True)  -> Detect P5
     23   Detect(nc=80, reg_max=1, ch=(128, 256, 512))
    """

    def __init__(self, nc: int = 80):
        super().__init__()
        self.nc = nc

        # Backbone
        self.conv0 = Conv(3, 32, k=3, s=2)
        self.conv1 = Conv(32, 64, k=3, s=2)
        self.c3k2_2 = C3k2(64, 128, n=1, c3k=False, e=0.25)
        self.conv3 = Conv(128, 128, k=3, s=2)
        self.c3k2_4 = C3k2(128, 256, n=1, c3k=False, e=0.25)
        self.conv5 = Conv(256, 256, k=3, s=2)
        self.c3k2_6 = C3k2(256, 256, n=1, c3k=True)
        self.conv7 = Conv(256, 512, k=3, s=2)
        self.c3k2_8 = C3k2(512, 512, n=1, c3k=True)
        self.sppf = SPPF(512, 512, k=5, n=3, shortcut=True)
        self.c2psa = C2PSA(512, 512, n=1, e=0.5)

        # Neck (PAN-FPN)
        self.up_a = nn.Upsample(scale_factor=2, mode="nearest")
        self.c3k2_13 = C3k2(768, 256, n=1, c3k=True)
        self.up_b = nn.Upsample(scale_factor=2, mode="nearest")
        self.c3k2_16 = C3k2(512, 128, n=1, c3k=True)
        self.conv17 = Conv(128, 128, k=3, s=2)
        self.c3k2_19 = C3k2(384, 256, n=1, c3k=True)
        self.conv20 = Conv(256, 256, k=3, s=2)
        self.c3k2_22 = C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True)

        # Head
        self.detect = Detect(nc=nc, reg_max=1, ch=(128, 256, 512))

        self._init_stride_and_bias()

    def _init_stride_and_bias(self) -> None:
        """Mirror DetectionModel._init in ultralytics/nn/tasks.py:410-415.

        Run a dummy forward at 256x256 to discover the per-level stride, then
        initialize Detect biases (which require strides to be set).
        """
        was_training = self.training
        self.eval()
        with torch.no_grad():
            feats = self._forward_features(torch.zeros(1, 3, 256, 256))
            self.detect.stride = torch.tensor([256.0 / f.shape[-2] for f in feats])
        self.detect.bias_init()
        if was_training:
            self.train()

    def _forward_features(self, x: torch.Tensor) -> list[torch.Tensor]:
        """Backbone + neck. Returns the three feature maps (P3, P4, P5) the head consumes."""
        x = self.conv0(x)
        x = self.conv1(x)
        x = self.c3k2_2(x)
        x = self.conv3(x)
        x4 = self.c3k2_4(x)       # saved
        x = self.conv5(x4)
        x6 = self.c3k2_6(x)       # saved
        x = self.conv7(x6)
        x = self.c3k2_8(x)
        x = self.sppf(x)
        x10 = self.c2psa(x)       # saved

        x = self.up_a(x10)
        x = torch.cat([x, x6], 1)
        x13 = self.c3k2_13(x)     # saved

        x = self.up_b(x13)
        x = torch.cat([x, x4], 1)
        p3 = self.c3k2_16(x)      # Detect P3

        x = self.conv17(p3)
        x = torch.cat([x, x13], 1)
        p4 = self.c3k2_19(x)      # Detect P4

        x = self.conv20(p4)
        x = torch.cat([x, x10], 1)
        p5 = self.c3k2_22(x)      # Detect P5

        return [p3, p4, p5]

    def forward(self, x: torch.Tensor) -> dict:
        feats = self._forward_features(x)
        return self.detect(feats)
	"""
	YOLO26-small detect — modules + graph.

	All modules are faithful ports of their upstream counterparts. The branches we
	collapsed:
	* Conv: dropped forward_fuse (deploy-time), dropped activation overrides.
	* C3k2: kept BOTH the bottleneck and PSABlock paths — the PSABlock path is
	used only at layer 22 (attn=True). Dropped forward_split.
	* Detect: dropped legacy v3/v5/v8 head shape, dropped export/dynamic/format,
	dropped postprocess/_inference/decode_bboxes (inference-only).
	reg_max=1, so DFL is identity; we drop the DFL class entirely.
	* Everything fuses-related, MPS guards, etc.

	The `YOLO26s` class hardcodes the 24-layer graph at scale s (width=0.5,
	depth=0.5, max_channels=1024) — channel widths verified against
	`parse_model(ultralytics/cfg/models/26/yolo26.yaml, scale='s')`.
	"""

	from __future__ import annotations

	import math

	import torch
	import torch.nn as nn

	from .ops import dist2bbox, make_anchors


	# ---------------------------------------------------------------------------
	# Basic building blocks (ports from ultralytics/nn/modules/conv.py)
	# ---------------------------------------------------------------------------


	def autopad(k: int, p: int \| None = None, d: int = 1) -> int:
	"""Same-padding for a kxk conv with dilation d."""
	if d > 1:
	k = d * (k - 1) + 1
	return p if p is not None else k // 2


	class Conv(nn.Module):
	"""Conv2d + BatchNorm2d + SiLU. The single most-used module in the network."""

	def __init__(self, c1: int, c2: int, k: int = 1, s: int = 1, p: int \| None = None, g: int = 1, d: int = 1):
	super().__init__()
	self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
	self.bn = nn.BatchNorm2d(c2)
	self.act = nn.SiLU()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.act(self.bn(self.conv(x)))


	class Concat(nn.Module):
	"""Channel-dim concatenation of a list of tensors."""

	def __init__(self, dim: int = 1):
	super().__init__()
	self.d = dim

	def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
	return torch.cat(xs, self.d)


	# ---------------------------------------------------------------------------
	# CSP family (ports from ultralytics/nn/modules/block.py)
	# ---------------------------------------------------------------------------


	class Bottleneck(nn.Module):
	"""k0xk0 + k1xk1 conv with optional residual when c1 == c2."""

	def __init__(
	self,
	c1: int,
	c2: int,
	shortcut: bool = True,
	g: int = 1,
	k: tuple[int, int] = (3, 3),
	e: float = 0.5,
	):
	super().__init__()
	c_ = int(c2 * e)
	self.cv1 = Conv(c1, c_, k[0], 1)
	self.cv2 = Conv(c_, c2, k[1], 1, g=g)
	self.add = shortcut and c1 == c2

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	y = self.cv2(self.cv1(x))
	return x + y if self.add else y


	class C3k(nn.Module):
	"""CSP block with 3 1x1 convs; inner block is a stack of `n` Bottlenecks (kxk)."""

	def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
	super().__init__()
	c_ = int(c2 * e)
	self.cv1 = Conv(c1, c_, 1, 1)
	self.cv2 = Conv(c1, c_, 1, 1)
	self.cv3 = Conv(2 * c_, c2, 1)
	self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))


	class Attention(nn.Module):
	def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
	super().__init__()
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.key_dim = int(self.head_dim * attn_ratio)
	self.scale = self.key_dim**-0.5
	nh_kd = self.key_dim * num_heads
	h = dim + nh_kd * 2 # one fused projection: q + k + v widths
	self.qkv = Conv(dim, h, 1)
	self.qkv.act = nn.Identity() # the source uses act=False here
	self.proj = Conv(dim, dim, 1)
	self.proj.act = nn.Identity()
	self.pe = Conv(dim, dim, 3, 1, g=dim)
	self.pe.act = nn.Identity()

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	B, C, H, W = x.shape
	N = H * W
	qkv = self.qkv(x)
	q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
	[self.key_dim, self.key_dim, self.head_dim], dim=2
	)
	attn = (q.transpose(-2, -1) @ k) * self.scale
	attn = attn.softmax(dim=-1)
	x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
	return self.proj(x)


	class PSABlock(nn.Module):
	"""Position-sensitive attention transformer block: attn + FFN, both with residuals."""

	def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True):
	super().__init__()
	self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
	self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1))
	self.ffn[1].act = nn.Identity() # source: act=False on the second FFN conv
	self.add = shortcut

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = x + self.attn(x) if self.add else self.attn(x)
	x = x + self.ffn(x) if self.add else self.ffn(x)
	return x


	class C2PSA(nn.Module):
	"""CSP shell around n stacked PSABlocks.

	Splits the input into two halves; runs attention on one; cats and projects.
	"""

	def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
	super().__init__()
	assert c1 == c2
	self.c = int(c1 * e)
	self.cv1 = Conv(c1, 2 * self.c, 1, 1)
	self.cv2 = Conv(2 * self.c, c1, 1)
	self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	a, b = self.cv1(x).split((self.c, self.c), dim=1)
	b = self.m(b)
	return self.cv2(torch.cat((a, b), 1))


	class C3k2(nn.Module):
	"""The workhorse CSP block of YOLO26.

	Like C2f: splits the cv1 output into two halves and chains `n` inner blocks
	on the second half, keeping every intermediate, then cats all and projects.

	The inner block depends on the layer:
	* attn=True (only layer 22 in yolo26s): Bottleneck -> PSABlock pair.
	* c3k=True (layers 6, 8, 13, 16, 19, 22): a nested C3k.
	* else (layers 2, 4): a plain Bottleneck.
	"""

	def __init__(
	self,
	c1: int,
	c2: int,
	n: int = 1,
	c3k: bool = False,
	e: float = 0.5,
	attn: bool = False,
	g: int = 1,
	shortcut: bool = True,
	):
	super().__init__()
	self.c = int(c2 * e)
	self.cv1 = Conv(c1, 2 * self.c, 1, 1)
	self.cv2 = Conv((2 + n) * self.c, c2, 1)
	if attn:
	self.m = nn.ModuleList(
	nn.Sequential(
	Bottleneck(self.c, self.c, shortcut, g),
	PSABlock(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1)),
	)
	for _ in range(n)
	)
	elif c3k:
	self.m = nn.ModuleList(C3k(self.c, self.c, 2, shortcut, g) for _ in range(n))
	else:
	self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g) for _ in range(n))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	y = list(self.cv1(x).chunk(2, 1))
	y.extend(m(y[-1]) for m in self.m)
	return self.cv2(torch.cat(y, 1))


	class SPPF(nn.Module):
	"""Spatial Pyramid Pooling — Fast. Equivalent to SPP(k=5,9,13) via serial pools."""

	def __init__(self, c1: int, c2: int, k: int = 5, n: int = 3, shortcut: bool = False):
	super().__init__()
	c_ = c1 // 2
	self.cv1 = Conv(c1, c_, 1, 1)
	self.cv1.act = nn.Identity() # source: act=False
	self.cv2 = Conv(c_ * (n + 1), c2, 1, 1)
	self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
	self.n = n
	self.add = shortcut and c1 == c2

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	y = [self.cv1(x)]
	for _ in range(self.n):
	y.append(self.m(y[-1]))
	out = self.cv2(torch.cat(y, 1))
	return out + x if self.add else out


	# ---------------------------------------------------------------------------
	# Detection head (port from ultralytics/nn/modules/head.py, end2end=True, reg_max=1)
	# ---------------------------------------------------------------------------


	class Detect(nn.Module):
	"""YOLO26 detection head: end2end (one2many + one2one), reg_max=1 (no DFL).

	For each pyramid level (P3, P4, P5) it predicts:
	* 4 channels of distance regression (left, top, right, bottom from anchor),
	* nc channels of per-class logits.

	`end2end=True` builds a parallel one2one head with identical structure. In
	training, BOTH heads run; the input features are detached into the one2one
	branch so the strict one-to-one matcher doesn't push gradients into the
	backbone — only the dense one2many path trains the backbone.
	"""

	def __init__(self, nc: int = 80, reg_max: int = 1, ch: tuple[int, ...] = ()):
	super().__init__()
	assert reg_max == 1, "this repro hardcodes reg_max=1 (the YOLO26 default)"
	self.nc = nc
	self.nl = len(ch)
	self.reg_max = reg_max
	self.no = nc + reg_max * 4
	self.stride = torch.zeros(self.nl)

	c2, c3 = max(16, ch[0] // 4, reg_max * 4), max(ch[0], min(nc, 100))

	# Box-distance head: two 3x3 Convs + 1x1 -> 4*reg_max logits.
	self.cv2 = nn.ModuleList(
	nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * reg_max, 1)) for x in ch
	)
	# Class head: the modern depthwise-separable variant (legacy v3/v5/v8 branch dropped).
	self.cv3 = nn.ModuleList(
	nn.Sequential(
	nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)), # DWConv -> 1x1
	nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
	nn.Conv2d(c3, nc, 1),
	)
	for x in ch
	)
	# DFL with reg_max=1 is the identity. Inlined; the DFL class is omitted.
	self.dfl = nn.Identity()

	# end2end is always True for YOLO26 — build the parallel one2one head.
	import copy

	self.one2one_cv2 = copy.deepcopy(self.cv2)
	self.one2one_cv3 = copy.deepcopy(self.cv3)

	# The upstream Detect uses `one2many`/`one2one` @property dicts so OBB/Pose can
	# extend with extra branches. For detect-only we keep the same shape.
	@property
	def one2many(self):
	return dict(box_head=self.cv2, cls_head=self.cv3)

	@property
	def one2one(self):
	return dict(box_head=self.one2one_cv2, cls_head=self.one2one_cv3)

	def forward_head(self, x: list[torch.Tensor], box_head: nn.ModuleList, cls_head: nn.ModuleList) -> dict:
	"""Run a single (one2many or one2one) head over all pyramid levels."""
	bs = x[0].shape[0]
	boxes = torch.cat([box_head[i](x[i]).view(bs, 4 * self.reg_max, -1) for i in range(self.nl)], dim=-1)
	scores = torch.cat([cls_head[i](x[i]).view(bs, self.nc, -1) for i in range(self.nl)], dim=-1)
	return dict(boxes=boxes, scores=scores, feats=x)

	def forward(self, x: list[torch.Tensor]) -> dict:
	"""Training-mode forward: returns {"one2many": {...}, "one2one": {...}}.

	Inference-time decode (anchors -> xyxy, sigmoid scores, topk postprocess)
	is intentionally omitted — this repro is about the training data path.
	"""
	preds_o2m = self.forward_head(x, **self.one2many)
	x_detach = [xi.detach() for xi in x]
	preds_o2o = self.forward_head(x_detach, **self.one2one)
	return {"one2many": preds_o2m, "one2one": preds_o2o}

	def bias_init(self) -> None:
	"""Set head biases for warm-start convergence (called once, after stride).

	Box-conv bias is 2.0 (a positive prior that decoded distances are ~2 strides).
	Cls-conv bias is log(5 / nc / (640/stride)^2) — assumes ~5 positives per image
	across the feature map at 640px input; this dramatically speeds early epochs.
	"""
	for heads in (self.one2many, self.one2one):
	for i, (a, b) in enumerate(zip(heads["box_head"], heads["cls_head"])):
	a[-1].bias.data[:] = 2.0
	b[-1].bias.data[: self.nc] = math.log(5 / self.nc / (640 / self.stride[i]) ** 2)


	# ---------------------------------------------------------------------------
	# YOLO26-small graph
	# ---------------------------------------------------------------------------


	class YOLO26s(nn.Module):
	"""Hardcoded YOLO26-s detect graph (no YAML, no parse_model).

	Channel widths at scale s (width=0.5, max_channels=1024, divisor=8) and
	depth=0.5 (so every C3k2/C2PSA with n=2 in the YAML becomes n=1 here).
	Verified by running upstream parse_model with scale='s'.

	Layer reference (idx \| module \| shape):
	0 Conv(3, 32, k=3, s=2) # P1/2
	1 Conv(32, 64, k=3, s=2) # P2/4
	2 C3k2(64, 128, n=1, c3k=False, e=0.25)
	3 Conv(128, 128, k=3, s=2)
	4 C3k2(128, 256, n=1, c3k=False, e=0.25) <- saved (feeds Concat 15)
	5 Conv(256, 256, k=3, s=2) # P3/8
	6 C3k2(256, 256, n=1, c3k=True) <- saved (feeds Concat 12)
	7 Conv(256, 512, k=3, s=2) # P4/16
	8 C3k2(512, 512, n=1, c3k=True)
	9 SPPF(512, 512, k=5, n=3, shortcut=True)
	10 C2PSA(512, 512, n=1, e=0.5) <- saved (feeds Concat 21)
	11 Upsample x2 nearest
	12 Concat([11, 6]) -> 768
	13 C3k2(768, 256, n=1, c3k=True) <- saved (feeds Concat 18)
	14 Upsample x2 nearest
	15 Concat([14, 4]) -> 512
	16 C3k2(512, 128, n=1, c3k=True) -> Detect P3
	17 Conv(128, 128, k=3, s=2)
	18 Concat([17, 13]) -> 384
	19 C3k2(384, 256, n=1, c3k=True) -> Detect P4
	20 Conv(256, 256, k=3, s=2)
	21 Concat([20, 10]) -> 768
	22 C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True) -> Detect P5
	23 Detect(nc=80, reg_max=1, ch=(128, 256, 512))
	"""

	def __init__(self, nc: int = 80):
	super().__init__()
	self.nc = nc

	# Backbone
	self.conv0 = Conv(3, 32, k=3, s=2)
	self.conv1 = Conv(32, 64, k=3, s=2)
	self.c3k2_2 = C3k2(64, 128, n=1, c3k=False, e=0.25)
	self.conv3 = Conv(128, 128, k=3, s=2)
	self.c3k2_4 = C3k2(128, 256, n=1, c3k=False, e=0.25)
	self.conv5 = Conv(256, 256, k=3, s=2)
	self.c3k2_6 = C3k2(256, 256, n=1, c3k=True)
	self.conv7 = Conv(256, 512, k=3, s=2)
	self.c3k2_8 = C3k2(512, 512, n=1, c3k=True)
	self.sppf = SPPF(512, 512, k=5, n=3, shortcut=True)
	self.c2psa = C2PSA(512, 512, n=1, e=0.5)

	# Neck (PAN-FPN)
	self.up_a = nn.Upsample(scale_factor=2, mode="nearest")
	self.c3k2_13 = C3k2(768, 256, n=1, c3k=True)
	self.up_b = nn.Upsample(scale_factor=2, mode="nearest")
	self.c3k2_16 = C3k2(512, 128, n=1, c3k=True)
	self.conv17 = Conv(128, 128, k=3, s=2)
	self.c3k2_19 = C3k2(384, 256, n=1, c3k=True)
	self.conv20 = Conv(256, 256, k=3, s=2)
	self.c3k2_22 = C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True)

	# Head
	self.detect = Detect(nc=nc, reg_max=1, ch=(128, 256, 512))

	self._init_stride_and_bias()

	def _init_stride_and_bias(self) -> None:
	"""Mirror DetectionModel._init in ultralytics/nn/tasks.py:410-415.

	Run a dummy forward at 256x256 to discover the per-level stride, then
	initialize Detect biases (which require strides to be set).
	"""
	was_training = self.training
	self.eval()
	with torch.no_grad():
	feats = self._forward_features(torch.zeros(1, 3, 256, 256))
	self.detect.stride = torch.tensor([256.0 / f.shape[-2] for f in feats])
	self.detect.bias_init()
	if was_training:
	self.train()

	def _forward_features(self, x: torch.Tensor) -> list[torch.Tensor]:
	"""Backbone + neck. Returns the three feature maps (P3, P4, P5) the head consumes."""
	x = self.conv0(x)
	x = self.conv1(x)
	x = self.c3k2_2(x)
	x = self.conv3(x)
	x4 = self.c3k2_4(x) # saved
	x = self.conv5(x4)
	x6 = self.c3k2_6(x) # saved
	x = self.conv7(x6)
	x = self.c3k2_8(x)
	x = self.sppf(x)
	x10 = self.c2psa(x) # saved

	x = self.up_a(x10)
	x = torch.cat([x, x6], 1)
	x13 = self.c3k2_13(x) # saved

	x = self.up_b(x13)
	x = torch.cat([x, x4], 1)
	p3 = self.c3k2_16(x) # Detect P3

	x = self.conv17(p3)
	x = torch.cat([x, x13], 1)
	p4 = self.c3k2_19(x) # Detect P4

	x = self.conv20(p4)
	x = torch.cat([x, x10], 1)
	p5 = self.c3k2_22(x) # Detect P5

	return [p3, p4, p5]

	def forward(self, x: torch.Tensor) -> dict:
	feats = self._forward_features(x)
	return self.detect(feats)
No results found