Skip to content

Instantly share code, notes, and snippets.

@CoffeeVampir3
Created May 19, 2026 20:40
Show Gist options
  • Select an option

  • Save CoffeeVampir3/7b8a029f659e811da2c65762818f1b09 to your computer and use it in GitHub Desktop.

Select an option

Save CoffeeVampir3/7b8a029f659e811da2c65762818f1b09 to your computer and use it in GitHub Desktop.
yolo
"""
YOLO26-small detect — modules + graph.
All modules are faithful ports of their upstream counterparts. The branches we
collapsed:
* Conv: dropped forward_fuse (deploy-time), dropped activation overrides.
* C3k2: kept BOTH the bottleneck and PSABlock paths — the PSABlock path is
used only at layer 22 (attn=True). Dropped forward_split.
* Detect: dropped legacy v3/v5/v8 head shape, dropped export/dynamic/format,
dropped postprocess/_inference/decode_bboxes (inference-only).
reg_max=1, so DFL is identity; we drop the DFL class entirely.
* Everything fuses-related, MPS guards, etc.
The `YOLO26s` class hardcodes the 24-layer graph at scale s (width=0.5,
depth=0.5, max_channels=1024) — channel widths verified against
`parse_model(ultralytics/cfg/models/26/yolo26.yaml, scale='s')`.
"""
from __future__ import annotations
import math
import torch
import torch.nn as nn
from .ops import dist2bbox, make_anchors
# ---------------------------------------------------------------------------
# Basic building blocks (ports from ultralytics/nn/modules/conv.py)
# ---------------------------------------------------------------------------
def autopad(k: int, p: int | None = None, d: int = 1) -> int:
"""Same-padding for a kxk conv with dilation d."""
if d > 1:
k = d * (k - 1) + 1
return p if p is not None else k // 2
class Conv(nn.Module):
"""Conv2d + BatchNorm2d + SiLU. The single most-used module in the network."""
def __init__(self, c1: int, c2: int, k: int = 1, s: int = 1, p: int | None = None, g: int = 1, d: int = 1):
super().__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = nn.SiLU()
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.act(self.bn(self.conv(x)))
class Concat(nn.Module):
"""Channel-dim concatenation of a list of tensors."""
def __init__(self, dim: int = 1):
super().__init__()
self.d = dim
def forward(self, xs: list[torch.Tensor]) -> torch.Tensor:
return torch.cat(xs, self.d)
# ---------------------------------------------------------------------------
# CSP family (ports from ultralytics/nn/modules/block.py)
# ---------------------------------------------------------------------------
class Bottleneck(nn.Module):
"""k0xk0 + k1xk1 conv with optional residual when c1 == c2."""
def __init__(
self,
c1: int,
c2: int,
shortcut: bool = True,
g: int = 1,
k: tuple[int, int] = (3, 3),
e: float = 0.5,
):
super().__init__()
c_ = int(c2 * e)
self.cv1 = Conv(c1, c_, k[0], 1)
self.cv2 = Conv(c_, c2, k[1], 1, g=g)
self.add = shortcut and c1 == c2
def forward(self, x: torch.Tensor) -> torch.Tensor:
y = self.cv2(self.cv1(x))
return x + y if self.add else y
class C3k(nn.Module):
"""CSP block with 3 1x1 convs; inner block is a stack of `n` Bottlenecks (kxk)."""
def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3):
super().__init__()
c_ = int(c2 * e)
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(2 * c_, c2, 1)
self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
class Attention(nn.Module):
def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5):
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.key_dim = int(self.head_dim * attn_ratio)
self.scale = self.key_dim**-0.5
nh_kd = self.key_dim * num_heads
h = dim + nh_kd * 2 # one fused projection: q + k + v widths
self.qkv = Conv(dim, h, 1)
self.qkv.act = nn.Identity() # the source uses act=False here
self.proj = Conv(dim, dim, 1)
self.proj.act = nn.Identity()
self.pe = Conv(dim, dim, 3, 1, g=dim)
self.pe.act = nn.Identity()
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, C, H, W = x.shape
N = H * W
qkv = self.qkv(x)
q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split(
[self.key_dim, self.key_dim, self.head_dim], dim=2
)
attn = (q.transpose(-2, -1) @ k) * self.scale
attn = attn.softmax(dim=-1)
x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W))
return self.proj(x)
class PSABlock(nn.Module):
"""Position-sensitive attention transformer block: attn + FFN, both with residuals."""
def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True):
super().__init__()
self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1))
self.ffn[1].act = nn.Identity() # source: act=False on the second FFN conv
self.add = shortcut
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = x + self.attn(x) if self.add else self.attn(x)
x = x + self.ffn(x) if self.add else self.ffn(x)
return x
class C2PSA(nn.Module):
"""CSP shell around n stacked PSABlocks.
Splits the input into two halves; runs attention on one; cats and projects.
"""
def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5):
super().__init__()
assert c1 == c2
self.c = int(c1 * e)
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
self.cv2 = Conv(2 * self.c, c1, 1)
self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
def forward(self, x: torch.Tensor) -> torch.Tensor:
a, b = self.cv1(x).split((self.c, self.c), dim=1)
b = self.m(b)
return self.cv2(torch.cat((a, b), 1))
class C3k2(nn.Module):
"""The workhorse CSP block of YOLO26.
Like C2f: splits the cv1 output into two halves and chains `n` inner blocks
on the second half, keeping every intermediate, then cats all and projects.
The inner block depends on the layer:
* attn=True (only layer 22 in yolo26s): Bottleneck -> PSABlock pair.
* c3k=True (layers 6, 8, 13, 16, 19, 22): a nested C3k.
* else (layers 2, 4): a plain Bottleneck.
"""
def __init__(
self,
c1: int,
c2: int,
n: int = 1,
c3k: bool = False,
e: float = 0.5,
attn: bool = False,
g: int = 1,
shortcut: bool = True,
):
super().__init__()
self.c = int(c2 * e)
self.cv1 = Conv(c1, 2 * self.c, 1, 1)
self.cv2 = Conv((2 + n) * self.c, c2, 1)
if attn:
self.m = nn.ModuleList(
nn.Sequential(
Bottleneck(self.c, self.c, shortcut, g),
PSABlock(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1)),
)
for _ in range(n)
)
elif c3k:
self.m = nn.ModuleList(C3k(self.c, self.c, 2, shortcut, g) for _ in range(n))
else:
self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g) for _ in range(n))
def forward(self, x: torch.Tensor) -> torch.Tensor:
y = list(self.cv1(x).chunk(2, 1))
y.extend(m(y[-1]) for m in self.m)
return self.cv2(torch.cat(y, 1))
class SPPF(nn.Module):
"""Spatial Pyramid Pooling — Fast. Equivalent to SPP(k=5,9,13) via serial pools."""
def __init__(self, c1: int, c2: int, k: int = 5, n: int = 3, shortcut: bool = False):
super().__init__()
c_ = c1 // 2
self.cv1 = Conv(c1, c_, 1, 1)
self.cv1.act = nn.Identity() # source: act=False
self.cv2 = Conv(c_ * (n + 1), c2, 1, 1)
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
self.n = n
self.add = shortcut and c1 == c2
def forward(self, x: torch.Tensor) -> torch.Tensor:
y = [self.cv1(x)]
for _ in range(self.n):
y.append(self.m(y[-1]))
out = self.cv2(torch.cat(y, 1))
return out + x if self.add else out
# ---------------------------------------------------------------------------
# Detection head (port from ultralytics/nn/modules/head.py, end2end=True, reg_max=1)
# ---------------------------------------------------------------------------
class Detect(nn.Module):
"""YOLO26 detection head: end2end (one2many + one2one), reg_max=1 (no DFL).
For each pyramid level (P3, P4, P5) it predicts:
* 4 channels of distance regression (left, top, right, bottom from anchor),
* nc channels of per-class logits.
`end2end=True` builds a parallel one2one head with identical structure. In
training, BOTH heads run; the input features are detached into the one2one
branch so the strict one-to-one matcher doesn't push gradients into the
backbone — only the dense one2many path trains the backbone.
"""
def __init__(self, nc: int = 80, reg_max: int = 1, ch: tuple[int, ...] = ()):
super().__init__()
assert reg_max == 1, "this repro hardcodes reg_max=1 (the YOLO26 default)"
self.nc = nc
self.nl = len(ch)
self.reg_max = reg_max
self.no = nc + reg_max * 4
self.stride = torch.zeros(self.nl)
c2, c3 = max(16, ch[0] // 4, reg_max * 4), max(ch[0], min(nc, 100))
# Box-distance head: two 3x3 Convs + 1x1 -> 4*reg_max logits.
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * reg_max, 1)) for x in ch
)
# Class head: the modern depthwise-separable variant (legacy v3/v5/v8 branch dropped).
self.cv3 = nn.ModuleList(
nn.Sequential(
nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)), # DWConv -> 1x1
nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)),
nn.Conv2d(c3, nc, 1),
)
for x in ch
)
# DFL with reg_max=1 is the identity. Inlined; the DFL class is omitted.
self.dfl = nn.Identity()
# end2end is always True for YOLO26 — build the parallel one2one head.
import copy
self.one2one_cv2 = copy.deepcopy(self.cv2)
self.one2one_cv3 = copy.deepcopy(self.cv3)
# The upstream Detect uses `one2many`/`one2one` @property dicts so OBB/Pose can
# extend with extra branches. For detect-only we keep the same shape.
@property
def one2many(self):
return dict(box_head=self.cv2, cls_head=self.cv3)
@property
def one2one(self):
return dict(box_head=self.one2one_cv2, cls_head=self.one2one_cv3)
def forward_head(self, x: list[torch.Tensor], box_head: nn.ModuleList, cls_head: nn.ModuleList) -> dict:
"""Run a single (one2many or one2one) head over all pyramid levels."""
bs = x[0].shape[0]
boxes = torch.cat([box_head[i](x[i]).view(bs, 4 * self.reg_max, -1) for i in range(self.nl)], dim=-1)
scores = torch.cat([cls_head[i](x[i]).view(bs, self.nc, -1) for i in range(self.nl)], dim=-1)
return dict(boxes=boxes, scores=scores, feats=x)
def forward(self, x: list[torch.Tensor]) -> dict:
"""Training-mode forward: returns {"one2many": {...}, "one2one": {...}}.
Inference-time decode (anchors -> xyxy, sigmoid scores, topk postprocess)
is intentionally omitted — this repro is about the training data path.
"""
preds_o2m = self.forward_head(x, **self.one2many)
x_detach = [xi.detach() for xi in x]
preds_o2o = self.forward_head(x_detach, **self.one2one)
return {"one2many": preds_o2m, "one2one": preds_o2o}
def bias_init(self) -> None:
"""Set head biases for warm-start convergence (called once, after stride).
Box-conv bias is 2.0 (a positive prior that decoded distances are ~2 strides).
Cls-conv bias is log(5 / nc / (640/stride)^2) — assumes ~5 positives per image
across the feature map at 640px input; this dramatically speeds early epochs.
"""
for heads in (self.one2many, self.one2one):
for i, (a, b) in enumerate(zip(heads["box_head"], heads["cls_head"])):
a[-1].bias.data[:] = 2.0
b[-1].bias.data[: self.nc] = math.log(5 / self.nc / (640 / self.stride[i]) ** 2)
# ---------------------------------------------------------------------------
# YOLO26-small graph
# ---------------------------------------------------------------------------
class YOLO26s(nn.Module):
"""Hardcoded YOLO26-s detect graph (no YAML, no parse_model).
Channel widths at scale s (width=0.5, max_channels=1024, divisor=8) and
depth=0.5 (so every C3k2/C2PSA with n=2 in the YAML becomes n=1 here).
Verified by running upstream parse_model with scale='s'.
Layer reference (idx | module | shape):
0 Conv(3, 32, k=3, s=2) # P1/2
1 Conv(32, 64, k=3, s=2) # P2/4
2 C3k2(64, 128, n=1, c3k=False, e=0.25)
3 Conv(128, 128, k=3, s=2)
4 C3k2(128, 256, n=1, c3k=False, e=0.25) <- saved (feeds Concat 15)
5 Conv(256, 256, k=3, s=2) # P3/8
6 C3k2(256, 256, n=1, c3k=True) <- saved (feeds Concat 12)
7 Conv(256, 512, k=3, s=2) # P4/16
8 C3k2(512, 512, n=1, c3k=True)
9 SPPF(512, 512, k=5, n=3, shortcut=True)
10 C2PSA(512, 512, n=1, e=0.5) <- saved (feeds Concat 21)
11 Upsample x2 nearest
12 Concat([11, 6]) -> 768
13 C3k2(768, 256, n=1, c3k=True) <- saved (feeds Concat 18)
14 Upsample x2 nearest
15 Concat([14, 4]) -> 512
16 C3k2(512, 128, n=1, c3k=True) -> Detect P3
17 Conv(128, 128, k=3, s=2)
18 Concat([17, 13]) -> 384
19 C3k2(384, 256, n=1, c3k=True) -> Detect P4
20 Conv(256, 256, k=3, s=2)
21 Concat([20, 10]) -> 768
22 C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True) -> Detect P5
23 Detect(nc=80, reg_max=1, ch=(128, 256, 512))
"""
def __init__(self, nc: int = 80):
super().__init__()
self.nc = nc
# Backbone
self.conv0 = Conv(3, 32, k=3, s=2)
self.conv1 = Conv(32, 64, k=3, s=2)
self.c3k2_2 = C3k2(64, 128, n=1, c3k=False, e=0.25)
self.conv3 = Conv(128, 128, k=3, s=2)
self.c3k2_4 = C3k2(128, 256, n=1, c3k=False, e=0.25)
self.conv5 = Conv(256, 256, k=3, s=2)
self.c3k2_6 = C3k2(256, 256, n=1, c3k=True)
self.conv7 = Conv(256, 512, k=3, s=2)
self.c3k2_8 = C3k2(512, 512, n=1, c3k=True)
self.sppf = SPPF(512, 512, k=5, n=3, shortcut=True)
self.c2psa = C2PSA(512, 512, n=1, e=0.5)
# Neck (PAN-FPN)
self.up_a = nn.Upsample(scale_factor=2, mode="nearest")
self.c3k2_13 = C3k2(768, 256, n=1, c3k=True)
self.up_b = nn.Upsample(scale_factor=2, mode="nearest")
self.c3k2_16 = C3k2(512, 128, n=1, c3k=True)
self.conv17 = Conv(128, 128, k=3, s=2)
self.c3k2_19 = C3k2(384, 256, n=1, c3k=True)
self.conv20 = Conv(256, 256, k=3, s=2)
self.c3k2_22 = C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True)
# Head
self.detect = Detect(nc=nc, reg_max=1, ch=(128, 256, 512))
self._init_stride_and_bias()
def _init_stride_and_bias(self) -> None:
"""Mirror DetectionModel._init in ultralytics/nn/tasks.py:410-415.
Run a dummy forward at 256x256 to discover the per-level stride, then
initialize Detect biases (which require strides to be set).
"""
was_training = self.training
self.eval()
with torch.no_grad():
feats = self._forward_features(torch.zeros(1, 3, 256, 256))
self.detect.stride = torch.tensor([256.0 / f.shape[-2] for f in feats])
self.detect.bias_init()
if was_training:
self.train()
def _forward_features(self, x: torch.Tensor) -> list[torch.Tensor]:
"""Backbone + neck. Returns the three feature maps (P3, P4, P5) the head consumes."""
x = self.conv0(x)
x = self.conv1(x)
x = self.c3k2_2(x)
x = self.conv3(x)
x4 = self.c3k2_4(x) # saved
x = self.conv5(x4)
x6 = self.c3k2_6(x) # saved
x = self.conv7(x6)
x = self.c3k2_8(x)
x = self.sppf(x)
x10 = self.c2psa(x) # saved
x = self.up_a(x10)
x = torch.cat([x, x6], 1)
x13 = self.c3k2_13(x) # saved
x = self.up_b(x13)
x = torch.cat([x, x4], 1)
p3 = self.c3k2_16(x) # Detect P3
x = self.conv17(p3)
x = torch.cat([x, x13], 1)
p4 = self.c3k2_19(x) # Detect P4
x = self.conv20(p4)
x = torch.cat([x, x10], 1)
p5 = self.c3k2_22(x) # Detect P5
return [p3, p4, p5]
def forward(self, x: torch.Tensor) -> dict:
feats = self._forward_features(x)
return self.detect(feats)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment