Created
May 19, 2026 20:40
-
-
Save CoffeeVampir3/7b8a029f659e811da2c65762818f1b09 to your computer and use it in GitHub Desktop.
yolo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| YOLO26-small detect — modules + graph. | |
| All modules are faithful ports of their upstream counterparts. The branches we | |
| collapsed: | |
| * Conv: dropped forward_fuse (deploy-time), dropped activation overrides. | |
| * C3k2: kept BOTH the bottleneck and PSABlock paths — the PSABlock path is | |
| used only at layer 22 (attn=True). Dropped forward_split. | |
| * Detect: dropped legacy v3/v5/v8 head shape, dropped export/dynamic/format, | |
| dropped postprocess/_inference/decode_bboxes (inference-only). | |
| reg_max=1, so DFL is identity; we drop the DFL class entirely. | |
| * Everything fuses-related, MPS guards, etc. | |
| The `YOLO26s` class hardcodes the 24-layer graph at scale s (width=0.5, | |
| depth=0.5, max_channels=1024) — channel widths verified against | |
| `parse_model(ultralytics/cfg/models/26/yolo26.yaml, scale='s')`. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import torch | |
| import torch.nn as nn | |
| from .ops import dist2bbox, make_anchors | |
| # --------------------------------------------------------------------------- | |
| # Basic building blocks (ports from ultralytics/nn/modules/conv.py) | |
| # --------------------------------------------------------------------------- | |
| def autopad(k: int, p: int | None = None, d: int = 1) -> int: | |
| """Same-padding for a kxk conv with dilation d.""" | |
| if d > 1: | |
| k = d * (k - 1) + 1 | |
| return p if p is not None else k // 2 | |
| class Conv(nn.Module): | |
| """Conv2d + BatchNorm2d + SiLU. The single most-used module in the network.""" | |
| def __init__(self, c1: int, c2: int, k: int = 1, s: int = 1, p: int | None = None, g: int = 1, d: int = 1): | |
| super().__init__() | |
| self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False) | |
| self.bn = nn.BatchNorm2d(c2) | |
| self.act = nn.SiLU() | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| return self.act(self.bn(self.conv(x))) | |
| class Concat(nn.Module): | |
| """Channel-dim concatenation of a list of tensors.""" | |
| def __init__(self, dim: int = 1): | |
| super().__init__() | |
| self.d = dim | |
| def forward(self, xs: list[torch.Tensor]) -> torch.Tensor: | |
| return torch.cat(xs, self.d) | |
| # --------------------------------------------------------------------------- | |
| # CSP family (ports from ultralytics/nn/modules/block.py) | |
| # --------------------------------------------------------------------------- | |
| class Bottleneck(nn.Module): | |
| """k0xk0 + k1xk1 conv with optional residual when c1 == c2.""" | |
| def __init__( | |
| self, | |
| c1: int, | |
| c2: int, | |
| shortcut: bool = True, | |
| g: int = 1, | |
| k: tuple[int, int] = (3, 3), | |
| e: float = 0.5, | |
| ): | |
| super().__init__() | |
| c_ = int(c2 * e) | |
| self.cv1 = Conv(c1, c_, k[0], 1) | |
| self.cv2 = Conv(c_, c2, k[1], 1, g=g) | |
| self.add = shortcut and c1 == c2 | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| y = self.cv2(self.cv1(x)) | |
| return x + y if self.add else y | |
| class C3k(nn.Module): | |
| """CSP block with 3 1x1 convs; inner block is a stack of `n` Bottlenecks (kxk).""" | |
| def __init__(self, c1: int, c2: int, n: int = 1, shortcut: bool = True, g: int = 1, e: float = 0.5, k: int = 3): | |
| super().__init__() | |
| c_ = int(c2 * e) | |
| self.cv1 = Conv(c1, c_, 1, 1) | |
| self.cv2 = Conv(c1, c_, 1, 1) | |
| self.cv3 = Conv(2 * c_, c2, 1) | |
| self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n))) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1)) | |
| class Attention(nn.Module): | |
| def __init__(self, dim: int, num_heads: int = 8, attn_ratio: float = 0.5): | |
| super().__init__() | |
| self.num_heads = num_heads | |
| self.head_dim = dim // num_heads | |
| self.key_dim = int(self.head_dim * attn_ratio) | |
| self.scale = self.key_dim**-0.5 | |
| nh_kd = self.key_dim * num_heads | |
| h = dim + nh_kd * 2 # one fused projection: q + k + v widths | |
| self.qkv = Conv(dim, h, 1) | |
| self.qkv.act = nn.Identity() # the source uses act=False here | |
| self.proj = Conv(dim, dim, 1) | |
| self.proj.act = nn.Identity() | |
| self.pe = Conv(dim, dim, 3, 1, g=dim) | |
| self.pe.act = nn.Identity() | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| B, C, H, W = x.shape | |
| N = H * W | |
| qkv = self.qkv(x) | |
| q, k, v = qkv.view(B, self.num_heads, self.key_dim * 2 + self.head_dim, N).split( | |
| [self.key_dim, self.key_dim, self.head_dim], dim=2 | |
| ) | |
| attn = (q.transpose(-2, -1) @ k) * self.scale | |
| attn = attn.softmax(dim=-1) | |
| x = (v @ attn.transpose(-2, -1)).view(B, C, H, W) + self.pe(v.reshape(B, C, H, W)) | |
| return self.proj(x) | |
| class PSABlock(nn.Module): | |
| """Position-sensitive attention transformer block: attn + FFN, both with residuals.""" | |
| def __init__(self, c: int, attn_ratio: float = 0.5, num_heads: int = 4, shortcut: bool = True): | |
| super().__init__() | |
| self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads) | |
| self.ffn = nn.Sequential(Conv(c, c * 2, 1), Conv(c * 2, c, 1)) | |
| self.ffn[1].act = nn.Identity() # source: act=False on the second FFN conv | |
| self.add = shortcut | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = x + self.attn(x) if self.add else self.attn(x) | |
| x = x + self.ffn(x) if self.add else self.ffn(x) | |
| return x | |
| class C2PSA(nn.Module): | |
| """CSP shell around n stacked PSABlocks. | |
| Splits the input into two halves; runs attention on one; cats and projects. | |
| """ | |
| def __init__(self, c1: int, c2: int, n: int = 1, e: float = 0.5): | |
| super().__init__() | |
| assert c1 == c2 | |
| self.c = int(c1 * e) | |
| self.cv1 = Conv(c1, 2 * self.c, 1, 1) | |
| self.cv2 = Conv(2 * self.c, c1, 1) | |
| self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| a, b = self.cv1(x).split((self.c, self.c), dim=1) | |
| b = self.m(b) | |
| return self.cv2(torch.cat((a, b), 1)) | |
| class C3k2(nn.Module): | |
| """The workhorse CSP block of YOLO26. | |
| Like C2f: splits the cv1 output into two halves and chains `n` inner blocks | |
| on the second half, keeping every intermediate, then cats all and projects. | |
| The inner block depends on the layer: | |
| * attn=True (only layer 22 in yolo26s): Bottleneck -> PSABlock pair. | |
| * c3k=True (layers 6, 8, 13, 16, 19, 22): a nested C3k. | |
| * else (layers 2, 4): a plain Bottleneck. | |
| """ | |
| def __init__( | |
| self, | |
| c1: int, | |
| c2: int, | |
| n: int = 1, | |
| c3k: bool = False, | |
| e: float = 0.5, | |
| attn: bool = False, | |
| g: int = 1, | |
| shortcut: bool = True, | |
| ): | |
| super().__init__() | |
| self.c = int(c2 * e) | |
| self.cv1 = Conv(c1, 2 * self.c, 1, 1) | |
| self.cv2 = Conv((2 + n) * self.c, c2, 1) | |
| if attn: | |
| self.m = nn.ModuleList( | |
| nn.Sequential( | |
| Bottleneck(self.c, self.c, shortcut, g), | |
| PSABlock(self.c, attn_ratio=0.5, num_heads=max(self.c // 64, 1)), | |
| ) | |
| for _ in range(n) | |
| ) | |
| elif c3k: | |
| self.m = nn.ModuleList(C3k(self.c, self.c, 2, shortcut, g) for _ in range(n)) | |
| else: | |
| self.m = nn.ModuleList(Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| y = list(self.cv1(x).chunk(2, 1)) | |
| y.extend(m(y[-1]) for m in self.m) | |
| return self.cv2(torch.cat(y, 1)) | |
| class SPPF(nn.Module): | |
| """Spatial Pyramid Pooling — Fast. Equivalent to SPP(k=5,9,13) via serial pools.""" | |
| def __init__(self, c1: int, c2: int, k: int = 5, n: int = 3, shortcut: bool = False): | |
| super().__init__() | |
| c_ = c1 // 2 | |
| self.cv1 = Conv(c1, c_, 1, 1) | |
| self.cv1.act = nn.Identity() # source: act=False | |
| self.cv2 = Conv(c_ * (n + 1), c2, 1, 1) | |
| self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2) | |
| self.n = n | |
| self.add = shortcut and c1 == c2 | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| y = [self.cv1(x)] | |
| for _ in range(self.n): | |
| y.append(self.m(y[-1])) | |
| out = self.cv2(torch.cat(y, 1)) | |
| return out + x if self.add else out | |
| # --------------------------------------------------------------------------- | |
| # Detection head (port from ultralytics/nn/modules/head.py, end2end=True, reg_max=1) | |
| # --------------------------------------------------------------------------- | |
| class Detect(nn.Module): | |
| """YOLO26 detection head: end2end (one2many + one2one), reg_max=1 (no DFL). | |
| For each pyramid level (P3, P4, P5) it predicts: | |
| * 4 channels of distance regression (left, top, right, bottom from anchor), | |
| * nc channels of per-class logits. | |
| `end2end=True` builds a parallel one2one head with identical structure. In | |
| training, BOTH heads run; the input features are detached into the one2one | |
| branch so the strict one-to-one matcher doesn't push gradients into the | |
| backbone — only the dense one2many path trains the backbone. | |
| """ | |
| def __init__(self, nc: int = 80, reg_max: int = 1, ch: tuple[int, ...] = ()): | |
| super().__init__() | |
| assert reg_max == 1, "this repro hardcodes reg_max=1 (the YOLO26 default)" | |
| self.nc = nc | |
| self.nl = len(ch) | |
| self.reg_max = reg_max | |
| self.no = nc + reg_max * 4 | |
| self.stride = torch.zeros(self.nl) | |
| c2, c3 = max(16, ch[0] // 4, reg_max * 4), max(ch[0], min(nc, 100)) | |
| # Box-distance head: two 3x3 Convs + 1x1 -> 4*reg_max logits. | |
| self.cv2 = nn.ModuleList( | |
| nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * reg_max, 1)) for x in ch | |
| ) | |
| # Class head: the modern depthwise-separable variant (legacy v3/v5/v8 branch dropped). | |
| self.cv3 = nn.ModuleList( | |
| nn.Sequential( | |
| nn.Sequential(Conv(x, x, 3, g=x), Conv(x, c3, 1)), # DWConv -> 1x1 | |
| nn.Sequential(Conv(c3, c3, 3, g=c3), Conv(c3, c3, 1)), | |
| nn.Conv2d(c3, nc, 1), | |
| ) | |
| for x in ch | |
| ) | |
| # DFL with reg_max=1 is the identity. Inlined; the DFL class is omitted. | |
| self.dfl = nn.Identity() | |
| # end2end is always True for YOLO26 — build the parallel one2one head. | |
| import copy | |
| self.one2one_cv2 = copy.deepcopy(self.cv2) | |
| self.one2one_cv3 = copy.deepcopy(self.cv3) | |
| # The upstream Detect uses `one2many`/`one2one` @property dicts so OBB/Pose can | |
| # extend with extra branches. For detect-only we keep the same shape. | |
| @property | |
| def one2many(self): | |
| return dict(box_head=self.cv2, cls_head=self.cv3) | |
| @property | |
| def one2one(self): | |
| return dict(box_head=self.one2one_cv2, cls_head=self.one2one_cv3) | |
| def forward_head(self, x: list[torch.Tensor], box_head: nn.ModuleList, cls_head: nn.ModuleList) -> dict: | |
| """Run a single (one2many or one2one) head over all pyramid levels.""" | |
| bs = x[0].shape[0] | |
| boxes = torch.cat([box_head[i](x[i]).view(bs, 4 * self.reg_max, -1) for i in range(self.nl)], dim=-1) | |
| scores = torch.cat([cls_head[i](x[i]).view(bs, self.nc, -1) for i in range(self.nl)], dim=-1) | |
| return dict(boxes=boxes, scores=scores, feats=x) | |
| def forward(self, x: list[torch.Tensor]) -> dict: | |
| """Training-mode forward: returns {"one2many": {...}, "one2one": {...}}. | |
| Inference-time decode (anchors -> xyxy, sigmoid scores, topk postprocess) | |
| is intentionally omitted — this repro is about the training data path. | |
| """ | |
| preds_o2m = self.forward_head(x, **self.one2many) | |
| x_detach = [xi.detach() for xi in x] | |
| preds_o2o = self.forward_head(x_detach, **self.one2one) | |
| return {"one2many": preds_o2m, "one2one": preds_o2o} | |
| def bias_init(self) -> None: | |
| """Set head biases for warm-start convergence (called once, after stride). | |
| Box-conv bias is 2.0 (a positive prior that decoded distances are ~2 strides). | |
| Cls-conv bias is log(5 / nc / (640/stride)^2) — assumes ~5 positives per image | |
| across the feature map at 640px input; this dramatically speeds early epochs. | |
| """ | |
| for heads in (self.one2many, self.one2one): | |
| for i, (a, b) in enumerate(zip(heads["box_head"], heads["cls_head"])): | |
| a[-1].bias.data[:] = 2.0 | |
| b[-1].bias.data[: self.nc] = math.log(5 / self.nc / (640 / self.stride[i]) ** 2) | |
| # --------------------------------------------------------------------------- | |
| # YOLO26-small graph | |
| # --------------------------------------------------------------------------- | |
| class YOLO26s(nn.Module): | |
| """Hardcoded YOLO26-s detect graph (no YAML, no parse_model). | |
| Channel widths at scale s (width=0.5, max_channels=1024, divisor=8) and | |
| depth=0.5 (so every C3k2/C2PSA with n=2 in the YAML becomes n=1 here). | |
| Verified by running upstream parse_model with scale='s'. | |
| Layer reference (idx | module | shape): | |
| 0 Conv(3, 32, k=3, s=2) # P1/2 | |
| 1 Conv(32, 64, k=3, s=2) # P2/4 | |
| 2 C3k2(64, 128, n=1, c3k=False, e=0.25) | |
| 3 Conv(128, 128, k=3, s=2) | |
| 4 C3k2(128, 256, n=1, c3k=False, e=0.25) <- saved (feeds Concat 15) | |
| 5 Conv(256, 256, k=3, s=2) # P3/8 | |
| 6 C3k2(256, 256, n=1, c3k=True) <- saved (feeds Concat 12) | |
| 7 Conv(256, 512, k=3, s=2) # P4/16 | |
| 8 C3k2(512, 512, n=1, c3k=True) | |
| 9 SPPF(512, 512, k=5, n=3, shortcut=True) | |
| 10 C2PSA(512, 512, n=1, e=0.5) <- saved (feeds Concat 21) | |
| 11 Upsample x2 nearest | |
| 12 Concat([11, 6]) -> 768 | |
| 13 C3k2(768, 256, n=1, c3k=True) <- saved (feeds Concat 18) | |
| 14 Upsample x2 nearest | |
| 15 Concat([14, 4]) -> 512 | |
| 16 C3k2(512, 128, n=1, c3k=True) -> Detect P3 | |
| 17 Conv(128, 128, k=3, s=2) | |
| 18 Concat([17, 13]) -> 384 | |
| 19 C3k2(384, 256, n=1, c3k=True) -> Detect P4 | |
| 20 Conv(256, 256, k=3, s=2) | |
| 21 Concat([20, 10]) -> 768 | |
| 22 C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True) -> Detect P5 | |
| 23 Detect(nc=80, reg_max=1, ch=(128, 256, 512)) | |
| """ | |
| def __init__(self, nc: int = 80): | |
| super().__init__() | |
| self.nc = nc | |
| # Backbone | |
| self.conv0 = Conv(3, 32, k=3, s=2) | |
| self.conv1 = Conv(32, 64, k=3, s=2) | |
| self.c3k2_2 = C3k2(64, 128, n=1, c3k=False, e=0.25) | |
| self.conv3 = Conv(128, 128, k=3, s=2) | |
| self.c3k2_4 = C3k2(128, 256, n=1, c3k=False, e=0.25) | |
| self.conv5 = Conv(256, 256, k=3, s=2) | |
| self.c3k2_6 = C3k2(256, 256, n=1, c3k=True) | |
| self.conv7 = Conv(256, 512, k=3, s=2) | |
| self.c3k2_8 = C3k2(512, 512, n=1, c3k=True) | |
| self.sppf = SPPF(512, 512, k=5, n=3, shortcut=True) | |
| self.c2psa = C2PSA(512, 512, n=1, e=0.5) | |
| # Neck (PAN-FPN) | |
| self.up_a = nn.Upsample(scale_factor=2, mode="nearest") | |
| self.c3k2_13 = C3k2(768, 256, n=1, c3k=True) | |
| self.up_b = nn.Upsample(scale_factor=2, mode="nearest") | |
| self.c3k2_16 = C3k2(512, 128, n=1, c3k=True) | |
| self.conv17 = Conv(128, 128, k=3, s=2) | |
| self.c3k2_19 = C3k2(384, 256, n=1, c3k=True) | |
| self.conv20 = Conv(256, 256, k=3, s=2) | |
| self.c3k2_22 = C3k2(768, 512, n=1, c3k=True, e=0.5, attn=True) | |
| # Head | |
| self.detect = Detect(nc=nc, reg_max=1, ch=(128, 256, 512)) | |
| self._init_stride_and_bias() | |
| def _init_stride_and_bias(self) -> None: | |
| """Mirror DetectionModel._init in ultralytics/nn/tasks.py:410-415. | |
| Run a dummy forward at 256x256 to discover the per-level stride, then | |
| initialize Detect biases (which require strides to be set). | |
| """ | |
| was_training = self.training | |
| self.eval() | |
| with torch.no_grad(): | |
| feats = self._forward_features(torch.zeros(1, 3, 256, 256)) | |
| self.detect.stride = torch.tensor([256.0 / f.shape[-2] for f in feats]) | |
| self.detect.bias_init() | |
| if was_training: | |
| self.train() | |
| def _forward_features(self, x: torch.Tensor) -> list[torch.Tensor]: | |
| """Backbone + neck. Returns the three feature maps (P3, P4, P5) the head consumes.""" | |
| x = self.conv0(x) | |
| x = self.conv1(x) | |
| x = self.c3k2_2(x) | |
| x = self.conv3(x) | |
| x4 = self.c3k2_4(x) # saved | |
| x = self.conv5(x4) | |
| x6 = self.c3k2_6(x) # saved | |
| x = self.conv7(x6) | |
| x = self.c3k2_8(x) | |
| x = self.sppf(x) | |
| x10 = self.c2psa(x) # saved | |
| x = self.up_a(x10) | |
| x = torch.cat([x, x6], 1) | |
| x13 = self.c3k2_13(x) # saved | |
| x = self.up_b(x13) | |
| x = torch.cat([x, x4], 1) | |
| p3 = self.c3k2_16(x) # Detect P3 | |
| x = self.conv17(p3) | |
| x = torch.cat([x, x13], 1) | |
| p4 = self.c3k2_19(x) # Detect P4 | |
| x = self.conv20(p4) | |
| x = torch.cat([x, x10], 1) | |
| p5 = self.c3k2_22(x) # Detect P5 | |
| return [p3, p4, p5] | |
| def forward(self, x: torch.Tensor) -> dict: | |
| feats = self._forward_features(x) | |
| return self.detect(feats) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment