Skip to content

Instantly share code, notes, and snippets.

@wolfecameron
Created March 6, 2025 22:24
Show Gist options
  • Save wolfecameron/01537359d71ccc2efadf0411ec8991f6 to your computer and use it in GitHub Desktop.
Save wolfecameron/01537359d71ccc2efadf0411ec8991f6 to your computer and use it in GitHub Desktop.
MoE block for an MoE-based decoder-only transformer model in PyTorch.
from torch import nn
class MoEBlock(nn.Module):
def __init__(
self,
d,
H,
C,
n_exp,
top_k,
use_noisy_top_k = True,
capacity_factor = 1.25,
bias = False,
dropout = 0.2,
):
"""
Arguments:
d: size of embedding dimension
H: number of attention heads
C: maximum length of input sequences (in tokens)
n_exp: the number of experts to create in the expert layer
top_k: the number of active experts for each token
use_noisy_top_k: whether to add noise when computing expert output
capacity_factor: used to compute expert capacity
bias: whether or not to use bias in linear layers
dropout: probability of dropout
"""
super().__init__()
self.ln_1 = nn.LayerNorm(d)
self.attn = CausalSelfAttention(d, H, T, bias, dropout)
self.ln_2 = nn.LayerNorm(d)
self.mlp = MOELayer(
d,
n_exp,
top_k,
use_noisy_top_k,
capacity_factor,
bias,
dropout,
)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment