Created
March 6, 2025 22:24
-
-
Save wolfecameron/01537359d71ccc2efadf0411ec8991f6 to your computer and use it in GitHub Desktop.
MoE block for an MoE-based decoder-only transformer model in PyTorch.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch import nn | |
class MoEBlock(nn.Module): | |
def __init__( | |
self, | |
d, | |
H, | |
C, | |
n_exp, | |
top_k, | |
use_noisy_top_k = True, | |
capacity_factor = 1.25, | |
bias = False, | |
dropout = 0.2, | |
): | |
""" | |
Arguments: | |
d: size of embedding dimension | |
H: number of attention heads | |
C: maximum length of input sequences (in tokens) | |
n_exp: the number of experts to create in the expert layer | |
top_k: the number of active experts for each token | |
use_noisy_top_k: whether to add noise when computing expert output | |
capacity_factor: used to compute expert capacity | |
bias: whether or not to use bias in linear layers | |
dropout: probability of dropout | |
""" | |
super().__init__() | |
self.ln_1 = nn.LayerNorm(d) | |
self.attn = CausalSelfAttention(d, H, T, bias, dropout) | |
self.ln_2 = nn.LayerNorm(d) | |
self.mlp = MOELayer( | |
d, | |
n_exp, | |
top_k, | |
use_noisy_top_k, | |
capacity_factor, | |
bias, | |
dropout, | |
) | |
def forward(self, x): | |
x = x + self.attn(self.ln_1(x)) | |
x = x + self.mlp(self.ln_2(x)) | |
return x |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment