Skip to content

Instantly share code, notes, and snippets.

@wolfecameron
Last active March 6, 2025 22:01
Show Gist options
  • Save wolfecameron/5448764d97ceed8a1cb0af9b4e21f48f to your computer and use it in GitHub Desktop.
Save wolfecameron/5448764d97ceed8a1cb0af9b4e21f48f to your computer and use it in GitHub Desktop.
Expert layer for a MoE-based transformer.
"""
Based upon ColossalAI OpenMoE
"""
import torch
from torch import nn
class MLPExperts(nn.Module):
def __init__(
self,
d,
n_exp=8,
bias=False,
dropout=0.2,
):
"""
Arguments:
d: size of embedding dimension
n_exp: the number of experts to create in the expert layer
bias: whether or not to use bias in linear layers
dropout: probability of dropout
"""
super().__init__()
self.bias = bias
self.c_fc = nn.Parameter(torch.empty(n_exp, d, 4 * d))
self.c_proj = nn.Parameter(torch.empty(n_exp, 4 * d, d))
self.fc_bias = nn.Parameter(torch.empty(n_exp, 1, 4 * d)) if self.bias else None
self.proj_bias = nn.Parameter(torch.empty(n_exp, 1, d)) if self.bias else None
self.gelu = nn.GELU()
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = torch.bmm(x, self.c_fc)
if self.bias:
x += self.fc_bias
x = self.gelu(x)
x = torch.bmm(x, self.c_proj)
if self.bias:
x += self.proj_bias
x = self.dropout(x)
return x
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment