wolfecameron · March 6, 2025 22:24
diff --git a/moe_block.py b/moe_block.py
 from torch import nn

 class MoEBlock(nn.Module):

    def __init__(
        self,
        d,
        H,
        C,
        n_exp,
        top_k,
        use_noisy_top_k = True,
        capacity_factor = 1.25,
        bias = False,
        dropout = 0.2,   
    ):
        """
        Arguments:
        d: size of embedding dimension
        H: number of attention heads
        C: maximum length of input sequences (in tokens)
        n_exp: the number of experts to create in the expert layer
        top_k: the number of active experts for each token
        use_noisy_top_k: whether to add noise when computing expert output
        capacity_factor: used to compute expert capacity
        bias: whether or not to use bias in linear layers
        dropout: probability of dropout
        """

        super().__init__()
        self.ln_1 = nn.LayerNorm(d)
        self.attn = CausalSelfAttention(d, H, T, bias, dropout)
        self.ln_2 = nn.LayerNorm(d)
        self.mlp = MOELayer(
            d,
            n_exp,
            top_k,
            use_noisy_top_k,
            capacity_factor,
            bias,
            dropout,
        )

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
	from torch import nn

	class MoEBlock(nn.Module):

	def __init__(
	self,
	d,
	H,
	C,
	n_exp,
	top_k,
	use_noisy_top_k = True,
	capacity_factor = 1.25,
	bias = False,
	dropout = 0.2,
	):
	"""
	Arguments:
	d: size of embedding dimension
	H: number of attention heads
	C: maximum length of input sequences (in tokens)
	n_exp: the number of experts to create in the expert layer
	top_k: the number of active experts for each token
	use_noisy_top_k: whether to add noise when computing expert output
	capacity_factor: used to compute expert capacity
	bias: whether or not to use bias in linear layers
	dropout: probability of dropout
	"""

	super().__init__()
	self.ln_1 = nn.LayerNorm(d)
	self.attn = CausalSelfAttention(d, H, T, bias, dropout)
	self.ln_2 = nn.LayerNorm(d)
	self.mlp = MOELayer(
	d,
	n_exp,
	top_k,
	use_noisy_top_k,
	capacity_factor,
	bias,
	dropout,
	)

	def forward(self, x):
	x = x + self.attn(self.ln_1(x))
	x = x + self.mlp(self.ln_2(x))
	return x