Skip to main content

burn_nn/activation/
swiglu.rs

1use burn_core as burn;
2
3use burn::config::Config;
4use burn::module::{Content, DisplaySettings, Initializer, Module, ModuleDisplay};
5use burn::tensor::activation::silu;
6use burn::tensor::{Tensor, backend::Backend};
7
8use crate::{Linear, LinearConfig, LinearLayout};
9
10/// Configuration to create a [SwiGlu](SwiGlu) activation layer using the [init function](SwiGluConfig::init).
11#[derive(Config, Debug)]
12pub struct SwiGluConfig {
13    /// The size of the input features.
14    pub d_input: usize,
15    /// The size of the output features.
16    pub d_output: usize,
17    /// If a bias should be applied during the linear transformation. Default behaviour is False
18    /// for SwiGLU activation implementations.
19    #[config(default = false)]
20    pub bias: bool,
21    /// The type of function used to initialize the linear layer parameters
22    #[config(
23        default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}"
24    )]
25    pub initializer: Initializer,
26    /// The layout in which the linear parameters are stored.
27    #[config(default = "LinearLayout::Row")]
28    pub layout: LinearLayout,
29}
30
31/// Applies the SwiGLU or Swish Gated Linear Unit to the input tensor.
32/// The SwiGLU activation function is defined as:
33/// `SwiGLU(x) = Swish(W_inner * x + b_inner) * (W_outer * x + b_outer)`
34///
35/// Should be created with [SwiGluConfig].
36#[derive(Module, Debug)]
37#[module(custom_display)]
38pub struct SwiGlu<B: Backend> {
39    /// The inner linear layer for Swish activation function
40    /// with `d_input` input features and `d_output` output features.
41    pub linear_inner: Linear<B>,
42    /// The outer linear layer for element wise multiplication
43    /// with `d_input` input features and `d_output` output features.
44    pub linear_outer: Linear<B>,
45}
46
47impl<B: Backend> ModuleDisplay for SwiGlu<B> {
48    fn custom_settings(&self) -> Option<DisplaySettings> {
49        DisplaySettings::new()
50            .with_new_line_after_attribute(false)
51            .optional()
52    }
53
54    fn custom_content(&self, content: Content) -> Option<Content> {
55        let [d_input, d_output] = self.linear_inner.weight.shape().dims();
56        content
57            .add("d_input", &d_input)
58            .add("d_output", &d_output)
59            .add("bias", &self.linear_inner.bias.is_some())
60            .optional()
61    }
62}
63
64impl SwiGluConfig {
65    /// Initialize a new [SwiGLU](SwiGlu) activation layer.
66    pub fn init<B: Backend>(&self, device: &B::Device) -> SwiGlu<B> {
67        SwiGlu {
68            linear_inner: LinearConfig::new(self.d_input, self.d_output)
69                .with_bias(self.bias)
70                .with_initializer(self.initializer.clone())
71                .with_layout(self.layout)
72                .init(device),
73            linear_outer: LinearConfig::new(self.d_input, self.d_output)
74                .with_bias(self.bias)
75                .with_initializer(self.initializer.clone())
76                .with_layout(self.layout)
77                .init(device),
78        }
79    }
80}
81
82impl<B: Backend> SwiGlu<B> {
83    /// Applies the Swish Gated Linear Unit to the input tensor.
84    ///
85    /// # Shapes
86    ///
87    /// - input: `[batch_size, seq_length, d_input]`
88    /// - output: `[batch_size, seq_length, d_output]`
89    pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
90        let x = self.linear_inner.forward(input.clone());
91        let x = silu(x);
92        x.mul(self.linear_outer.forward(input))
93    }
94}
95
96#[cfg(test)]
97mod tests {
98    use super::*;
99    use crate::TestBackend;
100    use burn::tensor::{Tolerance, ops::FloatElem};
101    type FT = FloatElem<TestBackend>;
102
103    #[test]
104    fn test_swiglu_forward_no_bias() {
105        let device = Default::default();
106        TestBackend::seed(&device, 0);
107
108        let config = SwiGluConfig::new(3, 3).with_initializer(Initializer::Constant { value: 0.5 });
109        let swiglu = config.init(&device);
110        let input =
111            Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
112        let output = swiglu.forward(input);
113        let expected_output = Tensor::<TestBackend, 2>::from_data(
114            [[8.5732, 8.5732, 8.5732], [56.2189, 56.2189, 56.2189]],
115            &device,
116        );
117        output
118            .to_data()
119            .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
120    }
121
122    #[test]
123    fn test_swiglu_forward_with_bias() {
124        let device = Default::default();
125        TestBackend::seed(&device, 0);
126
127        let config = SwiGluConfig::new(3, 3)
128            .with_bias(true)
129            .with_initializer(Initializer::Constant { value: 0.5 });
130        let swiglu = config.init(&device);
131        let input =
132            Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
133        let output = swiglu.forward(input);
134        let expected_output = Tensor::<TestBackend, 2>::from_data(
135            [[11.8909, 11.8909, 11.8909], [63.9785, 63.9785, 63.9785]],
136            &device,
137        );
138        output
139            .to_data()
140            .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
141    }
142
143    #[test]
144    fn display() {
145        let config = SwiGluConfig::new(3, 5);
146        let swiglu = config.init::<TestBackend>(&Default::default());
147
148        assert_eq!(
149            alloc::format!("{swiglu}"),
150            "SwiGlu {d_input: 3, d_output: 5, bias: false, params: 30}"
151        );
152    }
153}