burn_core/nn/
swiglu.rs

1use crate as burn;
2
3use crate::config::Config;
4use crate::module::{Content, DisplaySettings, Module, ModuleDisplay};
5use crate::tensor::activation::silu;
6use crate::tensor::{Tensor, backend::Backend};
7
8use super::{Initializer, Linear, LinearConfig};
9
10/// Configuration to create a [SwiGlu](SwiGlu) activation layer using the [init function](SwiGluConfig::init).
11#[derive(Config, Debug)]
12pub struct SwiGluConfig {
13    /// The size of the input features.
14    pub d_input: usize,
15    /// The size of the output features.
16    pub d_output: usize,
17    /// If a bias should be applied during the linear transformation. Default behaviour is False
18    /// for SwiGLU activation implementations.
19    #[config(default = false)]
20    pub bias: bool,
21    /// The type of function used to initialize the linear layer parameters
22    #[config(
23        default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}"
24    )]
25    pub initializer: Initializer,
26}
27
28/// Applies the SwiGLU or Swish Gated Linear Unit to the input tensor.
29/// The SwiGLU activation function is defined as:
30/// `SwiGLU(x) = Swish(W_inner * x + b_inner) * (W_outer * x + b_outer)`
31///
32/// Should be created with [SwiGluConfig].
33#[derive(Module, Debug)]
34#[module(custom_display)]
35pub struct SwiGlu<B: Backend> {
36    /// The inner linear layer for Swish activation function
37    /// with `d_input` input features and `d_output` output features.
38    pub linear_inner: Linear<B>,
39    /// The outer linear layer for element wise multiplication
40    /// with `d_input` input features and `d_output` output features.
41    pub linear_outer: Linear<B>,
42}
43
44impl<B: Backend> ModuleDisplay for SwiGlu<B> {
45    fn custom_settings(&self) -> Option<DisplaySettings> {
46        DisplaySettings::new()
47            .with_new_line_after_attribute(false)
48            .optional()
49    }
50
51    fn custom_content(&self, content: Content) -> Option<Content> {
52        let [d_input, d_output] = self.linear_inner.weight.shape().dims();
53        content
54            .add("d_input", &d_input)
55            .add("d_output", &d_output)
56            .add("bias", &self.linear_inner.bias.is_some())
57            .optional()
58    }
59}
60
61impl SwiGluConfig {
62    /// Initialize a new [SwiGLU](SwiGlu) activation layer.
63    pub fn init<B: Backend>(&self, device: &B::Device) -> SwiGlu<B> {
64        SwiGlu {
65            linear_inner: LinearConfig::new(self.d_input, self.d_output)
66                .with_bias(self.bias)
67                .with_initializer(self.initializer.clone())
68                .init(device),
69            linear_outer: LinearConfig::new(self.d_input, self.d_output)
70                .with_bias(self.bias)
71                .with_initializer(self.initializer.clone())
72                .init(device),
73        }
74    }
75}
76
77impl<B: Backend> SwiGlu<B> {
78    /// Applies the Swish Gated Linear Unit to the input tensor.
79    ///
80    /// # Shapes
81    ///
82    /// - input: `[batch_size, seq_length, d_input]`
83    /// - output: `[batch_size, seq_length, d_output]`
84    pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
85        let x = self.linear_inner.forward(input.clone());
86        let x = silu(x);
87        x.mul(self.linear_outer.forward(input))
88    }
89}
90
91#[cfg(test)]
92mod tests {
93    use super::*;
94    use crate::TestBackend;
95    use burn_tensor::{Tolerance, ops::FloatElem};
96    type FT = FloatElem<TestBackend>;
97
98    #[test]
99    fn test_swiglu_forward_no_bias() {
100        TestBackend::seed(0);
101        let device = Default::default();
102        let config = SwiGluConfig::new(3, 3).with_initializer(Initializer::Constant { value: 0.5 });
103        let swiglu = config.init(&device);
104        let input =
105            Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
106        let output = swiglu.forward(input);
107        let expected_output = Tensor::<TestBackend, 2>::from_data(
108            [[8.5732, 8.5732, 8.5732], [56.2189, 56.2189, 56.2189]],
109            &device,
110        );
111        output
112            .to_data()
113            .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
114    }
115
116    #[test]
117    fn test_swiglu_forward_with_bias() {
118        TestBackend::seed(0);
119        let device = Default::default();
120        let config = SwiGluConfig::new(3, 3)
121            .with_bias(true)
122            .with_initializer(Initializer::Constant { value: 0.5 });
123        let swiglu = config.init(&device);
124        let input =
125            Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
126        let output = swiglu.forward(input);
127        let expected_output = Tensor::<TestBackend, 2>::from_data(
128            [[11.8909, 11.8909, 11.8909], [63.9785, 63.9785, 63.9785]],
129            &device,
130        );
131        output
132            .to_data()
133            .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
134    }
135
136    #[test]
137    fn display() {
138        let config = SwiGluConfig::new(3, 5);
139        let swiglu = config.init::<TestBackend>(&Default::default());
140
141        assert_eq!(
142            alloc::format!("{swiglu}"),
143            "SwiGlu {d_input: 3, d_output: 5, bias: false, params: 30}"
144        );
145    }
146}