1use crate as burn;
2
3use crate::config::Config;
4use crate::module::{Content, DisplaySettings, Module, ModuleDisplay};
5use crate::tensor::activation::silu;
6use crate::tensor::{Tensor, backend::Backend};
7
8use super::{Initializer, Linear, LinearConfig};
9
10#[derive(Config, Debug)]
12pub struct SwiGluConfig {
13 pub d_input: usize,
15 pub d_output: usize,
17 #[config(default = false)]
20 pub bias: bool,
21 #[config(
23 default = "Initializer::KaimingUniform{gain:1.0/num_traits::Float::sqrt(3.0), fan_out_only:false}"
24 )]
25 pub initializer: Initializer,
26}
27
28#[derive(Module, Debug)]
34#[module(custom_display)]
35pub struct SwiGlu<B: Backend> {
36 pub linear_inner: Linear<B>,
39 pub linear_outer: Linear<B>,
42}
43
44impl<B: Backend> ModuleDisplay for SwiGlu<B> {
45 fn custom_settings(&self) -> Option<DisplaySettings> {
46 DisplaySettings::new()
47 .with_new_line_after_attribute(false)
48 .optional()
49 }
50
51 fn custom_content(&self, content: Content) -> Option<Content> {
52 let [d_input, d_output] = self.linear_inner.weight.shape().dims();
53 content
54 .add("d_input", &d_input)
55 .add("d_output", &d_output)
56 .add("bias", &self.linear_inner.bias.is_some())
57 .optional()
58 }
59}
60
61impl SwiGluConfig {
62 pub fn init<B: Backend>(&self, device: &B::Device) -> SwiGlu<B> {
64 SwiGlu {
65 linear_inner: LinearConfig::new(self.d_input, self.d_output)
66 .with_bias(self.bias)
67 .with_initializer(self.initializer.clone())
68 .init(device),
69 linear_outer: LinearConfig::new(self.d_input, self.d_output)
70 .with_bias(self.bias)
71 .with_initializer(self.initializer.clone())
72 .init(device),
73 }
74 }
75}
76
77impl<B: Backend> SwiGlu<B> {
78 pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
85 let x = self.linear_inner.forward(input.clone());
86 let x = silu(x);
87 x.mul(self.linear_outer.forward(input))
88 }
89}
90
91#[cfg(test)]
92mod tests {
93 use super::*;
94 use crate::TestBackend;
95 use burn_tensor::{Tolerance, ops::FloatElem};
96 type FT = FloatElem<TestBackend>;
97
98 #[test]
99 fn test_swiglu_forward_no_bias() {
100 TestBackend::seed(0);
101 let device = Default::default();
102 let config = SwiGluConfig::new(3, 3).with_initializer(Initializer::Constant { value: 0.5 });
103 let swiglu = config.init(&device);
104 let input =
105 Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
106 let output = swiglu.forward(input);
107 let expected_output = Tensor::<TestBackend, 2>::from_data(
108 [[8.5732, 8.5732, 8.5732], [56.2189, 56.2189, 56.2189]],
109 &device,
110 );
111 output
112 .to_data()
113 .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
114 }
115
116 #[test]
117 fn test_swiglu_forward_with_bias() {
118 TestBackend::seed(0);
119 let device = Default::default();
120 let config = SwiGluConfig::new(3, 3)
121 .with_bias(true)
122 .with_initializer(Initializer::Constant { value: 0.5 });
123 let swiglu = config.init(&device);
124 let input =
125 Tensor::<TestBackend, 2>::from_data([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], &device);
126 let output = swiglu.forward(input);
127 let expected_output = Tensor::<TestBackend, 2>::from_data(
128 [[11.8909, 11.8909, 11.8909], [63.9785, 63.9785, 63.9785]],
129 &device,
130 );
131 output
132 .to_data()
133 .assert_approx_eq::<FT>(&expected_output.to_data(), Tolerance::default());
134 }
135
136 #[test]
137 fn display() {
138 let config = SwiGluConfig::new(3, 5);
139 let swiglu = config.init::<TestBackend>(&Default::default());
140
141 assert_eq!(
142 alloc::format!("{swiglu}"),
143 "SwiGlu {d_input: 3, d_output: 5, bias: false, params: 30}"
144 );
145 }
146}