scirs2_sparse/neural_adaptive_sparse/
transformer.rs1use super::neural_network::{ActivationFunction, AttentionHead, LayerNorm};
7use crate::error::SparseResult;
8use scirs2_core::random::Rng;
9
10#[derive(Debug, Clone)]
12#[allow(dead_code)]
13pub(crate) struct TransformerModel {
14 pub encoder_layers: Vec<TransformerEncoderLayer>,
15 pub positional_encoding: Vec<Vec<f64>>,
16 pub embedding_dim: usize,
17}
18
19#[derive(Debug, Clone)]
21pub(crate) struct TransformerEncoderLayer {
22 pub self_attention: MultiHeadAttention,
23 pub feed_forward: FeedForwardNetwork,
24 pub layer_norm1: LayerNorm,
25 pub layer_norm2: LayerNorm,
26 pub dropout_rate: f64,
27}
28
29#[derive(Debug, Clone)]
31#[allow(dead_code)]
32pub(crate) struct MultiHeadAttention {
33 pub heads: Vec<AttentionHead>,
34 pub output_projection: Vec<Vec<f64>>,
35 pub num_heads: usize,
36 pub head_dim: usize,
37}
38
39#[derive(Debug, Clone)]
41pub(crate) struct FeedForwardNetwork {
42 pub layer1: Vec<Vec<f64>>,
43 pub layer1_bias: Vec<f64>,
44 pub layer2: Vec<Vec<f64>>,
45 pub layer2_bias: Vec<f64>,
46 pub activation: ActivationFunction,
47}
48
49impl TransformerModel {
50 pub fn new(
52 embedding_dim: usize,
53 num_layers: usize,
54 num_heads: usize,
55 ff_dim: usize,
56 max_sequence_length: usize,
57 ) -> Self {
58 let mut encoder_layers = Vec::new();
59
60 for _ in 0..num_layers {
61 encoder_layers.push(TransformerEncoderLayer::new(
62 embedding_dim,
63 num_heads,
64 ff_dim,
65 ));
66 }
67
68 let positional_encoding =
69 Self::create_positional_encoding(max_sequence_length, embedding_dim);
70
71 Self {
72 encoder_layers,
73 positional_encoding,
74 embedding_dim,
75 }
76 }
77
78 fn create_positional_encoding(max_length: usize, embedding_dim: usize) -> Vec<Vec<f64>> {
80 let mut pos_encoding = vec![vec![0.0; embedding_dim]; max_length];
81
82 for pos in 0..max_length {
83 for i in (0..embedding_dim).step_by(2) {
84 let angle = pos as f64 / 10000.0_f64.powf(i as f64 / embedding_dim as f64);
85 pos_encoding[pos][i] = angle.sin();
86 if i + 1 < embedding_dim {
87 pos_encoding[pos][i + 1] = angle.cos();
88 }
89 }
90 }
91
92 pos_encoding
93 }
94
95 pub fn forward(&self, input: &[Vec<f64>]) -> Vec<Vec<f64>> {
97 let mut x = input.to_vec();
99 for (i, sequence) in x.iter_mut().enumerate() {
100 if i < self.positional_encoding.len() {
101 for (j, val) in sequence.iter_mut().enumerate() {
102 if j < self.positional_encoding[i].len() {
103 *val += self.positional_encoding[i][j];
104 }
105 }
106 }
107 }
108
109 for layer in &self.encoder_layers {
111 x = layer.forward(&x);
112 }
113
114 x
115 }
116
117 pub fn encode_matrix_pattern(&self, matrix_features: &[f64]) -> Vec<f64> {
119 let sequence_length = (matrix_features.len() / self.embedding_dim).max(1);
121 let mut sequence = vec![vec![0.0; self.embedding_dim]; sequence_length];
122
123 let mut idx = 0;
124 for i in 0..sequence_length {
125 for j in 0..self.embedding_dim {
126 if idx < matrix_features.len() {
127 sequence[i][j] = matrix_features[idx];
128 idx += 1;
129 }
130 }
131 }
132
133 let encoded = self.forward(&sequence);
135
136 let mut pooled = vec![0.0; self.embedding_dim];
138 for sequence_step in &encoded {
139 for (i, &val) in sequence_step.iter().enumerate() {
140 if i < pooled.len() {
141 pooled[i] += val / encoded.len() as f64;
142 }
143 }
144 }
145
146 pooled
147 }
148
149 pub fn update_parameters(&mut self, gradients: &TransformerGradients, learning_rate: f64) {
151 for (layer_idx, layer) in self.encoder_layers.iter_mut().enumerate() {
152 if layer_idx < gradients.layer_gradients.len() {
153 layer.update_parameters(&gradients.layer_gradients[layer_idx], learning_rate);
154 }
155 }
156 }
157}
158
159impl TransformerEncoderLayer {
160 pub fn new(embedding_dim: usize, num_heads: usize, ff_dim: usize) -> Self {
162 Self {
163 self_attention: MultiHeadAttention::new(embedding_dim, num_heads),
164 feed_forward: FeedForwardNetwork::new(embedding_dim, ff_dim),
165 layer_norm1: LayerNorm::new(embedding_dim),
166 layer_norm2: LayerNorm::new(embedding_dim),
167 dropout_rate: 0.1,
168 }
169 }
170
171 pub fn forward(&self, input: &[Vec<f64>]) -> Vec<Vec<f64>> {
173 let attention_output = self.self_attention.forward(input);
175 let mut norm1_input = Vec::new();
176
177 for (i, attention_seq) in attention_output.iter().enumerate() {
178 let mut residual = attention_seq.clone();
179 if i < input.len() {
180 for (j, &input_val) in input[i].iter().enumerate() {
181 if j < residual.len() {
182 residual[j] += input_val;
183 }
184 }
185 }
186 norm1_input.push(residual);
187 }
188
189 let norm1_output: Vec<Vec<f64>> = norm1_input
191 .iter()
192 .map(|seq| self.layer_norm1.normalize(seq))
193 .collect();
194
195 let ff_output = self.feed_forward.forward(&norm1_output);
197 let mut norm2_input = Vec::new();
198
199 for (i, ff_seq) in ff_output.iter().enumerate() {
200 let mut residual = ff_seq.clone();
201 if i < norm1_output.len() {
202 for (j, &norm_val) in norm1_output[i].iter().enumerate() {
203 if j < residual.len() {
204 residual[j] += norm_val;
205 }
206 }
207 }
208 norm2_input.push(residual);
209 }
210
211 norm2_input
213 .iter()
214 .map(|seq| self.layer_norm2.normalize(seq))
215 .collect()
216 }
217
218 pub fn update_parameters(&mut self, gradients: &LayerGradients, learning_rate: f64) {
220 self.self_attention
221 .update_parameters(&gradients.attention_gradients, learning_rate);
222 self.feed_forward
223 .update_parameters(&gradients.ff_gradients, learning_rate);
224 }
225}
226
227impl MultiHeadAttention {
228 pub fn new(embedding_dim: usize, num_heads: usize) -> Self {
230 let head_dim = embedding_dim / num_heads;
231 let mut heads = Vec::new();
232
233 for _ in 0..num_heads {
234 heads.push(AttentionHead::new(embedding_dim));
235 }
236
237 let output_projection = Self::initialize_weights(embedding_dim, embedding_dim);
238
239 Self {
240 heads,
241 output_projection,
242 num_heads,
243 head_dim,
244 }
245 }
246
247 fn initialize_weights(input_dim: usize, output_dim: usize) -> Vec<Vec<f64>> {
249 let mut rng = scirs2_core::random::thread_rng();
250 let bound = (6.0 / (input_dim + output_dim) as f64).sqrt();
251
252 (0..output_dim)
253 .map(|_| {
254 (0..input_dim)
255 .map(|_| rng.gen_range(-bound..bound))
256 .collect()
257 })
258 .collect()
259 }
260
261 pub fn forward(&self, input: &[Vec<f64>]) -> Vec<Vec<f64>> {
263 let mut all_head_outputs = Vec::new();
264
265 for head in &self.heads {
267 let mut head_output = Vec::new();
268 for sequence in input {
269 head_output.push(head.forward(sequence));
270 }
271 all_head_outputs.push(head_output);
272 }
273
274 let mut result = Vec::new();
276 for seq_idx in 0..input.len() {
277 let mut concatenated = Vec::new();
278 for head_output in &all_head_outputs {
279 if seq_idx < head_output.len() {
280 concatenated.extend(&head_output[seq_idx]);
281 }
282 }
283
284 let projected = self.linear_transform(&concatenated, &self.output_projection);
286 result.push(projected);
287 }
288
289 result
290 }
291
292 fn linear_transform(&self, input: &[f64], weights: &[Vec<f64>]) -> Vec<f64> {
294 let mut output = vec![0.0; weights.len()];
295
296 for (i, neuron_weights) in weights.iter().enumerate() {
297 let mut sum = 0.0;
298 for (j, &input_val) in input.iter().enumerate() {
299 if j < neuron_weights.len() {
300 sum += neuron_weights[j] * input_val;
301 }
302 }
303 output[i] = sum;
304 }
305
306 output
307 }
308
309 pub fn update_parameters(&mut self, gradients: &AttentionGradients, learning_rate: f64) {
311 for (head_idx, head) in self.heads.iter_mut().enumerate() {
314 }
317 }
318}
319
320impl FeedForwardNetwork {
321 pub fn new(embedding_dim: usize, ff_dim: usize) -> Self {
323 Self {
324 layer1: Self::initialize_weights(embedding_dim, ff_dim),
325 layer1_bias: vec![0.0; ff_dim],
326 layer2: Self::initialize_weights(ff_dim, embedding_dim),
327 layer2_bias: vec![0.0; embedding_dim],
328 activation: ActivationFunction::ReLU,
329 }
330 }
331
332 fn initialize_weights(input_dim: usize, output_dim: usize) -> Vec<Vec<f64>> {
334 let mut rng = scirs2_core::random::thread_rng();
335 let bound = (6.0 / (input_dim + output_dim) as f64).sqrt();
336
337 (0..output_dim)
338 .map(|_| {
339 (0..input_dim)
340 .map(|_| rng.gen_range(-bound..bound))
341 .collect()
342 })
343 .collect()
344 }
345
346 pub fn forward(&self, input: &[Vec<f64>]) -> Vec<Vec<f64>> {
348 let mut result = Vec::new();
349
350 for sequence in input {
351 let mut layer1_output = vec![0.0; self.layer1.len()];
353 for (i, neuron_weights) in self.layer1.iter().enumerate() {
354 let mut sum = self.layer1_bias[i];
355 for (j, &input_val) in sequence.iter().enumerate() {
356 if j < neuron_weights.len() {
357 sum += neuron_weights[j] * input_val;
358 }
359 }
360 layer1_output[i] = self.apply_activation(sum);
361 }
362
363 let mut layer2_output = vec![0.0; self.layer2.len()];
365 for (i, neuron_weights) in self.layer2.iter().enumerate() {
366 let mut sum = self.layer2_bias[i];
367 for (j, &layer1_val) in layer1_output.iter().enumerate() {
368 if j < neuron_weights.len() {
369 sum += neuron_weights[j] * layer1_val;
370 }
371 }
372 layer2_output[i] = sum; }
374
375 result.push(layer2_output);
376 }
377
378 result
379 }
380
381 fn apply_activation(&self, x: f64) -> f64 {
383 match self.activation {
384 ActivationFunction::ReLU => x.max(0.0),
385 ActivationFunction::Sigmoid => 1.0 / (1.0 + (-x).exp()),
386 ActivationFunction::Tanh => x.tanh(),
387 ActivationFunction::Swish => x * (1.0 / (1.0 + (-x).exp())),
388 ActivationFunction::Gelu => 0.5 * x * (1.0 + (x * 0.7978845608028654).tanh()),
389 }
390 }
391
392 pub fn update_parameters(&mut self, gradients: &FFGradients, learning_rate: f64) {
394 }
397}
398
399#[derive(Debug, Clone)]
401pub struct TransformerGradients {
402 pub layer_gradients: Vec<LayerGradients>,
403}
404
405#[derive(Debug, Clone)]
406pub struct LayerGradients {
407 pub attention_gradients: AttentionGradients,
408 pub ff_gradients: FFGradients,
409}
410
411#[derive(Debug, Clone)]
412pub struct AttentionGradients {
413 pub head_gradients: Vec<HeadGradients>,
414 pub output_projection_gradients: Vec<Vec<f64>>,
415}
416
417#[derive(Debug, Clone)]
418pub struct HeadGradients {
419 pub query_gradients: Vec<Vec<f64>>,
420 pub key_gradients: Vec<Vec<f64>>,
421 pub value_gradients: Vec<Vec<f64>>,
422}
423
424#[derive(Debug, Clone)]
425pub struct FFGradients {
426 pub layer1_weight_gradients: Vec<Vec<f64>>,
427 pub layer1_bias_gradients: Vec<f64>,
428 pub layer2_weight_gradients: Vec<Vec<f64>>,
429 pub layer2_bias_gradients: Vec<f64>,
430}