god-graph 0.6.0-alpha

A graph-based LLM white-box optimization toolbox: topology validation, Lie group orthogonalization, tensor ring compression
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
//! LLaMA model implementation

use crate::tensor::DenseTensor;
use crate::tensor::traits::{TensorBase, TensorOps};
use super::layers::{MultiHeadAttention, FeedForward, RMSNorm, RoPE};
pub use super::loader::LlamaConfig;

/// LLaMA decoder layer
#[derive(Debug, Clone)]
pub struct LlamaDecoderLayer {
    /// Self-attention layer
    pub self_attn: MultiHeadAttention,
    /// Feed-forward network (SwiGLU)
    pub mlp: FeedForward,
    /// Input layer normalization
    pub input_layernorm: RMSNorm,
    /// Post-attention layer normalization
    pub post_attention_layernorm: RMSNorm,
}

impl LlamaDecoderLayer {
    /// Create a new LLaMA decoder layer
    pub fn new(
        self_attn: MultiHeadAttention,
        mlp: FeedForward,
        input_layernorm: RMSNorm,
        post_attention_layernorm: RMSNorm,
    ) -> Self {
        Self {
            self_attn,
            mlp,
            input_layernorm,
            post_attention_layernorm,
        }
    }

    /// Forward pass
    ///
    /// # Arguments
    /// * `x` - Input tensor [batch_size, seq_len, hidden_dim]
    /// * `mask` - Optional attention mask
    ///
    /// # Returns
    /// Output tensor [batch_size, seq_len, hidden_dim]
    pub fn forward(&self, x: &DenseTensor, mask: Option<&DenseTensor>) -> DenseTensor {
        // Pre-norm residual architecture (LLaMA uses pre-LN)
        
        // 1. Input normalization
        let normed = self.input_layernorm.forward(x);
        
        // 2. Self-attention with residual
        let attn_output = self.self_attn.forward_with_mask(&normed, mask);
        let hidden = x.add(&attn_output);
        
        // 3. Post-attention normalization
        let normed = self.post_attention_layernorm.forward(&hidden);
        
        // 4. FFN with residual
        let mlp_output = self.mlp.forward(&normed);
        
        
        hidden.add(&mlp_output)
    }

    /// Forward pass with KV cache
    ///
    /// # Arguments
    /// * `x` - Input tensor [batch_size, seq_len, hidden_dim]
    /// * `kv_cache` - Optional KV cache for this layer
    /// * `mask` - Optional attention mask
    ///
    /// # Returns
    /// Output tensor and updated KV cache
    pub fn forward_with_cache(
        &self,
        x: &DenseTensor,
        kv_cache: Option<(&DenseTensor, &DenseTensor)>,
        mask: Option<&DenseTensor>,
    ) -> (DenseTensor, Option<(DenseTensor, DenseTensor)>) {
        // For inference with KV cache
        // This is a simplified version - full implementation would update cache
        let output = self.forward(x, mask);
        (output, kv_cache.map(|(k, v)| (k.clone(), v.clone())))
    }

    /// Get the number of parameters in this layer
    pub fn num_parameters(&self) -> usize {
        let mut total = 0;

        // Attention parameters
        total += self.self_attn.num_parameters();

        // MLP parameters
        total += self.mlp.num_parameters();

        // Layer norm parameters (2 * hidden_dim)
        total += self.input_layernorm.weight.shape().iter().product::<usize>();
        total += self.post_attention_layernorm.weight.shape().iter().product::<usize>();

        total
    }
}

/// Complete LLaMA model
#[derive(Debug, Clone)]
pub struct LlamaModel {
    /// Model configuration
    pub config: LlamaConfig,
    /// Token embeddings [vocab_size, hidden_dim]
    pub embed_tokens: DenseTensor,
    /// Decoder layers
    pub layers: Vec<LlamaDecoderLayer>,
    /// Final layer normalization
    pub norm: RMSNorm,
    /// LM head (optional, may be tied with embed_tokens)
    pub lm_head: Option<DenseTensor>,
    /// RoPE module
    pub rope: RoPE,
}

impl LlamaModel {
    /// Create a new LLaMA model
    pub fn new(
        config: LlamaConfig,
        embed_tokens: DenseTensor,
        layers: Vec<LlamaDecoderLayer>,
        norm: RMSNorm,
        lm_head: Option<DenseTensor>,
    ) -> Self {
        let rope = RoPE::new(
            config.head_dim(),
            config.max_position_embeddings,
            config.rope_theta,
        );
        
        Self {
            config,
            embed_tokens,
            layers,
            norm,
            lm_head,
            rope,
        }
    }

    /// Forward pass
    ///
    /// # Arguments
    /// * `input_ids` - Input token IDs [batch_size, seq_len]
    /// * `mask` - Optional attention mask [batch_size, seq_len, seq_len]
    ///
    /// # Returns
    /// Logits tensor [batch_size, seq_len, vocab_size]
    pub fn forward(&self, input_ids: &[Vec<usize>], mask: Option<&DenseTensor>) -> DenseTensor {
        let batch_size = input_ids.len();
        let seq_len = input_ids[0].len();

        // 1. Get token embeddings
        let mut hidden = self.embed_tokens_batch(input_ids);

        // 2. Apply RoPE to positions
        let _positions: Vec<usize> = (0..seq_len).collect();

        // 3. Pass through decoder layers
        for layer in &self.layers {
            hidden = layer.forward(&hidden, mask);
        }

        // 4. Final normalization
        hidden = self.norm.forward(&hidden);

        // 5. LM head projection
        // hidden: [batch, seq_len, hidden_dim], lm_head: [vocab_size, hidden_dim]
        // Need to compute: hidden @ lm_head.T for each (batch, seq) position
        let lm_head = self.lm_head.as_ref().unwrap_or(&self.embed_tokens);
        let lm_head_t = lm_head.transpose(None); // [hidden_dim, vocab_size]
        
        // Reshape hidden to [batch*seq_len, hidden_dim] for matmul
        let hidden_data = hidden.data().to_vec();
        let hidden_dim = self.config.hidden_size;
        let flat_hidden = DenseTensor::new(hidden_data, vec![batch_size * seq_len, hidden_dim]);
        
        // Matmul: [batch*seq, hidden] @ [hidden, vocab] = [batch*seq, vocab]
        let logits_flat = flat_hidden.matmul(&lm_head_t);
        
        // Reshape back to [batch, seq_len, vocab_size]
        let vocab_size = self.config.vocab_size;
        let logits_data = logits_flat.data().to_vec();
        
        DenseTensor::new(logits_data, vec![batch_size, seq_len, vocab_size])
    }

    /// Forward pass for a single sequence
    pub fn forward_single(&self, input_ids: &[usize], mask: Option<&DenseTensor>) -> DenseTensor {
        self.forward(&[input_ids.to_vec()], mask)
    }

    /// Embed tokens in batch
    fn embed_tokens_batch(&self, input_ids: &[Vec<usize>]) -> DenseTensor {
        let batch_size = input_ids.len();
        let seq_len = input_ids[0].len();
        let hidden_dim = self.config.hidden_size;
        
        let mut data = Vec::with_capacity(batch_size * seq_len * hidden_dim);
        
        for batch in input_ids {
            for &token_id in batch {
                let start = token_id * hidden_dim;
                let end = start + hidden_dim;
                data.extend_from_slice(&self.embed_tokens.data()[start..end]);
            }
        }
        
        DenseTensor::new(data, vec![batch_size, seq_len, hidden_dim])
    }

    /// Get the hidden dimension
    pub fn hidden_dim(&self) -> usize {
        self.config.hidden_size
    }

    /// Get the vocabulary size
    pub fn vocab_size(&self) -> usize {
        self.config.vocab_size
    }

    /// Get number of layers
    pub fn num_layers(&self) -> usize {
        self.layers.len()
    }

    /// Get the number of parameters in the model
    pub fn num_parameters(&self) -> usize {
        let mut total = 0;

        // Embeddings
        total += self.embed_tokens.shape().iter().product::<usize>();

        // Each decoder layer
        for layer in &self.layers {
            total += layer.num_parameters();
        }

        // Final norm
        total += self.norm.weight.shape().iter().product::<usize>();

        // LM head
        if let Some(lm_head) = &self.lm_head {
            total += lm_head.shape().iter().product::<usize>();
        }

        total
    }

    /// Get model size in MB (assuming f64)
    pub fn size_mb(&self) -> f64 {
        (self.num_parameters() * 8) as f64 / (1024.0 * 1024.0)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tensor::DenseTensor;
    use crate::tensor::traits::TensorBase;

    fn create_test_layer(config: &LlamaConfig) -> LlamaDecoderLayer {
        let hidden_dim = config.hidden_size;
        let num_heads = config.num_attention_heads;

        // Create attention weights
        let w_q = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_k = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_v = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_o = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let self_attn = MultiHeadAttention::standard(w_q, w_k, w_v, w_o, num_heads);

        // Create FFN (SwiGLU)
        let gate_proj = DenseTensor::ones(vec![hidden_dim, config.intermediate_size]);
        let up_proj = DenseTensor::ones(vec![hidden_dim, config.intermediate_size]);
        let down_proj = DenseTensor::ones(vec![config.intermediate_size, hidden_dim]);
        let mlp = FeedForward::swiglu(gate_proj, up_proj, down_proj);

        // Create norms
        let input_layernorm = RMSNorm::default(hidden_dim);
        let post_attention_layernorm = RMSNorm::default(hidden_dim);

        LlamaDecoderLayer::new(self_attn, mlp, input_layernorm, post_attention_layernorm)
    }

    #[test]
    fn test_decoder_layer() {
        let config = LlamaConfig::llama_7b();
        let layer = create_test_layer(&config);

        let batch_size = 2;
        let seq_len = 4;
        let x = DenseTensor::ones(vec![batch_size, seq_len, config.hidden_size]);

        let output = layer.forward(&x, None);

        assert_eq!(output.shape(), &[batch_size, seq_len, config.hidden_size]);
    }

    #[test]
    fn test_llama_model_creation() {
        let config = LlamaConfig::llama_7b();

        let embed_tokens = DenseTensor::ones(vec![config.vocab_size, config.hidden_size]);
        let layers = vec![create_test_layer(&config); config.num_hidden_layers];
        let norm = RMSNorm::default(config.hidden_size);
        let lm_head = None; // Tied with embeddings

        let model = LlamaModel::new(config, embed_tokens, layers, norm, lm_head);

        assert_eq!(model.num_layers(), 32);
        assert_eq!(model.vocab_size(), 32000);
        assert_eq!(model.hidden_dim(), 4096);
    }
}

// ============================================================================
// LlamaModel Graph Builder
// ============================================================================

use crate::transformer::graph_transformer::GraphTransformer;

/// LlamaModel graph builder for constructing graph-structured Llama models
///
/// This builder converts a standard LlamaModel into a graph-structured representation
/// that can leverage god-gragh's graph algorithms for optimization and analysis.
///
/// # Example
///
/// ```no_run
/// use god_gragh::transformer::model::{LlamaModel, LlamaConfig, LlamaModelGraphBuilder};
/// use god_gragh::transformer::layers::RMSNorm;
/// use god_gragh::tensor::DenseTensor;
///
/// let config = LlamaConfig::llama_7b();
/// let embed_tokens = DenseTensor::ones(vec![config.vocab_size, config.hidden_size]);
/// let layers = vec![]; // Add your layers here
/// let norm = RMSNorm::default(config.hidden_size);
/// let model = LlamaModel::new(config, embed_tokens, layers, norm, None);
///
/// let builder = LlamaModelGraphBuilder::new(&model);
/// let graph_transformer = builder.build_graph();
/// ```
pub struct LlamaModelGraphBuilder<'a> {
    model: &'a LlamaModel,
}

impl<'a> LlamaModelGraphBuilder<'a> {
    /// Create a new graph builder from a LlamaModel
    pub fn new(model: &'a LlamaModel) -> Self {
        Self { model }
    }

    /// Build graph-structured transformer from the model
    pub fn build_graph(&self) -> GraphTransformer {
        let mut transformer = GraphTransformer::new(
            self.model.num_layers(),
            self.model.config.num_attention_heads,
            self.model.config.hidden_size,
        );

        // Build graph structure
        // Note: In a real implementation, this would use actual weights
        // For now, we create the graph topology
        let dummy_input = vec![0; 1]; // Single token for graph structure
        transformer.build_graph(&dummy_input);

        transformer
    }

    /// Build graph with specific input sequence
    pub fn build_graph_for_input(&self, input_ids: &[usize]) -> GraphTransformer {
        let mut transformer = GraphTransformer::new(
            self.model.num_layers(),
            self.model.config.num_attention_heads,
            self.model.config.hidden_size,
        );

        transformer.build_graph(input_ids);
        transformer
    }

    /// Export graph to DOT format for visualization
    pub fn export_to_dot(&self, transformer: &GraphTransformer) -> String {
        transformer.to_dot()
    }
}

#[cfg(test)]
mod graph_builder_tests {
    use super::*;
    use crate::transformer::layers::{MultiHeadAttention, FeedForward, RMSNorm};

    fn create_test_layer(config: &LlamaConfig) -> LlamaDecoderLayer {
        let hidden_dim = config.hidden_size;
        let num_heads = config.num_attention_heads;

        let w_q = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_k = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_v = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let w_o = DenseTensor::ones(vec![hidden_dim, hidden_dim]);
        let self_attn = MultiHeadAttention::standard(w_q, w_k, w_v, w_o, num_heads);

        let gate_proj = DenseTensor::ones(vec![hidden_dim, config.intermediate_size]);
        let up_proj = DenseTensor::ones(vec![hidden_dim, config.intermediate_size]);
        let down_proj = DenseTensor::ones(vec![config.intermediate_size, hidden_dim]);
        let mlp = FeedForward::swiglu(gate_proj, up_proj, down_proj);

        let input_layernorm = RMSNorm::default(hidden_dim);
        let post_attention_layernorm = RMSNorm::default(hidden_dim);

        LlamaDecoderLayer::new(self_attn, mlp, input_layernorm, post_attention_layernorm)
    }

    #[test]
    fn test_llama_model_graph_builder() {
        let config = LlamaConfig::llama_7b();
        let embed_tokens = DenseTensor::ones(vec![config.vocab_size, config.hidden_size]);
        let layers = vec![create_test_layer(&config); 2]; // Use 2 layers for test
        let norm = RMSNorm::default(config.hidden_size);
        let lm_head = None;

        let model = LlamaModel::new(config.clone(), embed_tokens, layers, norm, lm_head);

        let builder = LlamaModelGraphBuilder::new(&model);
        let transformer = builder.build_graph();

        // Verify graph was built
        assert!(transformer.num_nodes() > 0);
        assert!(transformer.num_edges() > 0);
    }

    #[test]
    fn test_llama_model_graph_builder_with_input() {
        let config = LlamaConfig::llama_7b();
        let embed_tokens = DenseTensor::ones(vec![config.vocab_size, config.hidden_size]);
        let layers = vec![create_test_layer(&config); 1];
        let norm = RMSNorm::default(config.hidden_size);
        let lm_head = None;

        let model = LlamaModel::new(config.clone(), embed_tokens, layers, norm, lm_head);

        let builder = LlamaModelGraphBuilder::new(&model);
        let input_ids = vec![1, 2, 3, 4, 5];
        let mut transformer = builder.build_graph_for_input(&input_ids);

        // Verify graph structure
        assert!(transformer.num_nodes() > 0);
        assert!(transformer.num_edges() > 0);

        // Test forward pass
        let output = transformer.forward(&input_ids);
        assert!(!output.data().is_empty());
    }

    #[test]
    fn test_graph_export_to_dot() {
        let config = LlamaConfig::llama_7b();
        let embed_tokens = DenseTensor::ones(vec![config.vocab_size, config.hidden_size]);
        let layers = vec![create_test_layer(&config); 1];
        let norm = RMSNorm::default(config.hidden_size);
        let lm_head = None;

        let model = LlamaModel::new(config.clone(), embed_tokens, layers, norm, lm_head);

        let builder = LlamaModelGraphBuilder::new(&model);
        let transformer = builder.build_graph();
        let dot = builder.export_to_dot(&transformer);

        // Verify DOT format
        assert!(dot.contains("digraph Transformer"));
        assert!(dot.contains("rankdir=TB"));
    }
}