entrenar 0.7.10

Training & Optimization library with autograd, LoRA, quantization, and model merging
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
//! Memory benchmarks: LoRA vs QLoRA
//!
//! Compares memory usage between full-precision LoRA and 4-bit quantized QLoRA
//! across realistic transformer model dimensions.

use crate::lora::{LoRALayer, QLoRALayer};
use crate::Tensor;

/// Memory usage statistics for a single layer
#[derive(Debug, Clone)]
pub struct LayerMemoryStats {
    /// Layer name/identifier
    pub name: String,
    /// Base weight memory (unquantized, bytes)
    pub base_unquantized_bytes: usize,
    /// Base weight memory (quantized, bytes) - only for QLoRA
    pub base_quantized_bytes: Option<usize>,
    /// LoRA adapter memory (bytes)
    pub lora_adapters_bytes: usize,
    /// Total memory usage (bytes)
    pub total_bytes: usize,
    /// Compression ratio (only for QLoRA)
    pub compression_ratio: Option<f32>,
}

impl LayerMemoryStats {
    /// Create stats from LoRALayer
    pub fn from_lora(name: String, layer: &LoRALayer) -> Self {
        let base_bytes = layer.d_out() * layer.d_in() * 4; // f32
        let lora_a_bytes = layer.rank() * layer.d_in() * 4;
        let lora_b_bytes = layer.d_out() * layer.rank() * 4;
        let lora_bytes = lora_a_bytes + lora_b_bytes;

        Self {
            name,
            base_unquantized_bytes: base_bytes,
            base_quantized_bytes: None,
            lora_adapters_bytes: lora_bytes,
            total_bytes: base_bytes + lora_bytes,
            compression_ratio: None,
        }
    }

    /// Create stats from QLoRALayer
    pub fn from_qlora(name: String, layer: &QLoRALayer) -> Self {
        let stats = layer.memory_stats();

        Self {
            name,
            base_unquantized_bytes: stats.base_unquantized_bytes,
            base_quantized_bytes: Some(stats.base_quantized_bytes),
            lora_adapters_bytes: stats.lora_bytes,
            total_bytes: stats.total_bytes,
            compression_ratio: Some(stats.compression_ratio),
        }
    }

    /// Calculate memory savings percentage vs unquantized
    pub fn savings_percent(&self) -> f32 {
        let unquantized_total = self.base_unquantized_bytes + self.lora_adapters_bytes;
        ((unquantized_total - self.total_bytes) as f32 / unquantized_total as f32) * 100.0
    }
}

/// Benchmark results comparing LoRA and QLoRA
#[derive(Debug, Clone)]
pub struct BenchmarkResults {
    /// Model configuration name
    pub model_name: String,
    /// LoRA layer statistics
    pub lora_stats: Vec<LayerMemoryStats>,
    /// QLoRA layer statistics
    pub qlora_stats: Vec<LayerMemoryStats>,
    /// Total LoRA memory (bytes)
    pub total_lora_bytes: usize,
    /// Total QLoRA memory (bytes)
    pub total_qlora_bytes: usize,
    /// Overall memory savings (bytes)
    pub savings_bytes: usize,
    /// Overall savings percentage
    pub savings_percent: f32,
}

impl BenchmarkResults {
    /// Create benchmark results
    pub fn new(
        model_name: String,
        lora_stats: Vec<LayerMemoryStats>,
        qlora_stats: Vec<LayerMemoryStats>,
    ) -> Self {
        let total_lora_bytes: usize = lora_stats.iter().map(|s| s.total_bytes).sum();
        let total_qlora_bytes: usize = qlora_stats.iter().map(|s| s.total_bytes).sum();
        let savings_bytes = total_lora_bytes.saturating_sub(total_qlora_bytes);
        let savings_percent = (savings_bytes as f32 / total_lora_bytes as f32) * 100.0;

        Self {
            model_name,
            lora_stats,
            qlora_stats,
            total_lora_bytes,
            total_qlora_bytes,
            savings_bytes,
            savings_percent,
        }
    }

    /// Generate human-readable report
    pub fn report(&self) -> String {
        let mut report = String::new();

        report.push_str(&format!("\n{}\n", "=".repeat(70)));
        report.push_str(&format!("Memory Benchmark: {}\n", self.model_name));
        report.push_str(&format!("{}\n\n", "=".repeat(70)));

        // Per-layer breakdown
        report.push_str("Layer-by-Layer Comparison:\n");
        report.push_str(&format!("{:-<70}\n", ""));

        for (lora, qlora) in self.lora_stats.iter().zip(self.qlora_stats.iter()) {
            report.push_str(&format!("\n{}:\n", lora.name));
            report.push_str(&format!("  LoRA:  {:>8} KB total\n", lora.total_bytes / 1024));
            report.push_str(&format!(
                "  QLoRA: {:>8} KB total ({:.1}% savings)\n",
                qlora.total_bytes / 1024,
                qlora.savings_percent()
            ));

            if let Some(ratio) = qlora.compression_ratio {
                report.push_str(&format!("  Base weight compression: {ratio:.1}x\n"));
            }
        }

        // Overall summary
        report.push_str(&format!("\n{:-<70}\n", ""));
        report.push_str(&format!("Total LoRA memory:  {:>8} KB\n", self.total_lora_bytes / 1024));
        report.push_str(&format!("Total QLoRA memory: {:>8} KB\n", self.total_qlora_bytes / 1024));
        report.push_str(&format!(
            "Memory savings:     {:>8} KB ({:.1}%)\n",
            self.savings_bytes / 1024,
            self.savings_percent
        ));
        report.push_str(&format!("{}\n", "=".repeat(70)));

        report
    }
}

/// Benchmark a single layer, returning (LoRA stats, QLoRA stats).
fn benchmark_single_layer(
    name: &str,
    d_out: usize,
    d_in: usize,
    rank: usize,
    alpha: f32,
) -> (LayerMemoryStats, LayerMemoryStats) {
    let base_weight = Tensor::from_vec(vec![1.0; d_out * d_in], false);

    let lora = LoRALayer::new(base_weight.clone(), d_out, d_in, rank, alpha);
    let lora_stats = LayerMemoryStats::from_lora(name.to_string(), &lora);

    let qlora = QLoRALayer::new(base_weight, d_out, d_in, rank, alpha);
    let qlora_stats = LayerMemoryStats::from_qlora(name.to_string(), &qlora);

    (lora_stats, qlora_stats)
}

/// Run memory benchmark for a specific model configuration
///
/// # Arguments
/// * `model_name` - Name of the model configuration
/// * `layers` - Layer configurations [(name, d_out, d_in, rank, alpha)]
pub fn benchmark_model(
    model_name: &str,
    layers: &[(&str, usize, usize, usize, f32)],
) -> BenchmarkResults {
    let mut lora_stats = Vec::new();
    let mut qlora_stats = Vec::new();

    for (name, d_out, d_in, rank, alpha) in layers {
        let (lora, qlora) = benchmark_single_layer(name, *d_out, *d_in, *rank, *alpha);
        lora_stats.push(lora);
        qlora_stats.push(qlora);
    }

    BenchmarkResults::new(model_name.to_string(), lora_stats, qlora_stats)
}

/// Benchmark suite for common transformer sizes
pub fn run_transformer_benchmarks() -> Vec<BenchmarkResults> {
    vec![
        // Small model (e.g., BERT-small, DistilBERT)
        benchmark_model(
            "Small Transformer (256-dim, 6 layers)",
            &[
                ("layer_0_qkvo", 256, 256, 8, 16.0),
                ("layer_1_qkvo", 256, 256, 8, 16.0),
                ("layer_2_qkvo", 256, 256, 8, 16.0),
                ("layer_3_qkvo", 256, 256, 8, 16.0),
                ("layer_4_qkvo", 256, 256, 8, 16.0),
                ("layer_5_qkvo", 256, 256, 8, 16.0),
            ],
        ),
        // Medium model (e.g., BERT-base)
        benchmark_model(
            "Medium Transformer (768-dim, 12 layers)",
            &[
                ("layer_0_q", 768, 768, 16, 32.0),
                ("layer_0_k", 768, 768, 16, 32.0),
                ("layer_0_v", 768, 768, 16, 32.0),
                ("layer_0_o", 768, 768, 16, 32.0),
                ("layer_6_q", 768, 768, 16, 32.0),
                ("layer_6_k", 768, 768, 16, 32.0),
                ("layer_6_v", 768, 768, 16, 32.0),
                ("layer_6_o", 768, 768, 16, 32.0),
                ("layer_11_q", 768, 768, 16, 32.0),
                ("layer_11_k", 768, 768, 16, 32.0),
                ("layer_11_v", 768, 768, 16, 32.0),
                ("layer_11_o", 768, 768, 16, 32.0),
            ],
        ),
        // Large model (e.g., GPT-2, LLaMA-7B)
        benchmark_model(
            "Large Transformer (4096-dim, 2 layers)",
            &[
                ("layer_0_q", 4096, 4096, 64, 128.0),
                ("layer_0_k", 4096, 4096, 64, 128.0),
                ("layer_0_v", 4096, 4096, 64, 128.0),
                ("layer_0_o", 4096, 4096, 64, 128.0),
                ("layer_1_q", 4096, 4096, 64, 128.0),
                ("layer_1_k", 4096, 4096, 64, 128.0),
                ("layer_1_v", 4096, 4096, 64, 128.0),
                ("layer_1_o", 4096, 4096, 64, 128.0),
            ],
        ),
    ]
}

#[cfg(test)]
mod tests {
    use super::*;
    use proptest::prelude::*;

    // ========================================================================
    // PROPERTY TESTS - Memory calculation correctness
    // ========================================================================

    proptest! {
        #![proptest_config(proptest::test_runner::Config::with_cases(100))]

        /// QLoRA should always use less memory than LoRA for large enough dimensions
        #[test]
        fn prop_qlora_always_less_memory(
            d in 16usize..128,
            rank in 1usize..8,
            alpha in 1.0f32..32.0,
        ) {
            let size = d * d;
            let base_weight = Tensor::from_vec(vec![1.0; size], false);

            let lora = LoRALayer::new(base_weight.clone(), d, d, rank, alpha);
            let qlora = QLoRALayer::new(base_weight, d, d, rank, alpha);

            let lora_stats = LayerMemoryStats::from_lora("test".to_string(), &lora);
            let qlora_stats = LayerMemoryStats::from_qlora("test".to_string(), &qlora);

            // QLoRA total should be less than LoRA for matrices >= 16x16
            prop_assert!(
                qlora_stats.total_bytes < lora_stats.total_bytes,
                "QLoRA {} should be < LoRA {}",
                qlora_stats.total_bytes,
                lora_stats.total_bytes
            );
        }

        /// Savings percentage should be positive for non-trivial sizes
        #[test]
        fn prop_savings_percent_positive(
            d in 16usize..64,
            rank in 1usize..4,
        ) {
            let size = d * d;
            let base_weight = Tensor::from_vec(vec![1.0; size], false);
            let qlora = QLoRALayer::new(base_weight, d, d, rank, 4.0);

            let stats = LayerMemoryStats::from_qlora("test".to_string(), &qlora);
            let savings = stats.savings_percent();

            // For d >= 16, should always have positive savings
            prop_assert!(
                savings > 0.0,
                "Savings {} should be positive for d={}",
                savings, d
            );
        }

        /// Benchmark results should be internally consistent
        #[test]
        fn prop_benchmark_results_consistent(
            d in 16usize..64,
            num_layers in 1usize..4,
            rank in 1usize..4,
        ) {
            let layers: Vec<(&str, usize, usize, usize, f32)> =
                (0..num_layers)
                    .map(|i| {
                        // Use static strings to satisfy lifetime
                        match i % 4 {
                            0 => ("q_proj", d, d, rank, rank as f32 * 2.0),
                            1 => ("k_proj", d, d, rank, rank as f32 * 2.0),
                            2 => ("v_proj", d, d, rank, rank as f32 * 2.0),
                            _ => ("o_proj", d, d, rank, rank as f32 * 2.0),
                        }
                    })
                    .collect();

            let results = benchmark_model("Test", &layers);

            // Total should equal sum of individual
            let sum_lora: usize = results.lora_stats.iter().map(|s| s.total_bytes).sum();
            let sum_qlora: usize = results.qlora_stats.iter().map(|s| s.total_bytes).sum();

            prop_assert_eq!(results.total_lora_bytes, sum_lora);
            prop_assert_eq!(results.total_qlora_bytes, sum_qlora);

            // Savings should be non-negative
            prop_assert!(results.savings_percent >= 0.0);
        }
    }

    // ========================================================================
    // UNIT TESTS
    // ========================================================================

    #[test]
    fn test_layer_memory_stats_lora() {
        let base_weight = Tensor::from_vec(vec![1.0; 256 * 256], false);
        let lora = LoRALayer::new(base_weight, 256, 256, 16, 32.0);

        let stats = LayerMemoryStats::from_lora("test_layer".to_string(), &lora);

        assert_eq!(stats.name, "test_layer");
        assert_eq!(stats.base_unquantized_bytes, 256 * 256 * 4); // 256KB
        assert!(stats.base_quantized_bytes.is_none());
        assert_eq!(stats.lora_adapters_bytes, (16 * 256 + 256 * 16) * 4); // 32KB
        assert!(stats.compression_ratio.is_none());
    }

    #[test]
    fn test_layer_memory_stats_qlora() {
        let base_weight = Tensor::from_vec(vec![1.0; 256 * 256], false);
        let qlora = QLoRALayer::new(base_weight, 256, 256, 16, 32.0);

        let stats = LayerMemoryStats::from_qlora("test_layer".to_string(), &qlora);

        assert_eq!(stats.name, "test_layer");
        assert_eq!(stats.base_unquantized_bytes, 256 * 256 * 4);
        assert!(stats.base_quantized_bytes.is_some());
        assert!(
            stats.base_quantized_bytes.expect("operation should succeed")
                < stats.base_unquantized_bytes
        );
        assert!(stats.compression_ratio.is_some());
        assert!(stats.compression_ratio.expect("operation should succeed") > 6.0);
    }

    #[test]
    fn test_savings_percent() {
        let base_weight = Tensor::from_vec(vec![1.0; 256 * 256], false);
        let qlora = QLoRALayer::new(base_weight, 256, 256, 16, 32.0);

        let stats = LayerMemoryStats::from_qlora("test".to_string(), &qlora);
        let savings = stats.savings_percent();

        // Should save significant memory (>60%)
        assert!(savings > 60.0, "Expected >60% savings, got {savings:.1}%");
    }

    #[test]
    fn test_benchmark_model_small() {
        let results = benchmark_model(
            "Test Small Model",
            &[("layer_0", 256, 256, 8, 16.0), ("layer_1", 256, 256, 8, 16.0)],
        );

        assert_eq!(results.model_name, "Test Small Model");
        assert_eq!(results.lora_stats.len(), 2);
        assert_eq!(results.qlora_stats.len(), 2);
        assert!(results.total_qlora_bytes < results.total_lora_bytes);
        assert!(results.savings_percent > 50.0);
    }

    #[test]
    fn test_benchmark_report_format() {
        let results = benchmark_model("Test Model", &[("layer_0", 256, 256, 8, 16.0)]);

        let report = results.report();

        // Report should contain key sections
        assert!(report.contains("Memory Benchmark: Test Model"));
        assert!(report.contains("Layer-by-Layer Comparison"));
        assert!(report.contains("layer_0"));
        assert!(report.contains("LoRA:"));
        assert!(report.contains("QLoRA:"));
        assert!(report.contains("Memory savings:"));
    }

    #[test]
    fn test_transformer_benchmarks() {
        let benchmarks = run_transformer_benchmarks();

        assert_eq!(benchmarks.len(), 3); // Small, Medium, Large

        // All should show significant savings
        for benchmark in &benchmarks {
            assert!(
                benchmark.savings_percent > 50.0,
                "{} should save >50% memory, got {:.1}%",
                benchmark.model_name,
                benchmark.savings_percent
            );
        }

        // Verify model names
        assert!(benchmarks[0].model_name.contains("Small"));
        assert!(benchmarks[1].model_name.contains("Medium"));
        assert!(benchmarks[2].model_name.contains("Large"));
    }

    #[test]
    fn test_benchmark_results_calculations() {
        let lora_stats = vec![
            LayerMemoryStats {
                name: "layer_0".to_string(),
                base_unquantized_bytes: 1000,
                base_quantized_bytes: None,
                lora_adapters_bytes: 200,
                total_bytes: 1200,
                compression_ratio: None,
            },
            LayerMemoryStats {
                name: "layer_1".to_string(),
                base_unquantized_bytes: 1000,
                base_quantized_bytes: None,
                lora_adapters_bytes: 200,
                total_bytes: 1200,
                compression_ratio: None,
            },
        ];

        let qlora_stats = vec![
            LayerMemoryStats {
                name: "layer_0".to_string(),
                base_unquantized_bytes: 1000,
                base_quantized_bytes: Some(150),
                lora_adapters_bytes: 200,
                total_bytes: 350,
                compression_ratio: Some(6.67),
            },
            LayerMemoryStats {
                name: "layer_1".to_string(),
                base_unquantized_bytes: 1000,
                base_quantized_bytes: Some(150),
                lora_adapters_bytes: 200,
                total_bytes: 350,
                compression_ratio: Some(6.67),
            },
        ];

        let results = BenchmarkResults::new("Test".to_string(), lora_stats, qlora_stats);

        assert_eq!(results.total_lora_bytes, 2400);
        assert_eq!(results.total_qlora_bytes, 700);
        assert_eq!(results.savings_bytes, 1700);
        assert!((results.savings_percent - 70.83).abs() < 0.1);
    }

    #[test]
    fn test_large_model_memory_savings() {
        // Test realistic large model scenario
        let results = benchmark_model(
            "Large Model Test",
            &[("layer_0_q", 4096, 4096, 64, 128.0), ("layer_0_v", 4096, 4096, 64, 128.0)],
        );

        // 4096x4096 = 16M values = 64MB per layer unquantized
        // With 2 layers = 128MB base + adapters
        // QLoRA should reduce this significantly

        let lora_mb = results.total_lora_bytes as f32 / (1024.0 * 1024.0);
        let qlora_mb = results.total_qlora_bytes as f32 / (1024.0 * 1024.0);

        assert!(lora_mb > 100.0, "LoRA should use >100MB");
        assert!(qlora_mb < 50.0, "QLoRA should use <50MB");
        assert!(results.savings_percent > 60.0, "Should save >60% memory on large models");
    }
}