realizar 0.8.5

Pure Rust ML inference engine built from scratch - model serving for GGUF and safetensors
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
//! CUDA GQA Parity Tests - Phase 54
//!
//! Tests for CPU/GPU parity with Grouped Query Attention (GQA) models.
//! Validates that GPU path produces same results as CPU path for GQA configs.
//!
//! ## Five-Whys Root Cause Analysis (PMAT-802)
//!
//! **Problem**: BUG-GGUF-001 (Q4_0 layout) and Q5_0 GQA bugs weren't caught by tests.
//!
//! 1. **Why?** No kernel-level parity tests comparing CPU vs GPU
//! 2. **Why?** Test infra focused on end-to-end, not isolated components
//! 3. **Why?** Setup/teardown requires full model files
//! 4. **Why?** No synthetic weight generators for isolated testing
//! 5. **Why?** Never designed ModelFixture pattern for standardized testing
//!
//! **Solution**: Layer 2 Kernel Parity Tests with synthetic weight generators
//!
//! ## Test Layers (Probar-style)
//!
//! - **Layer 1**: Unit tests (pure functions, no GPU)
//! - **Layer 2**: Kernel parity tests (CPU vs GPU for single ops) ← THIS MODULE
//! - **Layer 3**: Component tests (attention, FFN, etc.)
//! - **Layer 4**: Integration tests (full model inference)

#![cfg(feature = "cuda")]

use super::test_fixtures::{generate_q4_0_weights, generate_q5_0_weights};
use crate::cuda::CudaExecutor;
use crate::gguf::ops;
use crate::quantize::dequant::{dequantize_q4_0, dequantize_q5_0};
use serial_test::serial;
use trueno_gpu::driver::GpuBuffer;

// ============================================================================
// RMSNorm Parity Tests
// ============================================================================

/// Test RMSNorm parity between CPU and GPU
/// This is the first operation after embedding, so if this diverges, everything will.
#[test]
#[serial]
#[ignore] // Parity thresholds (1%) may differ across GPU architectures; validated separately
fn test_gqa_rmsnorm_cpu_gpu_parity() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // GQA config: Qwen-style with 14 heads, 2 kv_heads
    let hidden_dim = 896usize;
    let epsilon = 1e-6f32;

    // Create test input (simulated embedding)
    let input: Vec<f32> = (0..hidden_dim)
        .map(|i| ((i as f32 * 0.01) - 4.0).sin())
        .collect();

    // Create gamma weights (RMSNorm weights)
    let gamma: Vec<f32> = (0..hidden_dim).map(|i| 1.0 + (i as f32 * 0.001)).collect();

    // CPU RMSNorm
    let cpu_output = ops::layer_norm(&input, &gamma, None, epsilon);

    // GPU RMSNorm
    let input_buf = GpuBuffer::from_host(&executor.context, &input).expect("upload input");
    let gamma_buf = GpuBuffer::from_host(&executor.context, &gamma).expect("upload gamma");

    let gpu_output_buf = executor
        .rmsnorm_gpu(&input_buf, &gamma_buf, hidden_dim as u32, epsilon)
        .expect("GPU RMSNorm");

    executor.stream.synchronize().expect("sync");

    let mut gpu_output = vec![0.0f32; hidden_dim];
    gpu_output_buf
        .copy_to_host(&mut gpu_output)
        .expect("download");

    // Compare
    let cpu_sum: f32 = cpu_output.iter().sum();
    let gpu_sum: f32 = gpu_output.iter().sum();

    println!("=== RMSNorm Parity Test ===");
    println!("CPU first 5: {:?}", &cpu_output[..5]);
    println!("GPU first 5: {:?}", &gpu_output[..5]);
    println!("CPU sum: {:.6}", cpu_sum);
    println!("GPU sum: {:.6}", gpu_sum);

    // Allow small tolerance for GPU precision
    let max_diff = cpu_output
        .iter()
        .zip(gpu_output.iter())
        .map(|(c, g)| (c - g).abs())
        .fold(0.0f32, f32::max);

    println!("Max element diff: {:.6}", max_diff);

    // Should be within 1% for RMSNorm
    let sum_diff = (cpu_sum - gpu_sum).abs() / cpu_sum.abs().max(1e-6);
    assert!(
        sum_diff < 0.01,
        "RMSNorm sum differs by {:.2}%: CPU={:.6}, GPU={:.6}",
        sum_diff * 100.0,
        cpu_sum,
        gpu_sum
    );
}

/// Test RMSNorm with rmsnorm_into (pre-allocated output)
#[test]
#[serial]
#[ignore] // Parity thresholds (1%) may differ across GPU architectures; validated separately
fn test_gqa_rmsnorm_into_parity() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    let hidden_dim = 896usize;
    let epsilon = 1e-6f32;

    let input: Vec<f32> = (0..hidden_dim)
        .map(|i| ((i as f32 * 0.01) - 4.0).sin())
        .collect();
    let gamma: Vec<f32> = (0..hidden_dim).map(|i| 1.0 + (i as f32 * 0.001)).collect();

    // CPU
    let cpu_output = ops::layer_norm(&input, &gamma, None, epsilon);

    // GPU with rmsnorm_into
    let input_buf = GpuBuffer::from_host(&executor.context, &input).expect("upload input");
    let gamma_buf = GpuBuffer::from_host(&executor.context, &gamma).expect("upload gamma");
    let output_buf = GpuBuffer::<f32>::new(&executor.context, hidden_dim).expect("output buf");

    executor
        .rmsnorm_into(
            &input_buf,
            &gamma_buf,
            &output_buf,
            hidden_dim as u32,
            epsilon,
        )
        .expect("GPU RMSNorm into");

    executor.stream.synchronize().expect("sync");

    let mut gpu_output = vec![0.0f32; hidden_dim];
    output_buf.copy_to_host(&mut gpu_output).expect("download");

    // Compare
    let cpu_sum: f32 = cpu_output.iter().sum();
    let gpu_sum: f32 = gpu_output.iter().sum();

    println!("=== RMSNorm Into Parity Test ===");
    println!("CPU first 5: {:?}", &cpu_output[..5]);
    println!("GPU first 5: {:?}", &gpu_output[..5]);
    println!("CPU sum: {:.6}", cpu_sum);
    println!("GPU sum: {:.6}", gpu_sum);

    let sum_diff = (cpu_sum - gpu_sum).abs() / cpu_sum.abs().max(1e-6);
    assert!(
        sum_diff < 0.01,
        "RMSNorm sum differs by {:.2}%",
        sum_diff * 100.0
    );
}

// ============================================================================
// Q4K GEMV Parity Tests for GQA
// ============================================================================

/// Test Q4K GEMV output dimension matching for GQA
/// Verifies that K and V projections use kv_dim, not hidden_dim
#[test]
#[serial]
fn test_gqa_qkv_dimension_correctness() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // GQA config: 14 Q heads, 2 KV heads
    let hidden_dim = 896usize;
    let num_heads = 14usize;
    let num_kv_heads = 2usize;
    let head_dim = hidden_dim / num_heads; // 64
    let max_seq_len = 128usize;
    let num_layers = 1usize;

    // Initialize KV cache with GQA dimensions
    executor
        .init_kv_cache_gpu(num_layers, num_heads, num_kv_heads, head_dim, max_seq_len)
        .expect("init kv cache");

    // Verify dimensions are set correctly
    let q_dim = executor.kv_num_heads * executor.kv_head_dim;
    let kv_dim = executor.kv_num_kv_heads * executor.kv_head_dim;

    println!("=== GQA Dimension Check ===");
    println!("num_heads: {}", num_heads);
    println!("num_kv_heads: {}", num_kv_heads);
    println!("head_dim: {}", head_dim);
    println!(
        "Expected q_dim: {} (num_heads * head_dim)",
        num_heads * head_dim
    );
    println!("Actual q_dim: {}", q_dim);
    println!(
        "Expected kv_dim: {} (num_kv_heads * head_dim)",
        num_kv_heads * head_dim
    );
    println!("Actual kv_dim: {}", kv_dim);

    // Verify q_dim = hidden_dim for this config
    assert_eq!(
        q_dim, hidden_dim,
        "q_dim should equal hidden_dim: {} != {}",
        q_dim, hidden_dim
    );

    // Verify kv_dim = num_kv_heads * head_dim
    let expected_kv_dim = num_kv_heads * head_dim;
    assert_eq!(
        kv_dim, expected_kv_dim,
        "kv_dim should be {}: {} != {}",
        expected_kv_dim, kv_dim, expected_kv_dim
    );

    // kv_dim should be smaller than q_dim for GQA
    assert!(
        kv_dim < q_dim,
        "GQA: kv_dim ({}) should be < q_dim ({})",
        kv_dim,
        q_dim
    );

    println!(
        "GQA dimensions VERIFIED: q_dim={}, kv_dim={}",
        q_dim, kv_dim
    );
}

/// Test workspace buffer allocation with GQA dimensions
#[test]
#[serial]
fn test_gqa_workspace_allocation() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // GQA config
    let hidden_dim = 896usize;
    let intermediate_dim = 4864usize; // Qwen FFN dim
    let num_heads = 14usize;
    let num_kv_heads = 2usize;
    let head_dim = hidden_dim / num_heads;
    let max_seq_len = 128usize;
    let num_layers = 1usize;

    // Initialize KV cache
    executor
        .init_kv_cache_gpu(num_layers, num_heads, num_kv_heads, head_dim, max_seq_len)
        .expect("init kv cache");

    // Initialize workspace
    executor
        .init_workspace(hidden_dim, intermediate_dim)
        .expect("init workspace");

    // Verify workspace buffer sizes
    let expected_q_dim = num_heads * head_dim;
    let expected_kv_dim = num_kv_heads * head_dim;

    println!("=== Workspace Buffer Check ===");
    println!("Expected q_buf size: {}", expected_q_dim);
    println!("Expected k_buf size: {}", expected_kv_dim);
    println!("Expected v_buf size: {}", expected_kv_dim);
    println!("Workspace q_dim: {}", executor.workspace.q_dim);
    println!("Workspace kv_dim: {}", executor.workspace.kv_dim);

    assert_eq!(
        executor.workspace.q_dim, expected_q_dim,
        "Workspace q_dim mismatch"
    );
    assert_eq!(
        executor.workspace.kv_dim, expected_kv_dim,
        "Workspace kv_dim mismatch"
    );

    // Verify q_buf and k_buf have different sizes for GQA
    // GH-215: buffers are padded to Q4K super-block boundary (256 elements)
    let pad256 = |dim: usize| ((dim + 255) / 256) * 256;
    let q_buf = executor.workspace.q_buf.as_ref().expect("q_buf");
    let k_buf = executor.workspace.k_buf.as_ref().expect("k_buf");

    assert_eq!(
        q_buf.len(),
        pad256(expected_q_dim),
        "q_buf size mismatch (padded to 256)"
    );
    assert_eq!(
        k_buf.len(),
        pad256(expected_kv_dim),
        "k_buf size mismatch (padded to 256)"
    );

    println!("Workspace buffers VERIFIED for GQA");
}

// ============================================================================
// End-to-End Transformer Layer Parity
// ============================================================================

/// Test that transformer layer produces consistent output
/// This is a smoke test - actual parity requires full model weights
#[test]
#[serial]
fn test_gqa_transformer_layer_no_crash() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // GQA config
    let hidden_dim = 896usize;
    let intermediate_dim = 4864usize;
    let num_heads = 14usize;
    let num_kv_heads = 2usize;
    let head_dim = hidden_dim / num_heads;
    let max_seq_len = 128usize;
    let num_layers = 1usize;
    let epsilon = 1e-6f32;

    // Initialize
    executor
        .init_kv_cache_gpu(num_layers, num_heads, num_kv_heads, head_dim, max_seq_len)
        .expect("init kv cache");
    executor
        .init_workspace(hidden_dim, intermediate_dim)
        .expect("init workspace");

    // Verify GQA dimensions are correctly stored
    assert_eq!(executor.kv_num_heads, num_heads);
    assert_eq!(executor.kv_num_kv_heads, num_kv_heads);
    assert_eq!(executor.kv_head_dim, head_dim);

    let q_dim = executor.kv_num_heads * executor.kv_head_dim;
    let kv_dim = executor.kv_num_kv_heads * executor.kv_head_dim;

    println!("=== GQA Transformer Layer Smoke Test ===");
    println!("Hidden dim: {}", hidden_dim);
    println!("Intermediate dim: {}", intermediate_dim);
    println!(
        "num_heads: {}, num_kv_heads: {}, head_dim: {}",
        num_heads, num_kv_heads, head_dim
    );
    println!("Q dim: {}, KV dim: {}", q_dim, kv_dim);
    println!("Epsilon: {}", epsilon);

    // This test verifies the configuration is correct without running actual inference
    // (which would require model weights)
    assert!(
        kv_dim < q_dim,
        "GQA should have kv_dim < q_dim: {} < {}",
        kv_dim,
        q_dim
    );

    println!("GQA transformer layer configuration VERIFIED");
}

// ============================================================================
// Layer 2: Kernel Parity Tests (Five-Whys Root Cause Fix)
// ============================================================================
// These tests validate individual kernel numerical correctness.
// They would have caught BUG-GGUF-001 (Q4_0 layout) and the Q5_0 GQA bug.
//
// Synthetic weight generators are now in test_fixtures.rs for reuse.

/// Test Q4_0 GEMV parity: CPU dequantize+matmul vs GPU Q4_0 GEMV
/// This test would have caught BUG-GGUF-001 before it caused runtime failures.
#[test]
#[serial]
fn test_q4_0_gemv_parity() {
    if !CudaExecutor::is_available() {
        eprintln!("[SKIP] CUDA not available");
        return;
    }

    let mut executor = CudaExecutor::new(0).expect("CUDA executor");

    // Small test: 4 blocks = 128 elements
    let num_blocks = 4usize;
    let k = num_blocks * 32; // 128 input elements
    let n = 1usize; // Single output row (GEMV)

    let weights_q4_0 = generate_q4_0_weights(num_blocks);

    // CPU path: dequantize then matmul
    let weights_f32 = dequantize_q4_0(&weights_q4_0).expect("dequantize Q4_0");
    assert_eq!(weights_f32.len(), k, "Dequantized length mismatch");

    // Input vector
    let input: Vec<f32> = (0..k).map(|i| (i as f32 * 0.01).sin()).collect();

    // CPU matmul: dot product for single row
    let cpu_output: f32 = weights_f32
        .iter()
        .zip(input.iter())
        .map(|(w, x)| w * x)
        .sum();

    // GPU path - upload weights as bytes, get raw device pointer
    let weights_buf =
        GpuBuffer::from_host(&executor.context, &weights_q4_0).expect("upload weights");
    let input_buf = GpuBuffer::from_host(&executor.context, &input).expect("upload input");
    let output_buf = GpuBuffer::<f32>::new(&executor.context, n).expect("output buffer");

    // Execute Q4_0 GEMV using _into variant with raw device pointer
    let weight_ptr = weights_buf.as_ptr();
    executor
        .q4_0_gemv_into(weight_ptr, &input_buf, &output_buf, n as u32, k as u32)
        .expect("Q4_0 GEMV");

    executor.stream.synchronize().expect("sync");

    let mut gpu_output = vec![0.0f32; n];
    output_buf.copy_to_host(&mut gpu_output).expect("download");

    // Compare
    let diff = (cpu_output - gpu_output[0]).abs();
    let rel_diff = diff / cpu_output.abs().max(1e-6);

    println!("=== Q4_0 GEMV Parity Test ===");
    println!("CPU output: {:.6}", cpu_output);
    println!("GPU output: {:.6}", gpu_output[0]);
    println!("Absolute diff: {:.6}", diff);
    println!("Relative diff: {:.4}%", rel_diff * 100.0);

    // Should be within 1% for quantized GEMV
    assert!(
        rel_diff < 0.01,
        "Q4_0 GEMV parity failed: CPU={:.6}, GPU={:.6}, diff={:.4}%",
        cpu_output,
        gpu_output[0],
        rel_diff * 100.0
    );

    println!("Q4_0 GEMV parity VERIFIED");
}

include!("gqa_parity_tests_02.rs");