1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
//! CORRECTNESS-011: Trace CPU hidden state BEFORE output_norm
//!
//! Compare with GPU: sum=466.2486, rms=39.4793
//!
//! Run with: cargo run --example cpu_hidden_state_trace --release
fn main() -> Result<(), Box<dyn std::error::Error>> {
use realizar::gguf::{MappedGGUFModel, OwnedQuantizedModel};
let path = std::env::var("MODEL_PATH").unwrap_or_else(|_| {
"/home/noah/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf".to_string()
});
println!("CORRECTNESS-011: CPU Hidden State Before Output Norm");
println!("=====================================================");
println!("Model: {}", path);
let mapped = MappedGGUFModel::from_path(&path)?;
let model = OwnedQuantizedModel::from_mapped(&mapped)?;
let token_id = 791u32;
println!("\nToken ID: {}", token_id);
println!("Hidden dim: {}", model.config().hidden_dim);
// Use the model's forward_with_hidden_state if available,
// or manually trace through layers
// Get embedding
let hidden = model.embed(&[token_id]);
let embed_sum: f32 = hidden.iter().sum();
let embed_rms: f32 = (hidden.iter().map(|x| x * x).sum::<f32>() / hidden.len() as f32).sqrt();
println!("\n=== Initial Embedding ===");
println!("first 5: {:?}", &hidden[..5.min(hidden.len())]);
println!("sum={:.4}, rms={:.4}", embed_sum, embed_rms);
// Since we can't easily instrument the model's forward() function,
// let's use forward_with_kv_cache which might give us intermediate access
// Or we can use generate_with_cache to see the internal state
// For now, let's run forward and check the final logits
let logits = model.forward(&[token_id])?;
let argmax = logits
.iter()
.enumerate()
.max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.map(|(i, v)| (i, *v));
println!("\n=== Final Logits ===");
println!("Argmax: {:?}", argmax);
println!("first 5: {:?}", &logits[..5.min(logits.len())]);
// Compare with GPU values
println!("\n=== GPU Hidden State (from debug output) ===");
println!("Hidden before output_norm:");
println!(" first 5: [1.2728, 7.7476, -18.4799, 22.1341, -23.2289]");
println!(" sum=466.2486, rms=39.4793");
println!("Normed hidden:");
println!(" first 5: [0.1421, 0.9015, -1.5506, 2.5930, -2.6661]");
println!(" sum=107.5945, rms=4.6616");
// To get CPU hidden state, we need to modify the model or create a custom forward
// For now, let's estimate based on the output_norm transformation
// GPU normed_hidden = hidden * scale where scale = 1/sqrt(mean(hidden^2) + eps)
// GPU shows rms = 39.48 before norm, rms = 4.66 after norm
// scale = 4.66 / 39.48 ≈ 0.118
// This corresponds to rms_inv = 1/sqrt(mean_sq + eps) where mean_sq = sum_sq/n
// rms = sqrt(mean_sq) = 39.48
// mean_sq = 39.48^2 = 1559
// For n=1536: sum_sq = mean_sq * n = 1559 * 1536 = 2,394,624
println!("\n=== Analysis ===");
println!(
"GPU hidden RMS = 39.48 suggests mean_sq = {:.2}",
39.48_f32.powi(2)
);
println!(
"For hidden_dim=1536, sum_sq ≈ {:.2}",
39.48_f32.powi(2) * 1536.0
);
// Check if rms_norm is applied correctly
// rms_inv = rsqrt(mean_sq + eps) = rsqrt(1559 + 1e-5) ≈ 0.0253
let rms_inv = 1.0 / (39.48_f32.powi(2) + 1e-5).sqrt();
println!(
"Expected rms_inv = 1/sqrt({:.2} + 1e-5) = {:.6}",
39.48_f32.powi(2),
rms_inv
);
// The normed values should be: normed = hidden * rms_inv * weight
// If we assume weight ≈ 1 (on average), then:
// normed_rms ≈ hidden_rms * rms_inv = 39.48 * 0.0253 = 1.0
// But GPU shows normed_rms = 4.66, which is ~4.66x larger
// This suggests the output_norm weights have average value ≈ 4.66
println!(
"\nGPU normed_rms = 4.66, expected if weight=1: {:.4}",
39.48 * rms_inv
);
println!(
"This implies output_norm weights have mean ≈ {:.2}",
4.66 / (39.48 * rms_inv)
);
// The key question is whether CPU hidden state matches GPU hidden state
// If they match, the bug is in the GPU output_norm or LM head
// If they differ, the bug is in the transformer layers
println!("\n=== CONCLUSION ===");
println!("To determine root cause, need to compare:");
println!("1. CPU hidden state sum/rms BEFORE output_norm");
println!("2. GPU hidden state sum/rms = 466.25/39.48");
println!("\nIf they match → bug is in GPU output_norm or LM head");
println!("If they differ → bug is in GPU transformer layers (per spec: RoPE/Cache)");
Ok(())
}