Skip to main content

ferrum_testkit/op_diff/
mod.rs

1//! Cross-backend op-diff harness — PLAYBOOK § 3 L1.
2//!
3//! Runs the same op on CPU (reference) and on each available accelerator
4//! (Metal / CUDA), then reports the **NMSE** (normalized mean-squared
5//! error) of each accelerator's output relative to CPU's. Modelled on
6//! llama.cpp's `tests/test-backend-ops.cpp` `NMSE = mse(a,b) / mse(a,0)`
7//! comparison rather than naive max-abs-diff: NMSE is invariant to the
8//! magnitude of the reference output, so a well-tuned kernel will sit
9//! at the same NMSE regardless of input scaling.
10//!
11//! # Usage
12//!
13//! ```ignore
14//! use ferrum_testkit::op_diff::{compare_backends, NMSE_FP16_TOL, rms_norm::RmsNormOp};
15//!
16//! let report = compare_backends(&RmsNormOp { tokens: 4, dim: 4096, eps: 1e-6 }, 42);
17//! if let Some(nmse) = report.metal_nmse {
18//!     assert!(nmse < NMSE_FP16_TOL, "metal rms_norm NMSE {nmse} exceeds fp16 tol");
19//! }
20//! ```
21//!
22//! # Tolerance buckets (PLAYBOOK § 3.1)
23//!
24//! - `NMSE_FP32_TOL = 1e-7` — fp32 kernels must agree with CPU below this.
25//! - `NMSE_FP16_TOL = 1e-6` — fp16 kernels (Metal default storage).
26//!
27//! Tighter bucketing per op is welcome — define op-specific constants
28//! once empirical baselines are stable.
29
30pub mod activation_bridge;
31pub mod argmax_rows;
32pub mod embedding_lookup;
33pub mod flash_attention;
34pub mod fused_add_rms_norm;
35pub mod gemm;
36pub mod kv_cache_append;
37pub mod marlin_matmul; // stub — see file docs
38pub mod paged_varlen_attn; // stub — see file docs
39pub mod qk_norm_rope;
40pub mod residual_add;
41pub mod rms_norm;
42pub mod silu_mul;
43pub mod split_qkv;
44pub mod transpose_head_to_token;
45
46/// fp32 kernels — should agree with CPU below this.
47pub const NMSE_FP32_TOL: f64 = 1e-7;
48/// fp16 storage / Metal accumulation — slightly larger tol.
49pub const NMSE_FP16_TOL: f64 = 1e-6;
50
51/// Normalized mean-squared error.
52///
53/// NMSE = mse(a, b) / mse(a, 0). Returns the raw `mse(a, b)` when the
54/// reference is degenerate (all zeros) — falls back gracefully so tests
55/// for ops that legitimately output zero don't divide by zero.
56///
57/// # Panics
58/// Panics if `a.len() != b.len()`.
59pub fn nmse(a: &[f32], b: &[f32]) -> f64 {
60    assert_eq!(a.len(), b.len(), "nmse: length mismatch");
61    if a.is_empty() {
62        return 0.0;
63    }
64    let n = a.len() as f64;
65    let mse_ab: f64 = a
66        .iter()
67        .zip(b.iter())
68        .map(|(x, y)| {
69            let d = (*x as f64) - (*y as f64);
70            d * d
71        })
72        .sum::<f64>()
73        / n;
74    let mse_a0: f64 = a
75        .iter()
76        .map(|x| {
77            let d = *x as f64;
78            d * d
79        })
80        .sum::<f64>()
81        / n;
82    if mse_a0 < 1e-30 {
83        return mse_ab;
84    }
85    mse_ab / mse_a0
86}
87
88/// Output of a single op invocation. Each backend produces its own
89/// `Vec<f32>` after `to_vec()`-ing its buffer to host.
90pub type Output = Vec<f32>;
91
92/// A single op-under-test: knows how to run itself on each backend.
93pub trait OpUnderTest {
94    /// Display name (used in test failure messages).
95    fn name(&self) -> &str;
96
97    /// Run on CPU (reference). Always available.
98    fn run_cpu(&self, seed: u64) -> Output;
99
100    /// Run on Metal. Only available with `cfg(all(target_os = "macos", feature = "metal"))`.
101    #[cfg(all(target_os = "macos", feature = "metal"))]
102    fn run_metal(&self, seed: u64) -> Output;
103
104    /// Run on CUDA. Only available with `cfg(feature = "cuda")`.
105    #[cfg(feature = "cuda")]
106    fn run_cuda(&self, seed: u64) -> Output;
107}
108
109/// Cross-backend comparison result.
110///
111/// `cpu` is the reference output. `metal_nmse` / `cuda_nmse` are `None`
112/// on builds that don't include that backend.
113#[derive(Debug)]
114pub struct NmseReport {
115    pub op: String,
116    pub seed: u64,
117    pub cpu: Vec<f32>,
118    pub metal_nmse: Option<f64>,
119    pub cuda_nmse: Option<f64>,
120}
121
122impl NmseReport {
123    /// True if every available accelerator matches CPU below `tol`.
124    pub fn within_tol(&self, tol: f64) -> bool {
125        self.metal_nmse.map_or(true, |n| n < tol) && self.cuda_nmse.map_or(true, |n| n < tol)
126    }
127}
128
129/// Run `op` on every backend the current build supports and assemble
130/// the comparison report.
131pub fn compare_backends(op: &dyn OpUnderTest, seed: u64) -> NmseReport {
132    let cpu = op.run_cpu(seed);
133    let metal_nmse = run_metal_nmse(op, &cpu, seed);
134    let cuda_nmse = run_cuda_nmse(op, &cpu, seed);
135    NmseReport {
136        op: op.name().to_string(),
137        seed,
138        cpu,
139        metal_nmse,
140        cuda_nmse,
141    }
142}
143
144#[cfg(all(target_os = "macos", feature = "metal"))]
145fn run_metal_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
146    Some(nmse(cpu, &op.run_metal(seed)))
147}
148
149#[cfg(not(all(target_os = "macos", feature = "metal")))]
150fn run_metal_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
151    None
152}
153
154#[cfg(feature = "cuda")]
155fn run_cuda_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
156    Some(nmse(cpu, &op.run_cuda(seed)))
157}
158
159#[cfg(not(feature = "cuda"))]
160fn run_cuda_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
161    None
162}
163
164/// Convenience: deterministic uniform-random `Vec<f32>` in `[lo, hi)`.
165pub fn random_vec(n: usize, lo: f32, hi: f32, seed: u64) -> Vec<f32> {
166    use rand::{Rng, SeedableRng};
167    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
168    (0..n).map(|_| rng.random_range(lo..hi)).collect()
169}
170
171#[cfg(test)]
172mod tests {
173    use super::*;
174
175    #[test]
176    fn nmse_identical_is_zero() {
177        let a = vec![1.0, 2.0, 3.0];
178        assert!(nmse(&a, &a) < 1e-30);
179    }
180
181    #[test]
182    fn nmse_scaled_b_proportional() {
183        // b = 1.01 * a → relative error 0.01, NMSE ≈ 0.0001
184        let a = vec![1.0, 2.0, 3.0, 4.0];
185        let b: Vec<f32> = a.iter().map(|x| x * 1.01).collect();
186        let n = nmse(&a, &b);
187        // NMSE = mse(0.01*a, 0) / mse(a, 0) = 0.0001
188        assert!((n - 1e-4).abs() < 1e-5);
189    }
190
191    #[test]
192    fn nmse_zero_reference_falls_back() {
193        // a all-zero: NMSE returns raw MSE.
194        let a = vec![0.0, 0.0, 0.0];
195        let b = vec![0.1, 0.1, 0.1];
196        let n = nmse(&a, &b);
197        assert!((n - 0.01).abs() < 1e-9);
198    }
199
200    #[test]
201    fn random_vec_determinism() {
202        let a = random_vec(100, -1.0, 1.0, 42);
203        let b = random_vec(100, -1.0, 1.0, 42);
204        assert_eq!(a, b);
205    }
206}