ferrum_testkit/op_diff/
mod.rs

1//! Cross-backend op-diff harness — PLAYBOOK § 3 L1.
2//!
3//! Runs the same op on CPU (reference) and on each available accelerator
4//! (Metal / CUDA), then reports the **NMSE** (normalized mean-squared
5//! error) of each accelerator's output relative to CPU's. Modelled on
6//! llama.cpp's `tests/test-backend-ops.cpp` `NMSE = mse(a,b) / mse(a,0)`
7//! comparison rather than naive max-abs-diff: NMSE is invariant to the
8//! magnitude of the reference output, so a well-tuned kernel will sit
9//! at the same NMSE regardless of input scaling.
10//!
11//! # Usage
12//!
13//! ```ignore
14//! use ferrum_testkit::op_diff::{compare_backends, NMSE_FP16_TOL, rms_norm::RmsNormOp};
15//!
16//! let report = compare_backends(&RmsNormOp { tokens: 4, dim: 4096, eps: 1e-6 }, 42);
17//! if let Some(nmse) = report.metal_nmse {
18//!     assert!(nmse < NMSE_FP16_TOL, "metal rms_norm NMSE {nmse} exceeds fp16 tol");
19//! }
20//! ```
21//!
22//! # Tolerance buckets (PLAYBOOK § 3.1)
23//!
24//! - `NMSE_FP32_TOL = 1e-7` — fp32 kernels must agree with CPU below this.
25//! - `NMSE_FP16_TOL = 1e-6` — fp16 kernels (Metal default storage).
26//!
27//! Tighter bucketing per op is welcome — define op-specific constants
28//! once empirical baselines are stable.
29
30pub mod gemm;
31pub mod marlin_matmul; // stub — see file docs
32pub mod paged_varlen_attn; // stub — see file docs
33pub mod qk_norm_rope;
34pub mod rms_norm;
35pub mod silu_mul;
36
37/// fp32 kernels — should agree with CPU below this.
38pub const NMSE_FP32_TOL: f64 = 1e-7;
39/// fp16 storage / Metal accumulation — slightly larger tol.
40pub const NMSE_FP16_TOL: f64 = 1e-6;
41
42/// Normalized mean-squared error.
43///
44/// NMSE = mse(a, b) / mse(a, 0). Returns the raw `mse(a, b)` when the
45/// reference is degenerate (all zeros) — falls back gracefully so tests
46/// for ops that legitimately output zero don't divide by zero.
47///
48/// # Panics
49/// Panics if `a.len() != b.len()`.
50pub fn nmse(a: &[f32], b: &[f32]) -> f64 {
51    assert_eq!(a.len(), b.len(), "nmse: length mismatch");
52    if a.is_empty() {
53        return 0.0;
54    }
55    let n = a.len() as f64;
56    let mse_ab: f64 = a
57        .iter()
58        .zip(b.iter())
59        .map(|(x, y)| {
60            let d = (*x as f64) - (*y as f64);
61            d * d
62        })
63        .sum::<f64>()
64        / n;
65    let mse_a0: f64 = a
66        .iter()
67        .map(|x| {
68            let d = *x as f64;
69            d * d
70        })
71        .sum::<f64>()
72        / n;
73    if mse_a0 < 1e-30 {
74        return mse_ab;
75    }
76    mse_ab / mse_a0
77}
78
79/// Output of a single op invocation. Each backend produces its own
80/// `Vec<f32>` after `to_vec()`-ing its buffer to host.
81pub type Output = Vec<f32>;
82
83/// A single op-under-test: knows how to run itself on each backend.
84pub trait OpUnderTest {
85    /// Display name (used in test failure messages).
86    fn name(&self) -> &str;
87
88    /// Run on CPU (reference). Always available.
89    fn run_cpu(&self, seed: u64) -> Output;
90
91    /// Run on Metal. Only available with `cfg(all(target_os = "macos", feature = "metal"))`.
92    #[cfg(all(target_os = "macos", feature = "metal"))]
93    fn run_metal(&self, seed: u64) -> Output;
94
95    /// Run on CUDA. Only available with `cfg(feature = "cuda")`.
96    #[cfg(feature = "cuda")]
97    fn run_cuda(&self, seed: u64) -> Output;
98}
99
100/// Cross-backend comparison result.
101///
102/// `cpu` is the reference output. `metal_nmse` / `cuda_nmse` are `None`
103/// on builds that don't include that backend.
104#[derive(Debug)]
105pub struct NmseReport {
106    pub op: String,
107    pub seed: u64,
108    pub cpu: Vec<f32>,
109    pub metal_nmse: Option<f64>,
110    pub cuda_nmse: Option<f64>,
111}
112
113impl NmseReport {
114    /// True if every available accelerator matches CPU below `tol`.
115    pub fn within_tol(&self, tol: f64) -> bool {
116        self.metal_nmse.map_or(true, |n| n < tol) && self.cuda_nmse.map_or(true, |n| n < tol)
117    }
118}
119
120/// Run `op` on every backend the current build supports and assemble
121/// the comparison report.
122pub fn compare_backends(op: &dyn OpUnderTest, seed: u64) -> NmseReport {
123    let cpu = op.run_cpu(seed);
124    let metal_nmse = run_metal_nmse(op, &cpu, seed);
125    let cuda_nmse = run_cuda_nmse(op, &cpu, seed);
126    NmseReport {
127        op: op.name().to_string(),
128        seed,
129        cpu,
130        metal_nmse,
131        cuda_nmse,
132    }
133}
134
135#[cfg(all(target_os = "macos", feature = "metal"))]
136fn run_metal_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
137    Some(nmse(cpu, &op.run_metal(seed)))
138}
139
140#[cfg(not(all(target_os = "macos", feature = "metal")))]
141fn run_metal_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
142    None
143}
144
145#[cfg(feature = "cuda")]
146fn run_cuda_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
147    Some(nmse(cpu, &op.run_cuda(seed)))
148}
149
150#[cfg(not(feature = "cuda"))]
151fn run_cuda_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
152    None
153}
154
155/// Convenience: deterministic uniform-random `Vec<f32>` in `[lo, hi)`.
156pub fn random_vec(n: usize, lo: f32, hi: f32, seed: u64) -> Vec<f32> {
157    use rand::{Rng, SeedableRng};
158    let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
159    (0..n).map(|_| rng.random_range(lo..hi)).collect()
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn nmse_identical_is_zero() {
168        let a = vec![1.0, 2.0, 3.0];
169        assert!(nmse(&a, &a) < 1e-30);
170    }
171
172    #[test]
173    fn nmse_scaled_b_proportional() {
174        // b = 1.01 * a → relative error 0.01, NMSE ≈ 0.0001
175        let a = vec![1.0, 2.0, 3.0, 4.0];
176        let b: Vec<f32> = a.iter().map(|x| x * 1.01).collect();
177        let n = nmse(&a, &b);
178        // NMSE = mse(0.01*a, 0) / mse(a, 0) = 0.0001
179        assert!((n - 1e-4).abs() < 1e-5);
180    }
181
182    #[test]
183    fn nmse_zero_reference_falls_back() {
184        // a all-zero: NMSE returns raw MSE.
185        let a = vec![0.0, 0.0, 0.0];
186        let b = vec![0.1, 0.1, 0.1];
187        let n = nmse(&a, &b);
188        assert!((n - 0.01).abs() < 1e-9);
189    }
190
191    #[test]
192    fn random_vec_determinism() {
193        let a = random_vec(100, -1.0, 1.0, 42);
194        let b = random_vec(100, -1.0, 1.0, 42);
195        assert_eq!(a, b);
196    }
197}
ferrum_testkit/op_diff/mod.rs

ferrum_testkit/op_diff/
mod.rs