ferrum_testkit/op_diff/
mod.rs1pub mod activation_bridge;
31pub mod argmax_rows;
32pub mod embedding_lookup;
33pub mod flash_attention;
34pub mod fused_add_rms_norm;
35pub mod gemm;
36pub mod kv_cache_append;
37pub mod marlin_matmul; pub mod paged_varlen_attn; pub mod qk_norm_rope;
40pub mod residual_add;
41pub mod rms_norm;
42pub mod silu_mul;
43pub mod split_qkv;
44pub mod transpose_head_to_token;
45
46pub const NMSE_FP32_TOL: f64 = 1e-7;
48pub const NMSE_FP16_TOL: f64 = 1e-6;
50
51pub fn nmse(a: &[f32], b: &[f32]) -> f64 {
60 assert_eq!(a.len(), b.len(), "nmse: length mismatch");
61 if a.is_empty() {
62 return 0.0;
63 }
64 let n = a.len() as f64;
65 let mse_ab: f64 = a
66 .iter()
67 .zip(b.iter())
68 .map(|(x, y)| {
69 let d = (*x as f64) - (*y as f64);
70 d * d
71 })
72 .sum::<f64>()
73 / n;
74 let mse_a0: f64 = a
75 .iter()
76 .map(|x| {
77 let d = *x as f64;
78 d * d
79 })
80 .sum::<f64>()
81 / n;
82 if mse_a0 < 1e-30 {
83 return mse_ab;
84 }
85 mse_ab / mse_a0
86}
87
88pub type Output = Vec<f32>;
91
92pub trait OpUnderTest {
94 fn name(&self) -> &str;
96
97 fn run_cpu(&self, seed: u64) -> Output;
99
100 #[cfg(all(target_os = "macos", feature = "metal"))]
102 fn run_metal(&self, seed: u64) -> Output;
103
104 #[cfg(feature = "cuda")]
106 fn run_cuda(&self, seed: u64) -> Output;
107}
108
109#[derive(Debug)]
114pub struct NmseReport {
115 pub op: String,
116 pub seed: u64,
117 pub cpu: Vec<f32>,
118 pub metal_nmse: Option<f64>,
119 pub cuda_nmse: Option<f64>,
120}
121
122impl NmseReport {
123 pub fn within_tol(&self, tol: f64) -> bool {
125 self.metal_nmse.map_or(true, |n| n < tol) && self.cuda_nmse.map_or(true, |n| n < tol)
126 }
127}
128
129pub fn compare_backends(op: &dyn OpUnderTest, seed: u64) -> NmseReport {
132 let cpu = op.run_cpu(seed);
133 let metal_nmse = run_metal_nmse(op, &cpu, seed);
134 let cuda_nmse = run_cuda_nmse(op, &cpu, seed);
135 NmseReport {
136 op: op.name().to_string(),
137 seed,
138 cpu,
139 metal_nmse,
140 cuda_nmse,
141 }
142}
143
144#[cfg(all(target_os = "macos", feature = "metal"))]
145fn run_metal_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
146 Some(nmse(cpu, &op.run_metal(seed)))
147}
148
149#[cfg(not(all(target_os = "macos", feature = "metal")))]
150fn run_metal_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
151 None
152}
153
154#[cfg(feature = "cuda")]
155fn run_cuda_nmse(op: &dyn OpUnderTest, cpu: &[f32], seed: u64) -> Option<f64> {
156 Some(nmse(cpu, &op.run_cuda(seed)))
157}
158
159#[cfg(not(feature = "cuda"))]
160fn run_cuda_nmse(_op: &dyn OpUnderTest, _cpu: &[f32], _seed: u64) -> Option<f64> {
161 None
162}
163
164pub fn random_vec(n: usize, lo: f32, hi: f32, seed: u64) -> Vec<f32> {
166 use rand::{Rng, SeedableRng};
167 let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
168 (0..n).map(|_| rng.random_range(lo..hi)).collect()
169}
170
171#[cfg(test)]
172mod tests {
173 use super::*;
174
175 #[test]
176 fn nmse_identical_is_zero() {
177 let a = vec![1.0, 2.0, 3.0];
178 assert!(nmse(&a, &a) < 1e-30);
179 }
180
181 #[test]
182 fn nmse_scaled_b_proportional() {
183 let a = vec![1.0, 2.0, 3.0, 4.0];
185 let b: Vec<f32> = a.iter().map(|x| x * 1.01).collect();
186 let n = nmse(&a, &b);
187 assert!((n - 1e-4).abs() < 1e-5);
189 }
190
191 #[test]
192 fn nmse_zero_reference_falls_back() {
193 let a = vec![0.0, 0.0, 0.0];
195 let b = vec![0.1, 0.1, 0.1];
196 let n = nmse(&a, &b);
197 assert!((n - 0.01).abs() < 1e-9);
198 }
199
200 #[test]
201 fn random_vec_determinism() {
202 let a = random_vec(100, -1.0, 1.0, 42);
203 let b = random_vec(100, -1.0, 1.0, 42);
204 assert_eq!(a, b);
205 }
206}