dsfb_gpu_debug_core/fixed.rs
1//! Q16.16 signed fixed-point arithmetic.
2//!
3//! Why fixed-point: the CPU reference path and the CUDA kernels must produce
4//! byte-identical outputs cell-for-cell so the hash chain in the case file is
5//! the same regardless of whether evidence was produced on the host or the
6//! device. Floating point makes byte-equivalence fragile across compilers,
7//! drivers, fused-multiply-add behavior, and reduction order. Q16.16 with
8//! explicit rounding sidesteps all of that.
9//!
10//! Why Q16.16 specifically: 16.16 strikes a balance between dynamic range
11//! (i32 covers roughly ±32_767 in the integer half) and precision (1/65_536
12//! in the fractional half) that fits this workload — latencies in
13//! milliseconds, error-rate fractions, and smoothed-residual magnitudes all
14//! sit comfortably inside that band once inputs are clamped at the boundary.
15//!
16//! Why these specific operations: `sat_add`, `sat_sub`, `sat_mul`, `sat_div`,
17//! and `abs` are the only operations the downstream pipeline needs. No FMA,
18//! no SIMD intrinsics. The same scalar code must run on the CPU and inside a
19//! CUDA kernel, so the implementation is restricted to plain integer math
20//! that both backends can express identically.
21//!
22//! Rounding rule: multiplication widens to i64, then applies round-half-to-
23//! even (banker's rounding) at bit 15 before shifting right by 16. This rule
24//! is symmetric, deterministic across architectures, and reduces the bias
25//! that round-half-up would inject into long EWMA recurrences.
26
27#![allow(clippy::module_name_repetitions)]
28
29/// Signed Q16.16 fixed-point value.
30///
31/// Layout: a single `i32` where the high 16 bits are the integer part and
32/// the low 16 bits are the fractional part. `#[repr(transparent)]` so the
33/// type can cross the FFI boundary as a plain `int32_t` without conversion.
34#[repr(transparent)]
35#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Default, Debug)]
36pub struct Q16(pub i32);
37
38impl Q16 {
39 /// Q16.16 representation of zero. Used as an EWMA seed and as the
40 /// "no evidence" axis value before any detector has fired.
41 pub const ZERO: Q16 = Q16(0);
42
43 /// Q16.16 representation of one. The fractional half is zero, the
44 /// integer half is one. `1 << 16 == 65_536`.
45 pub const ONE: Q16 = Q16(1 << 16);
46
47 /// The smallest representable value. Saturating arithmetic clamps here
48 /// rather than wrapping.
49 pub const MIN: Q16 = Q16(i32::MIN);
50
51 /// The largest representable value. Saturating arithmetic clamps here
52 /// rather than wrapping.
53 pub const MAX: Q16 = Q16(i32::MAX);
54
55 /// Construct a Q16.16 value from an integer. Sign-extended into the
56 /// high 16 bits. Caller is responsible for keeping `|x| ≤ 32_767`.
57 #[must_use]
58 pub const fn from_int(x: i16) -> Q16 {
59 Q16((x as i32) << 16)
60 }
61
62 /// Construct a Q16.16 value from its raw `i32` bit pattern. Used by the
63 /// contract loader (`ewma_alpha_q16_raw`) and by tests that need to pin
64 /// a specific bit value.
65 #[must_use]
66 pub const fn from_raw(raw: i32) -> Q16 {
67 Q16(raw)
68 }
69
70 /// Return the raw `i32` representation. The hash-chain serializer
71 /// writes raw words as zero-padded big-endian hex so the canonical bytes
72 /// are stable regardless of host endianness.
73 #[must_use]
74 pub const fn raw(self) -> i32 {
75 self.0
76 }
77
78 /// Saturating addition. Both CPU and CUDA paths must use the same rule
79 /// because per-cell results would otherwise disagree at the boundaries.
80 #[must_use]
81 pub const fn sat_add(self, b: Q16) -> Q16 {
82 Q16(self.0.saturating_add(b.0))
83 }
84
85 /// Saturating subtraction. Mirrors `sat_add`.
86 #[must_use]
87 pub const fn sat_sub(self, b: Q16) -> Q16 {
88 Q16(self.0.saturating_sub(b.0))
89 }
90
91 /// Saturating multiplication with round-half-to-even.
92 ///
93 /// The product is widened to i64 to avoid overflow before the rounding
94 /// shift. Banker's rounding is applied at bit 15: ties round toward the
95 /// even result. After rounding the value is shifted right by 16 (the
96 /// fractional width) and saturated back to i32.
97 ///
98 /// This function is the load-bearing primitive for CPU↔GPU bit equality.
99 /// Any divergence in its definition between the two backends would break
100 /// stage-by-stage hash equivalence. The CUDA implementation in
101 /// `cuda/common.cuh` is intentionally a line-for-line transliteration.
102 #[must_use]
103 pub const fn sat_mul(self, b: Q16) -> Q16 {
104 let prod: i64 = (self.0 as i64) * (b.0 as i64);
105 let rounded: i64 = round_half_even_i64_shift(prod, 16);
106 Q16(saturate_i64_to_i32(rounded))
107 }
108
109 /// Saturating division. Returns `Q16::ZERO` when the divisor is zero.
110 /// The pipeline never divides on the hot path; this exists for the
111 /// occasional baseline computation and is included for completeness.
112 #[must_use]
113 pub const fn sat_div(self, b: Q16) -> Q16 {
114 if b.0 == 0 {
115 return Q16::ZERO;
116 }
117 // Shift numerator left by the fractional width first, then divide.
118 // Performed in i64 to keep the pre-divide shift safe.
119 let num: i64 = (self.0 as i64) << 16;
120 Q16(saturate_i64_to_i32(num / (b.0 as i64)))
121 }
122
123 /// Absolute value, saturating. `Q16::MIN.abs()` returns `Q16::MAX`
124 /// rather than panicking or wrapping.
125 #[must_use]
126 pub const fn abs(self) -> Q16 {
127 Q16(self.0.saturating_abs())
128 }
129
130 /// Test for the additive identity. Used by axis gates that need to know
131 /// "no evidence" without comparing two `Q16` values.
132 #[must_use]
133 pub const fn is_zero(self) -> bool {
134 self.0 == 0
135 }
136
137 /// Linear interpolation: `self + alpha * (other - self)`, computed in
138 /// saturating Q16.16. Convenience wrapper used by the EWMA recurrence.
139 /// `alpha` is expected to live in `[0, ONE]`; values outside that band
140 /// still produce a deterministic result but the EWMA interpretation
141 /// stops being meaningful.
142 #[must_use]
143 pub const fn lerp(self, other: Q16, alpha: Q16) -> Q16 {
144 let diff = other.sat_sub(self);
145 let step = alpha.sat_mul(diff);
146 self.sat_add(step)
147 }
148}
149
150/// Saturate an `i64` to `i32`. Standalone so both the CPU and CUDA paths use
151/// the same clamp rule.
152#[must_use]
153const fn saturate_i64_to_i32(x: i64) -> i32 {
154 if x > i32::MAX as i64 {
155 i32::MAX
156 } else if x < i32::MIN as i64 {
157 i32::MIN
158 } else {
159 x as i32
160 }
161}
162
163/// Right shift with round-half-to-even at the cut.
164///
165/// `bits` is expected to be in `1..=32`. For `bits == 0` the input is
166/// returned unchanged. The rule: take the bits about to be discarded; if
167/// the top discarded bit is 0 the result rounds down (truncation); if the
168/// top discarded bit is 1 and any of the remaining discarded bits is 1 the
169/// result rounds away from zero (round-up for positive, round-down for
170/// negative magnitudes); if the top discarded bit is 1 and all remaining
171/// discarded bits are 0 (exact midpoint), the result is rounded toward the
172/// even neighbor.
173#[must_use]
174const fn round_half_even_i64_shift(value: i64, bits: u32) -> i64 {
175 if bits == 0 {
176 return value;
177 }
178 let truncated: i64 = value >> bits;
179 let halfway_bit: i64 = 1i64 << (bits - 1);
180 let discarded: i64 = value & ((1i64 << bits) - 1);
181 let remainder_below_half: i64 = discarded & (halfway_bit - 1);
182 let at_half: bool = discarded == halfway_bit && remainder_below_half == 0;
183 if at_half {
184 // Exact midpoint: round toward even.
185 if truncated & 1 == 0 {
186 truncated
187 } else {
188 truncated.saturating_add(1)
189 }
190 } else if discarded > halfway_bit {
191 truncated.saturating_add(1)
192 } else {
193 truncated
194 }
195}
196
197#[cfg(test)]
198mod tests {
199 use super::*;
200
201 #[test]
202 fn constants_have_expected_raw_bits() {
203 assert_eq!(Q16::ZERO.raw(), 0);
204 assert_eq!(Q16::ONE.raw(), 1 << 16);
205 assert_eq!(Q16::MIN.raw(), i32::MIN);
206 assert_eq!(Q16::MAX.raw(), i32::MAX);
207 }
208
209 #[test]
210 fn from_int_round_trips_through_integer_part() {
211 for x in [-32_767_i16, -1, 0, 1, 32_767] {
212 let q = Q16::from_int(x);
213 assert_eq!(q.raw() >> 16, i32::from(x));
214 assert_eq!(q.raw() & 0xFFFF, 0);
215 }
216 }
217
218 #[test]
219 fn sat_add_clamps_at_max() {
220 assert_eq!(Q16::MAX.sat_add(Q16::ONE), Q16::MAX);
221 assert_eq!(Q16::MIN.sat_add(Q16::from_int(-1)), Q16::MIN);
222 assert_eq!(Q16::from_int(2).sat_add(Q16::from_int(3)), Q16::from_int(5));
223 }
224
225 #[test]
226 fn sat_sub_clamps_at_min() {
227 assert_eq!(Q16::MIN.sat_sub(Q16::ONE), Q16::MIN);
228 assert_eq!(Q16::MAX.sat_sub(Q16::from_int(-1)), Q16::MAX);
229 assert_eq!(Q16::from_int(5).sat_sub(Q16::from_int(3)), Q16::from_int(2));
230 }
231
232 #[test]
233 fn sat_mul_is_one_identity() {
234 for x in [-100, -1, 0, 1, 100, 32_767] {
235 let q = Q16::from_int(x);
236 assert_eq!(q.sat_mul(Q16::ONE), q);
237 assert_eq!(Q16::ONE.sat_mul(q), q);
238 }
239 }
240
241 #[test]
242 fn sat_mul_half_times_half_is_quarter() {
243 // 0.5 raw == 0x8000 == 32_768. 0.5 * 0.5 = 0.25 raw == 0x4000.
244 let half = Q16::from_raw(0x8000);
245 let quarter = Q16::from_raw(0x4000);
246 assert_eq!(half.sat_mul(half), quarter);
247 }
248
249 #[test]
250 fn sat_mul_handles_negative_signs() {
251 let a = Q16::from_int(-3);
252 let b = Q16::from_int(4);
253 assert_eq!(a.sat_mul(b), Q16::from_int(-12));
254 assert_eq!(b.sat_mul(a), Q16::from_int(-12));
255 assert_eq!(a.sat_mul(a), Q16::from_int(9));
256 }
257
258 #[test]
259 fn sat_mul_saturates_on_overflow() {
260 // 30_000 * 30_000 = 9e8 — far beyond i32::MAX / 65_536 ≈ 32_768.
261 let big = Q16::from_int(30_000);
262 assert_eq!(big.sat_mul(big), Q16::MAX);
263 assert_eq!(big.sat_mul(Q16::from_int(-30_000)), Q16::MIN);
264 }
265
266 #[test]
267 fn sat_div_one_is_identity() {
268 for x in [-100, -1, 0, 1, 100] {
269 let q = Q16::from_int(x);
270 assert_eq!(q.sat_div(Q16::ONE), q);
271 }
272 }
273
274 #[test]
275 fn sat_div_by_zero_returns_zero() {
276 assert_eq!(Q16::from_int(5).sat_div(Q16::ZERO), Q16::ZERO);
277 assert_eq!(Q16::from_int(-5).sat_div(Q16::ZERO), Q16::ZERO);
278 }
279
280 #[test]
281 fn abs_saturates_at_min() {
282 assert_eq!(Q16::MIN.abs(), Q16::MAX);
283 assert_eq!(Q16::from_int(-7).abs(), Q16::from_int(7));
284 assert_eq!(Q16::from_int(7).abs(), Q16::from_int(7));
285 }
286
287 #[test]
288 fn lerp_at_zero_returns_self() {
289 let a = Q16::from_int(10);
290 let b = Q16::from_int(20);
291 assert_eq!(a.lerp(b, Q16::ZERO), a);
292 }
293
294 #[test]
295 fn lerp_at_one_returns_other() {
296 let a = Q16::from_int(10);
297 let b = Q16::from_int(20);
298 assert_eq!(a.lerp(b, Q16::ONE), b);
299 }
300
301 #[test]
302 fn lerp_at_half_returns_midpoint() {
303 let a = Q16::from_int(10);
304 let b = Q16::from_int(20);
305 let half = Q16::from_raw(0x8000);
306 assert_eq!(a.lerp(b, half), Q16::from_int(15));
307 }
308
309 #[test]
310 fn ewma_recurrence_with_locked_alpha_is_stable() {
311 // alpha = 0.125 (0x2000 raw). Constant input x. The EWMA should
312 // monotonically approach x without ever overshooting it.
313 //
314 // With banker's rounding and alpha = 1/8, convergence stalls when
315 // the per-step delta drops to 4 raw Q16 units (4 * 8192 = 32_768,
316 // which is exactly halfway and rounds to the even neighbor 0). So
317 // the asymptotic gap is 4 raw units rather than 0 — a known
318 // property of the discrete recurrence we are intentionally pinning.
319 let alpha = Q16::from_raw(0x2000);
320 let x = Q16::from_int(100);
321 let mut ewma = Q16::ZERO;
322 let mut last = Q16::MIN;
323 for _ in 0..200 {
324 ewma = ewma.lerp(x, alpha);
325 // Each step is closer to x than the previous one (monotone non-decreasing).
326 assert!(ewma.raw() >= last.raw());
327 assert!(ewma.raw() <= x.raw());
328 last = ewma;
329 }
330 // After convergence, the EWMA sits within the rounding floor of x.
331 let final_gap = x.sat_sub(ewma).raw();
332 assert!(
333 final_gap <= 4,
334 "expected gap ≤ 4 raw units, got {final_gap}"
335 );
336 }
337
338 #[test]
339 fn round_half_even_breaks_ties_to_even() {
340 // 0.5 rounds to 0 (even); 1.5 rounds to 2 (even); 2.5 rounds to 2; 3.5 rounds to 4.
341 // Test by constructing values whose discarded bits are exactly the halfway pattern.
342 let cases: [(i64, i64); 4] = [
343 (0b0_1000, 0), // 0.5 ↦ 0
344 (0b1_1000, 2), // 1.5 ↦ 2
345 (0b10_1000, 2), // 2.5 ↦ 2
346 (0b11_1000, 4), // 3.5 ↦ 4
347 ];
348 for (input, expected) in cases {
349 assert_eq!(
350 round_half_even_i64_shift(input, 4),
351 expected,
352 "input={input:#b}"
353 );
354 }
355 }
356
357 #[test]
358 fn round_half_even_handles_negative_midpoints() {
359 // -0.5 ↦ 0 (even); -1.5 ↦ -2; -2.5 ↦ -2; -3.5 ↦ -4.
360 let cases: [(i64, i64); 4] = [
361 (-(0b0_1000_i64), 0),
362 (-(0b1_1000_i64), -2),
363 (-(0b10_1000_i64), -2),
364 (-(0b11_1000_i64), -4),
365 ];
366 for (input, expected) in cases {
367 assert_eq!(
368 round_half_even_i64_shift(input, 4),
369 expected,
370 "input={input}"
371 );
372 }
373 }
374}