gam_model_kernels/
cubic_cell_kernel.rs

1use gam_math::probability::normal_cdf;
2use gam_runtime::resource::{ByteLruCache, ResidentBytes};
3use smallvec::{SmallVec, smallvec};
4use std::hash::{Hash, Hasher};
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7
8/// Typed errors raised by the de-nested cubic transport kernel.
9///
10/// Sibling families (`bernoulli_marginal_slope`, `survival_marginal_slope`,
11/// `marginal_slope_shared`) currently consume the kernel's public surface via
12/// `Result<_, String>`. To stay source-compatible, the kernel converts errors
13/// to `String` at the boundary via `From<CubicCellKernelError> for String` and
14/// keeps the public function signatures returning `Result<_, String>`.
15/// `Display` is exact-byte-equivalent to the previous `format!(...)` strings.
16#[derive(Clone, Debug)]
17pub enum CubicCellKernelError {
18    /// Interval probe / cell-bounds preconditions (ordered bounds, supported
19    /// infinity patterns, positive finite width).
20    InvalidInterval { reason: String },
21    /// Cell-shape / branch-classification failure: tail cells not affine,
22    /// finite cells with non-positive width, non-finite affine coefficients,
23    /// non-affine cell with infinite bounds, leading-coefficient degeneracy
24    /// in the moment recurrence, etc.
25    InvalidCellShape { reason: String },
26    /// Reduced moment vector (or polynomial-convolution scratch) is shorter
27    /// than the polynomial degree the leaf needs to evaluate.
28    InsufficientMoments { reason: String },
29    /// Bivariate-normal CDF domain validation (non-finite/non-infinite
30    /// argument, non-finite correlation).
31    BivariateNormalDomain { reason: String },
32}
33
34impl_reason_error_boilerplate! {
35    CubicCellKernelError {
36        InvalidInterval,
37        InvalidCellShape,
38        InsufficientMoments,
39        BivariateNormalDomain,
40    }
41}
42
43impl CubicCellKernelError {
44    #[inline]
45    fn invalid_interval(reason: impl Into<String>) -> Self {
46        CubicCellKernelError::InvalidInterval {
47            reason: reason.into(),
48        }
49    }
50    #[inline]
51    fn invalid_cell_shape(reason: impl Into<String>) -> Self {
52        CubicCellKernelError::InvalidCellShape {
53            reason: reason.into(),
54        }
55    }
56    #[inline]
57    fn insufficient_moments(reason: impl Into<String>) -> Self {
58        CubicCellKernelError::InsufficientMoments {
59            reason: reason.into(),
60        }
61    }
62    #[inline]
63    fn bivariate_normal_domain(reason: impl Into<String>) -> Self {
64        CubicCellKernelError::BivariateNormalDomain {
65            reason: reason.into(),
66        }
67    }
68}
69
70// De-nested cubic transport kernel.
71//
72// This module implements the de-nested flexible-link/score-warp model
73//
74//   eta(z) = a + b*z + b*delta_h(z) + delta_w(a + b*z)
75//
76// where delta_h is the score warp and delta_w is the link deviation.
77// This is not the literal nested composition L(a + b*H(z)); it is an
78// additive-correction model around the affine core a + b*z.
79//
80// On each partition cell, both deviations are cubic polynomials, so eta is
81// at most sextic in z and q(z) = 0.5*(z^2 + eta^2) is at most degree 12.
82// The integral of exp(-q(z)) is evaluated by transporting from the affine
83// anchor (c2=c3=0, where q is Gaussian and the integral reduces to BVN)
84// to the target non-affine cell via the polynomial moment recurrence.
85//
86// The partition covers (-∞, +∞) with:
87//   • two semi-infinite affine TAIL cells (outside all deviation support),
88//   • finitely many interior cells (each a sextic microcell).
89// Because tail cells have constant deviations (c2=c3=0), their bounds
90// are parameter-independent, so no Leibniz boundary-motion corrections
91// appear in the derivatives.
92//
93// Shared by bernoulli_marginal_slope and survival_marginal_slope families.
94
95#[derive(Clone, Copy, Debug, PartialEq)]
96pub struct LocalSpanCubic {
97    pub left: f64,
98    pub right: f64,
99    pub c0: f64,
100    pub c1: f64,
101    pub c2: f64,
102    pub c3: f64,
103}
104
105impl LocalSpanCubic {
106    #[inline]
107    pub fn evaluate(self, x: f64) -> f64 {
108        let t = x - self.left;
109        self.c0 + self.c1 * t + self.c2 * t * t + self.c3 * t * t * t
110    }
111
112    #[inline]
113    pub fn first_derivative(self, x: f64) -> f64 {
114        let t = x - self.left;
115        self.c1 + 2.0 * self.c2 * t + 3.0 * self.c3 * t * t
116    }
117
118    #[inline]
119    pub fn second_derivative(self, x: f64) -> f64 {
120        let t = x - self.left;
121        2.0 * self.c2 + 6.0 * self.c3 * t
122    }
123}
124
125pub const ANCHORED_DEVIATION_KERNEL: &str = "DenestedCubicTransport";
126/// Default normalized non-affine branch tolerance used by [`branch_cell`].
127///
128/// Keep this cutoff explicit and hill-climbable: the large-scale cycle-0
129/// sweep evaluated `{1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3}` against the
130/// legacy transport path.  The more aggressive candidates require an
131/// end-to-end beta acceptance run before promotion; the default therefore
132/// remains the legacy `1e-10` value to preserve bit-for-bit model behavior.
133pub const NORMALIZED_CELL_BRANCH_TOL: f64 = 1e-10;
134
135const INV_TWO_PI: f64 = 1.0 / std::f64::consts::TAU;
136
137/// 384-point Gauss–Legendre nodes, re-exported for the GPU cubic-cell kernel
138/// (`src/gpu/cubic_cell/kernel_src.rs`) to embed as `__constant__` device
139/// memory. Linux-only because the kernel emitter is Linux-only.
140#[cfg(target_os = "linux")]
141pub const GL_NODES_FOR_GPU_KERNEL: &[f64; 384] = &GL_NODES;
142/// Companion weights to [`GL_NODES_FOR_GPU_KERNEL`].
143#[cfg(target_os = "linux")]
144pub const GL_WEIGHTS_FOR_GPU_KERNEL: &[f64; 384] = &GL_WEIGHTS;
145
146const GL_NODES: [f64; 384] = [
147    -9.999_804_411_726_474e-1,
148    -9.998_969_471_378_596e-1,
149    -9.997_467_408_113_523e-1,
150    -9.995_297_988_558_859e-1,
151    -9.992_461_316_671_845e-1,
152    -9.988_957_572_063_257e-1,
153    -9.984_786_985_384_589e-1,
154    -9.979_949_833_727_938e-1,
155    -9.974_446_439_389_107e-1,
156    -9.968_277_169_440_913e-1,
157    -9.961_442_435_551_087e-1,
158    -9.953_942_693_885_953e-1,
159    -9.945_778_445_047_068e-1,
160    -9.936_950_234_020_883e-1,
161    -9.927_458_650_133_153e-1,
162    -9.917_304_327_004_32e-1,
163    -9.906_487_942_504_061e-1,
164    -9.895_010_218_704_087e-1,
165    -9.882_871_921_828_699e-1,
166    -9.870_073_862_202_815e-1,
167    -9.856_616_894_197_333e-1,
168    -9.842_501_916_171_713e-1,
169    -9.827_729_870_413_743e-1,
170    -9.812_301_743_076_443e-1,
171    -9.796_218_564_112_101e-1,
172    -9.779_481_407_203_411e-1,
173    -9.762_091_389_691_724e-1,
174    -9.744_049_672_502_397e-1,
175    -9.725_357_460_067_257e-1,
176    -9.706_016_000_244_151e-1,
177    -9.686_026_584_233_628e-1,
178    -9.665_390_546_492_71e-1,
179    -9.644_109_264_645_802e-1,
180    -9.622_184_159_392_698e-1,
181    -9.599_616_694_413_742e-1,
182    -9.576_408_376_272_095e-1,
183    -9.552_560_754_313_16e-1,
184    -9.528_075_420_561_144e-1,
185    -9.502_954_009_612_771e-1,
186    -9.477_198_198_528_157e-1,
187    -9.450_809_706_718_851e-1,
188    -9.423_790_295_833_044e-1,
189    -9.396_141_769_637_963e-1,
190    -9.367_865_973_899_459e-1,
191    -9.338_964_796_258_775e-1,
192    -9.309_440_166_106_54e-1,
193    -9.279_294_054_453_956e-1,
194    -9.248_528_473_801_222e-1,
195    -9.217_145_478_003_181e-1,
196    -9.185_147_162_132_208e-1,
197    -9.152_535_662_338_34e-1,
198    -9.119_313_155_706_682e-1,
199    -9.085_481_860_112_055e-1,
200    -9.051_044_034_070_944e-1,
201    -9.016_001_976_590_722e-1,
202    -8.980_358_027_016_164e-1,
203    -8.944_114_564_873_288e-1,
204    -8.907_274_009_710_492e-1,
205    -8.869_838_820_937_034e-1,
206    -8.831_811_497_658_847e-1,
207    -8.793_194_578_511_7e-1,
208    -8.753_990_641_491_725e-1,
209    -8.714_202_303_783_312e-1,
210    -8.673_832_221_584_393e-1,
211    -8.632_883_089_929_12e-1,
212    -8.591_357_642_507_945e-1,
213    -8.549_258_651_485_127e-1,
214    -8.506_588_927_313_666e-1,
215    -8.463_351_318_547_683e-1,
216    -8.419_548_711_652_254e-1,
217    -8.375_184_030_810_715e-1,
218    -8.330_260_237_729_452e-1,
219    -8.284_780_331_440_178e-1,
220    -8.238_747_348_099_726e-1,
221    -8.192_164_360_787_36e-1,
222    -8.145_034_479_299_62e-1,
223    -8.097_360_849_942_72e-1,
224    -8.049_146_655_322_506e-1,
225    -8.000_395_114_131_988e-1,
226    -7.951_109_480_936_471e-1,
227    -7.901_293_045_956_28e-1,
228    -7.850_949_134_847_117e-1,
229    -7.800_081_108_478_04e-1,
230    -7.748_692_362_707_1e-1,
231    -7.696_786_328_154_644e-1,
232    -7.644_366_469_974_285e-1,
233    -7.591_436_287_621_58e-1,
234    -7.537_999_314_620_412e-1,
235    -7.484_059_118_327_094e-1,
236    -7.429_619_299_692_227e-1,
237    -7.374_683_493_020_299e-1,
238    -7.319_255_365_727_068e-1,
239    -7.263_338_618_094_733e-1,
240    -7.206_936_983_024_912e-1,
241    -7.150_054_225_789_432e-1,
242    -7.092_694_143_778_975e-1,
243    -7.034_860_566_249_567e-1,
244    -6.976_557_354_066_943e-1,
245    -6.917_788_399_448_808e-1,
246    -6.858_557_625_704_99e-1,
247    -6.798_868_986_975_534e-1,
248    -6.738_726_467_966_731e-1,
249    -6.678_134_083_685_102e-1,
250    -6.617_095_879_169_366e-1,
251    -6.555_615_929_220_4e-1,
252    -6.493_698_338_129_212e-1,
253    -6.431_347_239_402_948e-1,
254    -6.368_566_795_488_945e-1,
255    -6.305_361_197_496_849e-1,
256    -6.241_734_664_918_837e-1,
257    -6.177_691_445_347_913e-1,
258    -6.113_235_814_194_364e-1,
259    -6.048_372_074_400_329e-1,
260    -5.983_104_556_152_549e-1,
261    -5.917_437_616_593_286e-1,
262    -5.851_375_639_529_456e-1,
263    -5.784_923_035_139_965e-1,
264    -5.718_084_239_681_3e-1,
265    -5.650_863_715_191_369e-1,
266    -5.583_265_949_191_623e-1,
267    -5.515_295_454_387_482e-1,
268    -5.446_956_768_367_068e-1,
269    -5.378_254_453_298_289e-1,
270    -5.309_193_095_624_275e-1,
271    -5.239_777_305_757_194e-1,
272    -5.170_011_717_770_473e-1,
273    -5.099_900_989_089_429e-1,
274    -5.029_449_800_180_356e-1,
275    -4.958_662_854_238_058_4e-1,
276    -4.887_544_876_871_878e-1,
277    -4.816_100_615_790_221e-1,
278    -4.744_334_840_483_605_5e-1,
279    -4.672_252_341_906_264e-1,
280    -4.599_857_932_156_304e-1,
281    -4.527_156_444_154_463_7e-1,
282    -4.454_152_731_321_473_5e-1,
283    -4.380_851_667_254_05e-1,
284    -4.307_258_145_399_544_5e-1,
285    -4.233_377_078_729_265e-1,
286    -4.159_213_399_410_494e-1,
287    -4.084_772_058_477_228e-1,
288    -4.010_058_025_499_653e-1,
289    -3.935_076_288_252_386e-1,
290    -3.859_831_852_381_500_6e-1,
291    -3.784_329_741_070_358_6e-1,
292    -3.708_574_994_704_271e-1,
293    -3.632_572_670_534_011e-1,
294    -3.556_327_842_338_202e-1,
295    -3.479_845_600_084_600_6e-1,
296    -3.403_131_049_590_297e-1,
297    -3.326_189_312_180_866e-1,
298    -3.249_025_524_348_469_5e-1,
299    -3.171_644_837_408_958_4e-1,
300    -3.094_052_417_157_978e-1,
301    -3.016_253_443_526_109e-1,
302    -2.938_253_110_233_064_5e-1,
303    -2.860_056_624_440_967_5e-1,
304    -2.781_669_206_406_729e-1,
305    -2.703_096_089_133_553e-1,
306    -2.624_342_518_021_592_4e-1,
307    -2.545_413_750_517_773e-1,
308    -2.466_315_055_764_817_5e-1,
309    -2.387_051_714_249_486_3e-1,
310    -2.307_629_017_450_062e-1,
311    -2.228_052_267_483_099_4e-1,
312    -2.148_326_776_749_466_5e-1,
313    -2.068_457_867_579_697_5e-1,
314    -1.988_450_871_878_683_4e-1,
315    -1.908_311_130_769_724_5e-1,
316    -1.828_043_994_237_965_6e-1,
317    -1.747_654_820_773_241_2e-1,
318    -1.667_148_977_012_352_4e-1,
319    -1.586_531_837_380_799_3e-1,
320    -1.505_808_783_733_995e-1,
321    -1.424_985_204_997_981_4e-1,
322    -1.344_066_496_809_674_7e-1,
323    -1.263_058_061_156_663e-1,
324    -1.181_965_306_016_578_4e-1,
325    -1.100_793_644_996_070_4e-1,
326    -1.019_548_496_969_403_7e-1,
327    -9.382_352_857_167_028e-2,
328    -8.568_594_395_618_719e-2,
329    -7.754_263_910_102_077e-2,
330    -6.939_415_763_857_37e-2,
331    -6.124_104_354_682_962e-2,
332    -5.308_384_111_303_817_6e-2,
333    -4.492_309_489_737_94e-2,
334    -3.675_934_969_660_982e-2,
335    -2.859_315_050_769_284_7e-2,
336    -2.042_504_249_141_571e-2,
337    -1.225_557_093_599_553_8e-2,
338    -4.085_281_220_676_868e-3,
339    4.085_281_220_676_868e-3,
340    1.225_557_093_599_553_8e-2,
341    2.042_504_249_141_571e-2,
342    2.859_315_050_769_284_7e-2,
343    3.675_934_969_660_982e-2,
344    4.492_309_489_737_94e-2,
345    5.308_384_111_303_817_6e-2,
346    6.124_104_354_682_962e-2,
347    6.939_415_763_857_37e-2,
348    7.754_263_910_102_077e-2,
349    8.568_594_395_618_719e-2,
350    9.382_352_857_167_028e-2,
351    1.019_548_496_969_403_7e-1,
352    1.100_793_644_996_070_4e-1,
353    1.181_965_306_016_578_4e-1,
354    1.263_058_061_156_663e-1,
355    1.344_066_496_809_674_7e-1,
356    1.424_985_204_997_981_4e-1,
357    1.505_808_783_733_995e-1,
358    1.586_531_837_380_799_3e-1,
359    1.667_148_977_012_352_4e-1,
360    1.747_654_820_773_241_2e-1,
361    1.828_043_994_237_965_6e-1,
362    1.908_311_130_769_724_5e-1,
363    1.988_450_871_878_683_4e-1,
364    2.068_457_867_579_697_5e-1,
365    2.148_326_776_749_466_5e-1,
366    2.228_052_267_483_099_4e-1,
367    2.307_629_017_450_062e-1,
368    2.387_051_714_249_486_3e-1,
369    2.466_315_055_764_817_5e-1,
370    2.545_413_750_517_773e-1,
371    2.624_342_518_021_592_4e-1,
372    2.703_096_089_133_553e-1,
373    2.781_669_206_406_729e-1,
374    2.860_056_624_440_967_5e-1,
375    2.938_253_110_233_064_5e-1,
376    3.016_253_443_526_109e-1,
377    3.094_052_417_157_978e-1,
378    3.171_644_837_408_958_4e-1,
379    3.249_025_524_348_469_5e-1,
380    3.326_189_312_180_866e-1,
381    3.403_131_049_590_297e-1,
382    3.479_845_600_084_600_6e-1,
383    3.556_327_842_338_202e-1,
384    3.632_572_670_534_011e-1,
385    3.708_574_994_704_271e-1,
386    3.784_329_741_070_358_6e-1,
387    3.859_831_852_381_500_6e-1,
388    3.935_076_288_252_386e-1,
389    4.010_058_025_499_653e-1,
390    4.084_772_058_477_228e-1,
391    4.159_213_399_410_494e-1,
392    4.233_377_078_729_265e-1,
393    4.307_258_145_399_544_5e-1,
394    4.380_851_667_254_05e-1,
395    4.454_152_731_321_473_5e-1,
396    4.527_156_444_154_463_7e-1,
397    4.599_857_932_156_304e-1,
398    4.672_252_341_906_264e-1,
399    4.744_334_840_483_605_5e-1,
400    4.816_100_615_790_221e-1,
401    4.887_544_876_871_878e-1,
402    4.958_662_854_238_058_4e-1,
403    5.029_449_800_180_356e-1,
404    5.099_900_989_089_429e-1,
405    5.170_011_717_770_473e-1,
406    5.239_777_305_757_194e-1,
407    5.309_193_095_624_275e-1,
408    5.378_254_453_298_289e-1,
409    5.446_956_768_367_068e-1,
410    5.515_295_454_387_482e-1,
411    5.583_265_949_191_623e-1,
412    5.650_863_715_191_369e-1,
413    5.718_084_239_681_3e-1,
414    5.784_923_035_139_965e-1,
415    5.851_375_639_529_456e-1,
416    5.917_437_616_593_286e-1,
417    5.983_104_556_152_549e-1,
418    6.048_372_074_400_329e-1,
419    6.113_235_814_194_364e-1,
420    6.177_691_445_347_913e-1,
421    6.241_734_664_918_837e-1,
422    6.305_361_197_496_849e-1,
423    6.368_566_795_488_945e-1,
424    6.431_347_239_402_948e-1,
425    6.493_698_338_129_212e-1,
426    6.555_615_929_220_4e-1,
427    6.617_095_879_169_366e-1,
428    6.678_134_083_685_102e-1,
429    6.738_726_467_966_731e-1,
430    6.798_868_986_975_534e-1,
431    6.858_557_625_704_99e-1,
432    6.917_788_399_448_808e-1,
433    6.976_557_354_066_943e-1,
434    7.034_860_566_249_567e-1,
435    7.092_694_143_778_975e-1,
436    7.150_054_225_789_432e-1,
437    7.206_936_983_024_912e-1,
438    7.263_338_618_094_733e-1,
439    7.319_255_365_727_068e-1,
440    7.374_683_493_020_299e-1,
441    7.429_619_299_692_227e-1,
442    7.484_059_118_327_094e-1,
443    7.537_999_314_620_412e-1,
444    7.591_436_287_621_58e-1,
445    7.644_366_469_974_285e-1,
446    7.696_786_328_154_644e-1,
447    7.748_692_362_707_1e-1,
448    7.800_081_108_478_04e-1,
449    7.850_949_134_847_117e-1,
450    7.901_293_045_956_28e-1,
451    7.951_109_480_936_471e-1,
452    8.000_395_114_131_988e-1,
453    8.049_146_655_322_506e-1,
454    8.097_360_849_942_72e-1,
455    8.145_034_479_299_62e-1,
456    8.192_164_360_787_36e-1,
457    8.238_747_348_099_726e-1,
458    8.284_780_331_440_178e-1,
459    8.330_260_237_729_452e-1,
460    8.375_184_030_810_715e-1,
461    8.419_548_711_652_254e-1,
462    8.463_351_318_547_683e-1,
463    8.506_588_927_313_666e-1,
464    8.549_258_651_485_127e-1,
465    8.591_357_642_507_945e-1,
466    8.632_883_089_929_12e-1,
467    8.673_832_221_584_393e-1,
468    8.714_202_303_783_312e-1,
469    8.753_990_641_491_725e-1,
470    8.793_194_578_511_7e-1,
471    8.831_811_497_658_847e-1,
472    8.869_838_820_937_034e-1,
473    8.907_274_009_710_492e-1,
474    8.944_114_564_873_288e-1,
475    8.980_358_027_016_164e-1,
476    9.016_001_976_590_722e-1,
477    9.051_044_034_070_944e-1,
478    9.085_481_860_112_055e-1,
479    9.119_313_155_706_682e-1,
480    9.152_535_662_338_34e-1,
481    9.185_147_162_132_208e-1,
482    9.217_145_478_003_181e-1,
483    9.248_528_473_801_222e-1,
484    9.279_294_054_453_956e-1,
485    9.309_440_166_106_54e-1,
486    9.338_964_796_258_775e-1,
487    9.367_865_973_899_459e-1,
488    9.396_141_769_637_963e-1,
489    9.423_790_295_833_044e-1,
490    9.450_809_706_718_851e-1,
491    9.477_198_198_528_157e-1,
492    9.502_954_009_612_771e-1,
493    9.528_075_420_561_144e-1,
494    9.552_560_754_313_16e-1,
495    9.576_408_376_272_095e-1,
496    9.599_616_694_413_742e-1,
497    9.622_184_159_392_698e-1,
498    9.644_109_264_645_802e-1,
499    9.665_390_546_492_71e-1,
500    9.686_026_584_233_628e-1,
501    9.706_016_000_244_151e-1,
502    9.725_357_460_067_257e-1,
503    9.744_049_672_502_397e-1,
504    9.762_091_389_691_724e-1,
505    9.779_481_407_203_411e-1,
506    9.796_218_564_112_101e-1,
507    9.812_301_743_076_443e-1,
508    9.827_729_870_413_743e-1,
509    9.842_501_916_171_713e-1,
510    9.856_616_894_197_333e-1,
511    9.870_073_862_202_815e-1,
512    9.882_871_921_828_699e-1,
513    9.895_010_218_704_087e-1,
514    9.906_487_942_504_061e-1,
515    9.917_304_327_004_32e-1,
516    9.927_458_650_133_153e-1,
517    9.936_950_234_020_883e-1,
518    9.945_778_445_047_068e-1,
519    9.953_942_693_885_953e-1,
520    9.961_442_435_551_087e-1,
521    9.968_277_169_440_913e-1,
522    9.974_446_439_389_107e-1,
523    9.979_949_833_727_938e-1,
524    9.984_786_985_384_589e-1,
525    9.988_957_572_063_257e-1,
526    9.992_461_316_671_845e-1,
527    9.995_297_988_558_859e-1,
528    9.997_467_408_113_523e-1,
529    9.998_969_471_378_596e-1,
530    9.999_804_411_726_474e-1,
531];
532const GL_WEIGHTS: [f64; 384] = [
533    5.019_410_348_676_869_6e-5,
534    1.168_390_665_730_266_3e-4,
535    1.835_749_193_551_655_8e-4,
536    2.503_070_890_844_105e-4,
537    3.170_242_698_112_815e-4,
538    3.837_208_020_912_921_4e-4,
539    4.503_919_137_716_827e-4,
540    5.170_330_453_491_649e-4,
541    5.836_397_042_630_135e-4,
542    6.502_074_240_969_948e-4,
543    7.167_317_509_947_801e-4,
544    7.832_082_385_905_168e-4,
545    8.496_324_460_039_209e-4,
546    9.159_999_370_632_641e-4,
547    9.823_062_800_663_463e-4,
548    1.048_547_047_793_689_5e-3,
549    1.114_717_817_647_310_6e-3,
550    1.180_814_171_855_922e-3,
551    1.246_831_697_715_441_5e-3,
552    1.312_765_987_850_66e-3,
553    1.378_612_640_487_646_8e-3,
554    1.444_367_259_734_736e-3,
555    1.510_025_455_865_810_3e-3,
556    1.575_582_845_607_936_8e-3,
557    1.641_035_052_429_271_5e-3,
558    1.706_377_706_828_447_1e-3,
559    1.771_606_446_623_834_7e-3,
560    1.836_716_917_243_567_5e-3,
561    1.901_704_772_014_899_2e-3,
562    1.966_565_672_453_437e-3,
563    2.031_295_288_552_398_4e-3,
564    2.095_889_299_071_020_6e-3,
565    2.160_343_391_822_734_3e-3,
566    2.224_653_263_962_713e-3,
567    2.288_814_622_274_955e-3,
568    2.352_823_183_458_769e-3,
569    2.416_674_674_414_340_5e-3,
570    2.480_364_832_528_265_6e-3,
571    2.543_889_405_957_74e-3,
572    2.607_244_153_914_452e-3,
573    2.670_424_846_947_554e-3,
574    2.733_427_267_226_093_3e-3,
575    2.796_247_208_820_428e-3,
576    2.858_880_477_983_06e-3,
577    2.921_322_893_428_515_3e-3,
578    2.983_570_286_612_554_5e-3,
579    3.045_618_502_010_327_8e-3,
580    3.107_463_397_393_755_5e-3,
581    3.169_100_844_108_32e-3,
582    3.230_526_727_348_174e-3,
583    3.291_736_946_431_361e-3,
584    3.352_727_415_073_250_3e-3,
585    3.413_494_061_659_418_4e-3,
586    3.474_032_829_517_317e-3,
587    3.534_339_677_187_348_4e-3,
588    3.594_410_578_692_452e-3,
589    3.654_241_523_806_987e-3,
590    3.713_828_518_324_312_5e-3,
591    3.773_167_584_323_583_5e-3,
592    3.832_254_760_435_171e-3,
593    3.891_086_102_105_193_4e-3,
594    3.949_657_681_858_895e-3,
595    4.007_965_589_562_678e-3,
596    4.066_005_932_685_269e-3,
597    4.123_774_836_557_6e-3,
598    4.181_268_444_631_281e-3,
599    4.238_482_918_736_289e-3,
600    4.295_414_439_336_925e-3,
601    4.352_059_205_787_275e-3,
602    4.408_413_436_584_285e-3,
603    4.464_473_369_620_78e-3,
604    4.520_235_262_436_235e-3,
605    4.575_695_392_466_791e-3,
606    4.630_850_057_293_894e-3,
607    4.685_695_574_891_041e-3,
608    4.740_228_283_870_022e-3,
609    4.794_444_543_725_102e-3,
610    4.848_340_735_076_109e-3,
611    4.901_913_259_910_197e-3,
612    4.955_158_541_821_682_4e-3,
613    5.008_073_026_251_332e-3,
614    5.060_653_180_723_101_4e-3,
615    5.112_895_495_080_397e-3,
616    5.164_796_481_720_011e-3,
617    5.216_352_675_825_451e-3,
618    5.267_560_635_597_735e-3,
619    5.318_416_942_485_385e-3,
620    5.368_918_201_412_827e-3,
621    5.419_061_041_006_627e-3,
622    5.468_842_113_820_941e-3,
623    5.518_258_096_560_71e-3,
624    5.567_305_690_303_767e-3,
625    5.615_981_620_720_803e-3,
626    5.664_282_638_294_182e-3,
627    5.712_205_518_534_655e-3,
628    5.759_747_062_196_925_5e-3,
629    5.806_904_095_492_818e-3,
630    5.853_673_470_303_617_4e-3,
631    5.900_052_064_389_824e-3,
632    5.946_036_781_599_814e-3,
633    5.991_624_552_076_468e-3,
634    6.036_812_332_462_087e-3,
635    6.081_597_106_101_673e-3,
636    6.125_975_883_244_196e-3,
637    6.169_945_701_242_237e-3,
638    6.213_503_624_749_591e-3,
639    6.256_646_745_917_723e-3,
640    6.299_372_184_589_237e-3,
641    6.341_677_088_490_664e-3,
642    6.383_558_633_422_572e-3,
643    6.425_014_023_448_273e-3,
644    6.466_040_491_080_434e-3,
645    6.506_635_297_465_724e-3,
646    6.546_795_732_567_842_5e-3,
647    6.586_519_115_348_261e-3,
648    6.625_802_793_945_317e-3,
649    6.664_644_145_851_14e-3,
650    6.703_040_578_086_941e-3,
651    6.740_989_527_375_895e-3,
652    6.778_488_460_314_126e-3,
653    6.815_534_873_540_5e-3,
654    6.852_126_293_902_878e-3,
655    6.888_260_278_623_754e-3,
656    6.923_934_415_463_31e-3,
657    6.959_146_322_880_146_5e-3,
658    6.993_893_650_190_702e-3,
659    7.028_174_077_725_734e-3,
660    7.061_985_316_985_506e-3,
661    7.095_325_110_792_439e-3,
662    7.128_191_233_441_844e-3,
663    7.160_581_490_850_321e-3,
664    7.192_493_720_702_486e-3,
665    7.223_925_792_595_309e-3,
666    7.254_875_608_179_984e-3,
667    7.285_341_101_302_512e-3,
668    7.315_320_238_141_324_5e-3,
669    7.344_811_017_343_063e-3,
670    7.373_811_470_156_258e-3,
671    7.402_319_660_562_818e-3,
672    7.430_333_685_407_178e-3,
673    7.457_851_674_523_319e-3,
674    7.484_871_790_859_79e-3,
675    7.511_392_230_602_079e-3,
676    7.537_411_223_293_362e-3,
677    7.562_927_031_952_382e-3,
678    7.587_937_953_189_561_5e-3,
679    7.612_442_317_320_796e-3,
680    7.636_438_488_478_739e-3,
681    7.659_924_864_722_064e-3,
682    7.682_899_878_142_539e-3,
683    7.705_361_994_969_524e-3,
684    7.727_309_715_672_44e-3,
685    7.748_741_575_060_914e-3,
686    7.769_656_142_382_462e-3,
687    7.790_052_021_418_226e-3,
688    7.809_927_850_575_903e-3,
689    7.829_282_302_980_82e-3,
690    7.848_114_086_564_56e-3,
691    7.866_421_944_151_094e-3,
692    7.884_204_653_540_665e-3,
693    7.901_461_027_591_6e-3,
694    7.918_189_914_299_318e-3,
695    7.934_390_196_873_448e-3,
696    7.950_060_793_812_204e-3,
697    7.965_200_658_974_709e-3,
698    7.979_808_781_650_77e-3,
699    7.993_884_186_628_266e-3,
700    8.007_425_934_258_548e-3,
701    8.020_433_120_518_866e-3,
702    8.032_904_877_072_8e-3,
703    8.044_840_371_328_26e-3,
704    8.056_238_806_493_175e-3,
705    8.067_099_421_628_42e-3,
706    8.077_421_491_698_82e-3,
707    8.087_204_327_621_594e-3,
708    8.096_447_276_312_202e-3,
709    8.105_149_720_727_933e-3,
710    8.113_311_079_909_208e-3,
711    8.120_930_809_018_415e-3,
712    8.128_008_399_376_085e-3,
713    8.134_543_378_495_033e-3,
714    8.140_535_310_111_77e-3,
715    8.145_983_794_215_77e-3,
716    8.150_888_467_075_875e-3,
717    8.155_249_001_265_092e-3,
718    8.159_065_105_681_899e-3,
719    8.162_336_525_570_1e-3,
720    8.165_063_042_535_465e-3,
721    8.167_244_474_560_707e-3,
722    8.168_880_676_017_344e-3,
723    8.169_971_537_675_47e-3,
724    8.170_516_986_711_104e-3,
725    8.170_516_986_711_104e-3,
726    8.169_971_537_675_47e-3,
727    8.168_880_676_017_344e-3,
728    8.167_244_474_560_707e-3,
729    8.165_063_042_535_465e-3,
730    8.162_336_525_570_1e-3,
731    8.159_065_105_681_899e-3,
732    8.155_249_001_265_092e-3,
733    8.150_888_467_075_875e-3,
734    8.145_983_794_215_77e-3,
735    8.140_535_310_111_77e-3,
736    8.134_543_378_495_033e-3,
737    8.128_008_399_376_085e-3,
738    8.120_930_809_018_415e-3,
739    8.113_311_079_909_208e-3,
740    8.105_149_720_727_933e-3,
741    8.096_447_276_312_202e-3,
742    8.087_204_327_621_594e-3,
743    8.077_421_491_698_82e-3,
744    8.067_099_421_628_42e-3,
745    8.056_238_806_493_175e-3,
746    8.044_840_371_328_26e-3,
747    8.032_904_877_072_8e-3,
748    8.020_433_120_518_866e-3,
749    8.007_425_934_258_548e-3,
750    7.993_884_186_628_266e-3,
751    7.979_808_781_650_77e-3,
752    7.965_200_658_974_709e-3,
753    7.950_060_793_812_204e-3,
754    7.934_390_196_873_448e-3,
755    7.918_189_914_299_318e-3,
756    7.901_461_027_591_6e-3,
757    7.884_204_653_540_665e-3,
758    7.866_421_944_151_094e-3,
759    7.848_114_086_564_56e-3,
760    7.829_282_302_980_82e-3,
761    7.809_927_850_575_903e-3,
762    7.790_052_021_418_226e-3,
763    7.769_656_142_382_462e-3,
764    7.748_741_575_060_914e-3,
765    7.727_309_715_672_44e-3,
766    7.705_361_994_969_524e-3,
767    7.682_899_878_142_539e-3,
768    7.659_924_864_722_064e-3,
769    7.636_438_488_478_739e-3,
770    7.612_442_317_320_796e-3,
771    7.587_937_953_189_561_5e-3,
772    7.562_927_031_952_382e-3,
773    7.537_411_223_293_362e-3,
774    7.511_392_230_602_079e-3,
775    7.484_871_790_859_79e-3,
776    7.457_851_674_523_319e-3,
777    7.430_333_685_407_178e-3,
778    7.402_319_660_562_818e-3,
779    7.373_811_470_156_258e-3,
780    7.344_811_017_343_063e-3,
781    7.315_320_238_141_324_5e-3,
782    7.285_341_101_302_512e-3,
783    7.254_875_608_179_984e-3,
784    7.223_925_792_595_309e-3,
785    7.192_493_720_702_486e-3,
786    7.160_581_490_850_321e-3,
787    7.128_191_233_441_844e-3,
788    7.095_325_110_792_439e-3,
789    7.061_985_316_985_506e-3,
790    7.028_174_077_725_734e-3,
791    6.993_893_650_190_702e-3,
792    6.959_146_322_880_146_5e-3,
793    6.923_934_415_463_31e-3,
794    6.888_260_278_623_754e-3,
795    6.852_126_293_902_878e-3,
796    6.815_534_873_540_5e-3,
797    6.778_488_460_314_126e-3,
798    6.740_989_527_375_895e-3,
799    6.703_040_578_086_941e-3,
800    6.664_644_145_851_14e-3,
801    6.625_802_793_945_317e-3,
802    6.586_519_115_348_261e-3,
803    6.546_795_732_567_842_5e-3,
804    6.506_635_297_465_724e-3,
805    6.466_040_491_080_434e-3,
806    6.425_014_023_448_273e-3,
807    6.383_558_633_422_572e-3,
808    6.341_677_088_490_664e-3,
809    6.299_372_184_589_237e-3,
810    6.256_646_745_917_723e-3,
811    6.213_503_624_749_591e-3,
812    6.169_945_701_242_237e-3,
813    6.125_975_883_244_196e-3,
814    6.081_597_106_101_673e-3,
815    6.036_812_332_462_087e-3,
816    5.991_624_552_076_468e-3,
817    5.946_036_781_599_814e-3,
818    5.900_052_064_389_824e-3,
819    5.853_673_470_303_617_4e-3,
820    5.806_904_095_492_818e-3,
821    5.759_747_062_196_925_5e-3,
822    5.712_205_518_534_655e-3,
823    5.664_282_638_294_182e-3,
824    5.615_981_620_720_803e-3,
825    5.567_305_690_303_767e-3,
826    5.518_258_096_560_71e-3,
827    5.468_842_113_820_941e-3,
828    5.419_061_041_006_627e-3,
829    5.368_918_201_412_827e-3,
830    5.318_416_942_485_385e-3,
831    5.267_560_635_597_735e-3,
832    5.216_352_675_825_451e-3,
833    5.164_796_481_720_011e-3,
834    5.112_895_495_080_397e-3,
835    5.060_653_180_723_101_4e-3,
836    5.008_073_026_251_332e-3,
837    4.955_158_541_821_682_4e-3,
838    4.901_913_259_910_197e-3,
839    4.848_340_735_076_109e-3,
840    4.794_444_543_725_102e-3,
841    4.740_228_283_870_022e-3,
842    4.685_695_574_891_041e-3,
843    4.630_850_057_293_894e-3,
844    4.575_695_392_466_791e-3,
845    4.520_235_262_436_235e-3,
846    4.464_473_369_620_78e-3,
847    4.408_413_436_584_285e-3,
848    4.352_059_205_787_275e-3,
849    4.295_414_439_336_925e-3,
850    4.238_482_918_736_289e-3,
851    4.181_268_444_631_281e-3,
852    4.123_774_836_557_6e-3,
853    4.066_005_932_685_269e-3,
854    4.007_965_589_562_678e-3,
855    3.949_657_681_858_895e-3,
856    3.891_086_102_105_193_4e-3,
857    3.832_254_760_435_171e-3,
858    3.773_167_584_323_583_5e-3,
859    3.713_828_518_324_312_5e-3,
860    3.654_241_523_806_987e-3,
861    3.594_410_578_692_452e-3,
862    3.534_339_677_187_348_4e-3,
863    3.474_032_829_517_317e-3,
864    3.413_494_061_659_418_4e-3,
865    3.352_727_415_073_250_3e-3,
866    3.291_736_946_431_361e-3,
867    3.230_526_727_348_174e-3,
868    3.169_100_844_108_32e-3,
869    3.107_463_397_393_755_5e-3,
870    3.045_618_502_010_327_8e-3,
871    2.983_570_286_612_554_5e-3,
872    2.921_322_893_428_515_3e-3,
873    2.858_880_477_983_06e-3,
874    2.796_247_208_820_428e-3,
875    2.733_427_267_226_093_3e-3,
876    2.670_424_846_947_554e-3,
877    2.607_244_153_914_452e-3,
878    2.543_889_405_957_74e-3,
879    2.480_364_832_528_265_6e-3,
880    2.416_674_674_414_340_5e-3,
881    2.352_823_183_458_769e-3,
882    2.288_814_622_274_955e-3,
883    2.224_653_263_962_713e-3,
884    2.160_343_391_822_734_3e-3,
885    2.095_889_299_071_020_6e-3,
886    2.031_295_288_552_398_4e-3,
887    1.966_565_672_453_437e-3,
888    1.901_704_772_014_899_2e-3,
889    1.836_716_917_243_567_5e-3,
890    1.771_606_446_623_834_7e-3,
891    1.706_377_706_828_447_1e-3,
892    1.641_035_052_429_271_5e-3,
893    1.575_582_845_607_936_8e-3,
894    1.510_025_455_865_810_3e-3,
895    1.444_367_259_734_736e-3,
896    1.378_612_640_487_646_8e-3,
897    1.312_765_987_850_66e-3,
898    1.246_831_697_715_441_5e-3,
899    1.180_814_171_855_922e-3,
900    1.114_717_817_647_310_6e-3,
901    1.048_547_047_793_689_5e-3,
902    9.823_062_800_663_463e-4,
903    9.159_999_370_632_641e-4,
904    8.496_324_460_039_209e-4,
905    7.832_082_385_905_168e-4,
906    7.167_317_509_947_801e-4,
907    6.502_074_240_969_948e-4,
908    5.836_397_042_630_135e-4,
909    5.170_330_453_491_649e-4,
910    4.503_919_137_716_827e-4,
911    3.837_208_020_912_921_4e-4,
912    3.170_242_698_112_815e-4,
913    2.503_070_890_844_105e-4,
914    1.835_749_193_551_655_8e-4,
915    1.168_390_665_730_266_3e-4,
916    5.019_410_348_676_869_6e-5,
917];
918
919#[derive(Clone, Copy, Debug, Eq, PartialEq)]
920pub enum ExactCellBranch {
921    Affine,
922    Quartic,
923    Sextic,
924}
925
926/// Auto-tune the per-cell affine/non-affine branch tolerance from the cell's
927/// own coefficient magnitudes.
928///
929/// The legacy `branch_cell` compared the normalized cubic coefficients
930/// `(k2, k3)` against a single global constant.  That constant is calibrated
931/// for cells whose anchor coefficients `(c0, c1)` are O(1).  When the anchor
932/// dominates — e.g. a tail cell with `|c0|, |c1| >> 1` — a relative criterion
933/// against the anchor magnitude is more numerically meaningful than the bare
934/// global threshold, because the affine contribution to `eta` already absorbs
935/// any difference at the chosen scale.
936///
937/// The returned tolerance is always at least [`NORMALIZED_CELL_BRANCH_TOL`],
938/// so cells with O(1) anchors recover bit-identical classification with the
939/// legacy code path.  This preserves numerical equivalence for the
940/// established `cubic_cell_kernel` tests, including the
941/// `tuned_branch_tolerance_matches_legacy_non_affine_transport_grid` grid.
942#[inline]
943fn effective_branch_tol(cell: DenestedCubicCell) -> f64 {
944    let anchor_scale = cell.c0.abs().max(cell.c1.abs()).max(1.0);
945    NORMALIZED_CELL_BRANCH_TOL * anchor_scale
946}
947
948#[derive(Clone, Copy, Debug, PartialEq)]
949pub struct DenestedCubicCell {
950    pub left: f64,
951    pub right: f64,
952    pub c0: f64,
953    pub c1: f64,
954    pub c2: f64,
955    pub c3: f64,
956}
957
958impl DenestedCubicCell {
959    #[inline]
960    pub fn eta(self, z: f64) -> f64 {
961        self.c0 + self.c1 * z + self.c2 * z * z + self.c3 * z * z * z
962    }
963
964    #[inline]
965    pub fn q(self, z: f64) -> f64 {
966        let eta = self.eta(z);
967        0.5 * (z * z + eta * eta)
968    }
969}
970
971#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
972pub struct CellMomentFingerprint {
973    pub hash: u64,
974    bins: [u64; 6],
975}
976
977#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
978pub struct CellMomentCacheKey {
979    pub fingerprint: CellMomentFingerprint,
980    pub max_degree: usize,
981}
982
983#[derive(Clone, Copy, Debug, Default, PartialEq)]
984pub struct CellMomentDedupStats {
985    pub lookups: u64,
986    pub hits: u64,
987    pub misses: u64,
988}
989
990impl CellMomentDedupStats {
991    #[inline]
992    pub fn hit_rate(self) -> f64 {
993        if self.lookups == 0 {
994            0.0
995        } else {
996            self.hits as f64 / self.lookups as f64
997        }
998    }
999}
1000
1001#[inline]
1002fn splitmix64(x: u64) -> u64 {
1003    gam_linalg::utils::splitmix64_hash(x)
1004}
1005
1006#[inline]
1007fn mix_fingerprint_words(words: &[u64]) -> u64 {
1008    let mut h = 0xcbf2_9ce4_8422_2325u64;
1009    for &word in words {
1010        h ^= splitmix64(word);
1011        h = h.wrapping_mul(0x100_0000_01b3);
1012    }
1013    h
1014}
1015
1016#[inline]
1017fn quantized_cell_word(x: f64, epsilon: f64) -> u64 {
1018    if epsilon == 0.0 || !epsilon.is_finite() || epsilon < 0.0 || !x.is_finite() {
1019        return x.to_bits();
1020    }
1021    (x / epsilon).round().to_bits()
1022}
1023
1024/// Returns a deterministic geometric fingerprint for a de-nested cubic cell.
1025///
1026/// With `epsilon == 0.0`, each coordinate is represented by its exact IEEE-754
1027/// bit pattern, so equal fingerprints imply bit-equal `(left, right, c0, c1,
1028/// c2, c3)` tuples.  With `epsilon > 0`, finite coordinates are binned to the
1029/// nearest multiple of `epsilon`; callers should treat this as an approximate
1030/// cache key and validate the resulting model error for their data.
1031pub fn cell_moment_fingerprint(cell: DenestedCubicCell, epsilon: f64) -> CellMomentFingerprint {
1032    let bins = [
1033        quantized_cell_word(cell.left, epsilon),
1034        quantized_cell_word(cell.right, epsilon),
1035        quantized_cell_word(cell.c0, epsilon),
1036        quantized_cell_word(cell.c1, epsilon),
1037        quantized_cell_word(cell.c2, epsilon),
1038        quantized_cell_word(cell.c3, epsilon),
1039    ];
1040    CellMomentFingerprint {
1041        hash: mix_fingerprint_words(&bins),
1042        bins,
1043    }
1044}
1045
1046#[inline]
1047pub fn cell_moment_cache_key(
1048    cell: DenestedCubicCell,
1049    max_degree: usize,
1050    epsilon: f64,
1051) -> CellMomentCacheKey {
1052    CellMomentCacheKey {
1053        fingerprint: cell_moment_fingerprint(cell, epsilon),
1054        max_degree,
1055    }
1056}
1057
1058#[derive(Clone, Copy, Debug, PartialEq)]
1059pub struct DenestedPartitionCell {
1060    pub cell: DenestedCubicCell,
1061    pub score_span: LocalSpanCubic,
1062    pub link_span: LocalSpanCubic,
1063    /// Provenance of the cell's boundaries: a fixed z location (score break
1064    /// or ±∞ tail) or a link-knot crossing `z = (τ - a)/b`. Together with
1065    /// `(score_span, link_span)` this identifies the cell's two-parameter
1066    /// family in `(a, b)` across rows (see
1067    /// [`crate::cell_moment_family`]).
1068    pub left_edge: PartitionEdge,
1069    pub right_edge: PartitionEdge,
1070}
1071
1072impl DenestedPartitionCell {}
1073
1074/// Provenance of one boundary of a denested partition cell.
1075#[derive(Clone, Copy, Debug, PartialEq)]
1076pub enum PartitionEdge {
1077    /// A z location independent of the row scalars: a score-spline break,
1078    /// or ±∞ for tail cells.
1079    Fixed(f64),
1080    /// A link-knot crossing: the boundary sits at `z = (τ - a)/b` for the
1081    /// row's `(a, b)`.
1082    Crossing { tau: f64 },
1083}
1084
1085impl PartitionEdge {
1086    /// The boundary's z location at the row scalars `(a, b)`.
1087    #[inline]
1088    pub fn z_at(self, a: f64, b: f64) -> f64 {
1089        match self {
1090            Self::Fixed(z) => z,
1091            Self::Crossing { tau } => (tau - a) / b,
1092        }
1093    }
1094}
1095
1096#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
1097struct TailCellMomentCacheKey {
1098    c0_bits: u64,
1099    c1_bits: u64,
1100    endpoint_bits: u64,
1101    side: i8,
1102    max_degree: usize,
1103}
1104
1105const TAIL_CELL_MOMENT_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
1106const TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES: usize = 262_144;
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1109pub struct TailCellMomentCacheStats {
1110    pub hits: usize,
1111    pub misses: usize,
1112    pub entries: usize,
1113}
1114
1115impl TailCellMomentCacheStats {
1116    #[inline]
1117    pub fn requests(self) -> usize {
1118        self.hits + self.misses
1119    }
1120
1121    #[inline]
1122    pub fn hit_rate(self) -> f64 {
1123        let requests = self.requests();
1124        if requests == 0 {
1125            0.0
1126        } else {
1127            self.hits as f64 / requests as f64
1128        }
1129    }
1130}
1131
1132/// Affine-tail cell-moment memo.
1133///
1134/// Stand-alone instances (`TailCellMomentCache::new()`) are useful when a
1135/// caller needs deterministic hit/miss bookkeeping that is not polluted by
1136/// concurrent traffic on the global memo. The production path uses the
1137/// global instance behind [`evaluate_cell_moments`].
1138///
1139/// All methods take `&self`: the LRU is internally synchronized (sharded for
1140/// the concurrent global memo) and the counters are atomics, so the global
1141/// instance needs no outer `Mutex`. The previous `OnceLock<Mutex<…>>` wrapper
1142/// serialized every tail-cell evaluation across all rayon workers of the
1143/// marginal-slope exact-cache build — the same contention class the sharded
1144/// per-family cell-moment LRU fix removed.
1145#[derive(Debug)]
1146pub struct TailCellMomentCache {
1147    moments: ByteLruCache<TailCellMomentCacheKey, CellMomentState>,
1148    in_flight: std::sync::Mutex<
1149        std::collections::HashMap<
1150            TailCellMomentCacheKey,
1151            Arc<std::sync::OnceLock<Result<CellMomentState, String>>>,
1152        >,
1153    >,
1154    hits: std::sync::atomic::AtomicUsize,
1155    misses: std::sync::atomic::AtomicUsize,
1156}
1157
1158impl Default for TailCellMomentCache {
1159    fn default() -> Self {
1160        // Tail-cell entries are small (a short moment vector), so sharding
1161        // the byte/entry budgets is harmless; size the shard count off the
1162        // worker pool exactly like the per-family cell-moment LRU.
1163        let shard_count = std::thread::available_parallelism()
1164            .map(|workers| workers.get().saturating_mul(8))
1165            .unwrap_or(32)
1166            .clamp(8, 256);
1167        Self {
1168            moments: ByteLruCache::with_max_entries_sharded(
1169                TAIL_CELL_MOMENT_CACHE_MAX_BYTES,
1170                TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES,
1171                shard_count,
1172            ),
1173            in_flight: std::sync::Mutex::new(std::collections::HashMap::new()),
1174            hits: std::sync::atomic::AtomicUsize::new(0),
1175            misses: std::sync::atomic::AtomicUsize::new(0),
1176        }
1177    }
1178}
1179
1180impl TailCellMomentCache {
1181    /// Construct an empty cache. Hits/misses start at zero.
1182    #[inline]
1183    pub fn new() -> Self {
1184        Self::default()
1185    }
1186
1187    /// Reset the cache to its empty state. Existing entries are dropped and
1188    /// the hit/miss counters are zeroed.
1189    #[inline]
1190    pub fn clear(&self) {
1191        self.moments.clear();
1192        self.in_flight
1193            .lock()
1194            .unwrap_or_else(|p| p.into_inner())
1195            .clear();
1196        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
1197        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
1198    }
1199
1200    /// Snapshot of the cache's current usage stats.
1201    #[inline]
1202    pub fn stats(&self) -> TailCellMomentCacheStats {
1203        TailCellMomentCacheStats {
1204            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
1205            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
1206            entries: self.moments.len(),
1207        }
1208    }
1209
1210    /// Look up `cell` at `max_degree`, computing and inserting the result on
1211    /// miss. Cells outside the affine-tail keyset bypass the cache and run
1212    /// the uncached evaluator directly without touching the counters.
1213    ///
1214    /// Stat semantics: every request served from an existing resident entry,
1215    /// or from a concurrently published entry for the same key, increments
1216    /// `hits`; a **miss** is counted only for the caller that actually
1217    /// computes a cold key. The compute happens outside the LRU shard lock,
1218    /// but an in-flight table coalesces same-key cold races so followers reuse
1219    /// the leader's published value instead of duplicating work.
1220    pub fn evaluate(
1221        &self,
1222        cell: DenestedCubicCell,
1223        max_degree: usize,
1224    ) -> Result<CellMomentState, String> {
1225        let Some(key) = tail_cell_cache_key(cell, max_degree) else {
1226            return evaluate_cell_moments_uncached(cell, max_degree);
1227        };
1228        if let Some(state) = self.moments.get(&key) {
1229            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1230            return Ok(state);
1231        }
1232
1233        let (slot, leader) = {
1234            let mut in_flight = self.in_flight.lock().unwrap_or_else(|p| p.into_inner());
1235            if let Some(slot) = in_flight.get(&key) {
1236                (Arc::clone(slot), false)
1237            } else {
1238                let slot = Arc::new(std::sync::OnceLock::new());
1239                in_flight.insert(key, Arc::clone(&slot));
1240                (slot, true)
1241            }
1242        };
1243
1244        if !leader {
1245            let state = slot.wait().clone()?;
1246            self.hits
1247                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1248            return Ok(state);
1249        }
1250
1251        let state = evaluate_cell_moments_uncached(cell, max_degree);
1252        if let Ok(state) = &state {
1253            self.moments.insert(key, state.clone());
1254            self.hits
1255                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1256        }
1257        self.misses
1258            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1259        if let Err(existing_state) = slot.set(state.clone()) {
1260            std::mem::drop(existing_state);
1261        }
1262        self.in_flight
1263            .lock()
1264            .unwrap_or_else(|p| p.into_inner())
1265            .remove(&key);
1266        state
1267    }
1268}
1269
1270static TAIL_CELL_MOMENT_CACHE: std::sync::OnceLock<TailCellMomentCache> =
1271    std::sync::OnceLock::new();
1272static TAIL_CELL_MOMENT_CACHE_ENABLED: std::sync::atomic::AtomicBool =
1273    std::sync::atomic::AtomicBool::new(true);
1274
1275fn tail_cell_moment_cache() -> &'static TailCellMomentCache {
1276    TAIL_CELL_MOMENT_CACHE.get_or_init(TailCellMomentCache::default)
1277}
1278
1279#[inline]
1280fn tail_cell_cache_key(
1281    cell: DenestedCubicCell,
1282    max_degree: usize,
1283) -> Option<TailCellMomentCacheKey> {
1284    if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL {
1285        return None;
1286    }
1287    match (!cell.left.is_finite(), !cell.right.is_finite()) {
1288        (true, false) if cell.right.is_finite() => Some(TailCellMomentCacheKey {
1289            c0_bits: cell.c0.to_bits(),
1290            c1_bits: cell.c1.to_bits(),
1291            endpoint_bits: cell.right.to_bits(),
1292            side: -1,
1293            max_degree,
1294        }),
1295        (false, true) if cell.left.is_finite() => Some(TailCellMomentCacheKey {
1296            c0_bits: cell.c0.to_bits(),
1297            c1_bits: cell.c1.to_bits(),
1298            endpoint_bits: cell.left.to_bits(),
1299            side: 1,
1300            max_degree,
1301        }),
1302        _ => None,
1303    }
1304}
1305
1306pub fn set_tail_cell_moment_cache_enabled(enabled: bool) {
1307    TAIL_CELL_MOMENT_CACHE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed);
1308}
1309
1310pub fn reset_tail_cell_moment_cache() {
1311    tail_cell_moment_cache().clear();
1312}
1313
1314pub fn tail_cell_moment_cache_stats() -> TailCellMomentCacheStats {
1315    tail_cell_moment_cache().stats()
1316}
1317
1318#[derive(Clone, Copy, Debug, Eq)]
1319pub struct CellFingerprint {
1320    c0: u64,
1321    c1: u64,
1322    c2: u64,
1323    c3: u64,
1324    left: u64,
1325    right: u64,
1326}
1327
1328impl CellFingerprint {
1329    #[inline]
1330    pub fn new(cell: DenestedCubicCell) -> Self {
1331        Self {
1332            c0: cell.c0.to_bits(),
1333            c1: cell.c1.to_bits(),
1334            c2: cell.c2.to_bits(),
1335            c3: cell.c3.to_bits(),
1336            left: cell.left.to_bits(),
1337            right: cell.right.to_bits(),
1338        }
1339    }
1340}
1341
1342impl PartialEq for CellFingerprint {
1343    #[inline]
1344    fn eq(&self, other: &Self) -> bool {
1345        self.c0 == other.c0
1346            && self.c1 == other.c1
1347            && self.c2 == other.c2
1348            && self.c3 == other.c3
1349            && self.left == other.left
1350            && self.right == other.right
1351    }
1352}
1353
1354impl Hash for CellFingerprint {
1355    #[inline]
1356    fn hash<H: Hasher>(&self, state: &mut H) {
1357        self.c0.hash(state);
1358        self.c1.hash(state);
1359        self.c2.hash(state);
1360        self.c3.hash(state);
1361        self.left.hash(state);
1362        self.right.hash(state);
1363    }
1364}
1365
1366#[derive(Clone, Debug, Default, PartialEq)]
1367pub struct CachedCellMoments {
1368    /// Regular (value) cell moments, populated by
1369    /// `evaluate_cell_moments_cached`. None when only derivative moments
1370    /// have been cached for this cell. Wrapped in `Arc` so `ByteLruCache`
1371    /// returns lookups through cheap refcount bumps instead of deep-cloning
1372    /// the inline `SmallVec<[f64; 10]>` (which spills on every degree-`>= 10`
1373    /// request) on every hot-path LRU hit.
1374    state: Option<Arc<CellMomentState>>,
1375    /// Derivative moments, populated by
1376    /// `evaluate_cell_derivative_moments_cached`. None when only value
1377    /// moments have been cached for this cell. Both variants share the
1378    /// same `CellFingerprint` key so derivative-only callers do not evict
1379    /// pre-cached value entries and vice versa. Same `Arc` wrapping rationale
1380    /// as `state` above.
1381    derivative_state: Option<Arc<CellDerivativeMomentState>>,
1382}
1383
1384impl CachedCellMoments {
1385    #[inline]
1386    pub fn new(state: Arc<CellMomentState>) -> Self {
1387        Self {
1388            state: Some(state),
1389            derivative_state: None,
1390        }
1391    }
1392
1393    #[inline]
1394    pub fn new_derivative(state: Arc<CellDerivativeMomentState>) -> Self {
1395        Self {
1396            state: None,
1397            derivative_state: Some(state),
1398        }
1399    }
1400
1401    #[inline]
1402    pub fn state_for_degree(&self, max_degree: usize) -> Option<CellMomentState> {
1403        let state = self.state.as_ref()?;
1404        if state.moments.len().saturating_sub(1) < max_degree {
1405            return None;
1406        }
1407        // Cached `Arc<CellMomentState>` is shared across LRU hits, so we
1408        // cannot reuse the inner vector in place. Clone the underlying state
1409        // and (rarely) truncate down to the requested degree to honour the
1410        // public moment-length contract.
1411        let mut state = (**state).clone();
1412        state.moments.truncate(max_degree + 1);
1413        Some(state)
1414    }
1415
1416    #[inline]
1417    pub fn derivative_state_for_degree(
1418        &self,
1419        max_degree: usize,
1420    ) -> Option<CellDerivativeMomentState> {
1421        let state = self.derivative_state.as_ref()?;
1422        if state.moments.len().saturating_sub(1) < max_degree {
1423            return None;
1424        }
1425        // See `state_for_degree`: shared `Arc` forces an inner clone here.
1426        let mut state = (**state).clone();
1427        state.moments.truncate(max_degree + 1);
1428        Some(state)
1429    }
1430
1431    #[inline]
1432    pub fn with_value(mut self, state: Arc<CellMomentState>) -> Self {
1433        self.state = Some(state);
1434        self
1435    }
1436
1437    #[inline]
1438    pub fn with_derivative(mut self, state: Arc<CellDerivativeMomentState>) -> Self {
1439        self.derivative_state = Some(state);
1440        self
1441    }
1442}
1443
1444impl ResidentBytes for CachedCellMoments {
1445    fn resident_bytes(&self) -> usize {
1446        let value_bytes = self
1447            .state
1448            .as_ref()
1449            .map_or(0, |state| state.resident_bytes());
1450        let derivative_bytes = self
1451            .derivative_state
1452            .as_ref()
1453            .map_or(0, |state| state.resident_bytes());
1454        std::mem::size_of::<Self>()
1455            .saturating_add(value_bytes)
1456            .saturating_add(derivative_bytes)
1457    }
1458}
1459
1460#[derive(Debug, Default)]
1461pub struct CellMomentCacheStats {
1462    hits: AtomicU64,
1463    misses: AtomicU64,
1464}
1465
1466impl CellMomentCacheStats {
1467    #[inline]
1468    pub fn snapshot(&self) -> (u64, u64) {
1469        (
1470            self.hits.load(Ordering::Relaxed),
1471            self.misses.load(Ordering::Relaxed),
1472        )
1473    }
1474
1475    #[inline]
1476    pub fn hit_rate_delta(&self, before: (u64, u64)) -> (u64, u64, f64) {
1477        let (hits, misses) = self.snapshot();
1478        let dh = hits.saturating_sub(before.0);
1479        let dm = misses.saturating_sub(before.1);
1480        let total = dh + dm;
1481        let rate = if total == 0 {
1482            0.0
1483        } else {
1484            dh as f64 / total as f64
1485        };
1486        (dh, dm, rate)
1487    }
1488}
1489
1490pub type CellMomentLruCache = ByteLruCache<CellFingerprint, CachedCellMoments>;
1491
1492pub const CELL_MOMENT_INLINE_CAPACITY: usize = 10;
1493
1494pub type CellMomentVec = SmallVec<[f64; CELL_MOMENT_INLINE_CAPACITY]>;
1495
1496#[derive(Clone, Debug, PartialEq)]
1497pub struct CellMomentState {
1498    pub branch: ExactCellBranch,
1499    pub value: f64,
1500    pub moments: CellMomentVec,
1501}
1502
1503impl ResidentBytes for CellMomentState {
1504    fn resident_bytes(&self) -> usize {
1505        let spilled_bytes = if self.moments.spilled() {
1506            self.moments
1507                .capacity()
1508                .saturating_mul(std::mem::size_of::<f64>())
1509        } else {
1510            0
1511        };
1512        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1513    }
1514}
1515
1516#[derive(Clone, Debug, PartialEq)]
1517pub struct CellDerivativeMomentState {
1518    pub branch: ExactCellBranch,
1519    pub moments: CellMomentVec,
1520}
1521
1522impl ResidentBytes for CellDerivativeMomentState {
1523    fn resident_bytes(&self) -> usize {
1524        let spilled_bytes = if self.moments.spilled() {
1525            self.moments
1526                .capacity()
1527                .saturating_mul(std::mem::size_of::<f64>())
1528        } else {
1529            0
1530        };
1531        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1532    }
1533}
1534
1535#[derive(Clone, Copy, Debug, PartialEq)]
1536pub struct CellMomentStateRef<'a> {
1537    pub branch: ExactCellBranch,
1538    pub value: f64,
1539    pub moments: &'a [f64],
1540}
1541
1542#[derive(Clone, Debug)]
1543pub struct CellMomentScratch {
1544    moments: Vec<f64>,
1545}
1546
1547impl Default for CellMomentScratch {
1548    fn default() -> Self {
1549        // Pre-size to the codebase's max moment degree so steady-state
1550        // `prepare_moments` calls never reallocate. Calls with `len`
1551        // exceeding this still reserve lazily.
1552        Self {
1553            moments: Vec::with_capacity(MAX_AFFINE_ANCHOR_DEGREE + 1),
1554        }
1555    }
1556}
1557
1558impl CellMomentScratch {
1559    pub fn new() -> Self {
1560        Self::default()
1561    }
1562
1563    pub fn with_capacity(max_degree: usize) -> Self {
1564        Self {
1565            moments: Vec::with_capacity(max_degree + 1),
1566        }
1567    }
1568
1569    #[inline]
1570    fn prepare_moments(&mut self, len: usize) -> &mut [f64] {
1571        if self.moments.capacity() < len {
1572            CELL_MOMENT_REALLOCS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1573            self.moments.reserve(len - self.moments.capacity());
1574        }
1575        // Grow monotonically: shorter requests should not truncate the backing
1576        // storage and then zero the old tail when a later request grows again.
1577        // Only the active prefix is scratch for this evaluation.
1578        if self.moments.len() < len {
1579            self.moments.resize(len, 0.0);
1580        }
1581        let out = &mut self.moments[..len];
1582        out.fill(0.0);
1583        out
1584    }
1585}
1586
1587/// Counter for moment-buffer reallocations in `prepare_moments`. Production
1588/// code increments this on every buffer growth; the test mod inspects it to
1589/// assert the steady-state hot loop allocates exactly once per row buffer.
1590pub(crate) static CELL_MOMENT_REALLOCS: std::sync::atomic::AtomicUsize =
1591    std::sync::atomic::AtomicUsize::new(0);
1592
1593/// Canonical 20-point Gauss–Legendre nodes on [-1, 1] (Abramowitz & Stegun
1594/// 25.4), tabulated to f64 precision. Used here for the Drezner–Wesolowsky
1595/// bivariate normal CDF representation — 20 points give >30-digit accuracy for
1596/// the smooth arcsin-transformed integrand, ensuring the BVN value is exact to
1597/// f64 precision for all (h, k, ρ) — and shared with the cubic-cell B-spline
1598/// moment parity gate in [`crate::gpu_kernels::cubic_bspline_moments`].
1599pub const GL20_NODES: [f64; 20] = [
1600    -0.993_128_599_185_094_9,
1601    -0.963_971_927_277_913_8,
1602    -0.912_234_428_251_326,
1603    -0.839_116_971_822_218_8,
1604    -0.746_331_906_460_150_8,
1605    -0.636_053_680_726_515,
1606    -0.510_867_001_950_827_1,
1607    -0.373_706_088_715_419_6,
1608    -0.227_785_851_141_645_1,
1609    -0.076_526_521_133_497_33,
1610    0.076_526_521_133_497_33,
1611    0.227_785_851_141_645_1,
1612    0.373_706_088_715_419_6,
1613    0.510_867_001_950_827_1,
1614    0.636_053_680_726_515,
1615    0.746_331_906_460_150_8,
1616    0.839_116_971_822_218_8,
1617    0.912_234_428_251_326,
1618    0.963_971_927_277_913_8,
1619    0.993_128_599_185_094_9,
1620];
1621
1622/// Companion weights to [`GL20_NODES`]. Symmetric, summing to 2.
1623pub const GL20_WEIGHTS: [f64; 20] = [
1624    0.017_614_007_139_152_12,
1625    0.040_601_429_800_386_94,
1626    0.062_672_048_334_109_06,
1627    0.083_276_741_576_704_75,
1628    0.101_930_119_817_240_4,
1629    0.118_194_531_961_518_4,
1630    0.131_688_638_449_176_6,
1631    0.142_096_109_318_382_1,
1632    0.149_172_986_472_603_7,
1633    0.152_753_387_130_725_9,
1634    0.152_753_387_130_725_9,
1635    0.149_172_986_472_603_7,
1636    0.142_096_109_318_382_1,
1637    0.131_688_638_449_176_6,
1638    0.118_194_531_961_518_4,
1639    0.101_930_119_817_240_4,
1640    0.083_276_741_576_704_75,
1641    0.062_672_048_334_109_06,
1642    0.040_601_429_800_386_94,
1643    0.017_614_007_139_152_12,
1644];
1645
1646/// Provenance-tagged breakpoint dedup: sorts ascending and merges entries
1647/// coinciding within 1e-12, but when a fixed score break and a link-knot
1648/// crossing coincide (the kink configuration), the surviving entry keeps
1649/// the `Fixed` tag — a deterministic choice; the z location is identical
1650/// either way.
1651fn dedup_sorted_tagged_breakpoints(points: &mut Vec<(f64, PartitionEdge)>) {
1652    points.sort_by(|lhs, rhs| {
1653        lhs.0
1654            .partial_cmp(&rhs.0)
1655            .unwrap_or(std::cmp::Ordering::Equal)
1656    });
1657    points.dedup_by(|lhs, rhs| {
1658        let coincide = if lhs.0 == rhs.0 {
1659            true
1660        } else if lhs.0.is_finite() && rhs.0.is_finite() {
1661            (lhs.0 - rhs.0).abs() <= 1e-12
1662        } else {
1663            false
1664        };
1665        if coincide && matches!(lhs.1, PartitionEdge::Fixed(_)) {
1666            // `dedup_by` keeps `rhs` (the earlier element) — propagate the
1667            // Fixed tag onto the survivor.
1668            rhs.1 = lhs.1;
1669        }
1670        coincide
1671    });
1672}
1673
1674#[inline]
1675pub fn interval_probe_point(left: f64, right: f64) -> Result<f64, String> {
1676    if !(left < right) {
1677        return Err(CubicCellKernelError::invalid_interval(format!(
1678            "interval probe requires ordered bounds, got [{left}, {right}]"
1679        ))
1680        .into());
1681    }
1682    if left.is_finite() && right.is_finite() {
1683        Ok(0.5 * (left + right))
1684    } else if left == f64::NEG_INFINITY && right == f64::INFINITY {
1685        Ok(0.0)
1686    } else if left == f64::NEG_INFINITY && right.is_finite() {
1687        Ok(right - 1.0)
1688    } else if left.is_finite() && right == f64::INFINITY {
1689        Ok(left + 1.0)
1690    } else {
1691        Err(CubicCellKernelError::invalid_interval(format!(
1692            "interval probe requires finite bounds or full infinities, got [{left}, {right}]"
1693        ))
1694        .into())
1695    }
1696}
1697
1698#[inline]
1699pub fn quartic_qprime_coefficients(c0: f64, c1: f64, c2: f64) -> [f64; 4] {
1700    [
1701        c0 * c1,
1702        1.0 + c1 * c1 + 2.0 * c0 * c2,
1703        3.0 * c1 * c2,
1704        2.0 * c2 * c2,
1705    ]
1706}
1707
1708#[inline]
1709pub fn sextic_qprime_coefficients(c0: f64, c1: f64, c2: f64, c3: f64) -> [f64; 6] {
1710    [
1711        c0 * c1,
1712        1.0 + c1 * c1 + 2.0 * c0 * c2,
1713        3.0 * c0 * c3 + 3.0 * c1 * c2,
1714        4.0 * c1 * c3 + 2.0 * c2 * c2,
1715        5.0 * c2 * c3,
1716        3.0 * c3 * c3,
1717    ]
1718}
1719
1720/// Boundary term `right^n · exp(−q(right)) − left^n · exp(−q(left))` used by
1721/// the moment recurrences. Takes precomputed `left^n` and `right^n` so callers
1722/// can roll the powers across a recurrence — each iteration becomes one
1723/// multiply instead of a fresh `powi(n)`.
1724#[inline]
1725fn moment_boundary_term_with_powers(
1726    cell: DenestedCubicCell,
1727    left_pow_n: f64,
1728    right_pow_n: f64,
1729) -> f64 {
1730    let left_term = if cell.left.is_infinite() {
1731        0.0
1732    } else {
1733        left_pow_n * (-cell.q(cell.left)).exp()
1734    };
1735    let right_term = if cell.right.is_infinite() {
1736        0.0
1737    } else {
1738        right_pow_n * (-cell.q(cell.right)).exp()
1739    };
1740    right_term - left_term
1741}
1742
1743#[inline]
1744fn base_moments_match_direct(base: &[f64], direct: &[f64]) -> bool {
1745    base.iter()
1746        .zip(direct.iter())
1747        .all(|(&lhs, &rhs)| (lhs - rhs).abs() <= 1e-10 * (1.0 + lhs.abs().max(rhs.abs())))
1748}
1749
1750#[inline]
1751fn direct_non_affine_moments_if_base_matches(
1752    cell: DenestedCubicCell,
1753    base: &[f64],
1754    max_degree: usize,
1755) -> Option<Vec<f64>> {
1756    if !cell.left.is_finite() || !cell.right.is_finite() {
1757        return None;
1758    }
1759    // When the supplied base moments are the actual moments of this fixed
1760    // finite cell, prefer the same quadrature-backed evaluator used by the
1761    // public non-affine moment path.  The algebraic raising recurrence is kept
1762    // below for callers that intentionally pass symbolic or otherwise
1763    // non-cell-consistent bases, but repeatedly dividing by the quartic/sextic
1764    // leading coefficient can amplify harmless base-roundoff into high-order
1765    // moment error.
1766    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
1767    if base_moments_match_direct(base, &moments) {
1768        Some(moments.into_vec())
1769    } else {
1770        None
1771    }
1772}
1773
1774pub fn reduce_quartic_moments(
1775    cell: DenestedCubicCell,
1776    base_m0_m2: [f64; 3],
1777    max_degree: usize,
1778) -> Result<Vec<f64>, String> {
1779    if max_degree <= 2 {
1780        return Ok(base_m0_m2[..=max_degree].to_vec());
1781    }
1782    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m2, max_degree)
1783    {
1784        return Ok(moments);
1785    }
1786    let d = quartic_qprime_coefficients(cell.c0, cell.c1, cell.c2);
1787    let lead = d[3];
1788    if !lead.is_finite() || lead.abs() <= 1e-18 {
1789        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1790            "quartic moment reduction requires nonzero leading coefficient, got {lead:.3e}"
1791        ))
1792        .into());
1793    }
1794    let mut moments = vec![0.0; max_degree + 1];
1795    moments[0] = base_m0_m2[0];
1796    moments[1] = base_m0_m2[1];
1797    moments[2] = base_m0_m2[2];
1798    // Roll left^n / right^n across the recurrence rather than calling
1799    // `powi(n)` each iteration. Skip the multiply when an endpoint is
1800    // infinite — the boundary helper ignores the power in that case, and
1801    // ∞·0 would produce a NaN we'd then have to mask off anyway.
1802    let left_finite = cell.left.is_finite();
1803    let right_finite = cell.right.is_finite();
1804    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1805    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1806    for n in 0..=(max_degree - 3) {
1807        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1808        let mut numer = if n == 0 {
1809            0.0
1810        } else {
1811            (n as f64) * moments[n - 1]
1812        };
1813        for j in 0..=2 {
1814            numer -= d[j] * moments[n + j];
1815        }
1816        numer -= b_n;
1817        moments[n + 3] = numer / lead;
1818        if left_finite {
1819            left_pow_n *= cell.left;
1820        }
1821        if right_finite {
1822            right_pow_n *= cell.right;
1823        }
1824    }
1825    Ok(moments)
1826}
1827
1828pub fn reduce_sextic_moments(
1829    cell: DenestedCubicCell,
1830    base_m0_m4: [f64; 5],
1831    max_degree: usize,
1832) -> Result<Vec<f64>, String> {
1833    if max_degree <= 4 {
1834        return Ok(base_m0_m4[..=max_degree].to_vec());
1835    }
1836    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m4, max_degree)
1837    {
1838        return Ok(moments);
1839    }
1840    let d = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3);
1841    let lead = d[5];
1842    if !lead.is_finite() {
1843        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1844            "sextic moment reduction encountered non-finite leading coefficient: {lead:.3e}"
1845        ))
1846        .into());
1847    }
1848    if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
1849        if lower_branch == ExactCellBranch::Quartic {
1850            return evaluate_non_affine_cell_state(
1851                DenestedCubicCell { c3: 0.0, ..cell },
1852                ExactCellBranch::Quartic,
1853                max_degree,
1854            )
1855            .map(|state| state.moments.into_vec());
1856        }
1857        return evaluate_affine_cell_state(
1858            DenestedCubicCell {
1859                left: cell.left,
1860                right: cell.right,
1861                c0: cell.c0,
1862                c1: cell.c1,
1863                c2: 0.0,
1864                c3: 0.0,
1865            },
1866            max_degree,
1867        )
1868        .map(|state| state.moments.into_vec());
1869    }
1870    let mut moments = vec![0.0; max_degree + 1];
1871    for (idx, value) in base_m0_m4.into_iter().enumerate() {
1872        moments[idx] = value;
1873    }
1874    let left_finite = cell.left.is_finite();
1875    let right_finite = cell.right.is_finite();
1876    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1877    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1878    for n in 0..=(max_degree - 5) {
1879        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1880        let mut numer = if n == 0 {
1881            0.0
1882        } else {
1883            (n as f64) * moments[n - 1]
1884        };
1885        for j in 0..=4 {
1886            numer -= d[j] * moments[n + j];
1887        }
1888        numer -= b_n;
1889        moments[n + 5] = numer / lead;
1890        if left_finite {
1891            left_pow_n *= cell.left;
1892        }
1893        if right_finite {
1894            right_pow_n *= cell.right;
1895        }
1896    }
1897    Ok(moments)
1898}
1899
1900#[inline]
1901pub fn cell_first_derivative_from_moments(
1902    derivative_coefficients: &[f64],
1903    moments: &[f64],
1904) -> Result<f64, String> {
1905    let value = moment_dot_with_coefficients(derivative_coefficients, moments, "first derivative")?;
1906    Ok(value * INV_TWO_PI)
1907}
1908
1909/// Maximum moment index (i.e. `max_degree` passed to
1910/// `evaluate_cell_moments`) required to evaluate
1911/// `cell_first_derivative_from_moments(derivative_coefficients, moments)`.
1912///
1913/// Callers must request at least `cell_first_derivative_required_max_degree(
1914/// derivative_coefficients)` so the moment dot is well-defined; #321 was
1915/// caused by hardcoding a smaller value at one call site.
1916#[inline]
1917pub fn cell_first_derivative_required_max_degree(derivative_coefficients: &[f64]) -> usize {
1918    derivative_coefficients.len().saturating_sub(1)
1919}
1920
1921/// Maximum moment index required by `cell_second_derivative_from_moments`.
1922///
1923/// Mirrors the kernel's internal `needed = max(second_deg, product_deg) + 1`
1924/// computation, but returned as `max_degree` (i.e. `needed - 1`) so it lines
1925/// up with the `evaluate_cell_moments(cell, max_degree)` argument convention.
1926/// The contraction folds an inner cubic `eta` (always degree 3) with the two
1927/// first-coefficient slices and the second-coefficient slice; the +3 below is
1928/// the cubic-cell eta polynomial.
1929#[inline]
1930pub fn cell_second_derivative_required_max_degree(
1931    first_coefficients_r: &[f64],
1932    first_coefficients_s: &[f64],
1933    second_coefficients_rs: &[f64],
1934) -> usize {
1935    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1936    let product_degree = first_coefficients_r.len().saturating_sub(1)
1937        + first_coefficients_s.len().saturating_sub(1)
1938        + 3;
1939    second_degree.max(product_degree)
1940}
1941
1942#[inline]
1943pub fn cell_polynomial_integral_from_moments(
1944    polynomial_coefficients: &[f64],
1945    moments: &[f64],
1946    label: &str,
1947) -> Result<f64, String> {
1948    let value = moment_dot_with_coefficients(polynomial_coefficients, moments, label)?;
1949    Ok(value * INV_TWO_PI)
1950}
1951
1952#[inline]
1953pub fn cell_second_derivative_from_moments(
1954    cell: DenestedCubicCell,
1955    first_coefficients_r: &[f64],
1956    first_coefficients_s: &[f64],
1957    second_coefficients_rs: &[f64],
1958    moments: &[f64],
1959) -> Result<f64, String> {
1960    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1961    let product_degree = first_coefficients_r.len().saturating_sub(1)
1962        + first_coefficients_s.len().saturating_sub(1)
1963        + 3;
1964    let needed = second_degree.max(product_degree) + 1;
1965    if needed > moments.len() {
1966        return Err(CubicCellKernelError::insufficient_moments(format!(
1967            "insufficient reduced moments for second derivative: need {}, have {}",
1968            needed,
1969            moments.len()
1970        ))
1971        .into());
1972    }
1973    let second_term = moment_dot_with_coefficients_unchecked(second_coefficients_rs, moments);
1974    // Fold `Σ_{e,i,j} eta[e]·r[i]·s[j]·moments[e+i+j]` into a single dot
1975    // against `moments`. Convolving `eta ⊗ r ⊗ s` first turns the original
1976    // `len(eta)·len(r)·len(s)` triple loop (typically 4·4·4 = 64 mul-adds
1977    // per call) into `len(eta)·len(r) + (len(eta)+len(r)-1)·len(s) +
1978    // len(out)` ≈ 16 + 28 + 10 = 54 mul-adds, with the inner loops now in
1979    // straight-line FMA-friendly form.
1980    let cubic = [cell.c0, cell.c1, cell.c2, cell.c3];
1981    // Capacity bound: cubic (4) + first_r (≤MAX) + first_s (≤MAX) - 2.
1982    // First-coefficient slices are passed in as `[f64; 4]` from every
1983    // production caller; sizing to 32 covers any realistic test input.
1984    const SCRATCH: usize = 32;
1985    let mut eta_r = [0.0_f64; SCRATCH];
1986    let mut eta_rs = [0.0_f64; SCRATCH];
1987    let er_len = poly_conv_into(&cubic, first_coefficients_r, &mut eta_r);
1988    let ers_len = poly_conv_into(&eta_r[..er_len], first_coefficients_s, &mut eta_rs);
1989    let mut eta_term = 0.0;
1990    for k in 0..ers_len {
1991        eta_term = eta_rs[k].mul_add(moments[k], eta_term);
1992    }
1993    Ok((second_term - eta_term) * INV_TWO_PI)
1994}
1995
1996/// Pointwise value of the cell second-derivative integrand
1997/// `(∂²/∂r∂s) exp(-q(z))/2π` at a single `z`, evaluated from the SAME
1998/// `(r, s, rs)` coefficient polynomials the moment reduction
1999/// [`cell_second_derivative_from_moments`] integrates:
2000///
2001/// ```text
2002///   F_rs(z) = ( c_rs(z) - η(z)·c_r(z)·c_s(z) ) · exp(-q(z)) · 1/2π ,
2003/// ```
2004///
2005/// with `c_•(z) = Σ_k coeff_•[k]·zᵏ`, `η(z)` the cell cubic, and
2006/// `q(z) = ½(z² + η(z)²)`. This is the integrand whose `[cell.left,
2007/// cell.right]` integral the from-moments form returns — needed for the
2008/// Leibniz boundary term when a cell edge (a link-knot crossing
2009/// `z=(τ-a)/b`) moves with a parameter (the slope `b`): the directional
2010/// derivative of `∫_{z_L}^{z_R} F_rs dz` picks up
2011/// `F_rs(z_R)·z_R'(dir) - F_rs(z_L)·z_L'(dir)` on top of the fixed-domain
2012/// part. Coefficient sign convention matches the simpson reference
2013/// (`numeric_ab`): pass the ACTUAL derivative-coefficient polynomials
2014/// `∂c/∂r` etc. (not the negated `neg_dc_d•` the moment path consumes).
2015#[inline]
2016pub fn cell_second_derivative_boundary_integrand(
2017    cell: DenestedCubicCell,
2018    first_coefficients_r: &[f64],
2019    first_coefficients_s: &[f64],
2020    second_coefficients_rs: &[f64],
2021    z: f64,
2022) -> f64 {
2023    let eta = cell.eta(z);
2024    let c_r = poly_eval_at(first_coefficients_r, z);
2025    let c_s = poly_eval_at(first_coefficients_s, z);
2026    let c_rs = poly_eval_at(second_coefficients_rs, z);
2027    (c_rs - eta * c_r * c_s) * (-cell.q(z)).exp() * INV_TWO_PI
2028}
2029
2030/// Pointwise value of the cell third-derivative integrand
2031/// `(∂³/∂r∂s∂t) exp(-q(z))/2π` at a single `z`, evaluated from the same
2032/// `(r, s, t, rs, rt, st, rst)` coefficient polynomials that
2033/// [`cell_third_derivative_from_moments`] integrates:
2034///
2035/// ```text
2036/// F_rst(z) = (
2037///     c_rst(z)
2038///   - η(z)·(c_rs(z)c_t(z) + c_rt(z)c_s(z) + c_st(z)c_r(z))
2039///   + (η(z)² - 1)·c_r(z)c_s(z)c_t(z)
2040/// ) · exp(-q(z)) · 1/2π .
2041/// ```
2042///
2043/// This is the boundary value for differentiating an already-third-order
2044/// fixed-domain integral with respect to a moving edge. The sign convention is
2045/// intentionally identical to [`cell_third_derivative_from_moments`]: callers
2046/// must pass the coefficient slices in the convention of the integral they are
2047/// differentiating. In particular, survival/probit paths that integrate the
2048/// jointly negated cell and coefficient slices must evaluate this boundary
2049/// integrand with the same joint negation; evaluating an un-negated boundary for
2050/// a negated fixed-domain integral flips the sign of this odd-order integrand.
2051#[inline]
2052pub fn cell_third_derivative_boundary_integrand(
2053    cell: DenestedCubicCell,
2054    first_coefficients_r: &[f64],
2055    first_coefficients_s: &[f64],
2056    first_coefficients_t: &[f64],
2057    second_coefficients_rs: &[f64],
2058    second_coefficients_rt: &[f64],
2059    second_coefficients_st: &[f64],
2060    third_coefficients_rst: &[f64],
2061    z: f64,
2062) -> f64 {
2063    let eta = cell.eta(z);
2064    let c_r = poly_eval_at(first_coefficients_r, z);
2065    let c_s = poly_eval_at(first_coefficients_s, z);
2066    let c_t = poly_eval_at(first_coefficients_t, z);
2067    let c_rs = poly_eval_at(second_coefficients_rs, z);
2068    let c_rt = poly_eval_at(second_coefficients_rt, z);
2069    let c_st = poly_eval_at(second_coefficients_st, z);
2070    let c_rst = poly_eval_at(third_coefficients_rst, z);
2071    let amplitude =
2072        c_rst - eta * (c_rs * c_t + c_rt * c_s + c_st * c_r) + (eta * eta - 1.0) * c_r * c_s * c_t;
2073    amplitude * (-cell.q(z)).exp() * INV_TWO_PI
2074}
2075
2076/// Pointwise value of the density-weighted integrand `g(z)·exp(-q(z))/2π` at a
2077/// single `z`, for an arbitrary integrand polynomial `g`.
2078///
2079/// This is the boundary value needed for the moving-domain (Leibniz) term of a
2080/// density-normalization integral `∫ g(z)·exp(-q(z))/2π dz` whose cell edge is a
2081/// link-knot crossing `z=(τ-a)/b` that moves with a parameter direction: the
2082/// directional derivative of the integral picks up
2083/// `g(z_R)·w(z_R)·z_R'(dir) - g(z_L)·w(z_L)·z_L'(dir)` on top of the
2084/// fixed-domain part, with `w(z)=exp(-q(z))/2π` the same weight the moment
2085/// reductions integrate. Unlike the Hessian-integral boundary term (which is
2086/// shared by adjacent cells and cancels across each interior knot), the
2087/// ln-density integrand `D_t`/`D_t,uv` carries a non-shared `g`, so this
2088/// Leibniz term does NOT cancel and must be added (gam#932/#979).
2089pub fn cell_density_boundary_integrand(cell: DenestedCubicCell, g: &[f64], z: f64) -> f64 {
2090    poly_eval_at(g, z) * (-cell.q(z)).exp() * INV_TWO_PI
2091}
2092
2093/// Horner evaluation of `Σ_k coefficients[k]·zᵏ`.
2094#[inline]
2095fn poly_eval_at(coefficients: &[f64], z: f64) -> f64 {
2096    let mut acc = 0.0_f64;
2097    for &c in coefficients.iter().rev() {
2098        acc = acc.mul_add(z, c);
2099    }
2100    acc
2101}
2102
2103#[inline]
2104fn moment_dot_with_coefficients(
2105    coefficients: &[f64],
2106    moments: &[f64],
2107    label: &str,
2108) -> Result<f64, String> {
2109    if coefficients.len() > moments.len() {
2110        return Err(CubicCellKernelError::insufficient_moments(format!(
2111            "insufficient reduced moments for {label}: need {}, have {}",
2112            coefficients.len(),
2113            moments.len()
2114        ))
2115        .into());
2116    }
2117    Ok(moment_dot_with_coefficients_unchecked(
2118        coefficients,
2119        moments,
2120    ))
2121}
2122
2123#[inline]
2124fn moment_dot_with_coefficients_unchecked(coefficients: &[f64], moments: &[f64]) -> f64 {
2125    let mut acc = 0.0;
2126    for (idx, &coeff) in coefficients.iter().enumerate() {
2127        acc = coeff.mul_add(moments[idx], acc);
2128    }
2129    acc
2130}
2131
2132/// Convolve two polynomial coefficient slices into a fixed-capacity output
2133/// buffer. Returns the populated length (`lhs.len() + rhs.len() - 1` when
2134/// both are non-empty). The buffer's tail (beyond the returned length) is
2135/// not zeroed; callers must use only the returned prefix.
2136///
2137/// Used by the multi-derivative reductions to fold `eta · r · s · …` triple
2138/// and quadruple sums into a single moment dot, eliminating the
2139/// `O(deg^3)`/`O(deg^4)` inner-loop work that dominated the
2140/// `cell_*_derivative_from_moments` hot leaves on large-scale fits.
2141#[inline]
2142fn poly_conv_into(lhs: &[f64], rhs: &[f64], out: &mut [f64]) -> usize {
2143    if lhs.is_empty() || rhs.is_empty() {
2144        return 0;
2145    }
2146    let len = lhs.len() + rhs.len() - 1;
2147    assert!(out.len() >= len);
2148    for slot in out[..len].iter_mut() {
2149        *slot = 0.0;
2150    }
2151    for (i, &lv) in lhs.iter().enumerate() {
2152        for (j, &rv) in rhs.iter().enumerate() {
2153            out[i + j] = lv.mul_add(rv, out[i + j]);
2154        }
2155    }
2156    len
2157}
2158
2159#[inline]
2160fn require_moments_degree(
2161    required_degree: usize,
2162    moments: &[f64],
2163    label: &str,
2164) -> Result<(), String> {
2165    if required_degree >= moments.len() {
2166        return Err(CubicCellKernelError::insufficient_moments(format!(
2167            "insufficient reduced moments for {label}: need {}, have {}",
2168            required_degree + 1,
2169            moments.len()
2170        ))
2171        .into());
2172    }
2173    Ok::<(), _>(())
2174}
2175
2176#[inline]
2177fn require_scratch_capacity(
2178    required_len: usize,
2179    capacity: usize,
2180    label: &str,
2181) -> Result<(), String> {
2182    if required_len > capacity {
2183        return Err(CubicCellKernelError::insufficient_moments(format!(
2184            "{label} polynomial convolution scratch too small: need {required_len}, have {capacity}"
2185        ))
2186        .into());
2187    }
2188    Ok::<(), _>(())
2189}
2190
2191#[inline]
2192fn convolution_chain_len(lengths: &[usize]) -> usize {
2193    if lengths.is_empty() || lengths.contains(&0) {
2194        0
2195    } else {
2196        lengths.iter().sum::<usize>() - (lengths.len() - 1)
2197    }
2198}
2199
2200#[inline]
2201fn first_coefficients_degree(label: &str, coefficients: &[f64]) -> Result<usize, String> {
2202    coefficients
2203        .len()
2204        .checked_sub(1)
2205        .ok_or_else(|| format!("{label} first-derivative coefficients must be non-empty"))
2206}
2207
2208#[inline]
2209pub fn cell_third_derivative_from_moments(
2210    cell: DenestedCubicCell,
2211    first_coefficients_r: &[f64],
2212    first_coefficients_s: &[f64],
2213    first_coefficients_t: &[f64],
2214    second_coefficients_rs: &[f64],
2215    second_coefficients_rt: &[f64],
2216    second_coefficients_st: &[f64],
2217    third_coefficients_rst: &[f64],
2218    moments: &[f64],
2219) -> Result<f64, String> {
2220    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2221    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2222    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2223    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2224    let second_sum_degree = [
2225        second_coefficients_rs.len() + first_coefficients_t.len(),
2226        second_coefficients_rt.len() + first_coefficients_s.len(),
2227        second_coefficients_st.len() + first_coefficients_r.len(),
2228    ]
2229    .into_iter()
2230    .max()
2231    .unwrap_or(0)
2232    .saturating_sub(1);
2233    let triple_product_degree = r_degree + s_degree + t_degree;
2234    let needed = (third_coefficients_rst.len().saturating_sub(1))
2235        .max(3 + second_sum_degree)
2236        .max(6 + triple_product_degree);
2237    require_moments_degree(needed, moments, "third derivative")?;
2238
2239    let third_term = moment_dot_with_coefficients_unchecked(third_coefficients_rst, moments);
2240
2241    // This is a deliberately serial leaf kernel: each call performs only a
2242    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2243    // at the surrounding row/cell batch level rather than inside this hot path.
2244    const SCRATCH: usize = 32;
2245    let max_linear_conv_len = [
2246        convolution_chain_len(&[
2247            eta.len(),
2248            second_coefficients_rs.len(),
2249            first_coefficients_t.len(),
2250        ]),
2251        convolution_chain_len(&[
2252            eta.len(),
2253            second_coefficients_rt.len(),
2254            first_coefficients_s.len(),
2255        ]),
2256        convolution_chain_len(&[
2257            eta.len(),
2258            second_coefficients_st.len(),
2259            first_coefficients_r.len(),
2260        ]),
2261    ]
2262    .into_iter()
2263    .max()
2264    .unwrap_or(0);
2265    let max_cubic_conv_len = convolution_chain_len(&[
2266        7,
2267        first_coefficients_r.len(),
2268        first_coefficients_s.len(),
2269        first_coefficients_t.len(),
2270    ]);
2271    require_scratch_capacity(
2272        max_linear_conv_len.max(max_cubic_conv_len),
2273        SCRATCH,
2274        "third derivative",
2275    )?;
2276    let mut buf_a = [0.0_f64; SCRATCH];
2277    let mut buf_b = [0.0_f64; SCRATCH];
2278
2279    // eta_second_term = Σ over (rs⊗t, rt⊗s, st⊗r) of eta⊗product · moments.
2280    // Fold each of the three triple sums into a single moment dot.
2281    let mut eta_second_term = 0.0;
2282    let conv_dot = |first: &[f64],
2283                    second: &[f64],
2284                    buf_a: &mut [f64; SCRATCH],
2285                    buf_b: &mut [f64; SCRATCH]|
2286     -> f64 {
2287        let m = poly_conv_into(first, second, buf_a);
2288        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2289        let mut acc = 0.0;
2290        for k in 0..n {
2291            acc = buf_b[k].mul_add(moments[k], acc);
2292        }
2293        acc
2294    };
2295    eta_second_term += conv_dot(
2296        second_coefficients_rs,
2297        first_coefficients_t,
2298        &mut buf_a,
2299        &mut buf_b,
2300    );
2301    eta_second_term += conv_dot(
2302        second_coefficients_rt,
2303        first_coefficients_s,
2304        &mut buf_a,
2305        &mut buf_b,
2306    );
2307    eta_second_term += conv_dot(
2308        second_coefficients_st,
2309        first_coefficients_r,
2310        &mut buf_a,
2311        &mut buf_b,
2312    );
2313
2314    // cubic_coeff_term = Σ_{e,i,j,k} (eta·eta − 1)[e] · r[i] · s[j] · t[k] · moments[e+i+j+k].
2315    // Convolve r⊗s, then ⊗t, then ⊗(eta·eta − 1), giving a single dot.
2316    let mut eta_sq_minus_one = [0.0_f64; 7];
2317    for (i, &eta_i) in eta.iter().enumerate() {
2318        for (j, &eta_j) in eta.iter().enumerate() {
2319            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2320        }
2321    }
2322    eta_sq_minus_one[0] -= 1.0;
2323
2324    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2325    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2326    // buf_a now reused for (eta_sq_minus_one ⊗ rst).
2327    let final_len = poly_conv_into(&eta_sq_minus_one, &buf_b[..rst_len], &mut buf_a);
2328    let mut cubic_coeff_term = 0.0;
2329    for k in 0..final_len {
2330        cubic_coeff_term = buf_a[k].mul_add(moments[k], cubic_coeff_term);
2331    }
2332
2333    Ok((third_term - eta_second_term + cubic_coeff_term) * INV_TWO_PI)
2334}
2335
2336#[inline]
2337pub fn cell_fourth_derivative_from_moments(
2338    cell: DenestedCubicCell,
2339    first_coefficients_r: &[f64],
2340    first_coefficients_s: &[f64],
2341    first_coefficients_t: &[f64],
2342    first_coefficients_u: &[f64],
2343    second_coefficients_rs: &[f64],
2344    second_coefficients_rt: &[f64],
2345    second_coefficients_ru: &[f64],
2346    second_coefficients_st: &[f64],
2347    second_coefficients_su: &[f64],
2348    second_coefficients_tu: &[f64],
2349    third_coefficients_rst: &[f64],
2350    third_coefficients_rsu: &[f64],
2351    third_coefficients_rtu: &[f64],
2352    third_coefficients_stu: &[f64],
2353    fourth_coefficients_rstu: &[f64],
2354    moments: &[f64],
2355) -> Result<f64, String> {
2356    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2357    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2358    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2359    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2360    let u_degree = first_coefficients_degree("u", first_coefficients_u)?;
2361    let linear_sum_degree = [
2362        third_coefficients_rst.len() + first_coefficients_u.len(),
2363        third_coefficients_rsu.len() + first_coefficients_t.len(),
2364        third_coefficients_rtu.len() + first_coefficients_s.len(),
2365        third_coefficients_stu.len() + first_coefficients_r.len(),
2366        second_coefficients_rs.len() + second_coefficients_tu.len(),
2367        second_coefficients_rt.len() + second_coefficients_su.len(),
2368        second_coefficients_ru.len() + second_coefficients_st.len(),
2369    ]
2370    .into_iter()
2371    .max()
2372    .unwrap_or(0)
2373    .saturating_sub(1);
2374    let quad_sum_degree = [
2375        second_coefficients_rs.len() + first_coefficients_t.len() + first_coefficients_u.len(),
2376        second_coefficients_rt.len() + first_coefficients_s.len() + first_coefficients_u.len(),
2377        second_coefficients_ru.len() + first_coefficients_s.len() + first_coefficients_t.len(),
2378        second_coefficients_st.len() + first_coefficients_r.len() + first_coefficients_u.len(),
2379        second_coefficients_su.len() + first_coefficients_r.len() + first_coefficients_t.len(),
2380        second_coefficients_tu.len() + first_coefficients_r.len() + first_coefficients_s.len(),
2381    ]
2382    .into_iter()
2383    .max()
2384    .unwrap_or(0)
2385    .saturating_sub(2);
2386    let quartic_product_degree = r_degree + s_degree + t_degree + u_degree;
2387    let needed = (fourth_coefficients_rstu.len().saturating_sub(1))
2388        .max(3 + linear_sum_degree)
2389        .max(6 + quad_sum_degree)
2390        .max(9 + quartic_product_degree);
2391    require_moments_degree(needed, moments, "fourth derivative")?;
2392
2393    let fourth_term = moment_dot_with_coefficients_unchecked(fourth_coefficients_rstu, moments);
2394
2395    // This is a deliberately serial leaf kernel: each call performs only a
2396    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2397    // at the surrounding row/cell batch level rather than inside this hot path.
2398    const SCRATCH: usize = 32;
2399    let max_linear_conv_len = [
2400        convolution_chain_len(&[
2401            eta.len(),
2402            third_coefficients_rst.len(),
2403            first_coefficients_u.len(),
2404        ]),
2405        convolution_chain_len(&[
2406            eta.len(),
2407            third_coefficients_rsu.len(),
2408            first_coefficients_t.len(),
2409        ]),
2410        convolution_chain_len(&[
2411            eta.len(),
2412            third_coefficients_rtu.len(),
2413            first_coefficients_s.len(),
2414        ]),
2415        convolution_chain_len(&[
2416            eta.len(),
2417            third_coefficients_stu.len(),
2418            first_coefficients_r.len(),
2419        ]),
2420        convolution_chain_len(&[
2421            eta.len(),
2422            second_coefficients_rs.len(),
2423            second_coefficients_tu.len(),
2424        ]),
2425        convolution_chain_len(&[
2426            eta.len(),
2427            second_coefficients_rt.len(),
2428            second_coefficients_su.len(),
2429        ]),
2430        convolution_chain_len(&[
2431            eta.len(),
2432            second_coefficients_ru.len(),
2433            second_coefficients_st.len(),
2434        ]),
2435    ]
2436    .into_iter()
2437    .max()
2438    .unwrap_or(0);
2439    let max_quad_conv_len = [
2440        convolution_chain_len(&[
2441            7,
2442            second_coefficients_rs.len(),
2443            first_coefficients_t.len(),
2444            first_coefficients_u.len(),
2445        ]),
2446        convolution_chain_len(&[
2447            7,
2448            second_coefficients_rt.len(),
2449            first_coefficients_s.len(),
2450            first_coefficients_u.len(),
2451        ]),
2452        convolution_chain_len(&[
2453            7,
2454            second_coefficients_ru.len(),
2455            first_coefficients_s.len(),
2456            first_coefficients_t.len(),
2457        ]),
2458        convolution_chain_len(&[
2459            7,
2460            second_coefficients_st.len(),
2461            first_coefficients_r.len(),
2462            first_coefficients_u.len(),
2463        ]),
2464        convolution_chain_len(&[
2465            7,
2466            second_coefficients_su.len(),
2467            first_coefficients_r.len(),
2468            first_coefficients_t.len(),
2469        ]),
2470        convolution_chain_len(&[
2471            7,
2472            second_coefficients_tu.len(),
2473            first_coefficients_r.len(),
2474            first_coefficients_s.len(),
2475        ]),
2476    ]
2477    .into_iter()
2478    .max()
2479    .unwrap_or(0);
2480    let max_quartic_conv_len = convolution_chain_len(&[
2481        10,
2482        first_coefficients_r.len(),
2483        first_coefficients_s.len(),
2484        first_coefficients_t.len(),
2485        first_coefficients_u.len(),
2486    ]);
2487    require_scratch_capacity(
2488        max_linear_conv_len
2489            .max(max_quad_conv_len)
2490            .max(max_quartic_conv_len),
2491        SCRATCH,
2492        "fourth derivative",
2493    )?;
2494    let mut buf_a = [0.0_f64; SCRATCH];
2495    let mut buf_b = [0.0_f64; SCRATCH];
2496
2497    // eta_linear_term = Σ over seven (rst⊗u, rsu⊗t, rtu⊗s, stu⊗r, rs⊗tu,
2498    // rt⊗su, ru⊗st) of eta⊗product · moments. Fold each triple sum into
2499    // a single moment dot.
2500    let conv_eta_dot = |first: &[f64],
2501                        second: &[f64],
2502                        buf_a: &mut [f64; SCRATCH],
2503                        buf_b: &mut [f64; SCRATCH]|
2504     -> f64 {
2505        let m = poly_conv_into(first, second, buf_a);
2506        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2507        let mut acc = 0.0;
2508        for k in 0..n {
2509            acc = buf_b[k].mul_add(moments[k], acc);
2510        }
2511        acc
2512    };
2513    let mut eta_linear_term = 0.0;
2514    eta_linear_term += conv_eta_dot(
2515        third_coefficients_rst,
2516        first_coefficients_u,
2517        &mut buf_a,
2518        &mut buf_b,
2519    );
2520    eta_linear_term += conv_eta_dot(
2521        third_coefficients_rsu,
2522        first_coefficients_t,
2523        &mut buf_a,
2524        &mut buf_b,
2525    );
2526    eta_linear_term += conv_eta_dot(
2527        third_coefficients_rtu,
2528        first_coefficients_s,
2529        &mut buf_a,
2530        &mut buf_b,
2531    );
2532    eta_linear_term += conv_eta_dot(
2533        third_coefficients_stu,
2534        first_coefficients_r,
2535        &mut buf_a,
2536        &mut buf_b,
2537    );
2538    eta_linear_term += conv_eta_dot(
2539        second_coefficients_rs,
2540        second_coefficients_tu,
2541        &mut buf_a,
2542        &mut buf_b,
2543    );
2544    eta_linear_term += conv_eta_dot(
2545        second_coefficients_rt,
2546        second_coefficients_su,
2547        &mut buf_a,
2548        &mut buf_b,
2549    );
2550    eta_linear_term += conv_eta_dot(
2551        second_coefficients_ru,
2552        second_coefficients_st,
2553        &mut buf_a,
2554        &mut buf_b,
2555    );
2556
2557    let mut eta_sq_minus_one = [0.0_f64; 7];
2558    for (i, &eta_i) in eta.iter().enumerate() {
2559        for (j, &eta_j) in eta.iter().enumerate() {
2560            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2561        }
2562    }
2563    eta_sq_minus_one[0] -= 1.0;
2564
2565    // quad_coeff_term: six (eta²−1)⊗A⊗B⊗C · moments sums, where the (A,B,C)
2566    // factors are: (rs,t,u), (rt,s,u), (ru,s,t), (st,r,u), (su,r,t), (tu,r,s).
2567    let mut buf_c = [0.0_f64; SCRATCH];
2568    let conv_weighted_triple_dot = |weight: &[f64],
2569                                    a: &[f64],
2570                                    b: &[f64],
2571                                    c: &[f64],
2572                                    buf_a: &mut [f64; SCRATCH],
2573                                    buf_b: &mut [f64; SCRATCH],
2574                                    buf_c: &mut [f64; SCRATCH]|
2575     -> f64 {
2576        let ab_len = poly_conv_into(a, b, buf_a);
2577        let abc_len = poly_conv_into(&buf_a[..ab_len], c, buf_b);
2578        let final_len = poly_conv_into(weight, &buf_b[..abc_len], buf_c);
2579        let mut acc = 0.0;
2580        for k in 0..final_len {
2581            acc = buf_c[k].mul_add(moments[k], acc);
2582        }
2583        acc
2584    };
2585    let mut quad_coeff_term = 0.0;
2586    quad_coeff_term += conv_weighted_triple_dot(
2587        &eta_sq_minus_one,
2588        second_coefficients_rs,
2589        first_coefficients_t,
2590        first_coefficients_u,
2591        &mut buf_a,
2592        &mut buf_b,
2593        &mut buf_c,
2594    );
2595    quad_coeff_term += conv_weighted_triple_dot(
2596        &eta_sq_minus_one,
2597        second_coefficients_rt,
2598        first_coefficients_s,
2599        first_coefficients_u,
2600        &mut buf_a,
2601        &mut buf_b,
2602        &mut buf_c,
2603    );
2604    quad_coeff_term += conv_weighted_triple_dot(
2605        &eta_sq_minus_one,
2606        second_coefficients_ru,
2607        first_coefficients_s,
2608        first_coefficients_t,
2609        &mut buf_a,
2610        &mut buf_b,
2611        &mut buf_c,
2612    );
2613    quad_coeff_term += conv_weighted_triple_dot(
2614        &eta_sq_minus_one,
2615        second_coefficients_st,
2616        first_coefficients_r,
2617        first_coefficients_u,
2618        &mut buf_a,
2619        &mut buf_b,
2620        &mut buf_c,
2621    );
2622    quad_coeff_term += conv_weighted_triple_dot(
2623        &eta_sq_minus_one,
2624        second_coefficients_su,
2625        first_coefficients_r,
2626        first_coefficients_t,
2627        &mut buf_a,
2628        &mut buf_b,
2629        &mut buf_c,
2630    );
2631    quad_coeff_term += conv_weighted_triple_dot(
2632        &eta_sq_minus_one,
2633        second_coefficients_tu,
2634        first_coefficients_r,
2635        first_coefficients_s,
2636        &mut buf_a,
2637        &mut buf_b,
2638        &mut buf_c,
2639    );
2640
2641    // cubic_weight = 3·eta − eta³ (same as the prior expansion: eta_sq*eta
2642    // negated, plus the 3·eta linear correction).
2643    let mut eta_sq = [0.0_f64; 7];
2644    for (i, &eta_i) in eta.iter().enumerate() {
2645        for (j, &eta_j) in eta.iter().enumerate() {
2646            eta_sq[i + j] = eta_i.mul_add(eta_j, eta_sq[i + j]);
2647        }
2648    }
2649    let mut cubic_weight = [0.0_f64; 10];
2650    for (i, &eta_sq_i) in eta_sq.iter().enumerate() {
2651        for (j, &eta_j) in eta.iter().enumerate() {
2652            cubic_weight[i + j] = (-eta_sq_i).mul_add(eta_j, cubic_weight[i + j]);
2653        }
2654    }
2655    for (idx, &eta_coeff) in eta.iter().enumerate() {
2656        cubic_weight[idx] += 3.0 * eta_coeff;
2657    }
2658
2659    // quartic_coeff_term: cubic_weight ⊗ r ⊗ s ⊗ t ⊗ u · moments. The
2660    // original quintuple loop did 10·4·4·4·4 = 2560 mul-adds per call;
2661    // four sequential convolutions plus one moment dot drop this to
2662    // ~16+28+40+52+16 ≈ 152 mul-adds.
2663    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2664    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2665    let rstu_len = poly_conv_into(&buf_b[..rst_len], first_coefficients_u, &mut buf_a);
2666    let final_len = poly_conv_into(&cubic_weight, &buf_a[..rstu_len], &mut buf_b);
2667    let mut quartic_coeff_term = 0.0;
2668    for k in 0..final_len {
2669        quartic_coeff_term = buf_b[k].mul_add(moments[k], quartic_coeff_term);
2670    }
2671
2672    Ok((fourth_term - eta_linear_term + quad_coeff_term + quartic_coeff_term) * INV_TWO_PI)
2673}
2674
2675#[inline]
2676pub fn global_cubic_from_local(span: LocalSpanCubic) -> (f64, f64, f64, f64) {
2677    let left = span.left;
2678    let q0 = span.c0 - span.c1 * left + span.c2 * left * left - span.c3 * left * left * left;
2679    let q1 = span.c1 - 2.0 * span.c2 * left + 3.0 * span.c3 * left * left;
2680    let q2 = span.c2 - 3.0 * span.c3 * left;
2681    let q3 = span.c3;
2682    (q0, q1, q2, q3)
2683}
2684
2685/// Return the cubic polynomial coefficients (in `z`) of
2686/// `f(z) = link_span.evaluate(a + b*z)`.
2687///
2688/// `link_span.evaluate` is a cubic in its argument, so `f(z)` is also a cubic
2689/// in `z` and can be written exactly as
2690///
2691/// ```text
2692///     f(z) = d0 + d1·z + d2·z² + d3·z³
2693/// ```
2694///
2695/// where `(d0, d1, d2, d3)` are the values returned by this function. These
2696/// are **polynomial coefficients**, *not* derivatives of `f` at `z = 0`. The
2697/// relationship to Taylor derivatives is
2698///
2699/// ```text
2700///     d_k = f^(k)(0) / k!
2701/// ```
2702///
2703/// so `d0 = f(0)`, `d1 = f'(0)`, `d2 = ½·f''(0)`, `d3 = ⅙·f'''(0)`. Callers
2704/// such as [`denested_cell_coefficients`] and [`link_basis_cell_coefficients`]
2705/// rely on the polynomial-coefficient convention, since they propagate the
2706/// values directly as the `(c0, c1, c2, c3)` slots of a downstream polynomial
2707/// in `z`.
2708#[inline]
2709pub fn transformed_link_cubic(link_span: LocalSpanCubic, a: f64, b: f64) -> (f64, f64, f64, f64) {
2710    let shift = a - link_span.left;
2711    let d0 = link_span.c0
2712        + link_span.c1 * shift
2713        + link_span.c2 * shift * shift
2714        + link_span.c3 * shift * shift * shift;
2715    let d1 = b * (link_span.c1 + 2.0 * link_span.c2 * shift + 3.0 * link_span.c3 * shift * shift);
2716    let d2 = b * b * (link_span.c2 + 3.0 * link_span.c3 * shift);
2717    let d3 = link_span.c3 * b * b * b;
2718    (d0, d1, d2, d3)
2719}
2720
2721#[inline]
2722pub fn denested_cell_coefficients(
2723    score_span: LocalSpanCubic,
2724    link_span: LocalSpanCubic,
2725    a: f64,
2726    b: f64,
2727) -> [f64; 4] {
2728    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2729    let (d0, d1, d2, d3) = transformed_link_cubic(link_span, a, b);
2730    [a + b * h0 + d0, b + b * h1 + d1, b * h2 + d2, b * h3 + d3]
2731}
2732
2733#[inline]
2734pub fn denested_cell_coefficient_partials(
2735    score_span: LocalSpanCubic,
2736    link_span: LocalSpanCubic,
2737    a: f64,
2738    b: f64,
2739) -> ([f64; 4], [f64; 4]) {
2740    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2741    let shift = a - link_span.left;
2742    let alpha1 = link_span.c1;
2743    let alpha2 = link_span.c2;
2744    let alpha3 = link_span.c3;
2745    let dc_da = [
2746        1.0 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2747        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2748        3.0 * alpha3 * b * b,
2749        0.0,
2750    ];
2751    let dc_db = [
2752        h0,
2753        1.0 + h1 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2754        h2 + 2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2755        h3 + 3.0 * alpha3 * b * b,
2756    ];
2757    (dc_da, dc_db)
2758}
2759
2760#[inline]
2761fn link_cubic_second_partials(
2762    link_span: LocalSpanCubic,
2763    a: f64,
2764    b: f64,
2765) -> ([f64; 4], [f64; 4], [f64; 4]) {
2766    let shift = a - link_span.left;
2767    let alpha2 = link_span.c2;
2768    let alpha3 = link_span.c3;
2769    let dc_daa = [
2770        2.0 * alpha2 + 6.0 * alpha3 * shift,
2771        6.0 * alpha3 * b,
2772        0.0,
2773        0.0,
2774    ];
2775    let dc_dab = [
2776        0.0,
2777        2.0 * alpha2 + 6.0 * alpha3 * shift,
2778        6.0 * alpha3 * b,
2779        0.0,
2780    ];
2781    let dc_dbb = [
2782        0.0,
2783        0.0,
2784        2.0 * (alpha2 + 3.0 * alpha3 * shift),
2785        6.0 * alpha3 * b,
2786    ];
2787    (dc_daa, dc_dab, dc_dbb)
2788}
2789
2790#[inline]
2791pub fn denested_cell_second_partials(
2792    score_span: LocalSpanCubic,
2793    link_span: LocalSpanCubic,
2794    a: f64,
2795    b: f64,
2796) -> ([f64; 4], [f64; 4], [f64; 4]) {
2797    let score_left = score_span.left;
2798    if !score_left.is_finite() {
2799        return ([f64::NAN; 4], [f64::NAN; 4], [f64::NAN; 4]);
2800    }
2801    link_cubic_second_partials(link_span, a, b)
2802}
2803
2804#[inline]
2805fn link_cubic_third_partials(
2806    link_span: LocalSpanCubic,
2807) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2808    let alpha3 = link_span.c3;
2809    (
2810        [6.0 * alpha3, 0.0, 0.0, 0.0],
2811        [0.0, 6.0 * alpha3, 0.0, 0.0],
2812        [0.0, 0.0, 6.0 * alpha3, 0.0],
2813        [0.0, 0.0, 0.0, 6.0 * alpha3],
2814    )
2815}
2816
2817#[inline]
2818pub fn denested_cell_third_partials(
2819    link_span: LocalSpanCubic,
2820) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2821    link_cubic_third_partials(link_span)
2822}
2823
2824#[inline]
2825pub fn score_basis_cell_coefficients(score_basis_span: LocalSpanCubic, b: f64) -> [f64; 4] {
2826    let (h0, h1, h2, h3) = global_cubic_from_local(score_basis_span);
2827    [b * h0, b * h1, b * h2, b * h3]
2828}
2829
2830#[inline]
2831pub fn link_basis_cell_coefficients(link_basis_span: LocalSpanCubic, a: f64, b: f64) -> [f64; 4] {
2832    let (d0, d1, d2, d3) = transformed_link_cubic(link_basis_span, a, b);
2833    [d0, d1, d2, d3]
2834}
2835
2836#[inline]
2837pub fn link_basis_cell_coefficient_partials(
2838    link_basis_span: LocalSpanCubic,
2839    a: f64,
2840    b: f64,
2841) -> ([f64; 4], [f64; 4]) {
2842    let shift = a - link_basis_span.left;
2843    let alpha1 = link_basis_span.c1;
2844    let alpha2 = link_basis_span.c2;
2845    let alpha3 = link_basis_span.c3;
2846    let dc_da = [
2847        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2848        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2849        3.0 * alpha3 * b * b,
2850        0.0,
2851    ];
2852    let dc_db = [
2853        0.0,
2854        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2855        2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2856        3.0 * alpha3 * b * b,
2857    ];
2858    (dc_da, dc_db)
2859}
2860
2861#[inline]
2862pub fn link_basis_cell_second_partials(
2863    link_basis_span: LocalSpanCubic,
2864    a: f64,
2865    b: f64,
2866) -> ([f64; 4], [f64; 4], [f64; 4]) {
2867    link_cubic_second_partials(link_basis_span, a, b)
2868}
2869
2870#[inline]
2871pub fn link_basis_cell_third_partials(
2872    link_basis_span: LocalSpanCubic,
2873) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2874    link_cubic_third_partials(link_basis_span)
2875}
2876
2877pub fn build_denested_partition_cells<FS, FL>(
2878    a: f64,
2879    b: f64,
2880    score_breaks: &[f64],
2881    link_breaks: &[f64],
2882    score_span_at: FS,
2883    link_span_at: FL,
2884) -> Result<Vec<DenestedPartitionCell>, String>
2885where
2886    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2887    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2888{
2889    build_denested_partition_cells_with_tails(
2890        a,
2891        b,
2892        score_breaks,
2893        link_breaks,
2894        score_span_at,
2895        link_span_at,
2896    )
2897}
2898
2899/// Build a partition covering `(-∞, +∞)` with parameter-independent outer
2900/// bounds.  Interior cells use the same finite-cell polynomial algebra.
2901/// The two tail cells are guaranteed affine (c2=c3=0) because both
2902/// deviations saturate to constants outside their knot support.
2903///
2904/// The tail cells' score/link spans come from the same closures evaluated
2905/// at a representative point in the tail region — the closures must return
2906/// constant (c1=c2=c3=0) cubics for points outside support.
2907pub fn build_denested_partition_cells_with_tails<FS, FL>(
2908    a: f64,
2909    b: f64,
2910    score_breaks: &[f64],
2911    link_breaks: &[f64],
2912    mut score_span_at: FS,
2913    mut link_span_at: FL,
2914) -> Result<Vec<DenestedPartitionCell>, String>
2915where
2916    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2917    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2918{
2919    // Collect all INTERNAL split points (finite), each tagged with its
2920    // provenance: a fixed score break or a link-knot crossing. Provenance
2921    // identifies the cell's `(a, b)` family for the Chebyshev moment-family
2922    // layer; the z coordinates alone cannot distinguish the two kinds.
2923    let mut split_points: Vec<(f64, PartitionEdge)> = score_breaks
2924        .iter()
2925        .map(|&sigma| (sigma, PartitionEdge::Fixed(sigma)))
2926        .collect();
2927    if b.abs() > 1e-12 {
2928        for &tau in link_breaks {
2929            let z = (tau - a) / b;
2930            if z.is_finite() {
2931                split_points.push((z, PartitionEdge::Crossing { tau }));
2932            }
2933        }
2934    }
2935    dedup_sorted_tagged_breakpoints(&mut split_points);
2936
2937    let mut out = Vec::new();
2938
2939    if split_points.is_empty() {
2940        let score_span = score_span_at(0.0)?;
2941        let link_span = link_span_at(a)?;
2942        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2943        return Ok(vec![DenestedPartitionCell {
2944            cell: DenestedCubicCell {
2945                left: f64::NEG_INFINITY,
2946                right: f64::INFINITY,
2947                c0: coeffs[0],
2948                c1: coeffs[1],
2949                c2: 0.0,
2950                c3: 0.0,
2951            },
2952            score_span,
2953            link_span,
2954            left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2955            right_edge: PartitionEdge::Fixed(f64::INFINITY),
2956        }]);
2957    }
2958
2959    // ── Left tail cell: (-∞, leftmost_split] ──
2960    let (leftmost, leftmost_edge) = split_points[0];
2961    // Evaluate spans at a point just left of the leftmost split.  The
2962    // closures return constant tail cubics for this region.
2963    let left_probe = interval_probe_point(f64::NEG_INFINITY, leftmost)?;
2964    let left_score_span = score_span_at(left_probe)?;
2965    let left_link_span = link_span_at(a + b * left_probe)?;
2966    let left_coeffs = denested_cell_coefficients(left_score_span, left_link_span, a, b);
2967    if left_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2968        || left_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2969    {
2970        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2971            "left tail cell must be affine (deviations constant outside support), \
2972             got c2={:.3e}, c3={:.3e}",
2973            left_coeffs[2], left_coeffs[3]
2974        ))
2975        .into());
2976    }
2977    out.push(DenestedPartitionCell {
2978        cell: DenestedCubicCell {
2979            left: f64::NEG_INFINITY,
2980            right: leftmost,
2981            c0: left_coeffs[0],
2982            c1: left_coeffs[1],
2983            c2: 0.0,
2984            c3: 0.0,
2985        },
2986        score_span: left_score_span,
2987        link_span: left_link_span,
2988        left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2989        right_edge: leftmost_edge,
2990    });
2991
2992    // ── Interior cells (all finite) ──
2993    for window in split_points.windows(2) {
2994        let (left, left_edge) = window[0];
2995        let (right, right_edge) = window[1];
2996        if !left.is_finite() || !right.is_finite() || right - left <= 1e-12 {
2997            continue;
2998        }
2999        let mid = interval_probe_point(left, right)?;
3000        let score_span = score_span_at(mid)?;
3001        let link_span = link_span_at(a + b * mid)?;
3002        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
3003        out.push(DenestedPartitionCell {
3004            cell: DenestedCubicCell {
3005                left,
3006                right,
3007                c0: coeffs[0],
3008                c1: coeffs[1],
3009                c2: coeffs[2],
3010                c3: coeffs[3],
3011            },
3012            score_span,
3013            link_span,
3014            left_edge,
3015            right_edge,
3016        });
3017    }
3018
3019    // ── Right tail cell: [rightmost_split, +∞) ──
3020    let (rightmost, rightmost_edge) = *split_points.last().unwrap();
3021    let right_probe = interval_probe_point(rightmost, f64::INFINITY)?;
3022    let right_score_span = score_span_at(right_probe)?;
3023    let right_link_span = link_span_at(a + b * right_probe)?;
3024    let right_coeffs = denested_cell_coefficients(right_score_span, right_link_span, a, b);
3025    if right_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
3026        || right_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
3027    {
3028        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3029            "right tail cell must be affine (deviations constant outside support), \
3030             got c2={:.3e}, c3={:.3e}",
3031            right_coeffs[2], right_coeffs[3]
3032        ))
3033        .into());
3034    }
3035    out.push(DenestedPartitionCell {
3036        cell: DenestedCubicCell {
3037            left: rightmost,
3038            right: f64::INFINITY,
3039            c0: right_coeffs[0],
3040            c1: right_coeffs[1],
3041            c2: 0.0,
3042            c3: 0.0,
3043        },
3044        score_span: right_score_span,
3045        link_span: right_link_span,
3046        left_edge: rightmost_edge,
3047        right_edge: PartitionEdge::Fixed(f64::INFINITY),
3048    });
3049
3050    Ok(out)
3051}
3052
3053#[inline]
3054pub fn normalized_non_affine_coefficients(
3055    left: f64,
3056    right: f64,
3057    c0: f64,
3058    c1: f64,
3059    c2: f64,
3060    c3: f64,
3061) -> Result<(f64, f64), String> {
3062    let width = right - left;
3063    if !width.is_finite() || width <= 0.0 {
3064        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3065            "normalized cubic coefficients require a positive finite cell width, got left={left}, right={right}"
3066        ))
3067        .into());
3068    }
3069    let anchor_scale = c0.abs() + c1.abs();
3070    if !anchor_scale.is_finite() {
3071        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3072            "normalized cubic coefficients require finite affine coefficients, got c0={c0}, c1={c1}"
3073        ))
3074        .into());
3075    }
3076    let mid = 0.5 * (left + right);
3077    let half = 0.5 * width;
3078    let k2 = half * half * (c2 + 3.0 * c3 * mid);
3079    let k3 = c3 * half * half * half;
3080    Ok((k2, k3))
3081}
3082
3083#[inline]
3084pub fn branch_cell(cell: DenestedCubicCell) -> Result<ExactCellBranch, String> {
3085    let tol = effective_branch_tol(cell);
3086    if !cell.left.is_finite() || !cell.right.is_finite() {
3087        if cell.c2.abs() <= tol && cell.c3.abs() <= tol {
3088            return Ok(ExactCellBranch::Affine);
3089        }
3090        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3091            "non-affine cells require finite bounds, got [{}, {}] with c2={:.6e}, c3={:.6e}",
3092            cell.left, cell.right, cell.c2, cell.c3
3093        ))
3094        .into());
3095    }
3096    let (k2, k3) = normalized_non_affine_coefficients(
3097        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3098    )?;
3099    if k2.abs() <= tol && k3.abs() <= tol {
3100        Ok(ExactCellBranch::Affine)
3101    } else if k3.abs() <= tol {
3102        Ok(ExactCellBranch::Quartic)
3103    } else {
3104        Ok(ExactCellBranch::Sextic)
3105    }
3106}
3107
3108#[inline]
3109fn degenerate_sextic_branch(
3110    cell: DenestedCubicCell,
3111    lead: f64,
3112) -> Result<Option<ExactCellBranch>, String> {
3113    // The sextic recurrence divides by `lead = 3*c3^2`. When that division is
3114    // unstable, lower the polynomial degree without discarding a material
3115    // quadratic coefficient.
3116    let (normalized_k2, normalized_k3) = normalized_non_affine_coefficients(
3117        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3118    )?;
3119    if normalized_k3.abs() > NORMALIZED_CELL_BRANCH_TOL && lead.abs() > 1e-18 {
3120        return Ok(None);
3121    }
3122    if normalized_k2.abs() > NORMALIZED_CELL_BRANCH_TOL {
3123        Ok(Some(ExactCellBranch::Quartic))
3124    } else {
3125        Ok(Some(ExactCellBranch::Affine))
3126    }
3127}
3128
3129#[inline]
3130fn validate_bvn_args(h: f64, k: f64, rho: f64) -> Result<(), String> {
3131    if !h.is_finite() && !h.is_infinite() {
3132        return Err(CubicCellKernelError::bivariate_normal_domain(
3133            "bivariate normal cdf requires finite or infinite h",
3134        )
3135        .into());
3136    }
3137    if !k.is_finite() && !k.is_infinite() {
3138        return Err(CubicCellKernelError::bivariate_normal_domain(
3139            "bivariate normal cdf requires finite or infinite k",
3140        )
3141        .into());
3142    }
3143    if !rho.is_finite() {
3144        return Err(CubicCellKernelError::bivariate_normal_domain(format!(
3145            "bivariate normal cdf requires finite correlation, got {rho}"
3146        ))
3147        .into());
3148    }
3149    Ok::<(), _>(())
3150}
3151
3152#[inline]
3153fn bvn_gl_sum(h: f64, k: f64, rho_clamped: f64, asr: f64) -> f64 {
3154    // The Drezner-Wesolowsky arcsin representation is integrated with the
3155    // same 20-point Gauss-Legendre rule as before, but mirrored node pairs are
3156    // evaluated with one sin_cos for the half-angle offset rather than two
3157    // independent sin calls.  This preserves the quadrature rule (and hence
3158    // the accuracy envelope) while reducing the transcendental work in the
3159    // dominant finite-bound path from 20 sin calls to 11 sin/cos evaluations.
3160    if rho_clamped == 0.0 {
3161        return 0.0;
3162    }
3163    let hs = 0.5 * (h * h + k * k);
3164    let hk = h * k;
3165    let half_asr = 0.5 * asr;
3166    let (sin_mid, cos_mid) = half_asr.sin_cos();
3167    let mut sum = 0.0;
3168    for i in 0..10 {
3169        let node = GL20_NODES[i].abs();
3170        let weight = GL20_WEIGHTS[i];
3171        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3172
3173        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3174        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3175        let expo_lo = ((sn_lo * hk) - hs) / one_minus_lo;
3176
3177        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3178        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3179        let expo_hi = ((sn_hi * hk) - hs) / one_minus_hi;
3180
3181        sum += weight * (expo_lo.exp() + expo_hi.exp());
3182    }
3183    sum
3184}
3185
3186pub fn bivariate_normal_cdf(h: f64, k: f64, rho: f64) -> Result<f64, String> {
3187    validate_bvn_args(h, k, rho)?;
3188    if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
3189        return Ok(0.0);
3190    }
3191    if h == f64::INFINITY {
3192        return Ok(normal_cdf(k));
3193    }
3194    if k == f64::INFINITY {
3195        return Ok(normal_cdf(h));
3196    }
3197
3198    let rho_clamped = rho.clamp(-1.0, 1.0);
3199    if rho_clamped >= 1.0 - 1e-12 {
3200        return Ok(normal_cdf(h.min(k)));
3201    }
3202    if rho_clamped <= -1.0 + 1e-12 {
3203        return Ok((normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0));
3204    }
3205    if rho_clamped == 0.0 {
3206        return Ok((normal_cdf(h) * normal_cdf(k)).clamp(0.0, 1.0));
3207    }
3208    if h == 0.0 && k == 0.0 {
3209        return Ok((0.25 + rho_clamped.asin() / std::f64::consts::TAU).clamp(0.0, 1.0));
3210    }
3211
3212    let asr = rho_clamped.asin();
3213    let sum = bvn_gl_sum(h, k, rho_clamped, asr);
3214    Ok((normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3215}
3216
3217#[inline]
3218fn bvn_gl_sum_interval(h: f64, left: f64, right: f64, rho_clamped: f64, asr: f64) -> f64 {
3219    if rho_clamped == 0.0 {
3220        return 0.0;
3221    }
3222    let h2 = h * h;
3223    let right_hs = 0.5 * (h2 + right * right);
3224    let left_hs = 0.5 * (h2 + left * left);
3225    let half_asr = 0.5 * asr;
3226    let (sin_mid, cos_mid) = half_asr.sin_cos();
3227    let mut sum = 0.0;
3228    for i in 0..10 {
3229        let node = GL20_NODES[i].abs();
3230        let weight = GL20_WEIGHTS[i];
3231        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3232
3233        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3234        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3235        let lo_right = (((sn_lo * h * right) - right_hs) / one_minus_lo).exp();
3236        let lo_left = (((sn_lo * h * left) - left_hs) / one_minus_lo).exp();
3237
3238        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3239        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3240        let hi_right = (((sn_hi * h * right) - right_hs) / one_minus_hi).exp();
3241        let hi_left = (((sn_hi * h * left) - left_hs) / one_minus_hi).exp();
3242
3243        sum += weight * ((lo_right - lo_left) + (hi_right - hi_left));
3244    }
3245    sum
3246}
3247
3248fn bivariate_normal_cdf_interval(h: f64, left: f64, right: f64, rho: f64) -> Result<f64, String> {
3249    if right <= left {
3250        return Ok(0.0);
3251    }
3252    if left == f64::NEG_INFINITY && right == f64::INFINITY {
3253        return Ok(normal_cdf(h));
3254    }
3255    if !left.is_finite() || !right.is_finite() {
3256        let upper = bivariate_normal_cdf(h, right, rho)?;
3257        let lower = bivariate_normal_cdf(h, left, rho)?;
3258        return Ok((upper - lower).clamp(0.0, 1.0));
3259    }
3260    validate_bvn_args(h, left, rho)?;
3261    validate_bvn_args(h, right, rho)?;
3262    if h == f64::NEG_INFINITY {
3263        return Ok(0.0);
3264    }
3265    if h == f64::INFINITY {
3266        return Ok((normal_cdf(right) - normal_cdf(left)).clamp(0.0, 1.0));
3267    }
3268
3269    let rho_clamped = rho.clamp(-1.0, 1.0);
3270    if rho_clamped >= 1.0 - 1e-12 || rho_clamped <= -1.0 + 1e-12 {
3271        let upper = bivariate_normal_cdf(h, right, rho_clamped)?;
3272        let lower = bivariate_normal_cdf(h, left, rho_clamped)?;
3273        return Ok((upper - lower).clamp(0.0, 1.0));
3274    }
3275
3276    let cdf_h = normal_cdf(h);
3277    let normal_part = cdf_h * (normal_cdf(right) - normal_cdf(left));
3278    if rho_clamped == 0.0 {
3279        return Ok(normal_part.clamp(0.0, 1.0));
3280    }
3281    let asr = rho_clamped.asin();
3282    let sum = bvn_gl_sum_interval(h, left, right, rho_clamped, asr);
3283    Ok((normal_part + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3284}
3285
3286fn exp_neg_half_square(x: f64) -> f64 {
3287    if x.is_infinite() {
3288        0.0
3289    } else {
3290        (-0.5 * x * x).exp()
3291    }
3292}
3293
3294/// Zeroth truncated standard-normal moment `T_0(a, b) = ∫_a^b e^(−z²/2) dz
3295/// = √(2π)·(Φ(b) − Φ(a))`, evaluated without catastrophic cancellation in
3296/// either tail.
3297///
3298/// Writing `T_0 = √(π/2)·[erf(b/√2) − erf(a/√2)]`, the naive form collapses
3299/// to `0.0` whenever both endpoints lie in the *same* far tail: `erf`
3300/// saturates at the IEEE-754 values `±1.0` for `|x| ≳ 8.3·√2`, so the
3301/// difference of two saturated values is exactly zero even though the
3302/// integral is a strictly positive number well inside the f64 normal range
3303/// (e.g. `∫_{-12}^{-10} ≈ 1.9e-23`). The fix is to reduce the erf difference
3304/// to complementary tail probabilities — `erfc` is evaluated with a dedicated
3305/// tail series, *not* as `1 − erf` — and to pick, by the sign of the
3306/// endpoints, the algebraically-equivalent form whose terms do not cancel
3307/// against one another:
3308///
3309/// ```text
3310/// both ≥ 0 (upper tail):  erf(b/√2) − erf(a/√2) = erfc(a/√2) − erfc(b/√2)
3311/// both ≤ 0 (lower tail):  erf(b/√2) − erf(a/√2) = erfc(−b/√2) − erfc(−a/√2)
3312/// straddling zero:        erf(b/√2) − erf(a/√2)
3313///                        = erf(b/√2) + erf(−a/√2)       near the anchor
3314///                        = 2 − erfc(b/√2) − erfc(−a/√2) otherwise
3315/// ```
3316///
3317/// In each branch every `erfc` argument is `≥ 0`, so the terms are small
3318/// positive tail values, while narrow straddling intervals add two
3319/// non-negative `erf` masses measured outward from the anchor. That avoids
3320/// the `2 − erfc(b/√2) − erfc(−a/√2)` cancellation when both erfc terms round
3321/// to `1.0`, but keeps the erfc-tail form for ordinary/full-line straddling
3322/// intervals. No large quantities cancel and full f64 precision survives down
3323/// to the underflow boundary in either tail and around the affine anchor.
3324///
3325/// Uses `libm::erfc` (msun double-precision implementation, ≤ 1 ulp) rather
3326/// than `statrs::function::erf::erfc` (a 6-term rational approximation that
3327/// carries ~3·10⁻¹¹ relative error around `|x| ≈ 1/√2` — see the existing
3328/// `libm::erfc` consumer at `inference::polya_gamma_core::normal_cdf`). That
3329/// statrs error propagates directly into `T_0`, then through every higher
3330/// moment `T_n` (the recurrence `T_n = a^{n-1}e^{-a²/2} − b^{n-1}e^{-b²/2}
3331/// + (n-1)·T_{n-2}` walks `T_0` up two steps at a time), then through every
3332/// affine-cell moment via `affine_anchor_moment_vector` (whose `out[n]` is a
3333/// linear combination of `T_0..=T_n`), and is the dominant source of error
3334/// in the affine-cell branch of the cubic-cell substrate (CPU/GPU parity
3335/// reference for transformation-normal, bernoulli-marginal-slope, and the
3336/// BMS flex-row higher-derivative reuse path).
3337fn truncated_gaussian_zeroth_moment(a: f64, b: f64) -> f64 {
3338    let inv_sqrt2 = 1.0 / std::f64::consts::SQRT_2;
3339    let za = a * inv_sqrt2;
3340    let zb = b * inv_sqrt2;
3341    let erf_diff = if za >= 0.0 {
3342        libm::erfc(za) - libm::erfc(zb)
3343    } else if zb <= 0.0 {
3344        libm::erfc(-zb) - libm::erfc(-za)
3345    } else if zb <= 0.5 && -za <= 0.5 {
3346        // Near the affine anchor, erfc(zb) and erfc(-za) are both close to
3347        // one; subtracting them from 2.0 can round a tiny but representable
3348        // cell mass to zero. The equivalent erf sum adds small positive
3349        // quantities directly.
3350        libm::erf(zb) + libm::erf(-za)
3351    } else {
3352        2.0 - libm::erfc(zb) - libm::erfc(-za)
3353    };
3354    // √(2π)·½ = √(π/2).
3355    (std::f64::consts::PI / 2.0).sqrt() * erf_diff
3356}
3357
3358/// Fill `out[0..=max_degree]` with the raw truncated standard-normal moments
3359///
3360/// ```text
3361/// T_n(a, b) = ∫_a^b z^n exp(-z²/2) dz
3362/// ```
3363///
3364/// using the integration-by-parts recurrence
3365///
3366/// ```text
3367/// T_0(a, b) = √(2π) (Φ(b) − Φ(a))
3368/// T_1(a, b) = exp(−a²/2) − exp(−b²/2)
3369/// T_n(a, b) = a^(n−1) e^{−a²/2} − b^(n−1) e^{−b²/2} + (n−1) T_{n−2}(a, b)
3370/// ```
3371///
3372/// Computed in one forward sweep so each call evaluates `erf` and
3373/// `exp(−x²/2)` exactly twice (once at `a`, once at `b`) regardless of the
3374/// requested degree. The naive form — calling `T_n` recursively for each
3375/// `n = 0..=max_degree` — re-evaluated `erf`/`exp` about `max_degree²/4`
3376/// times per affine cell, which dominated the wall time of the
3377/// transformation-normal and bernoulli-marginal-slope inner solves with
3378/// `max_degree = 64` (the transport order's required degree budget).
3379fn fill_truncated_gaussian_moments(a: f64, b: f64, out: &mut [f64]) {
3380    if out.is_empty() {
3381        return;
3382    }
3383    out[0] = truncated_gaussian_zeroth_moment(a, b);
3384    if out.len() == 1 {
3385        return;
3386    }
3387    let ea = exp_neg_half_square(a);
3388    let eb = exp_neg_half_square(b);
3389    out[1] = ea - eb;
3390    if out.len() == 2 {
3391        return;
3392    }
3393    let a_finite = a.is_finite();
3394    let b_finite = b.is_finite();
3395    // For n in 2..=max_degree we need a^{n-1} e^{-a²/2} (resp. b). Carry the
3396    // running powers a^{n-1}, b^{n-1} forward by a single multiply per step.
3397    // Infinite endpoints contribute 0 (the integrand decays at the rate of
3398    // exp(−x²/2)), matching the prior `is_infinite` branch in the recursive
3399    // implementation; we still update the running power so the iteration
3400    // stays branchless when both endpoints are finite.
3401    let mut a_pow_n_minus_1 = a; // a^1, used at n = 2
3402    let mut b_pow_n_minus_1 = b;
3403    for n in 2..out.len() {
3404        let left = if a_finite { a_pow_n_minus_1 * ea } else { 0.0 };
3405        let right = if b_finite { b_pow_n_minus_1 * eb } else { 0.0 };
3406        out[n] = left - right + (n as f64 - 1.0) * out[n - 2];
3407        a_pow_n_minus_1 *= a;
3408        b_pow_n_minus_1 *= b;
3409    }
3410}
3411
3412/// Stack-array bound for `affine_anchor_moment_vector_into`. Public callers
3413/// use up to ~24 (largest is the bernoulli-margslope outer-step degree-21
3414/// reduction); 64 leaves comfortable headroom without growing the per-call
3415/// stack footprint meaningfully.
3416const MAX_AFFINE_ANCHOR_DEGREE: usize = 64;
3417
3418pub fn affine_anchor_moment_vector(
3419    alpha: f64,
3420    beta: f64,
3421    left: f64,
3422    right: f64,
3423    max_degree: usize,
3424) -> Vec<f64> {
3425    let mut out = vec![0.0; max_degree + 1];
3426    affine_anchor_moment_vector_into(alpha, beta, left, right, max_degree, &mut out);
3427    out
3428}
3429
3430fn affine_anchor_moment_vector_into(
3431    alpha: f64,
3432    beta: f64,
3433    left: f64,
3434    right: f64,
3435    max_degree: usize,
3436    out: &mut [f64],
3437) {
3438    assert_eq!(out.len(), max_degree + 1);
3439    let s = (1.0 + beta * beta).sqrt();
3440    let mu = -alpha * beta / (1.0 + beta * beta);
3441    let y_left = if left.is_infinite() {
3442        if left.is_sign_positive() {
3443            f64::INFINITY
3444        } else {
3445            f64::NEG_INFINITY
3446        }
3447    } else {
3448        s * (left - mu)
3449    };
3450    let y_right = if right.is_infinite() {
3451        if right.is_sign_positive() {
3452            f64::INFINITY
3453        } else {
3454            f64::NEG_INFINITY
3455        }
3456    } else {
3457        s * (right - mu)
3458    };
3459    let anchor = (-alpha * alpha / (2.0 * s * s)).exp() / s;
3460    assert!(
3461        max_degree <= MAX_AFFINE_ANCHOR_DEGREE,
3462        "affine_anchor_moment_vector max_degree {} exceeds compile-time bound {}",
3463        max_degree,
3464        MAX_AFFINE_ANCHOR_DEGREE
3465    );
3466    let mut t = [0.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3467    fill_truncated_gaussian_moments(y_left, y_right, &mut t[..=max_degree]);
3468    // Build mu^k and s^{-k} tables once. The inner sum is the binomial
3469    // expansion of the affine change-of-variables, and computing the
3470    // binomial coefficient via Pascal's row recurrence + carrying mu/s
3471    // powers eliminates the per-(n, k) `powi` and binomial calls that
3472    // otherwise dominated the inner loop at large `max_degree`.
3473    let mut mu_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3474    for k in 1..=max_degree {
3475        mu_pow[k] = mu_pow[k - 1] * mu;
3476    }
3477    let inv_s = 1.0 / s;
3478    let mut inv_s_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3479    for k in 1..=max_degree {
3480        inv_s_pow[k] = inv_s_pow[k - 1] * inv_s;
3481    }
3482    out.fill(0.0);
3483    for n in 0..=max_degree {
3484        let mut acc = 0.0;
3485        // C(n, k+1) = C(n, k) · (n − k) / (k + 1).
3486        let mut binom = 1.0;
3487        for k in 0..=n {
3488            let term = binom * mu_pow[n - k] * inv_s_pow[k];
3489            acc = term.mul_add(t[k], acc);
3490            if k < n {
3491                binom = binom * (n - k) as f64 / (k + 1) as f64;
3492            }
3493        }
3494        out[n] = anchor * acc;
3495    }
3496}
3497
3498fn affine_value_from_moment_primitive(alpha: f64, beta: f64, left: f64, right: f64) -> f64 {
3499    // Exact formula via bivariate normal CDF.
3500    //
3501    // V(α,β,l,r) = ∫_l^r Φ(α+βz)φ(z)dz
3502    //            = P(U ≤ α+βZ, l ≤ Z ≤ r)    where U,Z iid N(0,1)
3503    //            = Φ₂(h, r; ρ) − Φ₂(h, l; ρ)
3504    //
3505    // with h = α/√(1+β²) and ρ = −β/√(1+β²).
3506    //
3507    // This is exact to floating-point precision via the high-accuracy
3508    // Drezner-Wesolowsky BVN routine, replacing the previous fixed 20-point
3509    // Gauss-Legendre numerical integration of the derivative primitive.
3510    let s = (1.0 + beta * beta).sqrt();
3511    let h = alpha / s;
3512    let rho = -beta / s;
3513    bivariate_normal_cdf_interval(h, left, right, rho).unwrap_or(0.0)
3514}
3515
3516/// Evaluate an affine cell (c2=c3=0) with a value/moment-consistent primitive.
3517///
3518/// Value and moments are now generated from the same affine moment primitive.
3519/// The zero-moment derivative is exact, and `value` is reconstructed by
3520/// integrating `d value / d alpha = INV_TWO_PI * moments[0]` over `alpha`
3521/// on a transformed semi-infinite domain.
3522pub fn evaluate_affine_cell_state(
3523    cell: DenestedCubicCell,
3524    max_degree: usize,
3525) -> Result<CellMomentState, String> {
3526    let alpha = cell.c0;
3527    let beta = cell.c1;
3528    let value = affine_value_from_moment_primitive(alpha, beta, cell.left, cell.right);
3529    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3530    Ok(CellMomentState {
3531        branch: ExactCellBranch::Affine,
3532        value,
3533        moments: moments.into(),
3534    })
3535}
3536
3537fn evaluate_affine_cell_derivative_state(
3538    cell: DenestedCubicCell,
3539    max_degree: usize,
3540) -> Result<CellDerivativeMomentState, String> {
3541    let alpha = cell.c0;
3542    let beta = cell.c1;
3543    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3544    Ok(CellDerivativeMomentState {
3545        branch: ExactCellBranch::Affine,
3546        moments: moments.into(),
3547    })
3548}
3549
3550/// Accumulate `mw * z^k` into `moments[k]` for k=0..moments.len(). The
3551/// "unrolled4" name is historical — this is the plain scalar accumulator
3552/// that the SIMD outer loop calls per lane. Moment counts are small enough
3553/// (max_degree + 1 <= ~10) that explicit 4-way unrolling does not measurably
3554/// improve throughput over the iterator path; the wide::f64x4::exp savings
3555/// in the SIMD outer dominate the kernel's runtime.
3556#[inline]
3557fn accumulate_moments_unrolled4(moments: &mut [f64], mw: f64, z: f64) {
3558    let mut z_pow = 1.0_f64;
3559    for slot in moments.iter_mut() {
3560        *slot = mw.mul_add(z_pow, *slot);
3561        z_pow *= z;
3562    }
3563}
3564
3565// Shared SIMD Gauss-Legendre core for non-affine cells. The const generic
3566// `COMPUTE_VALUE` selects whether the cell value integral
3567// `∫ φ(η(z)) · exp(-½z²) dz / √(2π)` is accumulated alongside the moments.
3568// Monomorphization collapses the const-generic branches at compile time, so
3569// `COMPUTE_VALUE = false` emits the moment-only path verbatim.
3570//
3571// Single source of truth for the moment SIMD lane ordering, the Horner-with-FMA
3572// pattern for η(z), the `0.5 * (z² + η²)` quadratic-form evaluation order, the
3573// unscaled per-node GL moment weights, the post-loop half-width fold, and the
3574// per-lane `accumulate_moments_unrolled4` call. The previous duplicated code paths
3575// drifted by 1 ULP whenever any of these details diverged; here both paths
3576// share the same instructions, eliminating an entire class of regressions
3577// where a tweak to the quadrature order or the FMA pattern would silently
3578// re-introduce divergence between the value- and derivative-only callers.
3579//
3580// Gauss-Legendre on [left, right] converges geometrically for the analytic
3581// integrand exp(-q(z)) with quartic/sextic q on a bounded cell; the prior
3582// adaptive transport path expanded basis_moments via the forward 3-/5-step
3583// recurrences in reduce_quartic/sextic_moments, which amplify roundoff by
3584// (1/lead)^n with lead = 2c2²/3c3² and overflow to NaN for small c2/c3 cells
3585// that arise naturally in production.
3586//
3587// The fixed 384-node rule that replaced the transport path is accurate but
3588// pays ~384 exp evaluations per cell unconditionally. Production cells are
3589// narrow spline-knot subdivisions where a 12- or 24-node rule is already
3590// converged to machine precision, and the flex marginal-slope row calculus
3591// evaluates O(100) such cells per row across n=10⁵–10⁶ rows per criterion
3592// evaluation — the fixed rule was the dominant cost of the whole fit (#979).
3593// `evaluate_non_affine_cell_simd` therefore walks a progressive ladder of
3594// rules (12, 24, 48, 96, 192, 384 nodes) and returns as soon as two
3595// consecutive rules agree to `NON_AFFINE_LADDER_RTOL` relative to the moment
3596// vector's own scale. Unlike the old fixed rule — whose error was real but
3597// uncertified — every accepted ladder result carries an embedded two-rule
3598// agreement certificate; a cell that never certifies falls through to the
3599// same 384-node answer the fixed rule produced.
3600//
3601// SIMD path: process 4 GL nodes per outer iteration, batching the two scalar
3602// `exp` calls into single 4-wide `wide::f64x4::exp` invocations. All ladder
3603// rule sizes are divisible by 4, so no scalar tail is needed for the GL
3604// sweep. The inner moment accumulation is then run scalar per-lane but with
3605// a 4-way unrolled slab over the moment slots to break the `z_pow *= z`
3606// serial dependency chain.
3607#[inline(always)]
3608fn evaluate_non_affine_cell_with_rule<const COMPUTE_VALUE: bool>(
3609    cell: DenestedCubicCell,
3610    max_degree: usize,
3611    gl_nodes: &[f64],
3612    gl_weights: &[f64],
3613) -> (CellMomentVec, f64) {
3614    let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
3615    let mut value_integral = 0.0_f64;
3616    let center = 0.5 * (cell.left + cell.right);
3617    let half_width = 0.5 * (cell.right - cell.left);
3618    let c0 = cell.c0;
3619    let c1 = cell.c1;
3620    let c2 = cell.c2;
3621    let c3 = cell.c3;
3622    let moments_slice: &mut [f64] = &mut moments;
3623    assert_eq!(gl_nodes.len(), gl_weights.len());
3624    use wide::f64x4;
3625    let center_v = f64x4::splat(center);
3626    let half_width_v = f64x4::splat(half_width);
3627    let c0_v = f64x4::splat(c0);
3628    let c1_v = f64x4::splat(c1);
3629    let c2_v = f64x4::splat(c2);
3630    let c3_v = f64x4::splat(c3);
3631    let neg_half_v = f64x4::splat(-0.5);
3632    let n_total = gl_nodes.len();
3633    let n_simd = n_total - (n_total % 4);
3634    let mut i = 0;
3635    while i < n_simd {
3636        let node_v = f64x4::from([
3637            gl_nodes[i],
3638            gl_nodes[i + 1],
3639            gl_nodes[i + 2],
3640            gl_nodes[i + 3],
3641        ]);
3642        let weight_v = f64x4::from([
3643            gl_weights[i],
3644            gl_weights[i + 1],
3645            gl_weights[i + 2],
3646            gl_weights[i + 3],
3647        ]);
3648        let z_v = half_width_v.mul_add(node_v, center_v);
3649        // Horner: ((c3*z + c2)*z + c1)*z + c0
3650        let eta_v = c3_v
3651            .mul_add(z_v, c2_v)
3652            .mul_add(z_v, c1_v)
3653            .mul_add(z_v, c0_v);
3654        let z2_v = z_v * z_v;
3655        let neg_q_v = neg_half_v * (z2_v + eta_v * eta_v);
3656        let exp_negq_v = neg_q_v.exp();
3657        let moment_weight_v = weight_v * exp_negq_v;
3658        let z_arr = z_v.to_array();
3659        let mw_arr = moment_weight_v.to_array();
3660        if COMPUTE_VALUE {
3661            for lane in 0..4 {
3662                let z = z_arr[lane];
3663                let mw = mw_arr[lane];
3664                accumulate_moments_unrolled4(moments_slice, mw, z);
3665                // The value integrand carries Φ(η)'s erfc, whose systematic
3666                // per-z error is ~1e-13. To honor the cell-value accuracy
3667                // contract the value term must be assembled bit-for-bit like
3668                // the scalar reference: a non-fused node map
3669                // `z_ref = center + half_width·node`, the expanded
3670                // `η = c0 + c1·z + c2·z² + c3·z³` (NOT the SIMD Horner-FMA used
3671                // for the moments), the unscaled GL weight, a scalar `exp(-½z²)`,
3672                // and a plain `+=`. The SIMD `z_v`/`eta_v` above (fused) feed
3673                // ONLY the moments and are left untouched. Any single ULP slip
3674                // here (FMA node map, Horner η, per-term half_width, SIMD exp,
3675                // FMA accumulation) drifts the 384-node sum by ~1.4e-13 and
3676                // breaks the contract.
3677                let node = gl_nodes[i + lane];
3678                let weight = gl_weights[i + lane];
3679                let z_ref = center + half_width * node;
3680                let eta_ref = c0 + c1 * z_ref + c2 * z_ref * z_ref + c3 * z_ref * z_ref * z_ref;
3681                value_integral += weight * (-0.5 * z_ref * z_ref).exp() * normal_cdf(eta_ref);
3682            }
3683        } else {
3684            for lane in 0..4 {
3685                let z = z_arr[lane];
3686                let mw = mw_arr[lane];
3687                accumulate_moments_unrolled4(moments_slice, mw, z);
3688            }
3689        }
3690        i += 4;
3691    }
3692    while i < n_total {
3693        let node = gl_nodes[i];
3694        let weight = gl_weights[i];
3695        let z = center + half_width * node;
3696        let eta = c3.mul_add(z, c2).mul_add(z, c1).mul_add(z, c0);
3697        let q = 0.5 * (z * z + eta * eta);
3698        let moment_weight = weight * (-q).exp();
3699        accumulate_moments_unrolled4(moments_slice, moment_weight, z);
3700        if COMPUTE_VALUE {
3701            // Bit-for-bit the reference value structure (see SIMD branch): the
3702            // node map `z = center + half_width·node` here already matches the
3703            // reference (non-fused), but η must use the expanded reference form
3704            // rather than the moment path's Horner-FMA.
3705            let eta_ref = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3706            value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta_ref);
3707        }
3708        i += 1;
3709    }
3710    // Apply the cell half-width to both moment and value integrals ONCE at the
3711    // end, mirroring the prefold reference. Folding half_width per-term changes
3712    // f64 rounding enough to show up at the 1e-13 contract.
3713    for moment in moments_slice.iter_mut() {
3714        *moment *= half_width;
3715    }
3716    let value = if COMPUTE_VALUE {
3717        value_integral * half_width
3718    } else {
3719        value_integral
3720    };
3721    (moments, value)
3722}
3723
3724/// Relative agreement threshold for the progressive non-affine quadrature
3725/// ladder: two consecutive Gauss-Legendre rules must agree on every moment
3726/// slot to this tolerance relative to the moment vector's own max magnitude
3727/// before the finer rule's result
3728/// is accepted. Gauss-Legendre error decays geometrically in the node count
3729/// for the analytic integrand `exp(-q(z))`, so agreement between an n-node
3730/// and a 2n-node rule certifies that both are converged: the coarse rule's
3731/// true error is bounded by the observed difference plus the (much smaller)
3732/// fine-rule error.
3733///
3734/// History (#979): a roundoff-floor relaxation of this test (accept when
3735/// successive rungs agree to `≈ n·ε·scale` rather than the bare `3e-15`) was
3736/// tried to let smooth cells certify below the terminal 384-node rung. It was
3737/// reverted: the value-bearing path carries `∫ φ(z)·Φ(η(z)) dz`, and `Φ`'s
3738/// `erfc` implementation has a *systematic per-z* error of order `1e-13` that
3739/// each rung's node set samples differently. Only the exact 384-node rule
3740/// reproduces the reference's erfc-noise realization, so any sub-384 rung
3741/// drifts from the 384 value by `≈ 1e-13` — a drift that is NOT truncation,
3742/// does NOT shrink with rung, and is NOT bounded by rung-to-rung agreement.
3743/// The moment ladder remains independent of the value integral so value- and
3744/// derivative-only evaluators keep returning bit-identical moments. The scalar
3745/// value now evaluates on the terminal 384-node rule directly, preserving the
3746/// `non_affine_cell_state_matches_prefold_reference_to_1e_minus_13` value
3747/// contract without forcing every derivative-moment caller to use the terminal
3748/// rung.
3749const NON_AFFINE_LADDER_RTOL: f64 = 1e-15;
3750
3751/// Node counts of the progressive ladder below the 384-node terminal rung.
3752/// All divisible by 4 so the SIMD sweep needs no scalar tail.
3753const NON_AFFINE_LADDER_RUNGS: [usize; 5] = [12, 24, 48, 96, 192];
3754
3755/// Runtime-generated Gauss-Legendre rules for the ladder rungs, computed
3756/// once per process by Newton iteration on the Legendre polynomial roots
3757/// (standard `gauleg`: cosine initial guess, 3-4 Newton steps to machine
3758/// precision). The terminal 384-node rung reuses the compile-time
3759/// `GL_NODES`/`GL_WEIGHTS` tables, which also remain the single source for
3760/// the GPU kernel.
3761fn non_affine_ladder_rules() -> &'static [(Vec<f64>, Vec<f64>)] {
3762    static RULES: std::sync::OnceLock<Vec<(Vec<f64>, Vec<f64>)>> = std::sync::OnceLock::new();
3763    RULES.get_or_init(|| {
3764        NON_AFFINE_LADDER_RUNGS
3765            .iter()
3766            .map(|&n| gauss_legendre_rule(n))
3767            .collect()
3768    })
3769}
3770
3771/// Nodes and weights of the `n`-point Gauss-Legendre rule on `[-1, 1]`.
3772///
3773/// Newton iteration on `P_n` from the cosine initial guess
3774/// `cos(π(i + 0.75)/(n + 0.5))` converges to every root in a handful of
3775/// steps; weights follow from `w_i = 2 / ((1 - x_i²) P_n'(x_i)²)`. Roots are
3776/// filled symmetrically so the rule is exactly antisymmetric about 0.
3777fn gauss_legendre_rule(n: usize) -> (Vec<f64>, Vec<f64>) {
3778    let mut nodes = vec![0.0_f64; n];
3779    let mut weights = vec![0.0_f64; n];
3780    for i in 0..n.div_ceil(2) {
3781        let mut z = (std::f64::consts::PI * (i as f64 + 0.75) / (n as f64 + 0.5)).cos();
3782        let mut pp = 0.0_f64;
3783        for _ in 0..100 {
3784            // Legendre recurrence: p1 = P_n(z), p2 = P_{n-1}(z).
3785            let mut p1 = 1.0_f64;
3786            let mut p2 = 0.0_f64;
3787            for j in 1..=n {
3788                let p3 = p2;
3789                p2 = p1;
3790                p1 = ((2 * j - 1) as f64 * z * p2 - (j - 1) as f64 * p3) / j as f64;
3791            }
3792            pp = n as f64 * (z * p1 - p2) / (z * z - 1.0);
3793            let z_prev = z;
3794            z = z_prev - p1 / pp;
3795            if (z - z_prev).abs() <= f64::EPSILON {
3796                break;
3797            }
3798        }
3799        nodes[i] = -z;
3800        nodes[n - 1 - i] = z;
3801        let w = 2.0 / ((1.0 - z * z) * pp * pp);
3802        weights[i] = w;
3803        weights[n - 1 - i] = w;
3804    }
3805    (nodes, weights)
3806}
3807
3808/// Two-rule agreement certificate for the progressive ladder. `true` when
3809/// every MOMENT slot agrees to `NON_AFFINE_LADDER_RTOL` relative to the fine
3810/// result's max magnitude. Non-finite results never certify, so they fall
3811/// through to the terminal 384-node rung and reproduce the fixed rule's
3812/// behavior exactly.
3813///
3814/// The decision is deliberately moment-only and independent of whether the
3815/// caller also computed the cell value: the value- and derivative-only
3816/// evaluators MUST select the same ladder rung so they accumulate the moment
3817/// vector over the same nodes and return bit-identical moments (the
3818/// `derivative_moment_evaluator_matches_value_evaluator_moments` invariant).
3819/// Value-bearing callers evaluate the scalar cell probability separately on
3820/// the terminal 384-node rule; this certificate governs only the reusable
3821/// derivative moment vector.
3822fn non_affine_ladder_converged(coarse: &CellMomentVec, fine: &CellMomentVec) -> bool {
3823    let mut scale = 0.0_f64;
3824    let mut err = 0.0_f64;
3825    for (&c, &f) in coarse.iter().zip(fine.iter()) {
3826        scale = scale.max(f.abs());
3827        err = err.max((c - f).abs());
3828    }
3829    if !(scale.is_finite() && err.is_finite()) {
3830        return false;
3831    }
3832    err <= NON_AFFINE_LADDER_RTOL * scale
3833}
3834
3835/// Per-rung certification histogram for the non-affine ladder, indexed by the
3836/// rung that certified (`NON_AFFINE_LADDER_RUNGS[i]` at index `i`), with the
3837/// final slot counting cells that fell through to the terminal 384-node rule.
3838/// Incremented once per non-affine cell evaluation; the BMS exact-cache build
3839/// logs the distribution so the ladder's real cost (early-certify win vs.
3840/// terminal-fallthrough cost) is observable on every large-scale fit rather
3841/// than assumed. `+1` length for the terminal bucket.
3842pub(crate) static NON_AFFINE_LADDER_CERT_COUNTS: [AtomicU64; NON_AFFINE_LADDER_RUNGS.len() + 1] = [
3843    AtomicU64::new(0),
3844    AtomicU64::new(0),
3845    AtomicU64::new(0),
3846    AtomicU64::new(0),
3847    AtomicU64::new(0),
3848    AtomicU64::new(0),
3849];
3850
3851/// Snapshot the ladder certification histogram as `(rung_node_count, count)`
3852/// pairs plus the terminal-fallthrough count, for logging/inspection.
3853pub fn non_affine_ladder_cert_histogram() -> (Vec<(usize, u64)>, u64) {
3854    let per_rung = NON_AFFINE_LADDER_RUNGS
3855        .iter()
3856        .enumerate()
3857        .map(|(i, &n)| (n, NON_AFFINE_LADDER_CERT_COUNTS[i].load(Ordering::Relaxed)))
3858        .collect();
3859    let terminal =
3860        NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].load(Ordering::Relaxed);
3861    (per_rung, terminal)
3862}
3863
3864/// Progressive-ladder evaluation of a non-affine cell: walk the rule ladder
3865/// from 12 nodes upward and return the first result certified by two-rule
3866/// agreement; a cell that never certifies returns the terminal 384-node
3867/// result, byte-identical to the previous fixed-rule implementation.
3868#[inline]
3869fn evaluate_non_affine_cell_simd<const COMPUTE_VALUE: bool>(
3870    cell: DenestedCubicCell,
3871    max_degree: usize,
3872) -> (CellMomentVec, f64) {
3873    let mut prev: Option<(CellMomentVec, f64)> = None;
3874    for (i, (nodes, weights)) in non_affine_ladder_rules().iter().enumerate() {
3875        let cur =
3876            evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, nodes, weights);
3877        if let Some(prev) = prev.as_ref()
3878            && non_affine_ladder_converged(&prev.0, &cur.0)
3879        {
3880            NON_AFFINE_LADDER_CERT_COUNTS[i].fetch_add(1, Ordering::Relaxed);
3881            return cur;
3882        }
3883        prev = Some(cur);
3884    }
3885    NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].fetch_add(1, Ordering::Relaxed);
3886    evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, &GL_NODES, &GL_WEIGHTS)
3887}
3888
3889/// Value-only evaluation of a non-affine cell on the terminal 384-node rule.
3890///
3891/// Returns the cell probability integral `∫ exp(-½z²)·Φ(η(z)) dz` (pre the
3892/// `1/√τ` normalization) computed bit-for-bit like the value branch of
3893/// [`evaluate_non_affine_cell_with_rule`]: the non-fused node map
3894/// `z = center + half_width·node`, the expanded (non-Horner)
3895/// `η = c0 + c1·z + c2·z² + c3·z³`, the unscaled GL weight, a scalar
3896/// `exp(-½z²)`, a plain `+=` in ascending node order, and a single trailing
3897/// `·half_width`. The terminal rule has 384 nodes (divisible by 4), so the
3898/// general kernel's value path never takes its scalar tail — this loop walks
3899/// the same nodes in the same order and therefore reproduces the reference
3900/// erfc-noise realization the `1e-13` value contract pins down.
3901///
3902/// Computing this through `evaluate_non_affine_cell_with_rule::<true>` at
3903/// `max_degree = 0` would additionally run the 4-wide SIMD `exp(-q)` moment
3904/// sweep and a moment accumulation on every node only to discard the moment
3905/// vector. The survival marginal-slope fit evaluates a value per non-affine
3906/// partition cell, so that discarded moment work is the dominant waste in the
3907/// per-cell pass; this evaluator does only the work the value needs.
3908fn evaluate_non_affine_cell_value_terminal(cell: DenestedCubicCell) -> f64 {
3909    let center = 0.5 * (cell.left + cell.right);
3910    let half_width = 0.5 * (cell.right - cell.left);
3911    let c0 = cell.c0;
3912    let c1 = cell.c1;
3913    let c2 = cell.c2;
3914    let c3 = cell.c3;
3915    let mut value_integral = 0.0_f64;
3916    for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
3917        let z = center + half_width * node;
3918        let eta = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3919        value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
3920    }
3921    value_integral * half_width
3922}
3923
3924fn evaluate_non_affine_cell_state(
3925    cell: DenestedCubicCell,
3926    branch: ExactCellBranch,
3927    max_degree: usize,
3928) -> Result<CellMomentState, String> {
3929    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3930    let value_integral = evaluate_non_affine_cell_value_terminal(cell);
3931    // Reference structure: `value_integral * half_width / sqrt(TAU)`. The
3932    // half_width factor is already applied inside the rule evaluator, so divide
3933    // by sqrt(TAU) here (a true division, NOT multiply-by-reciprocal) to
3934    // reproduce the reference's final rounding bit-for-bit.
3935    Ok(CellMomentState {
3936        branch,
3937        value: value_integral / (std::f64::consts::TAU).sqrt(),
3938        moments,
3939    })
3940}
3941
3942fn evaluate_non_affine_cell_derivative_state(
3943    cell: DenestedCubicCell,
3944    branch: ExactCellBranch,
3945    max_degree: usize,
3946) -> Result<CellDerivativeMomentState, String> {
3947    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3948    Ok(CellDerivativeMomentState { branch, moments })
3949}
3950
3951/// De-nested cubic cell evaluator.
3952///
3953/// Affine cells use the closed-form affine anchor; non-affine cells (Quartic
3954/// and Sextic branches) are evaluated in a single pass over a fixed
3955/// high-order Gauss-Legendre rule on `[left, right]`.
3956pub fn evaluate_cell_moments(
3957    cell: DenestedCubicCell,
3958    max_degree: usize,
3959) -> Result<CellMomentState, String> {
3960    if !TAIL_CELL_MOMENT_CACHE_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
3961        return evaluate_cell_moments_uncached(cell, max_degree);
3962    }
3963    tail_cell_moment_cache().evaluate(cell, max_degree)
3964}
3965
3966/// Evaluate cell moments without consulting the global affine-tail memo.
3967///
3968/// This is retained for regression tests and before/after microbenchmarks;
3969/// production callers should use [`evaluate_cell_moments`].
3970pub fn evaluate_cell_moments_uncached(
3971    cell: DenestedCubicCell,
3972    max_degree: usize,
3973) -> Result<CellMomentState, String> {
3974    evaluate_cell_state_dispatched(
3975        cell,
3976        max_degree,
3977        evaluate_affine_cell_state,
3978        evaluate_non_affine_cell_state,
3979    )
3980}
3981
3982/// Evaluate only the moment vector needed by derivative contractions.
3983///
3984/// This deliberately does not compute the cell probability value
3985/// `∫ φ(z) Φ(η(z)) dz`. Derivative contractions consume
3986/// `∫ z^k exp(-q(z)) dz` moments only, so keeping the value out of the return
3987/// type prevents this cheaper evaluator from satisfying value-bearing calls.
3988pub fn evaluate_cell_derivative_moments_uncached(
3989    cell: DenestedCubicCell,
3990    max_degree: usize,
3991) -> Result<CellDerivativeMomentState, String> {
3992    evaluate_cell_state_dispatched(
3993        cell,
3994        max_degree,
3995        evaluate_affine_cell_derivative_state,
3996        evaluate_non_affine_cell_derivative_state,
3997    )
3998}
3999
4000/// Shared branch dispatch for the value-bearing and derivative-only cell
4001/// evaluators. Both walk the same decision tree (semi-infinite tail → must
4002/// be affine; finite cell → branch-by-coefficients with the sextic
4003/// degenerate-lowering path), differing only in which pair of
4004/// `(affine, non_affine)` evaluator helpers to delegate to.  The two helpers
4005/// are passed as `fn` pointers so the dispatch monomorphizes per `S` and
4006/// keeps the existing pre-condition errors / unreachable branch handling
4007/// in lockstep across both evaluators.
4008fn evaluate_cell_state_dispatched<S>(
4009    cell: DenestedCubicCell,
4010    max_degree: usize,
4011    affine: fn(DenestedCubicCell, usize) -> Result<S, String>,
4012    non_affine: fn(DenestedCubicCell, ExactCellBranch, usize) -> Result<S, String>,
4013) -> Result<S, String> {
4014    let left_inf = !cell.left.is_finite();
4015    let right_inf = !cell.right.is_finite();
4016    if left_inf || right_inf {
4017        // Semi-infinite tail cells must be affine: the deviation saturates
4018        // to a constant outside support, so c2=c3=0.  Both the BVN CDF
4019        // and the truncated-Gaussian moment vector handle infinite bounds.
4020        if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL
4021        {
4022            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4023                "semi-infinite cell [{}, {}] must be affine (c2=c3=0), got c2={:.3e}, c3={:.3e}",
4024                cell.left, cell.right, cell.c2, cell.c3
4025            ))
4026            .into());
4027        }
4028        return affine(cell, max_degree);
4029    }
4030    if cell.right <= cell.left {
4031        return Err(CubicCellKernelError::invalid_cell_shape(format!(
4032            "finite cell must have left < right, got [{}, {}]",
4033            cell.left, cell.right
4034        ))
4035        .into());
4036    }
4037    let branch = branch_cell(cell)?;
4038    if branch == ExactCellBranch::Affine {
4039        return affine(cell, max_degree);
4040    }
4041    if branch == ExactCellBranch::Sextic {
4042        let lead = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3)[5];
4043        if !lead.is_finite() {
4044            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4045                "sextic cell evaluation encountered non-finite leading coefficient: {lead:.3e}"
4046            ))
4047            .into());
4048        }
4049        if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
4050            return match lower_branch {
4051                ExactCellBranch::Quartic => non_affine(
4052                    DenestedCubicCell { c3: 0.0, ..cell },
4053                    ExactCellBranch::Quartic,
4054                    max_degree,
4055                ),
4056                ExactCellBranch::Affine => affine(
4057                    DenestedCubicCell {
4058                        c2: 0.0,
4059                        c3: 0.0,
4060                        ..cell
4061                    },
4062                    max_degree,
4063                ),
4064                ExactCellBranch::Sextic => Err(CubicCellKernelError::invalid_cell_shape(
4065                    "internal: degenerate_sextic_branch returned Sextic as a lowered branch",
4066                )
4067                .into()),
4068            };
4069        }
4070    }
4071    non_affine(cell, branch, max_degree)
4072}
4073
4074/// Evaluate a de-nested cubic cell through a fit-lifetime byte-limited LRU cache.
4075///
4076/// The fingerprint is an exact bit-cast of `(c0, c1, c2, c3, left, right)`, so
4077/// eviction and reuse cannot alias nearby-but-different cells.  A cached entry
4078/// computed to a higher degree may satisfy a lower-degree request by truncating
4079/// the moment vector, preserving the public [`evaluate_cell_moments`] contract.
4080pub fn evaluate_cell_moments_cached(
4081    cell: DenestedCubicCell,
4082    max_degree: usize,
4083    cache: &CellMomentLruCache,
4084    stats: Option<&CellMomentCacheStats>,
4085) -> Result<CellMomentState, String> {
4086    // Affine cells (every rigid-path cell and every tail cell) evaluate
4087    // through the closed-form anchor — cheaper than a single LRU probe. The
4088    // LRU exists only to amortize the EXPENSIVE non-affine transport across
4089    // recurring cells; at large n the row scalars `(a, b)` are unique per
4090    // row, so affine cells never recur and routing them through the sharded
4091    // mutex was pure cost (320k lock+insert+evict ops per gradient eval, ~0%
4092    // hit — the dominant cost of the rigid n=320k fit, #979). Bypass the
4093    // cache entirely for them.
4094    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4095        if let Some(stats) = stats {
4096            stats.misses.fetch_add(1, Ordering::Relaxed);
4097        }
4098        return evaluate_cell_moments_uncached(cell, max_degree);
4099    }
4100    let key = CellFingerprint::new(cell);
4101    let existing_derivative = match cache.get(&key) {
4102        Some(cached) => {
4103            if let Some(state) = cached.state_for_degree(max_degree) {
4104                if let Some(stats) = stats {
4105                    stats.hits.fetch_add(1, Ordering::Relaxed);
4106                }
4107                return Ok(state);
4108            }
4109            // `cached.derivative_state` is `Option<Arc<_>>`; `.clone()` here
4110            // is the cheap refcount bump the audit-39 fix targets, not a
4111            // full moment-vector deep clone.
4112            cached.derivative_state.clone()
4113        }
4114        None => None,
4115    };
4116    if let Some(stats) = stats {
4117        stats.misses.fetch_add(1, Ordering::Relaxed);
4118    }
4119    let state = evaluate_cell_moments(cell, max_degree)?;
4120    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4121    // through `Arc::clone`, and return the underlying value by unwrapping the
4122    // unique-reference (caller-side) `Arc`. This replaces the prior
4123    // `state.clone()` deep copy at the insert site.
4124    let shared = Arc::new(state);
4125    let mut entry = CachedCellMoments::new(Arc::clone(&shared));
4126    if let Some(derivative) = existing_derivative {
4127        entry = entry.with_derivative(derivative);
4128    }
4129    cache.insert(key, entry);
4130    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4131}
4132
4133/// Derivative-moment counterpart to [`evaluate_cell_moments_cached`]. Shares
4134/// the value-moment LRU by storing both moment kinds in a single
4135/// [`CachedCellMoments`] entry keyed on the cell fingerprint — derivative
4136/// insertions preserve any pre-existing value state and vice versa, so the
4137/// two callers never evict each other's work.
4138pub fn evaluate_cell_derivative_moments_cached(
4139    cell: DenestedCubicCell,
4140    max_degree: usize,
4141    cache: &CellMomentLruCache,
4142    stats: Option<&CellMomentCacheStats>,
4143) -> Result<CellDerivativeMomentState, String> {
4144    // Affine cells bypass the LRU — see `evaluate_cell_moments_cached` for
4145    // why the sharded-mutex memo is pure overhead on the closed-form affine
4146    // path at large n (#979).
4147    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4148        if let Some(stats) = stats {
4149            stats.misses.fetch_add(1, Ordering::Relaxed);
4150        }
4151        return evaluate_cell_derivative_moments_uncached(cell, max_degree);
4152    }
4153    let key = CellFingerprint::new(cell);
4154    let existing_value = match cache.get(&key) {
4155        Some(cached) => {
4156            if let Some(state) = cached.derivative_state_for_degree(max_degree) {
4157                if let Some(stats) = stats {
4158                    stats.hits.fetch_add(1, Ordering::Relaxed);
4159                }
4160                return Ok(state);
4161            }
4162            // `cached.state` is `Option<Arc<_>>`; `.clone()` here is the cheap
4163            // refcount bump the audit-39 fix targets, not a full moment-vector
4164            // deep clone.
4165            cached.state.clone()
4166        }
4167        None => None,
4168    };
4169    if let Some(stats) = stats {
4170        stats.misses.fetch_add(1, Ordering::Relaxed);
4171    }
4172    let state = evaluate_cell_derivative_moments_uncached(cell, max_degree)?;
4173    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4174    // through `Arc::clone`, and return the underlying value by unwrapping the
4175    // unique-reference (caller-side) `Arc`. This replaces the prior
4176    // `state.clone()` deep copy at the insert site.
4177    let shared = Arc::new(state);
4178    let mut entry = CachedCellMoments::new_derivative(Arc::clone(&shared));
4179    if let Some(value) = existing_value {
4180        entry = entry.with_value(value);
4181    }
4182    cache.insert(key, entry);
4183    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4184}
4185
4186/// Scratch-backed variant of [`evaluate_cell_moments`].
4187///
4188/// Reuses the supplied [`CellMomentScratch`] for the returned moments slice,
4189/// so repeated calls with the same scratch (and a sufficient initial capacity)
4190/// avoid per-call `Vec` allocations on the hot inner-PIRLS row-intercept
4191/// solver path. Internal transport allocations are unchanged.
4192pub fn evaluate_cell_moments_with_scratch<'a>(
4193    cell: DenestedCubicCell,
4194    max_degree: usize,
4195    scratch: &'a mut CellMomentScratch,
4196) -> Result<CellMomentStateRef<'a>, String> {
4197    let state = evaluate_cell_moments(cell, max_degree)?;
4198    let out = scratch.prepare_moments(max_degree + 1);
4199    out.copy_from_slice(&state.moments);
4200    Ok(CellMomentStateRef {
4201        branch: state.branch,
4202        value: state.value,
4203        moments: out,
4204    })
4205}
4206
4207#[cfg(test)]
4208mod tests {
4209    use super::*;
4210    use gam_math::probability::normal_pdf;
4211
4212    #[inline]
4213    pub(super) fn polynomial_value(coefficients: &[f64], z: f64) -> f64 {
4214        coefficients
4215            .iter()
4216            .rev()
4217            .fold(0.0, |acc, &coeff| acc * z + coeff)
4218    }
4219
4220    fn reset_cell_moment_test_reallocs() {
4221        super::CELL_MOMENT_REALLOCS.store(0, std::sync::atomic::Ordering::Relaxed);
4222    }
4223
4224    fn cell_moment_test_reallocs() -> usize {
4225        super::CELL_MOMENT_REALLOCS.load(std::sync::atomic::Ordering::Relaxed)
4226    }
4227
4228    fn assert_close_rel(label: &str, actual: f64, expected: f64, tol: f64) {
4229        let denom = expected.abs().max(1.0);
4230        let rel = (actual - expected).abs() / denom;
4231        assert!(
4232            rel <= tol,
4233            "{label}: actual={actual:.17e} expected={expected:.17e} rel={rel:.3e} tol={tol:.3e}"
4234        );
4235    }
4236
4237    // The link-basis cell coefficient `transformed_link_cubic(span, a, b)` is, in
4238    // each of its four output components, a polynomial of TOTAL degree exactly 3 in
4239    // (a, b):
4240    //   d0 = c0 + c1·s + c2·s² + c3·s³            (s = a − left; deg 3 in a)
4241    //   d1 = b·(c1 + 2c2·s + 3c3·s²)              (a²·b → total deg 3)
4242    //   d2 = b²·(c2 + 3c3·s)                       (a·b² → total deg 3)
4243    //   d3 = c3·b³                                 (b³  → total deg 3)
4244    // Therefore EVERY 4th-order total (a,b)-partial (∂⁴/∂aⁱ∂b^{4−i}) is identically
4245    // zero, while the 3rd-order partials (∂³/∂aⁱ∂b^{3−i}) are the highest nonzero
4246    // ones. This is the exact algebraic fact the bidirectional flex jet relies on:
4247    // a "second mixed derivative of a third-a-partial" slot, etc., demands a 4th
4248    // total (a,b)-partial and must be hard-zero — substituting a (nonzero) 3rd
4249    // partial there is a bug. This test certifies BOTH facts by central FD so the
4250    // hard-coded `0.0` fixes are provably correct and provably necessary.
4251    #[test]
4252    fn link_basis_cell_fourth_ab_partials_vanish_third_are_nonzero() {
4253        let span = LocalSpanCubic {
4254            left: -0.4,
4255            right: 1.6,
4256            c0: 0.37,
4257            c1: -0.81,
4258            c2: 0.53,
4259            c3: -0.29,
4260        };
4261        let a0 = 0.23_f64;
4262        let b0 = 0.61_f64;
4263        let h = 1e-2_f64;
4264
4265        // Generic central-difference stencils per derivative order.
4266        let stencil = |order: usize| -> &'static [(i64, f64)] {
4267            match order {
4268                0 => &[(0, 1.0)],
4269                1 => &[(-1, -0.5), (1, 0.5)],
4270                2 => &[(-1, 1.0), (0, -2.0), (1, 1.0)],
4271                3 => &[(-2, -0.5), (-1, 1.0), (1, -1.0), (2, 0.5)],
4272                4 => &[(-2, 1.0), (-1, -4.0), (0, 6.0), (1, -4.0), (2, 1.0)],
4273                _ => &[(0, 1.0)],
4274            }
4275        };
4276        // FD of component `k` of the cell coefficient: ∂^{na+nb}/∂a^{na}∂b^{nb}.
4277        let fd = |k: usize, na: usize, nb: usize| -> f64 {
4278            let mut acc = 0.0;
4279            for &(ia, wa) in stencil(na) {
4280                for &(ib, wb) in stencil(nb) {
4281                    let a = a0 + (ia as f64) * h;
4282                    let b = b0 + (ib as f64) * h;
4283                    acc += wa * wb * link_basis_cell_coefficients(span, a, b)[k];
4284                }
4285            }
4286            acc / h.powi((na + nb) as i32)
4287        };
4288
4289        let (p3_aaa, p3_aab, p3_abb, p3_bbb) = link_basis_cell_third_partials(span);
4290
4291        // (1) The analytic 3rd partials match FD (within FD truncation) — and at
4292        // least one is appreciably nonzero, so these are real signal that a wrong
4293        // slot would inject.
4294        let mut max_third = 0.0_f64;
4295        for k in 0..4 {
4296            for (label, (na, nb), analytic) in [
4297                ("aaa", (3usize, 0usize), p3_aaa[k]),
4298                ("aab", (2, 1), p3_aab[k]),
4299                ("abb", (1, 2), p3_abb[k]),
4300                ("bbb", (0, 3), p3_bbb[k]),
4301            ] {
4302                let got = fd(k, na, nb);
4303                assert!(
4304                    (got - analytic).abs() <= 1e-4 + 1e-3 * analytic.abs(),
4305                    "3rd partial {label}[{k}] analytic {analytic:+.6e} vs FD {got:+.6e}"
4306                );
4307                max_third = max_third.max(analytic.abs());
4308            }
4309        }
4310        assert!(
4311            max_third > 1e-1,
4312            "expected an appreciable nonzero 3rd (a,b)-partial; max |analytic| = {max_third:.3e}"
4313        );
4314
4315        // (2) EVERY 4th-order total (a,b)-partial vanishes (degree-3 polynomial),
4316        // certifying that the hard-coded `0.0` in the bidirectional d12 slots is the
4317        // mathematically required value, not an approximation.
4318        for k in 0..4 {
4319            for (na, nb) in [(4usize, 0usize), (3, 1), (2, 2), (1, 3), (0, 4)] {
4320                let got = fd(k, na, nb);
4321                assert!(
4322                    got.abs() <= 1e-2,
4323                    "4th (a,b)-partial ∂^{na}_a∂^{nb}_b of cell coeff[{k}] must vanish, FD = {got:+.6e}"
4324                );
4325            }
4326        }
4327    }
4328
4329    #[test]
4330    fn non_affine_cell_state_grid_matches_public_cell_moments_reference() {
4331        let cells = [
4332            DenestedCubicCell {
4333                left: -1.25,
4334                right: -0.2,
4335                c0: -0.35,
4336                c1: 0.85,
4337                c2: 0.04,
4338                c3: -0.015,
4339            },
4340            DenestedCubicCell {
4341                left: -0.2,
4342                right: 0.55,
4343                c0: 0.12,
4344                c1: -0.65,
4345                c2: -0.025,
4346                c3: 0.02,
4347            },
4348            DenestedCubicCell {
4349                left: 0.55,
4350                right: 1.6,
4351                c0: 0.42,
4352                c1: 0.35,
4353                c2: 0.018,
4354                c3: 0.012,
4355            },
4356        ];
4357        for cell in cells {
4358            let branch = branch_cell(cell).expect("branch");
4359            assert_ne!(branch, ExactCellBranch::Affine);
4360            for max_degree in [0usize, 2, 4, 9, 16] {
4361                let direct = evaluate_non_affine_cell_state(cell, branch, max_degree)
4362                    .expect("direct non-affine transport");
4363                let public = evaluate_cell_moments(cell, max_degree).expect("public evaluator");
4364                assert_eq!(direct.branch, public.branch);
4365                assert_eq!(direct.moments.len(), public.moments.len());
4366                let value_scale = direct.value.abs().max(public.value.abs()).max(1.0);
4367                assert!(
4368                    (direct.value - public.value).abs() <= 1e-10 * value_scale,
4369                    "value mismatch for {cell:?} degree {max_degree}: direct={} public={}",
4370                    direct.value,
4371                    public.value
4372                );
4373                for (degree, (lhs, rhs)) in
4374                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4375                {
4376                    let scale = lhs.abs().max(rhs.abs()).max(1.0);
4377                    assert!(
4378                        (lhs - rhs).abs() <= 1e-10 * scale,
4379                        "moment {degree} mismatch for {cell:?} degree {max_degree}: {lhs} vs {rhs}"
4380                    );
4381                }
4382            }
4383        }
4384    }
4385
4386    #[test]
4387    fn affine_tail_cell_memo_matches_uncached_grid_and_records_hits() {
4388        // Use a dedicated local cache so the test's hit/miss/entry counters
4389        // are not perturbed by concurrent tests that drive the shared
4390        // global memo through `evaluate_cell_moments`. Asserting on the
4391        // global counters made this test race-flaky when the suite ran in
4392        // parallel.
4393        let cache = TailCellMomentCache::new();
4394        let c0s = [-2.0, -0.25, 0.0, 1.5];
4395        let c1s = [-1.2, -0.05, 0.0, 0.8];
4396        let endpoints = [-4.0, -1.0, 0.0, 2.5, 6.0];
4397        let degrees = [0_usize, 4, 9, 16, 24];
4398
4399        for &c0 in &c0s {
4400            for &c1 in &c1s {
4401                for &endpoint in &endpoints {
4402                    for &max_degree in &degrees {
4403                        for &(left, right) in
4404                            &[(f64::NEG_INFINITY, endpoint), (endpoint, f64::INFINITY)]
4405                        {
4406                            let cell = DenestedCubicCell {
4407                                left,
4408                                right,
4409                                c0,
4410                                c1,
4411                                c2: 0.0,
4412                                c3: 0.0,
4413                            };
4414                            let expected = evaluate_cell_moments_uncached(cell, max_degree)
4415                                .expect("uncached affine tail moments");
4416                            let actual = cache
4417                                .evaluate(cell, max_degree)
4418                                .expect("cached affine tail moments miss");
4419                            let repeat = cache
4420                                .evaluate(cell, max_degree)
4421                                .expect("cached affine tail moments hit");
4422                            assert_eq!(actual.branch, expected.branch);
4423                            assert_eq!(repeat.branch, expected.branch);
4424                            assert_close_rel(
4425                                "tail value miss",
4426                                actual.value,
4427                                expected.value,
4428                                1e-14,
4429                            );
4430                            assert_close_rel("tail value hit", repeat.value, expected.value, 1e-14);
4431                            assert_eq!(actual.moments.len(), expected.moments.len());
4432                            assert_eq!(repeat.moments.len(), expected.moments.len());
4433                            for (idx, ((a, r), e)) in actual
4434                                .moments
4435                                .iter()
4436                                .zip(repeat.moments.iter())
4437                                .zip(expected.moments.iter())
4438                                .enumerate()
4439                            {
4440                                assert_close_rel(
4441                                    &format!("tail moment miss[{idx}]"),
4442                                    *a,
4443                                    *e,
4444                                    1e-14,
4445                                );
4446                                assert_close_rel(&format!("tail moment hit[{idx}]"), *r, *e, 1e-14);
4447                            }
4448                        }
4449                    }
4450                }
4451            }
4452        }
4453
4454        let stats = cache.stats();
4455        assert_eq!(stats.misses, stats.entries);
4456        assert!(
4457            stats.hits >= stats.misses,
4458            "expected repeat hits: {stats:?}"
4459        );
4460        assert!(
4461            stats.hit_rate() >= 0.5,
4462            "unexpected low hit rate: {stats:?}"
4463        );
4464    }
4465
4466    fn reference_bivariate_normal_cdf_20(h: f64, k: f64, rho: f64) -> f64 {
4467        if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
4468            return 0.0;
4469        }
4470        if h == f64::INFINITY {
4471            return normal_cdf(k);
4472        }
4473        if k == f64::INFINITY {
4474            return normal_cdf(h);
4475        }
4476        let rho_clamped = rho.clamp(-1.0, 1.0);
4477        if rho_clamped >= 1.0 - 1e-12 {
4478            return normal_cdf(h.min(k));
4479        }
4480        if rho_clamped <= -1.0 + 1e-12 {
4481            return (normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0);
4482        }
4483
4484        let hs = 0.5 * (h * h + k * k);
4485        let asr = rho_clamped.asin();
4486        let mut sum = 0.0;
4487        for (&node, &weight) in GL20_NODES.iter().zip(GL20_WEIGHTS.iter()) {
4488            let sn = (0.5 * asr * (node + 1.0)).sin();
4489            let one_minus = 1.0 - sn * sn;
4490            let expo = ((sn * h * k) - hs) / one_minus;
4491            sum += weight * expo.exp();
4492        }
4493        (normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0)
4494    }
4495
4496    #[test]
4497    fn non_affine_cell_state_reference_grid_matches_public_moments() {
4498        let c0s = [-0.4, 0.0, 0.35];
4499        let c1s = [-0.8, 0.25, 1.1];
4500        let c2s = [-0.12, 0.08];
4501        let c3s = [-0.04, 0.03];
4502        let intervals = [(-1.25, -0.2), (-0.5, 0.75), (0.1, 1.4)];
4503        let degrees = [3usize, 6, 9, 12];
4504
4505        for &c0 in &c0s {
4506            for &c1 in &c1s {
4507                for &c2 in &c2s {
4508                    for &c3 in &c3s {
4509                        for &(left, right) in &intervals {
4510                            let cell = DenestedCubicCell {
4511                                left,
4512                                right,
4513                                c0,
4514                                c1,
4515                                c2,
4516                                c3,
4517                            };
4518                            let branch = branch_cell(cell).expect("branch");
4519                            assert_ne!(branch, ExactCellBranch::Affine);
4520                            for &degree in &degrees {
4521                                let direct = evaluate_non_affine_cell_state(cell, branch, degree)
4522                                    .expect("direct non-affine state");
4523                                let public = evaluate_cell_moments(cell, degree)
4524                                    .expect("public non-affine state");
4525                                assert_eq!(direct.branch, public.branch);
4526                                let value_scale =
4527                                    direct.value.abs().max(public.value.abs()).max(1.0);
4528                                assert!(
4529                                    (direct.value - public.value).abs() / value_scale <= 1.0e-15,
4530                                    "value mismatch for {cell:?}, degree {degree}: direct={:.17e}, public={:.17e}",
4531                                    direct.value,
4532                                    public.value
4533                                );
4534                                assert_eq!(direct.moments.len(), public.moments.len());
4535                                for (idx, (&a, &b)) in
4536                                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4537                                {
4538                                    let scale = a.abs().max(b.abs()).max(1.0);
4539                                    assert!(
4540                                        (a - b).abs() / scale <= 1.0e-15,
4541                                        "moment {idx} mismatch for {cell:?}, degree {degree}: direct={a:.17e}, public={b:.17e}"
4542                                    );
4543                                }
4544                            }
4545                        }
4546                    }
4547                }
4548            }
4549        }
4550    }
4551
4552    #[test]
4553    fn bivariate_normal_cdf_matches_reference_grid_to_1e_minus_10() {
4554        let hs = [-8.0, -5.0, -3.0, -1.5, -0.5, 0.0, 0.25, 1.0, 2.5, 5.0, 8.0];
4555        let ks = [-8.0, -4.0, -2.0, -0.75, 0.0, 0.4, 1.25, 3.0, 6.0, 8.0];
4556        let rhos = [
4557            -0.999_999_999_999,
4558            -0.999,
4559            -0.95,
4560            -0.7,
4561            -0.3,
4562            -1.0e-12,
4563            0.0,
4564            1.0e-12,
4565            0.3,
4566            0.7,
4567            0.95,
4568            0.999,
4569            0.999_999_999_999,
4570        ];
4571        for &h in &hs {
4572            for &k in &ks {
4573                for &rho in &rhos {
4574                    let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4575                    let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4576                    let scale = expected.abs().max(1.0e-300);
4577                    let rel = (actual - expected).abs() / scale;
4578                    assert!(
4579                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4580                        "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4581                    );
4582                }
4583            }
4584        }
4585    }
4586
4587    #[test]
4588    fn bivariate_normal_cdf_matches_reference_lcg_property_samples() {
4589        let mut seed = 0x5eed_cafe_f00d_u64;
4590        let mut next_unit = || {
4591            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
4592            ((seed >> 11) as f64) * (1.0 / ((1_u64 << 53) as f64))
4593        };
4594        for _ in 0..4096 {
4595            let h = -8.0 + 16.0 * next_unit();
4596            let k = -8.0 + 16.0 * next_unit();
4597            let rho = -0.999 + 1.998 * next_unit();
4598            let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4599            let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4600            let scale = expected.abs().max(1.0e-300);
4601            let rel = (actual - expected).abs() / scale;
4602            assert!(
4603                rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4604                "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4605            );
4606        }
4607    }
4608
4609    #[test]
4610    fn affine_bvn_interval_primitive_matches_two_cdf_difference() {
4611        let hs = [-6.0, -2.0, -0.25, 0.0, 0.8, 3.0, 6.0];
4612        let bounds = [
4613            (-5.0, -2.0),
4614            (-3.0, -0.1),
4615            (-1.0, 0.0),
4616            (-0.25, 0.75),
4617            (0.2, 3.5),
4618            (2.0, 7.0),
4619        ];
4620        let rhos = [-0.98, -0.8, -0.25, 0.0, 0.25, 0.8, 0.98];
4621        for &h in &hs {
4622            for &(left, right) in &bounds {
4623                for &rho in &rhos {
4624                    let actual =
4625                        bivariate_normal_cdf_interval(h, left, right, rho).expect("interval");
4626                    let expected = (reference_bivariate_normal_cdf_20(h, right, rho)
4627                        - reference_bivariate_normal_cdf_20(h, left, rho))
4628                    .clamp(0.0, 1.0);
4629                    let scale = expected.abs().max(1.0e-300);
4630                    let rel = (actual - expected).abs() / scale;
4631                    assert!(
4632                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-12,
4633                        "h={h} left={left} right={right} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4634                    );
4635                }
4636            }
4637        }
4638    }
4639
4640    fn simpson_integral<F>(left: f64, right: f64, steps: usize, f: F) -> f64
4641    where
4642        F: Fn(f64) -> f64,
4643    {
4644        let n = if steps.is_multiple_of(2) {
4645            steps
4646        } else {
4647            steps + 1
4648        };
4649        let h = (right - left) / n as f64;
4650        let mut acc = f(left) + f(right);
4651        for k in 1..n {
4652            let x = left + h * k as f64;
4653            let w = if k % 2 == 0 { 2.0 } else { 4.0 };
4654            acc += w * f(x);
4655        }
4656        acc * h / 3.0
4657    }
4658
4659    #[test]
4660    fn global_transform_preserves_local_span_polynomial() {
4661        let span = LocalSpanCubic {
4662            left: -1.2,
4663            right: 0.8,
4664            c0: 0.3,
4665            c1: -0.25,
4666            c2: 0.11,
4667            c3: -0.04,
4668        };
4669        let (g0, g1, g2, g3) = global_cubic_from_local(span);
4670        for &x in &[-1.2, -0.7, -0.1, 0.4, 0.8] {
4671            let local = span.evaluate(x);
4672            let global = g0 + g1 * x + g2 * x * x + g3 * x * x * x;
4673            assert!((local - global).abs() < 1e-12);
4674        }
4675    }
4676
4677    #[test]
4678    fn bivariate_normal_cdf_independent_factorizes() {
4679        let h = -0.35;
4680        let k = 0.8;
4681        let out = bivariate_normal_cdf(h, k, 0.0).expect("bvn");
4682        let target = normal_cdf(h) * normal_cdf(k);
4683        assert!((out - target).abs() < 1e-12);
4684    }
4685
4686    #[test]
4687    fn evaluate_affine_cell_state_matches_numeric_integrals() {
4688        let cell = DenestedCubicCell {
4689            left: -0.9,
4690            right: 0.8,
4691            c0: 0.15,
4692            c1: -0.35,
4693            c2: 0.0,
4694            c3: 0.0,
4695        };
4696        let state = evaluate_affine_cell_state(cell, 6).expect("affine cell");
4697        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
4698            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
4699        });
4700        assert_eq!(state.branch, ExactCellBranch::Affine);
4701        assert!((state.value - value_numeric).abs() < 1e-9);
4702        for degree in 0..=6 {
4703            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
4704                z.powi(degree as i32) * (-cell.q(z)).exp()
4705            });
4706            assert!((state.moments[degree] - target).abs() < 1e-9);
4707        }
4708    }
4709
4710    #[test]
4711    fn affine_cell_value_matches_zero_moment_derivative() {
4712        let cell = DenestedCubicCell {
4713            left: -1.1,
4714            right: 0.7,
4715            c0: 0.23,
4716            c1: -0.41,
4717            c2: 0.0,
4718            c3: 0.0,
4719        };
4720        let h = 1e-6;
4721        let plus = evaluate_affine_cell_state(
4722            DenestedCubicCell {
4723                c0: cell.c0 + h,
4724                ..cell
4725            },
4726            0,
4727        )
4728        .expect("affine plus");
4729        let minus = evaluate_affine_cell_state(
4730            DenestedCubicCell {
4731                c0: cell.c0 - h,
4732                ..cell
4733            },
4734            0,
4735        )
4736        .expect("affine minus");
4737        let center = evaluate_affine_cell_state(cell, 0).expect("affine center");
4738        let d_value = (plus.value - minus.value) / (2.0 * h);
4739        let target = INV_TWO_PI * center.moments[0];
4740        assert!((d_value - target).abs() < 1e-8);
4741    }
4742
4743    #[test]
4744    fn coefficient_partials_match_exact_span_derivatives() {
4745        let score_span = LocalSpanCubic {
4746            left: -0.75,
4747            right: 0.25,
4748            c0: 0.08,
4749            c1: -0.03,
4750            c2: 0.02,
4751            c3: -0.01,
4752        };
4753        let link_span = LocalSpanCubic {
4754            left: -0.6,
4755            right: 0.9,
4756            c0: -0.05,
4757            c1: 0.04,
4758            c2: -0.02,
4759            c3: 0.015,
4760        };
4761        let a = 0.3;
4762        let b = -0.7;
4763        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
4764        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4765            let u = a + b * z;
4766            let eta_a = 1.0 + link_span.first_derivative(u);
4767            let eta_b = z + score_span.evaluate(z) + z * link_span.first_derivative(u);
4768            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4769            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4770        }
4771    }
4772
4773    #[test]
4774    fn second_coefficient_partials_match_exact_span_derivatives() {
4775        let score_span = LocalSpanCubic {
4776            left: -0.75,
4777            right: 0.25,
4778            c0: 0.08,
4779            c1: -0.03,
4780            c2: 0.02,
4781            c3: -0.01,
4782        };
4783        let link_span = LocalSpanCubic {
4784            left: -0.6,
4785            right: 0.9,
4786            c0: -0.05,
4787            c1: 0.04,
4788            c2: -0.02,
4789            c3: 0.015,
4790        };
4791        let a = 0.3;
4792        let b = -0.7;
4793        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
4794        let dc_daa = second_partials.0;
4795        let dc_dab = second_partials.1;
4796        let dc_dbb = second_partials.2;
4797        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4798            let u = a + b * z;
4799            let eta_aa = link_span.second_derivative(u);
4800            let eta_ab = z * link_span.second_derivative(u);
4801            let eta_bb = z * z * link_span.second_derivative(u);
4802            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4803            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4804            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4805        }
4806    }
4807
4808    #[test]
4809    fn higher_derivative_moment_helpers_reject_empty_first_coefficients() {
4810        let cell = DenestedCubicCell {
4811            left: -1.0,
4812            right: 1.0,
4813            c0: 0.0,
4814            c1: 1.0,
4815            c2: 0.0,
4816            c3: 0.0,
4817        };
4818        let moments = [1.0; 16];
4819
4820        let third_err = cell_third_derivative_from_moments(
4821            cell,
4822            &[],
4823            &[1.0],
4824            &[1.0],
4825            &[],
4826            &[],
4827            &[],
4828            &[],
4829            &moments,
4830        )
4831        .expect_err("empty first coefficients should be rejected");
4832        assert!(third_err.contains("r first-derivative coefficients must be non-empty"));
4833
4834        let fourth_err = cell_fourth_derivative_from_moments(
4835            cell,
4836            &[1.0],
4837            &[],
4838            &[1.0],
4839            &[1.0],
4840            &[],
4841            &[],
4842            &[],
4843            &[],
4844            &[],
4845            &[],
4846            &[],
4847            &[],
4848            &[],
4849            &[],
4850            &[],
4851            &moments,
4852        )
4853        .expect_err("empty first coefficients should be rejected");
4854        assert!(fourth_err.contains("s first-derivative coefficients must be non-empty"));
4855    }
4856
4857    #[test]
4858    fn fourth_derivative_rejects_overlong_scratch_convolutions() {
4859        let cell = DenestedCubicCell {
4860            left: -1.0,
4861            right: 1.0,
4862            c0: 0.0,
4863            c1: 1.0,
4864            c2: 0.0,
4865            c3: 0.0,
4866        };
4867        let long_first = [1.0; 10];
4868        let zero = [0.0; 1];
4869        let moments = [1.0; 64];
4870
4871        let err = cell_fourth_derivative_from_moments(
4872            cell,
4873            &long_first,
4874            &long_first,
4875            &long_first,
4876            &long_first,
4877            &zero,
4878            &zero,
4879            &zero,
4880            &zero,
4881            &zero,
4882            &zero,
4883            &zero,
4884            &zero,
4885            &zero,
4886            &zero,
4887            &zero,
4888            &moments,
4889        )
4890        .expect_err("oversized convolution should be rejected before writing scratch");
4891        assert!(err.contains("fourth derivative polynomial convolution scratch too small"));
4892    }
4893
4894    #[test]
4895    fn score_and_link_basis_cell_coefficients_match_direct_construction() {
4896        let score_basis_span = LocalSpanCubic {
4897            left: -0.7,
4898            right: 0.4,
4899            c0: 0.2,
4900            c1: -0.04,
4901            c2: 0.03,
4902            c3: -0.01,
4903        };
4904        let link_basis_span = LocalSpanCubic {
4905            left: -0.5,
4906            right: 1.1,
4907            c0: -0.03,
4908            c1: 0.05,
4909            c2: -0.02,
4910            c3: 0.01,
4911        };
4912        let a = 0.25;
4913        let b = -0.8;
4914        let score_coeffs = score_basis_cell_coefficients(score_basis_span, b);
4915        let link_coeffs = link_basis_cell_coefficients(link_basis_span, a, b);
4916        for &z in &[-0.7, -0.1, 0.2, 0.4] {
4917            let score_poly = polynomial_value(&score_coeffs, z);
4918            let link_poly = polynomial_value(&link_coeffs, z);
4919            assert!((score_poly - b * score_basis_span.evaluate(z)).abs() < 1e-12);
4920            assert!((link_poly - link_basis_span.evaluate(a + b * z)).abs() < 1e-12);
4921        }
4922    }
4923
4924    #[test]
4925    fn link_basis_partials_match_exact_span_derivatives() {
4926        let link_basis_span = LocalSpanCubic {
4927            left: -0.5,
4928            right: 1.1,
4929            c0: -0.03,
4930            c1: 0.05,
4931            c2: -0.02,
4932            c3: 0.01,
4933        };
4934        let a = 0.25;
4935        let b = -0.8;
4936        let (dc_da, dc_db) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
4937        let (dc_daa, dc_dab, dc_dbb) = link_basis_cell_second_partials(link_basis_span, a, b);
4938        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4939            let u = a + b * z;
4940            let eta_a = link_basis_span.first_derivative(u);
4941            let eta_b = z * link_basis_span.first_derivative(u);
4942            let eta_aa = link_basis_span.second_derivative(u);
4943            let eta_ab = z * link_basis_span.second_derivative(u);
4944            let eta_bb = z * z * link_basis_span.second_derivative(u);
4945            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4946            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4947            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4948            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4949            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4950        }
4951    }
4952
4953    #[test]
4954    fn denested_third_partials_match_exact_span_derivatives() {
4955        let link_span = LocalSpanCubic {
4956            left: -0.6,
4957            right: 0.9,
4958            c0: -0.05,
4959            c1: 0.04,
4960            c2: -0.02,
4961            c3: 0.015,
4962        };
4963        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
4964        let link_third = 6.0 * link_span.c3;
4965        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4966            let eta_aaa = link_third;
4967            let eta_aab = z * link_third;
4968            let eta_abb = z * z * link_third;
4969            let eta_bbb = z * z * z * link_third;
4970            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4971            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4972            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4973            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4974        }
4975    }
4976
4977    #[test]
4978    fn link_basis_third_partials_match_exact_span_derivatives() {
4979        let link_basis_span = LocalSpanCubic {
4980            left: -0.5,
4981            right: 1.1,
4982            c0: -0.03,
4983            c1: 0.05,
4984            c2: -0.02,
4985            c3: 0.01,
4986        };
4987        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = link_basis_cell_third_partials(link_basis_span);
4988        let link_third = 6.0 * link_basis_span.c3;
4989        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4990            let eta_aaa = link_third;
4991            let eta_aab = z * link_third;
4992            let eta_abb = z * z * link_third;
4993            let eta_bbb = z * z * z * link_third;
4994            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4995            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4996            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4997            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4998        }
4999    }
5000
5001    #[test]
5002    fn branch_selection_uses_normalized_non_affine_coefficients() {
5003        let affine = DenestedCubicCell {
5004            left: -1.0,
5005            right: 1.0,
5006            c0: 0.1,
5007            c1: -0.4,
5008            c2: 1e-13,
5009            c3: -1e-13,
5010        };
5011        let quartic = DenestedCubicCell {
5012            c2: 2e-4,
5013            c3: 1e-13,
5014            ..affine
5015        };
5016        let sextic = DenestedCubicCell {
5017            c2: 2e-4,
5018            c3: 5e-3,
5019            ..affine
5020        };
5021        assert_eq!(branch_cell(affine).unwrap(), ExactCellBranch::Affine);
5022        assert_eq!(branch_cell(quartic).unwrap(), ExactCellBranch::Quartic);
5023        assert_eq!(branch_cell(sextic).unwrap(), ExactCellBranch::Sextic);
5024    }
5025
5026    #[test]
5027    fn affine_anchor_moments_match_whole_line_closed_forms() {
5028        let out = affine_anchor_moment_vector(0.0, 0.0, f64::NEG_INFINITY, f64::INFINITY, 4);
5029        // `affine_anchor_moment_vector` returns the RAW substrate moments
5030        // `T_n = ∫ z^n exp(-½z²) dz` (the cubic-cell `∫ z^n exp(-q) dz`
5031        // convention that every production consumer and the GPU parity path
5032        // share; the `1/√(2π)` is folded in downstream via `INV_TWO_PI`). At
5033        // the affine identity the anchor is the *unnormalized* standard normal,
5034        // so M0 = M2 = √(2π) and M1 = 0 — the normalized {1, 0, 1} moments
5035        // scaled by the whole-line mass √(2π).
5036        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5037        assert!((out[0] - sqrt_2pi).abs() < 1e-12);
5038        assert!(out[1].abs() < 1e-12);
5039        assert!((out[2] - sqrt_2pi).abs() < 1e-12);
5040    }
5041
5042    #[test]
5043    fn affine_anchor_moments_match_shifted_gaussian_whole_line() {
5044        let alpha = 0.7;
5045        let beta = -0.4;
5046        let out = affine_anchor_moment_vector(alpha, beta, f64::NEG_INFINITY, f64::INFINITY, 4);
5047        let s = (1.0 + beta * beta).sqrt();
5048        let mu = -alpha * beta / (1.0 + beta * beta);
5049        // RAW (unnormalized) whole-line moments of the affine anchor
5050        // `exp(-½(alpha + beta·z)²)·exp(-½z²)`, an unnormalized Gaussian with
5051        // mean `mu` and variance `1/s²`. Its raw moments carry the `√(2π)` mass
5052        // factor: M0 = √(2π)·scale, M1 = √(2π)·scale·mu,
5053        // M2 = √(2π)·scale·(mu² + 1/s²), where the anchor amplitude
5054        // `scale = exp(-alpha² / 2s²) / s`.
5055        let scale = (-alpha * alpha / (2.0 * s * s)).exp() / s;
5056        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5057        assert!((out[0] - scale * sqrt_2pi).abs() < 1e-12);
5058        assert!((out[1] - scale * sqrt_2pi * mu).abs() < 1e-12);
5059        assert!((out[2] - scale * sqrt_2pi * (mu * mu + 1.0 / (s * s))).abs() < 1e-10);
5060    }
5061
5062    #[test]
5063    fn quartic_recurrence_reduces_higher_moments() {
5064        let cell = DenestedCubicCell {
5065            left: -1.0,
5066            right: 0.9,
5067            c0: 0.2,
5068            c1: -0.3,
5069            c2: 0.18,
5070            c3: 0.0,
5071        };
5072        let exact = |k: usize| {
5073            simpson_integral(cell.left, cell.right, 2000, |z| {
5074                z.powi(k as i32) * (-cell.q(z)).exp()
5075            })
5076        };
5077        let reduced = reduce_quartic_moments(cell, [exact(0), exact(1), exact(2)], 6)
5078            .expect("quartic reduction");
5079        for k in 0..=6 {
5080            let target = exact(k);
5081            assert!(
5082                (reduced[k] - target).abs() < 1e-7,
5083                "quartic reduced moment M{k} mismatch: {} vs {}",
5084                reduced[k],
5085                target
5086            );
5087        }
5088    }
5089
5090    #[test]
5091    fn sextic_recurrence_reduces_higher_moments() {
5092        let cell = DenestedCubicCell {
5093            left: -0.8,
5094            right: 0.7,
5095            c0: -0.1,
5096            c1: 0.25,
5097            c2: -0.14,
5098            c3: 0.22,
5099        };
5100        let exact = |k: usize| {
5101            simpson_integral(cell.left, cell.right, 3000, |z| {
5102                z.powi(k as i32) * (-cell.q(z)).exp()
5103            })
5104        };
5105        let reduced =
5106            reduce_sextic_moments(cell, [exact(0), exact(1), exact(2), exact(3), exact(4)], 9)
5107                .expect("sextic reduction");
5108        for k in 0..=9 {
5109            let target = exact(k);
5110            assert!(
5111                (reduced[k] - target).abs() < 1e-7,
5112                "sextic reduced moment M{k} mismatch: {} vs {}",
5113                reduced[k],
5114                target
5115            );
5116        }
5117    }
5118
5119    #[test]
5120    fn degenerate_sextic_branch_preserves_quadratic_coefficient() {
5121        let cell = DenestedCubicCell {
5122            left: -1.0,
5123            right: 1.0,
5124            c0: 0.0,
5125            c1: 0.0,
5126            c2: 0.1,
5127            c3: 2.0e-10,
5128        };
5129        assert_eq!(branch_cell(cell).unwrap(), ExactCellBranch::Sextic);
5130
5131        let state = evaluate_cell_moments(cell, 9).expect("degenerate sextic cell");
5132        let quartic_cell = DenestedCubicCell { c3: 0.0, ..cell };
5133        let quartic = evaluate_cell_moments(quartic_cell, 9).expect("quartic cell");
5134        let affine = evaluate_affine_cell_state(
5135            DenestedCubicCell {
5136                c2: 0.0,
5137                c3: 0.0,
5138                ..cell
5139            },
5140            9,
5141        )
5142        .expect("affine cell");
5143
5144        assert_eq!(state.branch, ExactCellBranch::Quartic);
5145        for k in 0..=9 {
5146            assert!(
5147                (state.moments[k] - quartic.moments[k]).abs() < 1e-12,
5148                "lowered moment M{k} should match the quartic cell: {} vs {}",
5149                state.moments[k],
5150                quartic.moments[k]
5151            );
5152        }
5153        assert!(
5154            (state.moments[0] - affine.moments[0]).abs() > 1e-4,
5155            "degenerate sextic handling must not drop the nonzero c2 term"
5156        );
5157    }
5158
5159    #[test]
5160    fn moment_reduced_first_and_second_derivatives_match_numeric_integrals() {
5161        let cell = DenestedCubicCell {
5162            left: -0.9,
5163            right: 0.6,
5164            c0: 0.15,
5165            c1: -0.2,
5166            c2: 0.08,
5167            c3: 0.17,
5168        };
5169        let moments = reduce_sextic_moments(
5170            cell,
5171            [
5172                simpson_integral(cell.left, cell.right, 3000, |z| (-cell.q(z)).exp()),
5173                simpson_integral(cell.left, cell.right, 3000, |z| z * (-cell.q(z)).exp()),
5174                simpson_integral(cell.left, cell.right, 3000, |z| z * z * (-cell.q(z)).exp()),
5175                simpson_integral(cell.left, cell.right, 3000, |z| {
5176                    z.powi(3) * (-cell.q(z)).exp()
5177                }),
5178                simpson_integral(cell.left, cell.right, 3000, |z| {
5179                    z.powi(4) * (-cell.q(z)).exp()
5180                }),
5181            ],
5182            9,
5183        )
5184        .expect("reduced moments");
5185
5186        let r = [0.7, -0.1, 0.3];
5187        let s = [0.2, 0.5];
5188        let second = [0.4, -0.2, 0.1];
5189        let exact_first = cell_first_derivative_from_moments(&r, &moments).expect("first");
5190        let exact_second =
5191            cell_second_derivative_from_moments(cell, &r, &s, &second, &moments).expect("second");
5192
5193        let numeric_first = simpson_integral(cell.left, cell.right, 3000, |z| {
5194            polynomial_value(&r, z) * (-cell.q(z)).exp() / (2.0 * std::f64::consts::PI)
5195        });
5196        let numeric_second = simpson_integral(cell.left, cell.right, 3000, |z| {
5197            let eta = cell.eta(z);
5198            (polynomial_value(&second, z) - eta * polynomial_value(&r, z) * polynomial_value(&s, z))
5199                * (-cell.q(z)).exp()
5200                / (2.0 * std::f64::consts::PI)
5201        });
5202
5203        assert!((exact_first - numeric_first).abs() < 1e-7);
5204        assert!((exact_second - numeric_second).abs() < 1e-7);
5205    }
5206
5207    #[test]
5208    fn moment_reduced_third_derivative_matches_numeric_integral() {
5209        let cell = DenestedCubicCell {
5210            left: -0.85,
5211            right: 0.7,
5212            c0: -0.12,
5213            c1: 0.18,
5214            c2: 0.09,
5215            c3: -0.11,
5216        };
5217        let moments = evaluate_cell_moments(cell, 12).expect("cell moments");
5218        let r = [0.35, -0.12, 0.08];
5219        let s = [0.17, 0.09];
5220        let t = [-0.21, 0.14, -0.04];
5221        let rs = [0.11, -0.07, 0.05];
5222        let rt = [-0.06, 0.03];
5223        let st = [0.08, -0.02, 0.01];
5224        let rst = [0.04, -0.05, 0.02];
5225
5226        let exact_third = cell_third_derivative_from_moments(
5227            cell,
5228            &r,
5229            &s,
5230            &t,
5231            &rs,
5232            &rt,
5233            &st,
5234            &rst,
5235            &moments.moments,
5236        )
5237        .expect("third derivative");
5238        let numeric_third = simpson_integral(cell.left, cell.right, 4000, |z| {
5239            let eta = cell.eta(z);
5240            let rz = polynomial_value(&r, z);
5241            let sz = polynomial_value(&s, z);
5242            let tz = polynomial_value(&t, z);
5243            let rsz = polynomial_value(&rs, z);
5244            let rtz = polynomial_value(&rt, z);
5245            let stz = polynomial_value(&st, z);
5246            let rstz = polynomial_value(&rst, z);
5247            (rstz - eta * (rsz * tz + rtz * sz + stz * rz) + (eta * eta - 1.0) * rz * sz * tz)
5248                * (-cell.q(z)).exp()
5249                / (2.0 * std::f64::consts::PI)
5250        });
5251
5252        assert!((exact_third - numeric_third).abs() < 1e-7);
5253    }
5254
5255    #[test]
5256    fn moment_reduced_fourth_derivative_matches_numeric_integral() {
5257        let cell = DenestedCubicCell {
5258            left: -0.8,
5259            right: 0.65,
5260            c0: 0.11,
5261            c1: -0.22,
5262            c2: 0.07,
5263            c3: 0.13,
5264        };
5265        let moments = evaluate_cell_moments(cell, 16).expect("cell moments");
5266        let r = [0.21, -0.13, 0.06];
5267        let s = [-0.18, 0.04];
5268        let t = [0.09, 0.07, -0.03];
5269        let u = [-0.14, 0.05];
5270        let rs = [0.08, -0.03, 0.02];
5271        let rt = [-0.05, 0.01];
5272        let ru = [0.04, -0.02, 0.01];
5273        let st = [0.03, 0.02];
5274        let su = [-0.02, 0.05, -0.01];
5275        let tu = [0.07, -0.04];
5276        let rst = [0.03, -0.01, 0.02];
5277        let rsu = [-0.02, 0.04];
5278        let rtu = [0.01, 0.02, -0.01];
5279        let stu = [-0.03, 0.02];
5280        let rstu = [0.02, -0.01, 0.01];
5281
5282        let exact_fourth = cell_fourth_derivative_from_moments(
5283            cell,
5284            &r,
5285            &s,
5286            &t,
5287            &u,
5288            &rs,
5289            &rt,
5290            &ru,
5291            &st,
5292            &su,
5293            &tu,
5294            &rst,
5295            &rsu,
5296            &rtu,
5297            &stu,
5298            &rstu,
5299            &moments.moments,
5300        )
5301        .expect("fourth derivative");
5302        let numeric_fourth = simpson_integral(cell.left, cell.right, 5000, |z| {
5303            let eta = cell.eta(z);
5304            let rz = polynomial_value(&r, z);
5305            let sz = polynomial_value(&s, z);
5306            let tz = polynomial_value(&t, z);
5307            let uz = polynomial_value(&u, z);
5308            let rsz = polynomial_value(&rs, z);
5309            let rtz = polynomial_value(&rt, z);
5310            let ruz = polynomial_value(&ru, z);
5311            let stz = polynomial_value(&st, z);
5312            let suz = polynomial_value(&su, z);
5313            let tuz = polynomial_value(&tu, z);
5314            let rstz = polynomial_value(&rst, z);
5315            let rsuz = polynomial_value(&rsu, z);
5316            let rtuz = polynomial_value(&rtu, z);
5317            let stuz = polynomial_value(&stu, z);
5318            let rstuz = polynomial_value(&rstu, z);
5319            let linear =
5320                rstz * uz + rsuz * tz + rtuz * sz + stuz * rz + rsz * tuz + rtz * suz + ruz * stz;
5321            let quadratic = rsz * tz * uz
5322                + rtz * sz * uz
5323                + ruz * sz * tz
5324                + stz * rz * uz
5325                + suz * rz * tz
5326                + tuz * rz * sz;
5327            let quartic = rz * sz * tz * uz;
5328            (rstuz - eta * linear
5329                + (eta * eta - 1.0) * quadratic
5330                + (-eta * eta * eta + 3.0 * eta) * quartic)
5331                * (-cell.q(z)).exp()
5332                / (2.0 * std::f64::consts::PI)
5333        });
5334
5335        assert!((exact_fourth - numeric_fourth).abs() < 2e-7);
5336    }
5337
5338    #[test]
5339    fn denested_cell_parameter_derivatives_match_exact_integrands() {
5340        let score_span = LocalSpanCubic {
5341            left: -0.75,
5342            right: 0.25,
5343            c0: 0.08,
5344            c1: -0.03,
5345            c2: 0.02,
5346            c3: -0.01,
5347        };
5348        let link_span = LocalSpanCubic {
5349            left: -0.6,
5350            right: 0.9,
5351            c0: -0.05,
5352            c1: 0.04,
5353            c2: -0.02,
5354            c3: 0.015,
5355        };
5356        let a = 0.3;
5357        let b = -0.7;
5358        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5359        let cell = DenestedCubicCell {
5360            left: score_span.left,
5361            right: score_span.right,
5362            c0: coeffs[0],
5363            c1: coeffs[1],
5364            c2: coeffs[2],
5365            c3: coeffs[3],
5366        };
5367        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5368        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5369        let (dc_daa, dc_dab, dc_dbb) = denested_cell_second_partials(score_span, link_span, a, b);
5370        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
5371        let zero = [0.0; 4];
5372        let link_third = 6.0 * link_span.c3;
5373
5374        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5375        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5376        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5377        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5378        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5379        let eta_aaa = |z: f64| link_third + 0.0 * z;
5380        let eta_aab = |z: f64| z * link_third;
5381        let eta_abb = |z: f64| z * z * link_third;
5382        let eta_bbb = |z: f64| z * z * z * link_third;
5383
5384        let exact_a = cell_first_derivative_from_moments(&dc_da, &state.moments).expect("a");
5385        let exact_b = cell_first_derivative_from_moments(&dc_db, &state.moments).expect("b");
5386        let exact_aa =
5387            cell_second_derivative_from_moments(cell, &dc_da, &dc_da, &dc_daa, &state.moments)
5388                .expect("aa");
5389        let exact_ab =
5390            cell_second_derivative_from_moments(cell, &dc_da, &dc_db, &dc_dab, &state.moments)
5391                .expect("ab");
5392        let exact_bb =
5393            cell_second_derivative_from_moments(cell, &dc_db, &dc_db, &dc_dbb, &state.moments)
5394                .expect("bb");
5395        let exact_aaa = cell_third_derivative_from_moments(
5396            cell,
5397            &dc_da,
5398            &dc_da,
5399            &dc_da,
5400            &dc_daa,
5401            &dc_daa,
5402            &dc_daa,
5403            &dc_daaa,
5404            &state.moments,
5405        )
5406        .expect("aaa");
5407        let exact_aab = cell_third_derivative_from_moments(
5408            cell,
5409            &dc_da,
5410            &dc_da,
5411            &dc_db,
5412            &dc_daa,
5413            &dc_dab,
5414            &dc_dab,
5415            &dc_daab,
5416            &state.moments,
5417        )
5418        .expect("aab");
5419        let exact_abb = cell_third_derivative_from_moments(
5420            cell,
5421            &dc_da,
5422            &dc_db,
5423            &dc_db,
5424            &dc_dab,
5425            &dc_dab,
5426            &dc_dbb,
5427            &dc_dabb,
5428            &state.moments,
5429        )
5430        .expect("abb");
5431        let exact_bbb = cell_third_derivative_from_moments(
5432            cell,
5433            &dc_db,
5434            &dc_db,
5435            &dc_db,
5436            &dc_dbb,
5437            &dc_dbb,
5438            &dc_dbb,
5439            &dc_dbbb,
5440            &state.moments,
5441        )
5442        .expect("bbb");
5443        let exact_aaaa = cell_fourth_derivative_from_moments(
5444            cell,
5445            &dc_da,
5446            &dc_da,
5447            &dc_da,
5448            &dc_da,
5449            &dc_daa,
5450            &dc_daa,
5451            &dc_daa,
5452            &dc_daa,
5453            &dc_daa,
5454            &dc_daa,
5455            &dc_daaa,
5456            &dc_daaa,
5457            &dc_daaa,
5458            &dc_daaa,
5459            &zero,
5460            &state.moments,
5461        )
5462        .expect("aaaa");
5463        let exact_aaab = cell_fourth_derivative_from_moments(
5464            cell,
5465            &dc_da,
5466            &dc_da,
5467            &dc_da,
5468            &dc_db,
5469            &dc_daa,
5470            &dc_daa,
5471            &dc_dab,
5472            &dc_daa,
5473            &dc_dab,
5474            &dc_dab,
5475            &dc_daaa,
5476            &dc_daab,
5477            &dc_daab,
5478            &dc_daab,
5479            &zero,
5480            &state.moments,
5481        )
5482        .expect("aaab");
5483        let exact_aabb = cell_fourth_derivative_from_moments(
5484            cell,
5485            &dc_da,
5486            &dc_da,
5487            &dc_db,
5488            &dc_db,
5489            &dc_daa,
5490            &dc_dab,
5491            &dc_dab,
5492            &dc_dab,
5493            &dc_dab,
5494            &dc_dbb,
5495            &dc_daab,
5496            &dc_daab,
5497            &dc_dabb,
5498            &dc_dabb,
5499            &zero,
5500            &state.moments,
5501        )
5502        .expect("aabb");
5503        let exact_abbb = cell_fourth_derivative_from_moments(
5504            cell,
5505            &dc_da,
5506            &dc_db,
5507            &dc_db,
5508            &dc_db,
5509            &dc_dab,
5510            &dc_dab,
5511            &dc_dab,
5512            &dc_dbb,
5513            &dc_dbb,
5514            &dc_dbb,
5515            &dc_dabb,
5516            &dc_dabb,
5517            &dc_dabb,
5518            &dc_dbbb,
5519            &zero,
5520            &state.moments,
5521        )
5522        .expect("abbb");
5523        let exact_bbbb = cell_fourth_derivative_from_moments(
5524            cell,
5525            &dc_db,
5526            &dc_db,
5527            &dc_db,
5528            &dc_db,
5529            &dc_dbb,
5530            &dc_dbb,
5531            &dc_dbb,
5532            &dc_dbb,
5533            &dc_dbb,
5534            &dc_dbb,
5535            &dc_dbbb,
5536            &dc_dbbb,
5537            &dc_dbbb,
5538            &dc_dbbb,
5539            &zero,
5540            &state.moments,
5541        )
5542        .expect("bbbb");
5543
5544        let numeric_a = simpson_integral(cell.left, cell.right, 5000, |z| {
5545            eta_a(z) * (-cell.q(z)).exp() * INV_TWO_PI
5546        });
5547        let numeric_b = simpson_integral(cell.left, cell.right, 5000, |z| {
5548            eta_b(z) * (-cell.q(z)).exp() * INV_TWO_PI
5549        });
5550        let numeric_aa = simpson_integral(cell.left, cell.right, 5000, |z| {
5551            (eta_aa(z) - cell.eta(z) * eta_a(z) * eta_a(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5552        });
5553        let numeric_ab = simpson_integral(cell.left, cell.right, 5000, |z| {
5554            (eta_ab(z) - cell.eta(z) * eta_a(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5555        });
5556        let numeric_bb = simpson_integral(cell.left, cell.right, 5000, |z| {
5557            (eta_bb(z) - cell.eta(z) * eta_b(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5558        });
5559        let numeric_aaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5560            let eta = cell.eta(z);
5561            (eta_aaa(z) - 3.0 * eta * eta_aa(z) * eta_a(z) + (eta * eta - 1.0) * eta_a(z).powi(3))
5562                * (-cell.q(z)).exp()
5563                * INV_TWO_PI
5564        });
5565        let numeric_aab = simpson_integral(cell.left, cell.right, 5000, |z| {
5566            let eta = cell.eta(z);
5567            let a_z = eta_a(z);
5568            let b_z = eta_b(z);
5569            (eta_aab(z) - eta * (eta_aa(z) * b_z + 2.0 * eta_ab(z) * a_z)
5570                + (eta * eta - 1.0) * a_z * a_z * b_z)
5571                * (-cell.q(z)).exp()
5572                * INV_TWO_PI
5573        });
5574        let numeric_abb = simpson_integral(cell.left, cell.right, 5000, |z| {
5575            let eta = cell.eta(z);
5576            let a_z = eta_a(z);
5577            let b_z = eta_b(z);
5578            (eta_abb(z) - eta * (2.0 * eta_ab(z) * b_z + eta_bb(z) * a_z)
5579                + (eta * eta - 1.0) * a_z * b_z * b_z)
5580                * (-cell.q(z)).exp()
5581                * INV_TWO_PI
5582        });
5583        let numeric_bbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5584            let eta = cell.eta(z);
5585            (eta_bbb(z) - 3.0 * eta * eta_bb(z) * eta_b(z) + (eta * eta - 1.0) * eta_b(z).powi(3))
5586                * (-cell.q(z)).exp()
5587                * INV_TWO_PI
5588        });
5589        let numeric_aaaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5590            let eta = cell.eta(z);
5591            let eta_a_z = eta_a(z);
5592            let eta_aa_z = eta_aa(z);
5593            let eta_aaa_z = eta_aaa(z);
5594            (-eta * (4.0 * eta_aaa_z * eta_a_z + 3.0 * eta_aa_z * eta_aa_z)
5595                + (eta * eta - 1.0) * (6.0 * eta_aa_z * eta_a_z * eta_a_z)
5596                + (-eta * eta * eta + 3.0 * eta) * eta_a_z.powi(4))
5597                * (-cell.q(z)).exp()
5598                * INV_TWO_PI
5599        });
5600        let numeric_aaab = simpson_integral(cell.left, cell.right, 5000, |z| {
5601            let eta = cell.eta(z);
5602            let a_z = eta_a(z);
5603            let b_z = eta_b(z);
5604            let aa_z = eta_aa(z);
5605            let ab_z = eta_ab(z);
5606            let aaa_z = eta_aaa(z);
5607            let aab_z = eta_aab(z);
5608            (-eta * (aaa_z * b_z + 3.0 * aab_z * a_z + 3.0 * aa_z * ab_z)
5609                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * b_z + 3.0 * ab_z * a_z * a_z)
5610                + (-eta * eta * eta + 3.0 * eta) * a_z.powi(3) * b_z)
5611                * (-cell.q(z)).exp()
5612                * INV_TWO_PI
5613        });
5614        let numeric_aabb = simpson_integral(cell.left, cell.right, 5000, |z| {
5615            let eta = cell.eta(z);
5616            let a_z = eta_a(z);
5617            let b_z = eta_b(z);
5618            let aa_z = eta_aa(z);
5619            let ab_z = eta_ab(z);
5620            let bb_z = eta_bb(z);
5621            let aab_z = eta_aab(z);
5622            let abb_z = eta_abb(z);
5623            (-eta * (2.0 * aab_z * b_z + 2.0 * abb_z * a_z + aa_z * bb_z + 2.0 * ab_z * ab_z)
5624                + (eta * eta - 1.0)
5625                    * (aa_z * b_z * b_z + 4.0 * ab_z * a_z * b_z + bb_z * a_z * a_z)
5626                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * b_z * b_z)
5627                * (-cell.q(z)).exp()
5628                * INV_TWO_PI
5629        });
5630        let numeric_abbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5631            let eta = cell.eta(z);
5632            let a_z = eta_a(z);
5633            let b_z = eta_b(z);
5634            let ab_z = eta_ab(z);
5635            let bb_z = eta_bb(z);
5636            let abb_z = eta_abb(z);
5637            let bbb_z = eta_bbb(z);
5638            (-eta * (3.0 * abb_z * b_z + bbb_z * a_z + 3.0 * ab_z * bb_z)
5639                + (eta * eta - 1.0) * (3.0 * ab_z * b_z * b_z + 3.0 * bb_z * a_z * b_z)
5640                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z.powi(3))
5641                * (-cell.q(z)).exp()
5642                * INV_TWO_PI
5643        });
5644        let numeric_bbbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5645            let eta = cell.eta(z);
5646            let eta_b_z = eta_b(z);
5647            let eta_bb_z = eta_bb(z);
5648            let eta_bbb_z = eta_bbb(z);
5649            (-eta * (4.0 * eta_bbb_z * eta_b_z + 3.0 * eta_bb_z * eta_bb_z)
5650                + (eta * eta - 1.0) * (6.0 * eta_bb_z * eta_b_z * eta_b_z)
5651                + (-eta * eta * eta + 3.0 * eta) * eta_b_z.powi(4))
5652                * (-cell.q(z)).exp()
5653                * INV_TWO_PI
5654        });
5655
5656        assert!((exact_a - numeric_a).abs() < 1e-8);
5657        assert!((exact_b - numeric_b).abs() < 1e-8);
5658        assert!((exact_aa - numeric_aa).abs() < 1e-8);
5659        assert!((exact_ab - numeric_ab).abs() < 1e-8);
5660        assert!((exact_bb - numeric_bb).abs() < 1e-8);
5661        assert!((exact_aaa - numeric_aaa).abs() < 2e-7);
5662        assert!((exact_aab - numeric_aab).abs() < 2e-7);
5663        assert!((exact_abb - numeric_abb).abs() < 2e-7);
5664        assert!((exact_bbb - numeric_bbb).abs() < 2e-7);
5665        assert!((exact_aaaa - numeric_aaaa).abs() < 2e-6);
5666        assert!((exact_aaab - numeric_aaab).abs() < 2e-6);
5667        assert!((exact_aabb - numeric_aabb).abs() < 2e-6);
5668        assert!((exact_abbb - numeric_abbb).abs() < 2e-6);
5669        assert!((exact_bbbb - numeric_bbbb).abs() < 2e-6);
5670    }
5671
5672    #[test]
5673    fn link_basis_cell_derivatives_match_exact_integrands() {
5674        let score_span = LocalSpanCubic {
5675            left: -0.75,
5676            right: 0.25,
5677            c0: 0.08,
5678            c1: -0.03,
5679            c2: 0.02,
5680            c3: -0.01,
5681        };
5682        let link_span = LocalSpanCubic {
5683            left: -0.6,
5684            right: 0.9,
5685            c0: -0.05,
5686            c1: 0.04,
5687            c2: -0.02,
5688            c3: 0.015,
5689        };
5690        let link_basis_span = LocalSpanCubic {
5691            left: -0.6,
5692            right: 0.9,
5693            c0: 0.02,
5694            c1: -0.01,
5695            c2: 0.03,
5696            c3: -0.02,
5697        };
5698        let a = 0.3;
5699        let b = -0.7;
5700        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5701        let cell = DenestedCubicCell {
5702            left: score_span.left,
5703            right: score_span.right,
5704            c0: coeffs[0],
5705            c1: coeffs[1],
5706            c2: coeffs[2],
5707            c3: coeffs[3],
5708        };
5709        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5710        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5711        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5712        let dc_daa = second_partials.0;
5713        let dc_dab = second_partials.1;
5714        let dc_dbb = second_partials.2;
5715        let denested_third = denested_cell_third_partials(link_span);
5716        let dc_daaa = denested_third.0;
5717        let dc_dbbb = denested_third.3;
5718
5719        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
5720        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
5721        let (coeff_aaw, coeff_abw, coeff_bbw) =
5722            link_basis_cell_second_partials(link_basis_span, a, b);
5723        let link_basis_third = link_basis_cell_third_partials(link_basis_span);
5724        let coeff_aaaw = link_basis_third.0;
5725        let coeff_bbbw = link_basis_third.3;
5726        let zero = [0.0; 4];
5727        let basis_third = 6.0 * link_basis_span.c3;
5728
5729        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5730        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5731        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5732        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5733        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5734        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
5735        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
5736        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
5737        let eta_aaw = |z: f64| link_basis_span.second_derivative(a + b * z);
5738        let eta_abw = |z: f64| z * link_basis_span.second_derivative(a + b * z);
5739        let eta_bbw = |z: f64| z * z * link_basis_span.second_derivative(a + b * z);
5740        let eta_aaaw = |z: f64| basis_third + 0.0 * z;
5741        let eta_bbbw = |z: f64| z * z * z * basis_third;
5742
5743        let exact_w = cell_first_derivative_from_moments(&coeff_w, &state.moments).expect("w");
5744        let exact_aw =
5745            cell_second_derivative_from_moments(cell, &dc_da, &coeff_w, &coeff_aw, &state.moments)
5746                .expect("aw");
5747        let exact_bw =
5748            cell_second_derivative_from_moments(cell, &dc_db, &coeff_w, &coeff_bw, &state.moments)
5749                .expect("bw");
5750        let exact_ww =
5751            cell_second_derivative_from_moments(cell, &coeff_w, &coeff_w, &zero, &state.moments)
5752                .expect("ww");
5753        let exact_aaw = cell_third_derivative_from_moments(
5754            cell,
5755            &dc_da,
5756            &dc_da,
5757            &coeff_w,
5758            &dc_daa,
5759            &coeff_aw,
5760            &coeff_aw,
5761            &coeff_aaw,
5762            &state.moments,
5763        )
5764        .expect("aaw");
5765        let exact_abw = cell_third_derivative_from_moments(
5766            cell,
5767            &dc_da,
5768            &dc_db,
5769            &coeff_w,
5770            &dc_dab,
5771            &coeff_aw,
5772            &coeff_bw,
5773            &coeff_abw,
5774            &state.moments,
5775        )
5776        .expect("abw");
5777        let exact_bbw = cell_third_derivative_from_moments(
5778            cell,
5779            &dc_db,
5780            &dc_db,
5781            &coeff_w,
5782            &dc_dbb,
5783            &coeff_bw,
5784            &coeff_bw,
5785            &coeff_bbw,
5786            &state.moments,
5787        )
5788        .expect("bbw");
5789        let exact_www = cell_third_derivative_from_moments(
5790            cell,
5791            &coeff_w,
5792            &coeff_w,
5793            &coeff_w,
5794            &zero,
5795            &zero,
5796            &zero,
5797            &zero,
5798            &state.moments,
5799        )
5800        .expect("www");
5801        let exact_aaaw = cell_fourth_derivative_from_moments(
5802            cell,
5803            &dc_da,
5804            &dc_da,
5805            &dc_da,
5806            &coeff_w,
5807            &dc_daa,
5808            &dc_daa,
5809            &coeff_aw,
5810            &dc_daa,
5811            &coeff_aw,
5812            &coeff_aw,
5813            &dc_daaa,
5814            &coeff_aaw,
5815            &coeff_aaw,
5816            &coeff_aaw,
5817            &coeff_aaaw,
5818            &state.moments,
5819        )
5820        .expect("aaaw");
5821        let exact_aaww = cell_fourth_derivative_from_moments(
5822            cell,
5823            &dc_da,
5824            &dc_da,
5825            &coeff_w,
5826            &coeff_w,
5827            &dc_daa,
5828            &coeff_aw,
5829            &coeff_aw,
5830            &coeff_aw,
5831            &coeff_aw,
5832            &zero,
5833            &coeff_aaw,
5834            &coeff_aaw,
5835            &zero,
5836            &zero,
5837            &zero,
5838            &state.moments,
5839        )
5840        .expect("aaww");
5841        let exact_abww = cell_fourth_derivative_from_moments(
5842            cell,
5843            &dc_da,
5844            &dc_db,
5845            &coeff_w,
5846            &coeff_w,
5847            &dc_dab,
5848            &coeff_aw,
5849            &coeff_aw,
5850            &coeff_bw,
5851            &coeff_bw,
5852            &zero,
5853            &coeff_abw,
5854            &coeff_abw,
5855            &zero,
5856            &zero,
5857            &zero,
5858            &state.moments,
5859        )
5860        .expect("abww");
5861        let exact_bbww = cell_fourth_derivative_from_moments(
5862            cell,
5863            &dc_db,
5864            &dc_db,
5865            &coeff_w,
5866            &coeff_w,
5867            &dc_dbb,
5868            &coeff_bw,
5869            &coeff_bw,
5870            &coeff_bw,
5871            &coeff_bw,
5872            &zero,
5873            &coeff_bbw,
5874            &coeff_bbw,
5875            &zero,
5876            &zero,
5877            &zero,
5878            &state.moments,
5879        )
5880        .expect("bbww");
5881        let exact_bbbw = cell_fourth_derivative_from_moments(
5882            cell,
5883            &dc_db,
5884            &dc_db,
5885            &dc_db,
5886            &coeff_w,
5887            &dc_dbb,
5888            &dc_dbb,
5889            &coeff_bw,
5890            &dc_dbb,
5891            &coeff_bw,
5892            &coeff_bw,
5893            &dc_dbbb,
5894            &coeff_bbw,
5895            &coeff_bbw,
5896            &coeff_bbw,
5897            &coeff_bbbw,
5898            &state.moments,
5899        )
5900        .expect("bbbw");
5901        let exact_wwww = cell_fourth_derivative_from_moments(
5902            cell,
5903            &coeff_w,
5904            &coeff_w,
5905            &coeff_w,
5906            &coeff_w,
5907            &zero,
5908            &zero,
5909            &zero,
5910            &zero,
5911            &zero,
5912            &zero,
5913            &zero,
5914            &zero,
5915            &zero,
5916            &zero,
5917            &zero,
5918            &state.moments,
5919        )
5920        .expect("wwww");
5921
5922        let numeric_w = simpson_integral(cell.left, cell.right, 5000, |z| {
5923            eta_w(z) * (-cell.q(z)).exp() * INV_TWO_PI
5924        });
5925        let numeric_aw = simpson_integral(cell.left, cell.right, 5000, |z| {
5926            (eta_aw(z) - cell.eta(z) * eta_a(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5927        });
5928        let numeric_bw = simpson_integral(cell.left, cell.right, 5000, |z| {
5929            (eta_bw(z) - cell.eta(z) * eta_b(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5930        });
5931        let numeric_ww = simpson_integral(cell.left, cell.right, 5000, |z| {
5932            (-cell.eta(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5933        });
5934        let numeric_aaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5935            let eta = cell.eta(z);
5936            let w_z = eta_w(z);
5937            let a_z = eta_a(z);
5938            (eta_aaw(z) - eta * (eta_aa(z) * w_z + 2.0 * eta_aw(z) * a_z)
5939                + (eta * eta - 1.0) * a_z * a_z * w_z)
5940                * (-cell.q(z)).exp()
5941                * INV_TWO_PI
5942        });
5943        let numeric_abw = simpson_integral(cell.left, cell.right, 5000, |z| {
5944            let eta = cell.eta(z);
5945            let w_z = eta_w(z);
5946            let a_z = eta_a(z);
5947            let b_z = eta_b(z);
5948            (eta_abw(z) - eta * (eta_ab(z) * w_z + eta_aw(z) * b_z + eta_bw(z) * a_z)
5949                + (eta * eta - 1.0) * a_z * b_z * w_z)
5950                * (-cell.q(z)).exp()
5951                * INV_TWO_PI
5952        });
5953        let numeric_bbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5954            let eta = cell.eta(z);
5955            let w_z = eta_w(z);
5956            let b_z = eta_b(z);
5957            (eta_bbw(z) - eta * (eta_bb(z) * w_z + 2.0 * eta_bw(z) * b_z)
5958                + (eta * eta - 1.0) * b_z * b_z * w_z)
5959                * (-cell.q(z)).exp()
5960                * INV_TWO_PI
5961        });
5962        let numeric_www = simpson_integral(cell.left, cell.right, 5000, |z| {
5963            let eta = cell.eta(z);
5964            let w_z = eta_w(z);
5965            ((eta * eta - 1.0) * w_z * w_z * w_z) * (-cell.q(z)).exp() * INV_TWO_PI
5966        });
5967        let numeric_aaaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5968            let eta = cell.eta(z);
5969            let a_z = eta_a(z);
5970            let w_z = eta_w(z);
5971            let aa_z = eta_aa(z);
5972            let aw_z = eta_aw(z);
5973            (eta_aaaw(z)
5974                - eta * ((dc_daaa[0] + 0.0 * z) * w_z + 3.0 * eta_aaw(z) * a_z + 3.0 * aa_z * aw_z)
5975                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * w_z + 3.0 * aw_z * a_z * a_z)
5976                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * a_z * w_z)
5977                * (-cell.q(z)).exp()
5978                * INV_TWO_PI
5979        });
5980        let numeric_aaww = simpson_integral(cell.left, cell.right, 5000, |z| {
5981            let eta = cell.eta(z);
5982            let a_z = eta_a(z);
5983            let w_z = eta_w(z);
5984            let aw_z = eta_aw(z);
5985            (-(2.0 * eta * (eta_aaw(z) * w_z + aw_z * aw_z))
5986                + (eta * eta - 1.0) * (eta_aa(z) * w_z * w_z + 4.0 * aw_z * a_z * w_z)
5987                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * w_z * w_z)
5988                * (-cell.q(z)).exp()
5989                * INV_TWO_PI
5990        });
5991        let numeric_abww = simpson_integral(cell.left, cell.right, 5000, |z| {
5992            let eta = cell.eta(z);
5993            let a_z = eta_a(z);
5994            let b_z = eta_b(z);
5995            let w_z = eta_w(z);
5996            let aw_z = eta_aw(z);
5997            let bw_z = eta_bw(z);
5998            (-(2.0 * eta * (eta_abw(z) * w_z + aw_z * bw_z))
5999                + (eta * eta - 1.0)
6000                    * (eta_ab(z) * w_z * w_z + 2.0 * aw_z * b_z * w_z + 2.0 * bw_z * a_z * w_z)
6001                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * w_z * w_z)
6002                * (-cell.q(z)).exp()
6003                * INV_TWO_PI
6004        });
6005        let numeric_bbww = simpson_integral(cell.left, cell.right, 5000, |z| {
6006            let eta = cell.eta(z);
6007            let b_z = eta_b(z);
6008            let w_z = eta_w(z);
6009            let bw_z = eta_bw(z);
6010            (-(2.0 * eta * (eta_bbw(z) * w_z + bw_z * bw_z))
6011                + (eta * eta - 1.0) * (eta_bb(z) * w_z * w_z + 4.0 * bw_z * b_z * w_z)
6012                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * w_z * w_z)
6013                * (-cell.q(z)).exp()
6014                * INV_TWO_PI
6015        });
6016        let numeric_bbbw = simpson_integral(cell.left, cell.right, 5000, |z| {
6017            let eta = cell.eta(z);
6018            let b_z = eta_b(z);
6019            let w_z = eta_w(z);
6020            let bb_z = eta_bb(z);
6021            let bw_z = eta_bw(z);
6022            (eta_bbbw(z)
6023                - eta
6024                    * ((dc_dbbb[3] * z * z * z) * w_z + 3.0 * eta_bbw(z) * b_z + 3.0 * bb_z * bw_z)
6025                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * w_z + 3.0 * bw_z * b_z * b_z)
6026                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * w_z)
6027                * (-cell.q(z)).exp()
6028                * INV_TWO_PI
6029        });
6030        let numeric_wwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6031            let eta = cell.eta(z);
6032            let w_z = eta_w(z);
6033            ((-eta * eta * eta + 3.0 * eta) * w_z * w_z * w_z * w_z)
6034                * (-cell.q(z)).exp()
6035                * INV_TWO_PI
6036        });
6037
6038        assert!((exact_w - numeric_w).abs() < 1e-8);
6039        assert!((exact_aw - numeric_aw).abs() < 1e-7);
6040        assert!((exact_bw - numeric_bw).abs() < 1e-7);
6041        assert!((exact_ww - numeric_ww).abs() < 1e-7);
6042        assert!((exact_aaw - numeric_aaw).abs() < 2e-6);
6043        assert!((exact_abw - numeric_abw).abs() < 2e-6);
6044        assert!((exact_bbw - numeric_bbw).abs() < 2e-6);
6045        assert!((exact_www - numeric_www).abs() < 2e-6);
6046        assert!((exact_aaaw - numeric_aaaw).abs() < 3e-6);
6047        assert!((exact_aaww - numeric_aaww).abs() < 3e-6);
6048        assert!((exact_abww - numeric_abww).abs() < 3e-6);
6049        assert!((exact_bbww - numeric_bbww).abs() < 3e-6);
6050        assert!((exact_bbbw - numeric_bbbw).abs() < 3e-6);
6051        assert!((exact_wwww - numeric_wwww).abs() < 3e-6);
6052    }
6053
6054    #[test]
6055    fn score_basis_cell_derivatives_match_exact_integrands() {
6056        let score_span = LocalSpanCubic {
6057            left: -0.75,
6058            right: 0.25,
6059            c0: 0.08,
6060            c1: -0.03,
6061            c2: 0.02,
6062            c3: -0.01,
6063        };
6064        let score_basis_span = LocalSpanCubic {
6065            left: -0.75,
6066            right: 0.25,
6067            c0: -0.04,
6068            c1: 0.06,
6069            c2: -0.01,
6070            c3: 0.02,
6071        };
6072        let link_span = LocalSpanCubic {
6073            left: -0.6,
6074            right: 0.9,
6075            c0: -0.05,
6076            c1: 0.04,
6077            c2: -0.02,
6078            c3: 0.015,
6079        };
6080        let a = 0.3;
6081        let b = -0.7;
6082        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6083        let cell = DenestedCubicCell {
6084            left: score_span.left,
6085            right: score_span.right,
6086            c0: coeffs[0],
6087            c1: coeffs[1],
6088            c2: coeffs[2],
6089            c3: coeffs[3],
6090        };
6091        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6092        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6093        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
6094        let dc_daa = second_partials.0;
6095        let dc_dab = second_partials.1;
6096        let dc_dbb = second_partials.2;
6097        let denested_third = denested_cell_third_partials(link_span);
6098        let dc_dbbb = denested_third.3;
6099
6100        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6101        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6102        let zero = [0.0; 4];
6103
6104        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6105        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6106        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6107        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
6108        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6109        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6110
6111        let exact_h = cell_first_derivative_from_moments(&coeff_h, &state.moments).expect("h");
6112        let exact_ah =
6113            cell_second_derivative_from_moments(cell, &dc_da, &coeff_h, &zero, &state.moments)
6114                .expect("ah");
6115        let exact_bh =
6116            cell_second_derivative_from_moments(cell, &dc_db, &coeff_h, &coeff_bh, &state.moments)
6117                .expect("bh");
6118        let exact_hh =
6119            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_h, &zero, &state.moments)
6120                .expect("hh");
6121        let exact_abh = cell_third_derivative_from_moments(
6122            cell,
6123            &dc_da,
6124            &dc_db,
6125            &coeff_h,
6126            &dc_dab,
6127            &zero,
6128            &coeff_bh,
6129            &zero,
6130            &state.moments,
6131        )
6132        .expect("abh");
6133        let exact_bbh = cell_third_derivative_from_moments(
6134            cell,
6135            &dc_db,
6136            &dc_db,
6137            &coeff_h,
6138            &dc_dbb,
6139            &coeff_bh,
6140            &coeff_bh,
6141            &zero,
6142            &state.moments,
6143        )
6144        .expect("bbh");
6145        let exact_bhh = cell_third_derivative_from_moments(
6146            cell,
6147            &dc_db,
6148            &coeff_h,
6149            &coeff_h,
6150            &coeff_bh,
6151            &coeff_bh,
6152            &zero,
6153            &zero,
6154            &state.moments,
6155        )
6156        .expect("bhh");
6157        let exact_hhh = cell_third_derivative_from_moments(
6158            cell,
6159            &coeff_h,
6160            &coeff_h,
6161            &coeff_h,
6162            &zero,
6163            &zero,
6164            &zero,
6165            &zero,
6166            &state.moments,
6167        )
6168        .expect("hhh");
6169        let exact_bbbh = cell_fourth_derivative_from_moments(
6170            cell,
6171            &dc_db,
6172            &dc_db,
6173            &dc_db,
6174            &coeff_h,
6175            &dc_dbb,
6176            &dc_dbb,
6177            &coeff_bh,
6178            &dc_dbb,
6179            &coeff_bh,
6180            &coeff_bh,
6181            &dc_dbbb,
6182            &zero,
6183            &zero,
6184            &zero,
6185            &zero,
6186            &state.moments,
6187        )
6188        .expect("bbbh");
6189        let exact_aahh = cell_fourth_derivative_from_moments(
6190            cell,
6191            &dc_da,
6192            &dc_da,
6193            &coeff_h,
6194            &coeff_h,
6195            &dc_daa,
6196            &zero,
6197            &zero,
6198            &zero,
6199            &zero,
6200            &zero,
6201            &zero,
6202            &zero,
6203            &zero,
6204            &zero,
6205            &zero,
6206            &state.moments,
6207        )
6208        .expect("aahh");
6209        let exact_abhh = cell_fourth_derivative_from_moments(
6210            cell,
6211            &dc_da,
6212            &dc_db,
6213            &coeff_h,
6214            &coeff_h,
6215            &dc_dab,
6216            &zero,
6217            &zero,
6218            &coeff_bh,
6219            &coeff_bh,
6220            &zero,
6221            &zero,
6222            &zero,
6223            &zero,
6224            &zero,
6225            &zero,
6226            &state.moments,
6227        )
6228        .expect("abhh");
6229        let exact_bbhh = cell_fourth_derivative_from_moments(
6230            cell,
6231            &dc_db,
6232            &dc_db,
6233            &coeff_h,
6234            &coeff_h,
6235            &dc_dbb,
6236            &coeff_bh,
6237            &coeff_bh,
6238            &coeff_bh,
6239            &coeff_bh,
6240            &zero,
6241            &zero,
6242            &zero,
6243            &zero,
6244            &zero,
6245            &zero,
6246            &state.moments,
6247        )
6248        .expect("bbhh");
6249        let exact_bhhh = cell_fourth_derivative_from_moments(
6250            cell,
6251            &dc_db,
6252            &coeff_h,
6253            &coeff_h,
6254            &coeff_h,
6255            &coeff_bh,
6256            &coeff_bh,
6257            &coeff_bh,
6258            &zero,
6259            &zero,
6260            &zero,
6261            &zero,
6262            &zero,
6263            &zero,
6264            &zero,
6265            &zero,
6266            &state.moments,
6267        )
6268        .expect("bhhh");
6269        let exact_hhhh = cell_fourth_derivative_from_moments(
6270            cell,
6271            &coeff_h,
6272            &coeff_h,
6273            &coeff_h,
6274            &coeff_h,
6275            &zero,
6276            &zero,
6277            &zero,
6278            &zero,
6279            &zero,
6280            &zero,
6281            &zero,
6282            &zero,
6283            &zero,
6284            &zero,
6285            &zero,
6286            &state.moments,
6287        )
6288        .expect("hhhh");
6289
6290        let numeric_h = simpson_integral(cell.left, cell.right, 5000, |z| {
6291            eta_h(z) * (-cell.q(z)).exp() * INV_TWO_PI
6292        });
6293        let numeric_ah = simpson_integral(cell.left, cell.right, 5000, |z| {
6294            (-cell.eta(z) * eta_a(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6295        });
6296        let numeric_bh = simpson_integral(cell.left, cell.right, 5000, |z| {
6297            (eta_bh(z) - cell.eta(z) * eta_b(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6298        });
6299        let numeric_hh = simpson_integral(cell.left, cell.right, 5000, |z| {
6300            (-cell.eta(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6301        });
6302        let numeric_abh = simpson_integral(cell.left, cell.right, 5000, |z| {
6303            let eta = cell.eta(z);
6304            (-(eta * (eta_ab(z) * eta_h(z) + eta_bh(z) * eta_a(z)))
6305                + (eta * eta - 1.0) * eta_a(z) * eta_b(z) * eta_h(z))
6306                * (-cell.q(z)).exp()
6307                * INV_TWO_PI
6308        });
6309        let numeric_bbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6310            let eta = cell.eta(z);
6311            (-(eta * (eta_bb(z) * eta_h(z) + 2.0 * eta_bh(z) * eta_b(z)))
6312                + (eta * eta - 1.0) * eta_b(z) * eta_b(z) * eta_h(z))
6313                * (-cell.q(z)).exp()
6314                * INV_TWO_PI
6315        });
6316        let numeric_bhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6317            let eta = cell.eta(z);
6318            (-(2.0 * eta * eta_bh(z) * eta_h(z))
6319                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_h(z))
6320                * (-cell.q(z)).exp()
6321                * INV_TWO_PI
6322        });
6323        let numeric_hhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6324            let eta = cell.eta(z);
6325            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6326        });
6327        let numeric_bbbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6328            let eta = cell.eta(z);
6329            let b_z = eta_b(z);
6330            let h_z = eta_h(z);
6331            let bb_z = eta_bb(z);
6332            let bh_z = eta_bh(z);
6333            (-(eta * ((dc_dbbb[3] * z * z * z) * h_z + 3.0 * bb_z * bh_z))
6334                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * h_z + 3.0 * bh_z * b_z * b_z)
6335                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * h_z)
6336                * (-cell.q(z)).exp()
6337                * INV_TWO_PI
6338        });
6339        let numeric_aahh = simpson_integral(cell.left, cell.right, 5000, |z| {
6340            let eta = cell.eta(z);
6341            let a_z = eta_a(z);
6342            let h_z = eta_h(z);
6343            ((eta * eta - 1.0) * polynomial_value(&dc_daa, z) * h_z * h_z
6344                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * h_z * h_z)
6345                * (-cell.q(z)).exp()
6346                * INV_TWO_PI
6347        });
6348        let numeric_abhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6349            let eta = cell.eta(z);
6350            let a_z = eta_a(z);
6351            let b_z = eta_b(z);
6352            let h_z = eta_h(z);
6353            ((eta * eta - 1.0) * (eta_ab(z) * h_z * h_z + 2.0 * eta_bh(z) * a_z * h_z)
6354                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * h_z * h_z)
6355                * (-cell.q(z)).exp()
6356                * INV_TWO_PI
6357        });
6358        let numeric_bbhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6359            let eta = cell.eta(z);
6360            let b_z = eta_b(z);
6361            let h_z = eta_h(z);
6362            let bh_z = eta_bh(z);
6363            (-(2.0 * eta * bh_z * bh_z)
6364                + (eta * eta - 1.0) * (eta_bb(z) * h_z * h_z + 4.0 * bh_z * b_z * h_z)
6365                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * h_z * h_z)
6366                * (-cell.q(z)).exp()
6367                * INV_TWO_PI
6368        });
6369        let numeric_bhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6370            let eta = cell.eta(z);
6371            let h_z = eta_h(z);
6372            (-(eta * (3.0 * eta_bh(z) * h_z * h_z))
6373                + (eta * eta - 1.0) * (3.0 * eta_bh(z) * h_z * h_z)
6374                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * h_z * h_z)
6375                * (-cell.q(z)).exp()
6376                * INV_TWO_PI
6377        });
6378        let numeric_hhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6379            let eta = cell.eta(z);
6380            let h_z = eta_h(z);
6381            ((-eta * eta * eta + 3.0 * eta) * h_z * h_z * h_z * h_z)
6382                * (-cell.q(z)).exp()
6383                * INV_TWO_PI
6384        });
6385
6386        assert!((exact_h - numeric_h).abs() < 1e-8);
6387        assert!((exact_ah - numeric_ah).abs() < 1e-7);
6388        assert!((exact_bh - numeric_bh).abs() < 1e-7);
6389        assert!((exact_hh - numeric_hh).abs() < 1e-7);
6390        assert!((exact_abh - numeric_abh).abs() < 2e-6);
6391        assert!((exact_bbh - numeric_bbh).abs() < 2e-6);
6392        assert!((exact_bhh - numeric_bhh).abs() < 2e-6);
6393        assert!((exact_hhh - numeric_hhh).abs() < 2e-6);
6394        assert!((exact_bbbh - numeric_bbbh).abs() < 3e-6);
6395        assert!((exact_aahh - numeric_aahh).abs() < 3e-6);
6396        assert!((exact_abhh - numeric_abhh).abs() < 3e-6);
6397        assert!((exact_bbhh - numeric_bbhh).abs() < 3e-6);
6398        assert!((exact_bhhh - numeric_bhhh).abs() < 3e-6);
6399        assert!((exact_hhhh - numeric_hhhh).abs() < 3e-6);
6400    }
6401
6402    #[test]
6403    fn cross_basis_cell_derivatives_match_exact_integrands() {
6404        let score_span = LocalSpanCubic {
6405            left: -0.75,
6406            right: 0.25,
6407            c0: 0.08,
6408            c1: -0.03,
6409            c2: 0.02,
6410            c3: -0.01,
6411        };
6412        let score_basis_span = LocalSpanCubic {
6413            left: -0.75,
6414            right: 0.25,
6415            c0: -0.04,
6416            c1: 0.06,
6417            c2: -0.01,
6418            c3: 0.02,
6419        };
6420        let link_span = LocalSpanCubic {
6421            left: -0.6,
6422            right: 0.9,
6423            c0: -0.05,
6424            c1: 0.04,
6425            c2: -0.02,
6426            c3: 0.015,
6427        };
6428        let link_basis_span = LocalSpanCubic {
6429            left: -0.6,
6430            right: 0.9,
6431            c0: 0.02,
6432            c1: -0.01,
6433            c2: 0.03,
6434            c3: -0.02,
6435        };
6436        let a = 0.3;
6437        let b = -0.7;
6438        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6439        let cell = DenestedCubicCell {
6440            left: score_span.left,
6441            right: score_span.right,
6442            c0: coeffs[0],
6443            c1: coeffs[1],
6444            c2: coeffs[2],
6445            c3: coeffs[3],
6446        };
6447        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6448        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6449        let (dc_daa, dc_dab, _) = denested_cell_second_partials(score_span, link_span, a, b);
6450
6451        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6452        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6453        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
6454        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
6455        let (coeff_aaw, coeff_abw, _) = link_basis_cell_second_partials(link_basis_span, a, b);
6456        let zero = [0.0; 4];
6457
6458        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6459        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6460        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6461        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6462        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
6463        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6464        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
6465        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
6466
6467        let exact_hw =
6468            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_w, &zero, &state.moments)
6469                .expect("hw");
6470        let exact_ahw = cell_third_derivative_from_moments(
6471            cell,
6472            &dc_da,
6473            &coeff_h,
6474            &coeff_w,
6475            &zero,
6476            &coeff_aw,
6477            &zero,
6478            &zero,
6479            &state.moments,
6480        )
6481        .expect("ahw");
6482        let exact_bhw = cell_third_derivative_from_moments(
6483            cell,
6484            &dc_db,
6485            &coeff_h,
6486            &coeff_w,
6487            &coeff_bh,
6488            &coeff_bw,
6489            &zero,
6490            &zero,
6491            &state.moments,
6492        )
6493        .expect("bhw");
6494        let exact_hhw = cell_third_derivative_from_moments(
6495            cell,
6496            &coeff_h,
6497            &coeff_h,
6498            &coeff_w,
6499            &zero,
6500            &zero,
6501            &zero,
6502            &zero,
6503            &state.moments,
6504        )
6505        .expect("hhw");
6506        let exact_hww = cell_third_derivative_from_moments(
6507            cell,
6508            &coeff_h,
6509            &coeff_w,
6510            &coeff_w,
6511            &zero,
6512            &zero,
6513            &zero,
6514            &zero,
6515            &state.moments,
6516        )
6517        .expect("hww");
6518        let exact_aahw = cell_fourth_derivative_from_moments(
6519            cell,
6520            &dc_da,
6521            &dc_da,
6522            &coeff_h,
6523            &coeff_w,
6524            &dc_daa,
6525            &zero,
6526            &coeff_aw,
6527            &zero,
6528            &coeff_aw,
6529            &zero,
6530            &zero,
6531            &coeff_aaw,
6532            &zero,
6533            &zero,
6534            &zero,
6535            &state.moments,
6536        )
6537        .expect("aahw");
6538        let exact_hhww = cell_fourth_derivative_from_moments(
6539            cell,
6540            &coeff_h,
6541            &coeff_h,
6542            &coeff_w,
6543            &coeff_w,
6544            &zero,
6545            &zero,
6546            &zero,
6547            &zero,
6548            &zero,
6549            &zero,
6550            &zero,
6551            &zero,
6552            &zero,
6553            &zero,
6554            &zero,
6555            &state.moments,
6556        )
6557        .expect("hhww");
6558        let exact_hhhw = cell_fourth_derivative_from_moments(
6559            cell,
6560            &coeff_h,
6561            &coeff_h,
6562            &coeff_h,
6563            &coeff_w,
6564            &zero,
6565            &zero,
6566            &zero,
6567            &zero,
6568            &zero,
6569            &zero,
6570            &zero,
6571            &zero,
6572            &zero,
6573            &zero,
6574            &zero,
6575            &state.moments,
6576        )
6577        .expect("hhhw");
6578        let exact_abhw = cell_fourth_derivative_from_moments(
6579            cell,
6580            &dc_da,
6581            &dc_db,
6582            &coeff_h,
6583            &coeff_w,
6584            &dc_dab,
6585            &zero,
6586            &coeff_aw,
6587            &coeff_bh,
6588            &coeff_bw,
6589            &zero,
6590            &zero,
6591            &coeff_abw,
6592            &zero,
6593            &zero,
6594            &zero,
6595            &state.moments,
6596        )
6597        .expect("abhw");
6598        let exact_ahww = cell_fourth_derivative_from_moments(
6599            cell,
6600            &dc_da,
6601            &coeff_h,
6602            &coeff_w,
6603            &coeff_w,
6604            &zero,
6605            &coeff_aw,
6606            &coeff_aw,
6607            &zero,
6608            &zero,
6609            &zero,
6610            &zero,
6611            &zero,
6612            &zero,
6613            &zero,
6614            &zero,
6615            &state.moments,
6616        )
6617        .expect("ahww");
6618        let exact_bhww = cell_fourth_derivative_from_moments(
6619            cell,
6620            &dc_db,
6621            &coeff_h,
6622            &coeff_w,
6623            &coeff_w,
6624            &coeff_bh,
6625            &coeff_bw,
6626            &coeff_bw,
6627            &zero,
6628            &zero,
6629            &zero,
6630            &zero,
6631            &zero,
6632            &zero,
6633            &zero,
6634            &zero,
6635            &state.moments,
6636        )
6637        .expect("bhww");
6638        let exact_hwww = cell_fourth_derivative_from_moments(
6639            cell,
6640            &coeff_h,
6641            &coeff_w,
6642            &coeff_w,
6643            &coeff_w,
6644            &zero,
6645            &zero,
6646            &zero,
6647            &zero,
6648            &zero,
6649            &zero,
6650            &zero,
6651            &zero,
6652            &zero,
6653            &zero,
6654            &zero,
6655            &state.moments,
6656        )
6657        .expect("hwww");
6658
6659        let numeric_hw = simpson_integral(cell.left, cell.right, 5000, |z| {
6660            (-cell.eta(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6661        });
6662        let numeric_ahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6663            let eta = cell.eta(z);
6664            (-(eta * eta_aw(z) * eta_h(z)) + (eta * eta - 1.0) * eta_a(z) * eta_h(z) * eta_w(z))
6665                * (-cell.q(z)).exp()
6666                * INV_TWO_PI
6667        });
6668        let numeric_bhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6669            let eta = cell.eta(z);
6670            (-(eta * (eta_bh(z) * eta_w(z) + eta_bw(z) * eta_h(z)))
6671                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_w(z))
6672                * (-cell.q(z)).exp()
6673                * INV_TWO_PI
6674        });
6675        let numeric_hhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6676            let eta = cell.eta(z);
6677            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6678        });
6679        let numeric_hww = simpson_integral(cell.left, cell.right, 5000, |z| {
6680            let eta = cell.eta(z);
6681            ((eta * eta - 1.0) * eta_h(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6682        });
6683        let numeric_aahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6684            let eta = cell.eta(z);
6685            (-(eta * polynomial_value(&coeff_aaw, z) * eta_h(z))
6686                + (eta * eta - 1.0)
6687                    * (polynomial_value(&dc_daa, z) * eta_h(z) * eta_w(z)
6688                        + 2.0 * eta_aw(z) * eta_a(z) * eta_h(z))
6689                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_a(z) * eta_h(z) * eta_w(z))
6690                * (-cell.q(z)).exp()
6691                * INV_TWO_PI
6692        });
6693        let numeric_hhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6694            let eta = cell.eta(z);
6695            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_w(z) * eta_w(z))
6696                * (-cell.q(z)).exp()
6697                * INV_TWO_PI
6698        });
6699        let numeric_hhhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6700            let eta = cell.eta(z);
6701            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_h(z) * eta_w(z))
6702                * (-cell.q(z)).exp()
6703                * INV_TWO_PI
6704        });
6705        let numeric_abhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6706            let eta = cell.eta(z);
6707            (-(eta * polynomial_value(&coeff_abw, z) * eta_h(z) + eta * eta_aw(z) * eta_bh(z))
6708                + (eta * eta - 1.0)
6709                    * (eta_ab(z) * eta_h(z) * eta_w(z)
6710                        + eta_aw(z) * eta_b(z) * eta_h(z)
6711                        + eta_bh(z) * eta_a(z) * eta_w(z)
6712                        + eta_bw(z) * eta_a(z) * eta_h(z))
6713                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_b(z) * eta_h(z) * eta_w(z))
6714                * (-cell.q(z)).exp()
6715                * INV_TWO_PI
6716        });
6717        let numeric_ahww = simpson_integral(cell.left, cell.right, 5000, |z| {
6718            let eta = cell.eta(z);
6719            (2.0 * (eta * eta - 1.0) * eta_aw(z) * eta_h(z) * eta_w(z)
6720                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_h(z) * eta_w(z) * eta_w(z))
6721                * (-cell.q(z)).exp()
6722                * INV_TWO_PI
6723        });
6724        let numeric_bhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6725            let eta = cell.eta(z);
6726            let h_z = eta_h(z);
6727            let w_z = eta_w(z);
6728            ((eta * eta - 1.0) * (eta_bh(z) * w_z * w_z + 2.0 * eta_bw(z) * h_z * w_z)
6729                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * w_z * w_z)
6730                * (-cell.q(z)).exp()
6731                * INV_TWO_PI
6732        });
6733        let numeric_hwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6734            let eta = cell.eta(z);
6735            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_w(z) * eta_w(z) * eta_w(z))
6736                * (-cell.q(z)).exp()
6737                * INV_TWO_PI
6738        });
6739
6740        assert!((exact_hw - numeric_hw).abs() < 1e-7);
6741        assert!((exact_ahw - numeric_ahw).abs() < 2e-6);
6742        assert!((exact_bhw - numeric_bhw).abs() < 2e-6);
6743        assert!((exact_hhw - numeric_hhw).abs() < 2e-6);
6744        assert!((exact_hww - numeric_hww).abs() < 2e-6);
6745        assert!((exact_aahw - numeric_aahw).abs() < 3e-6);
6746        assert!((exact_hhww - numeric_hhww).abs() < 3e-6);
6747        assert!((exact_hhhw - numeric_hhhw).abs() < 3e-6);
6748        assert!((exact_abhw - numeric_abhw).abs() < 3e-6);
6749        assert!((exact_ahww - numeric_ahww).abs() < 3e-6);
6750        assert!((exact_bhww - numeric_bhww).abs() < 3e-6);
6751        assert!((exact_hwww - numeric_hwww).abs() < 3e-6);
6752    }
6753
6754    #[test]
6755    fn cell_moment_scratch_reuses_buffers_under_margslope_like_pressure() {
6756        let cells = [
6757            DenestedCubicCell {
6758                left: -1.2,
6759                right: -0.35,
6760                c0: 0.18,
6761                c1: 0.72,
6762                c2: -0.045,
6763                c3: 0.018,
6764            },
6765            DenestedCubicCell {
6766                left: -0.35,
6767                right: 0.48,
6768                c0: -0.08,
6769                c1: 0.91,
6770                c2: 0.038,
6771                c3: -0.014,
6772            },
6773            DenestedCubicCell {
6774                left: 0.48,
6775                right: 1.4,
6776                c0: 0.11,
6777                c1: 0.83,
6778                c2: 0.022,
6779                c3: 0.012,
6780            },
6781        ];
6782        let mut scratch = CellMomentScratch::with_capacity(MAX_AFFINE_ANCHOR_DEGREE);
6783        for cell in cells {
6784            let baseline = evaluate_cell_moments(cell, 9).expect("baseline moments");
6785            let scratch_state =
6786                evaluate_cell_moments_with_scratch(cell, 9, &mut scratch).expect("scratch moments");
6787            assert_eq!(baseline.branch, scratch_state.branch);
6788            assert!((baseline.value - scratch_state.value).abs() <= 1e-10);
6789            assert_eq!(baseline.moments.len(), scratch_state.moments.len());
6790            for (lhs, rhs) in baseline.moments.iter().zip(scratch_state.moments.iter()) {
6791                assert!((lhs - rhs).abs() <= 1e-10, "{lhs} vs {rhs}");
6792            }
6793        }
6794
6795        reset_cell_moment_test_reallocs();
6796        let mut checksum = 0.0;
6797        for i in 0..5_000 {
6798            let cell = cells[i % cells.len()];
6799            let state = evaluate_cell_moments_with_scratch(cell, 9, &mut scratch)
6800                .expect("scratch moments under repeated pressure");
6801            checksum += state.value + state.moments[0] * 1e-12;
6802        }
6803        assert!(checksum.is_finite());
6804        assert_eq!(
6805            cell_moment_test_reallocs(),
6806            0,
6807            "scratch-backed inner cell-moment calls should not grow Vec buffers"
6808        );
6809    }
6810
6811    #[test]
6812    fn evaluate_cell_moments_matches_numeric_integrals() {
6813        let cell = DenestedCubicCell {
6814            left: -0.9,
6815            right: 0.8,
6816            c0: 0.15,
6817            c1: -0.35,
6818            c2: 0.11,
6819            c3: -0.07,
6820        };
6821        let state = evaluate_cell_moments(cell, 6).expect("cell moments");
6822        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
6823            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
6824        });
6825        assert!((state.value - value_numeric).abs() < 1e-9);
6826        for degree in 0..=6 {
6827            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
6828                z.powi(degree as i32) * (-cell.q(z)).exp()
6829            });
6830            assert!((state.moments[degree] - target).abs() < 1e-9);
6831        }
6832    }
6833
6834    #[test]
6835    fn partition_builder_moves_link_preimages_with_intercept() {
6836        let score_breaks = [-2.0, -1.0, 0.0, 1.0, 2.0];
6837        let link_breaks = [-1.5, -0.5, 0.5, 1.5];
6838        let score_span = |z: f64| {
6839            let left = if z < -1.0 {
6840                -2.0
6841            } else if z < 0.0 {
6842                -1.0
6843            } else if z < 1.0 {
6844                0.0
6845            } else {
6846                1.0
6847            };
6848            Ok(LocalSpanCubic {
6849                left,
6850                right: left + 1.0,
6851                c0: 0.1,
6852                c1: 0.2,
6853                c2: 0.0,
6854                c3: 0.0,
6855            })
6856        };
6857        let link_span = |u: f64| {
6858            let left = if u < -0.5 {
6859                -1.5
6860            } else if u < 0.5 {
6861                -0.5
6862            } else {
6863                0.5
6864            };
6865            Ok(LocalSpanCubic {
6866                left,
6867                right: left + 1.0,
6868                c0: -0.05,
6869                c1: 0.1,
6870                c2: 0.0,
6871                c3: 0.0,
6872            })
6873        };
6874        let cells_a0 = build_denested_partition_cells(
6875            0.25,
6876            0.9,
6877            &score_breaks,
6878            &link_breaks,
6879            score_span,
6880            link_span,
6881        )
6882        .expect("cells a0");
6883        let cells_a1 = build_denested_partition_cells(
6884            0.55,
6885            0.9,
6886            &score_breaks,
6887            &link_breaks,
6888            score_span,
6889            link_span,
6890        )
6891        .expect("cells a1");
6892        assert!(cells_a0.len() >= score_breaks.len() - 1);
6893        assert!(
6894            cells_a0
6895                .windows(2)
6896                .all(|w| (w[0].cell.right - w[1].cell.left).abs() <= 1e-12)
6897        );
6898        assert!(
6899            cells_a0
6900                .iter()
6901                .zip(cells_a1.iter())
6902                .any(|(lhs, rhs)| (lhs.cell.left - rhs.cell.left).abs() > 1e-10)
6903        );
6904        assert!(cells_a0.first().unwrap().cell.left.is_infinite());
6905        assert!(cells_a0.last().unwrap().cell.right.is_infinite());
6906    }
6907
6908    #[test]
6909    fn partition_builder_without_breaks_returns_single_global_cell() {
6910        let cells = build_denested_partition_cells_with_tails(
6911            0.3,
6912            -0.4,
6913            &[],
6914            &[],
6915            |z| {
6916                if z.is_nan() {
6917                    return Err("probe z is NaN".to_string());
6918                }
6919                Ok(LocalSpanCubic {
6920                    left: 0.0,
6921                    right: 1.0,
6922                    c0: 0.0,
6923                    c1: 0.0,
6924                    c2: 0.0,
6925                    c3: 0.0,
6926                })
6927            },
6928            |u| {
6929                if u.is_nan() {
6930                    return Err("probe u is NaN".to_string());
6931                }
6932                Ok(LocalSpanCubic {
6933                    left: 0.0,
6934                    right: 1.0,
6935                    c0: 0.0,
6936                    c1: 0.0,
6937                    c2: 0.0,
6938                    c3: 0.0,
6939                })
6940            },
6941        )
6942        .expect("global cell");
6943        assert_eq!(cells.len(), 1);
6944        assert_eq!(cells[0].cell.left, f64::NEG_INFINITY);
6945        assert_eq!(cells[0].cell.right, f64::INFINITY);
6946        assert!(cells[0].cell.c2.abs() < 1e-12);
6947        assert!(cells[0].cell.c3.abs() < 1e-12);
6948    }
6949
6950    #[test]
6951    fn polynomial_integral_helper_matches_moment_sum() {
6952        let cell = DenestedCubicCell {
6953            left: -1.5,
6954            right: 1.25,
6955            c0: 0.2,
6956            c1: -0.4,
6957            c2: 0.15,
6958            c3: 0.03,
6959        };
6960        let state = evaluate_cell_moments(cell, 8).expect("cell moments");
6961        let coeffs = [1.5, -0.25, 0.75, 0.1];
6962        let expected = INV_TWO_PI
6963            * coeffs
6964                .iter()
6965                .enumerate()
6966                .map(|(idx, coeff)| coeff * state.moments[idx])
6967                .sum::<f64>();
6968        let got = cell_polynomial_integral_from_moments(&coeffs, &state.moments, "test poly")
6969            .expect("poly integral");
6970        assert!((got - expected).abs() < 1e-14);
6971    }
6972
6973    #[test]
6974    fn batched_cell_moment_max_degree_matches_direct_non_affine_grid() {
6975        let cells = [
6976            DenestedCubicCell {
6977                left: -2.0,
6978                right: -0.25,
6979                c0: -0.7,
6980                c1: 0.8,
6981                c2: 0.015,
6982                c3: -0.004,
6983            },
6984            DenestedCubicCell {
6985                left: -0.5,
6986                right: 0.75,
6987                c0: 0.2,
6988                c1: -0.35,
6989                c2: -0.025,
6990                c3: 0.0,
6991            },
6992            DenestedCubicCell {
6993                left: 0.1,
6994                right: 1.6,
6995                c0: 0.4,
6996                c1: 0.25,
6997                c2: 0.01,
6998                c3: 0.006,
6999            },
7000            DenestedCubicCell {
7001                left: -1.25,
7002                right: 2.25,
7003                c0: -0.1,
7004                c1: 0.55,
7005                c2: -0.012,
7006                c3: 0.003,
7007            },
7008        ];
7009        for cell in cells {
7010            let branch = branch_cell(cell).expect("branch");
7011            if branch == ExactCellBranch::Affine {
7012                continue;
7013            }
7014            let batched =
7015                evaluate_non_affine_cell_state(cell, branch, 21).expect("degree-21 state");
7016            for degree in [9usize, 15, 21] {
7017                let direct =
7018                    evaluate_non_affine_cell_state(cell, branch, degree).expect("direct state");
7019                assert_eq!(batched.branch, direct.branch);
7020                let denom = direct.value.abs().max(1.0);
7021                assert!(((batched.value - direct.value).abs() / denom) < 1e-10);
7022                for k in 0..=degree {
7023                    let denom = direct.moments[k].abs().max(1.0);
7024                    let rel = (batched.moments[k] - direct.moments[k]).abs() / denom;
7025                    assert!(
7026                        rel < 1e-10,
7027                        "cell={cell:?} degree={degree} moment={k} rel={rel:e}"
7028                    );
7029                }
7030            }
7031        }
7032    }
7033
7034    #[test]
7035    fn derivative_moment_evaluator_matches_value_evaluator_moments() {
7036        let cells = [
7037            DenestedCubicCell {
7038                left: -2.0,
7039                right: -0.4,
7040                c0: 0.15,
7041                c1: -0.8,
7042                c2: 0.0,
7043                c3: 0.0,
7044            },
7045            DenestedCubicCell {
7046                left: -0.75,
7047                right: 1.4,
7048                c0: -0.25,
7049                c1: 0.6,
7050                c2: 0.12,
7051                c3: 0.0,
7052            },
7053            DenestedCubicCell {
7054                left: -1.1,
7055                right: 0.9,
7056                c0: 0.35,
7057                c1: -0.3,
7058                c2: 0.05,
7059                c3: -0.015,
7060            },
7061        ];
7062        for cell in cells {
7063            for degree in [4usize, 9, 15, 21] {
7064                let full = evaluate_cell_moments_uncached(cell, degree).expect("full moments");
7065                let derivative = evaluate_cell_derivative_moments_uncached(cell, degree)
7066                    .expect("derivative moments");
7067                assert_eq!(full.branch, derivative.branch);
7068                assert_eq!(full.moments.len(), derivative.moments.len());
7069                for k in 0..full.moments.len() {
7070                    assert_eq!(full.moments[k].to_bits(), derivative.moments[k].to_bits());
7071                }
7072            }
7073        }
7074    }
7075
7076    #[test]
7077    fn cell_moment_lru_matches_uncached_non_affine_grid() {
7078        let cache = CellMomentLruCache::new(16 * 1024 * 1024);
7079        let stats = CellMomentCacheStats::default();
7080        let c0s = [-0.75, 0.0, 0.5];
7081        let c1s = [-1.2, 0.25, 1.1];
7082        let c2s = [-0.18, 0.07];
7083        let c3s = [0.0, 0.025];
7084        let bounds = [(-2.0, -0.5), (-0.25, 1.5)];
7085        let degrees = [4usize, 9, 15, 21];
7086        for &c0 in &c0s {
7087            for &c1 in &c1s {
7088                for &c2 in &c2s {
7089                    for &c3 in &c3s {
7090                        for &(left, right) in &bounds {
7091                            for &max_degree in &degrees {
7092                                let cell = DenestedCubicCell {
7093                                    left,
7094                                    right,
7095                                    c0,
7096                                    c1,
7097                                    c2,
7098                                    c3,
7099                                };
7100                                let branch = branch_cell(cell).expect("branch");
7101                                if branch == ExactCellBranch::Affine {
7102                                    continue;
7103                                }
7104                                let expected =
7105                                    evaluate_non_affine_cell_state(cell, branch, max_degree)
7106                                        .expect("uncached non-affine moments");
7107                                let got = evaluate_cell_moments_cached(
7108                                    cell,
7109                                    max_degree,
7110                                    &cache,
7111                                    Some(&stats),
7112                                )
7113                                .expect("cached moments");
7114                                assert_eq!(got.branch, expected.branch);
7115                                assert_eq!(got.moments.len(), max_degree + 1);
7116                                let denom = expected.value.abs().max(1.0);
7117                                assert!(
7118                                    ((got.value - expected.value).abs() / denom) < 1e-10,
7119                                    "value mismatch for {cell:?} degree {max_degree}: got {} expected {}",
7120                                    got.value,
7121                                    expected.value
7122                                );
7123                                for (idx, (&lhs, &rhs)) in
7124                                    got.moments.iter().zip(expected.moments.iter()).enumerate()
7125                                {
7126                                    let denom = rhs.abs().max(1.0);
7127                                    assert!(
7128                                        ((lhs - rhs).abs() / denom) < 1e-10,
7129                                        "moment {idx} mismatch for {cell:?} degree {max_degree}: got {lhs} expected {rhs}"
7130                                    );
7131                                }
7132                                let warm = evaluate_cell_moments_cached(
7133                                    cell,
7134                                    max_degree,
7135                                    &cache,
7136                                    Some(&stats),
7137                                )
7138                                .expect("warm cached moments");
7139                                assert_eq!(warm, got);
7140                            }
7141                        }
7142                    }
7143                }
7144            }
7145        }
7146        let (hits, misses) = stats.snapshot();
7147        assert!(hits > 0, "expected warm LRU hits");
7148        assert!(misses > 0, "expected cold LRU misses");
7149    }
7150
7151    #[test]
7152    fn cell_moment_fingerprint_exact_cache_matches_current_evaluator() {
7153        let cells = [
7154            DenestedCubicCell {
7155                left: -1.75,
7156                right: -0.25,
7157                c0: 0.15,
7158                c1: -0.35,
7159                c2: 0.08,
7160                c3: -0.015,
7161            },
7162            DenestedCubicCell {
7163                left: -0.5,
7164                right: 0.8,
7165                c0: -0.2,
7166                c1: 0.45,
7167                c2: -0.12,
7168                c3: 0.025,
7169            },
7170            DenestedCubicCell {
7171                left: 0.1,
7172                right: 1.6,
7173                c0: 0.05,
7174                c1: 0.2,
7175                c2: 0.03,
7176                c3: 0.004,
7177            },
7178        ];
7179        let mut cache = std::collections::HashMap::new();
7180        for max_degree in [0usize, 3, 4, 9, 16] {
7181            for cell in cells {
7182                let baseline = evaluate_cell_moments(cell, max_degree).expect("baseline moments");
7183                let key = cell_moment_cache_key(cell, max_degree, 0.0);
7184                let cached = cache.entry(key).or_insert_with(|| {
7185                    evaluate_cell_moments(cell, max_degree).expect("cached moments")
7186                });
7187                assert_eq!(baseline.branch, cached.branch);
7188                assert_eq!(baseline.value.to_bits(), cached.value.to_bits());
7189                assert_eq!(baseline.moments.len(), cached.moments.len());
7190                for (lhs, rhs) in baseline.moments.iter().zip(cached.moments.iter()) {
7191                    assert_eq!(lhs.to_bits(), rhs.to_bits());
7192                }
7193            }
7194        }
7195    }
7196
7197    #[test]
7198    fn fuzzy_cell_moment_fingerprint_error_scales_with_epsilon() {
7199        for epsilon in [1e-8, 1e-6] {
7200            let base = DenestedCubicCell {
7201                left: -1.25,
7202                right: 1.1,
7203                c0: 0.1,
7204                c1: -0.25,
7205                c2: 0.04,
7206                c3: -0.006,
7207            };
7208            let perturbed = DenestedCubicCell {
7209                left: base.left + 0.001 * epsilon,
7210                right: base.right - 0.001 * epsilon,
7211                c0: base.c0 + 0.001 * epsilon,
7212                c1: base.c1 - 0.001 * epsilon,
7213                c2: base.c2 + 0.001 * epsilon,
7214                c3: base.c3 - 0.001 * epsilon,
7215            };
7216            assert_eq!(
7217                cell_moment_cache_key(base, 9, epsilon),
7218                cell_moment_cache_key(perturbed, 9, epsilon)
7219            );
7220            let lhs = evaluate_cell_moments(base, 9).expect("base moments");
7221            let rhs = evaluate_cell_moments(perturbed, 9).expect("perturbed moments");
7222            let max_rel = lhs
7223                .moments
7224                .iter()
7225                .zip(rhs.moments.iter())
7226                .map(|(a, b)| (a - b).abs() / a.abs().max(b.abs()).max(1.0))
7227                .fold(0.0_f64, f64::max);
7228            assert!(
7229                max_rel <= 10.0 * epsilon,
7230                "epsilon={epsilon:.1e} max_rel={max_rel:.3e}"
7231            );
7232        }
7233    }
7234
7235    /// Locks in numerical equivalence of the optimized
7236    /// `evaluate_non_affine_cell_state` against an inline reference
7237    /// implementation that mirrors the prior pre-fold structure
7238    /// (separate `cell.eta(z)` / `cell.q(z)` calls; post-loop
7239    /// `* half_width`; trailing `value_integral * half_width / sqrt(TAU)`).
7240    /// Any drift larger than 1e-13 relative would indicate the hot-path
7241    /// rewrite changed the math.
7242    #[test]
7243    fn non_affine_cell_state_matches_prefold_reference_to_1e_minus_13() {
7244        // Reference: byte-for-byte the structure of the previous
7245        // implementation. Kept local to this test to avoid leaking a second
7246        // public surface.
7247        fn reference(
7248            cell: DenestedCubicCell,
7249            branch: ExactCellBranch,
7250            max_degree: usize,
7251        ) -> CellMomentState {
7252            let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
7253            let mut value_integral = 0.0_f64;
7254            let center = 0.5 * (cell.left + cell.right);
7255            let half_width = 0.5 * (cell.right - cell.left);
7256            for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
7257                let z = center + half_width * node;
7258                let eta = cell.eta(z);
7259                let moment_weight = weight * (-cell.q(z)).exp();
7260                let mut z_pow = 1.0_f64;
7261                for moment in &mut moments {
7262                    *moment = moment_weight.mul_add(z_pow, *moment);
7263                    z_pow *= z;
7264                }
7265                value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
7266            }
7267            for moment in &mut moments {
7268                *moment *= half_width;
7269            }
7270            CellMomentState {
7271                branch,
7272                value: value_integral * half_width / (std::f64::consts::TAU).sqrt(),
7273                moments,
7274            }
7275        }
7276
7277        // Hand-rolled inputs that cross both Quartic and Sextic branches and
7278        // exercise positive/negative coefficients, asymmetric intervals, and
7279        // a wide degree range (matches survival_marginal_slope's degree=9
7280        // production call as well as the bernoulli outer-step degree=24).
7281        let cells = [
7282            DenestedCubicCell {
7283                left: -1.25,
7284                right: -0.2,
7285                c0: -0.35,
7286                c1: 0.85,
7287                c2: 0.04,
7288                c3: -0.015,
7289            },
7290            DenestedCubicCell {
7291                left: -0.2,
7292                right: 0.55,
7293                c0: 0.12,
7294                c1: -0.65,
7295                c2: -0.025,
7296                c3: 0.02,
7297            },
7298            DenestedCubicCell {
7299                left: 0.55,
7300                right: 1.6,
7301                c0: 0.42,
7302                c1: 0.35,
7303                c2: 0.018,
7304                c3: 0.012,
7305            },
7306            DenestedCubicCell {
7307                left: -3.0,
7308                right: -1.0,
7309                c0: 1.7,
7310                c1: -0.4,
7311                c2: 0.11,
7312                c3: -0.07,
7313            },
7314        ];
7315        let degrees = [0_usize, 4, 9, 16, 24];
7316        for cell in cells {
7317            let branch = branch_cell(cell).expect("branch");
7318            assert_ne!(branch, ExactCellBranch::Affine);
7319            for max_degree in degrees {
7320                let actual = evaluate_non_affine_cell_state(cell, branch, max_degree)
7321                    .expect("optimized non-affine");
7322                let expected = reference(cell, branch, max_degree);
7323                assert_eq!(actual.branch, expected.branch);
7324                assert_eq!(actual.moments.len(), expected.moments.len());
7325                let denom_v = expected.value.abs().max(1.0);
7326                let rel_v = (actual.value - expected.value).abs() / denom_v;
7327                let actual_v = actual.value;
7328                let expected_v = expected.value;
7329                assert!(
7330                    rel_v <= 1e-13,
7331                    "value rel mismatch for {cell:?} degree {max_degree}: \
7332                     actual={actual_v:.17e} expected={expected_v:.17e} rel={rel_v:.3e}"
7333                );
7334                for (k, (lhs, rhs)) in actual
7335                    .moments
7336                    .iter()
7337                    .zip(expected.moments.iter())
7338                    .enumerate()
7339                {
7340                    let denom = rhs.abs().max(1.0);
7341                    let rel = (lhs - rhs).abs() / denom;
7342                    assert!(
7343                        rel <= 1e-13,
7344                        "moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7345                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7346                    );
7347                }
7348
7349                // Also lock in the derivative-state path on the same
7350                // inputs so the (parallel) edit there can't drift.
7351                let actual_deriv =
7352                    evaluate_non_affine_cell_derivative_state(cell, branch, max_degree)
7353                        .expect("optimized derivative");
7354                for (k, (lhs, rhs)) in actual_deriv
7355                    .moments
7356                    .iter()
7357                    .zip(expected.moments.iter())
7358                    .enumerate()
7359                {
7360                    let denom = rhs.abs().max(1.0);
7361                    let rel = (lhs - rhs).abs() / denom;
7362                    assert!(
7363                        rel <= 1e-13,
7364                        "deriv moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7365                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7366                    );
7367                }
7368            }
7369        }
7370    }
7371
7372    /// DECISIVE: the third-derivative kernel must equal the FD of the
7373    /// second-derivative kernel w.r.t. a parameter that perturbs `eta`,
7374    /// RE-EVALUATING the moments at each step (the moments depend on `eta`
7375    /// via the `exp(-q)` weight). This isolates the kernel from all survival
7376    /// partition/cross machinery (gam#979 f_uv_dir localization).
7377    #[test]
7378    fn third_derivative_kernel_matches_fd_of_second_with_eta_perturbation() {
7379        // A finite, non-affine cell.
7380        let base = DenestedCubicCell {
7381            left: -0.6,
7382            right: 0.9,
7383            c0: 0.30,
7384            c1: 0.45,
7385            c2: -0.20,
7386            c3: 0.12,
7387        };
7388        // Synthetic parameter directions as cubic-in-z perturbations of eta:
7389        //   eta_u = ∂eta/∂u, eta_v = ∂eta/∂v, eta_t = ∂eta/∂t (the dir).
7390        let eta_u = [0.11_f64, -0.07, 0.05, 0.02];
7391        let eta_v = [-0.09_f64, 0.13, -0.04, 0.03];
7392        let eta_t = [0.17_f64, 0.06, -0.10, 0.04]; // the "b-like" direction
7393        // Second crosses ∂²eta/∂{·}{·} (pick small non-zero cubics).
7394        let eta_uv = [0.02_f64, 0.01, -0.015, 0.005];
7395        let eta_ut = [-0.01_f64, 0.02, 0.007, -0.003];
7396        let eta_vt = [0.015_f64, -0.008, 0.01, 0.004];
7397        // Third cross ∂³eta/∂u∂v∂t.
7398        let eta_uvt = [0.003_f64, -0.002, 0.001, 0.0005];
7399
7400        let neg = |a: &[f64; 4]| a.map(|v| -v);
7401        let max_degree = 15usize;
7402
7403        // f_uv(s) where param s shifts eta by s·(eta_t + ½ s²... ) — here we
7404        // build the cell at eta + s·eta_t + s²·eta_vt-style is NOT needed; we
7405        // only need the t-direction to first order for ∂/∂t. To FD ∂(f_uv)/∂t
7406        // we perturb eta along eta_t AND carry the s-dependence of the u,v
7407        // crosses: eta_u(s)=eta_u + s·eta_ut, eta_v(s)=eta_v + s·eta_vt,
7408        // eta_uv(s)=eta_uv + s·eta_uvt. The cell cubic shifts by s·eta_t.
7409        let f_uv_at = |s: f64| -> f64 {
7410            let cell_s = DenestedCubicCell {
7411                c0: base.c0 + s * eta_t[0],
7412                c1: base.c1 + s * eta_t[1],
7413                c2: base.c2 + s * eta_t[2],
7414                c3: base.c3 + s * eta_t[3],
7415                ..base
7416            };
7417            // Moments MUST be recomputed at the perturbed eta.
7418            let st = evaluate_cell_moments(cell_s, max_degree).unwrap();
7419            let neg_cell = DenestedCubicCell {
7420                c0: -cell_s.c0,
7421                c1: -cell_s.c1,
7422                c2: -cell_s.c2,
7423                c3: -cell_s.c3,
7424                ..cell_s
7425            };
7426            let u_s = [
7427                eta_u[0] + s * eta_ut[0],
7428                eta_u[1] + s * eta_ut[1],
7429                eta_u[2] + s * eta_ut[2],
7430                eta_u[3] + s * eta_ut[3],
7431            ];
7432            let v_s = [
7433                eta_v[0] + s * eta_vt[0],
7434                eta_v[1] + s * eta_vt[1],
7435                eta_v[2] + s * eta_vt[2],
7436                eta_v[3] + s * eta_vt[3],
7437            ];
7438            let uv_s = [
7439                eta_uv[0] + s * eta_uvt[0],
7440                eta_uv[1] + s * eta_uvt[1],
7441                eta_uv[2] + s * eta_uvt[2],
7442                eta_uv[3] + s * eta_uvt[3],
7443            ];
7444            cell_second_derivative_from_moments(
7445                neg_cell,
7446                &neg(&u_s),
7447                &neg(&v_s),
7448                &neg(&uv_s),
7449                &st.moments,
7450            )
7451            .unwrap()
7452        };
7453
7454        let h = 1e-5;
7455        let fd = (f_uv_at(h) - f_uv_at(-h)) / (2.0 * h);
7456
7457        // Analytic third via the kernel (negated cell + negated crosses, as the
7458        // survival path does).
7459        let st0 = evaluate_cell_moments(base, max_degree).unwrap();
7460        let neg_cell0 = DenestedCubicCell {
7461            c0: -base.c0,
7462            c1: -base.c1,
7463            c2: -base.c2,
7464            c3: -base.c3,
7465            ..base
7466        };
7467        let analytic = cell_third_derivative_from_moments(
7468            neg_cell0,
7469            &neg(&eta_u),
7470            &neg(&eta_v),
7471            &neg(&eta_t),
7472            &neg(&eta_uv),
7473            &neg(&eta_ut),
7474            &neg(&eta_vt),
7475            &neg(&eta_uvt),
7476            &st0.moments,
7477        )
7478        .unwrap();
7479
7480        let denom = fd.abs().max(1e-3);
7481        let rel = (analytic - fd).abs() / denom;
7482        assert!(
7483            rel <= 1e-5,
7484            "third kernel vs FD-of-second mismatch: analytic={analytic:.12e} fd={fd:.12e} rel={rel:.3e}"
7485        );
7486    }
7487
7488    #[test]
7489    fn moving_shared_edge_second_integral_derivative_has_leibniz_jump_sign() {
7490        let edge0 = 0.2_f64;
7491        let edge_velocity = -0.37_f64;
7492
7493        let left_eta = [0.22_f64, -0.18, 0.09, 0.03];
7494        let right_eta = [-0.11_f64, 0.26, -0.04, 0.02];
7495        let left_r = [0.08_f64, -0.05, 0.03, 0.01];
7496        let left_s = [-0.06_f64, 0.04, 0.02, -0.015];
7497        let left_rs = [0.025_f64, -0.012, 0.006, 0.004];
7498        let right_r = [-0.03_f64, 0.07, -0.02, 0.012];
7499        let right_s = [0.05_f64, -0.025, 0.018, 0.007];
7500        let right_rs = [-0.018_f64, 0.014, -0.005, 0.003];
7501
7502        let integral_at = |shift: f64| -> f64 {
7503            let edge = edge0 + edge_velocity * shift;
7504            let left = DenestedCubicCell {
7505                left: -0.7,
7506                right: edge,
7507                c0: left_eta[0],
7508                c1: left_eta[1],
7509                c2: left_eta[2],
7510                c3: left_eta[3],
7511            };
7512            let right = DenestedCubicCell {
7513                left: edge,
7514                right: 1.1,
7515                c0: right_eta[0],
7516                c1: right_eta[1],
7517                c2: right_eta[2],
7518                c3: right_eta[3],
7519            };
7520            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7521            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7522            cell_second_derivative_from_moments(
7523                left,
7524                &left_r,
7525                &left_s,
7526                &left_rs,
7527                &left_state.moments,
7528            )
7529            .expect("left second")
7530                + cell_second_derivative_from_moments(
7531                    right,
7532                    &right_r,
7533                    &right_s,
7534                    &right_rs,
7535                    &right_state.moments,
7536                )
7537                .expect("right second")
7538        };
7539
7540        let h = 1e-5;
7541        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7542
7543        let left = DenestedCubicCell {
7544            left: -0.7,
7545            right: edge0,
7546            c0: left_eta[0],
7547            c1: left_eta[1],
7548            c2: left_eta[2],
7549            c3: left_eta[3],
7550        };
7551        let right = DenestedCubicCell {
7552            left: edge0,
7553            right: 1.1,
7554            c0: right_eta[0],
7555            c1: right_eta[1],
7556            c2: right_eta[2],
7557            c3: right_eta[3],
7558        };
7559        let f_left =
7560            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7561        let f_right =
7562            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7563        let analytic = edge_velocity * (f_left - f_right);
7564
7565        let denom = analytic.abs().max(1e-8);
7566        let rel = (fd - analytic).abs() / denom;
7567        assert!(
7568            rel <= 5e-8,
7569            "moving edge sign mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7570        );
7571    }
7572
7573    #[test]
7574    fn moving_shared_edge_second_integral_mixed_derivative_has_full_leibniz_terms() {
7575        let edge0 = -0.15_f64;
7576        let edge_d1 = 0.31_f64;
7577        let edge_d2 = -0.27_f64;
7578        let edge_d12 = 0.19_f64;
7579
7580        let left_eta = [0.16_f64, -0.21, 0.07, -0.025];
7581        let right_eta = [-0.09_f64, 0.18, -0.055, 0.018];
7582        let left_r = [0.075_f64, -0.045, 0.018, 0.009];
7583        let left_s = [-0.052_f64, 0.033, 0.014, -0.011];
7584        let left_rs = [0.021_f64, -0.009, 0.005, 0.0025];
7585        let right_r = [-0.028_f64, 0.063, -0.017, 0.010];
7586        let right_s = [0.047_f64, -0.023, 0.016, 0.006];
7587        let right_rs = [-0.015_f64, 0.012, -0.004, 0.002];
7588
7589        let integral_at = |s1: f64, s2: f64| -> f64 {
7590            let edge = edge0 + edge_d1 * s1 + edge_d2 * s2 + edge_d12 * s1 * s2;
7591            let left = DenestedCubicCell {
7592                left: -0.8,
7593                right: edge,
7594                c0: left_eta[0],
7595                c1: left_eta[1],
7596                c2: left_eta[2],
7597                c3: left_eta[3],
7598            };
7599            let right = DenestedCubicCell {
7600                left: edge,
7601                right: 0.9,
7602                c0: right_eta[0],
7603                c1: right_eta[1],
7604                c2: right_eta[2],
7605                c3: right_eta[3],
7606            };
7607            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7608            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7609            cell_second_derivative_from_moments(
7610                left,
7611                &left_r,
7612                &left_s,
7613                &left_rs,
7614                &left_state.moments,
7615            )
7616            .expect("left second")
7617                + cell_second_derivative_from_moments(
7618                    right,
7619                    &right_r,
7620                    &right_s,
7621                    &right_rs,
7622                    &right_state.moments,
7623                )
7624                .expect("right second")
7625        };
7626
7627        let h = 2e-4;
7628        let fd = (integral_at(h, h) - integral_at(h, -h) - integral_at(-h, h)
7629            + integral_at(-h, -h))
7630            / (4.0 * h * h);
7631
7632        let left = DenestedCubicCell {
7633            left: -0.8,
7634            right: edge0,
7635            c0: left_eta[0],
7636            c1: left_eta[1],
7637            c2: left_eta[2],
7638            c3: left_eta[3],
7639        };
7640        let right = DenestedCubicCell {
7641            left: edge0,
7642            right: 0.9,
7643            c0: right_eta[0],
7644            c1: right_eta[1],
7645            c2: right_eta[2],
7646            c3: right_eta[3],
7647        };
7648
7649        let boundary_z_derivative =
7650            |cell: DenestedCubicCell, r: &[f64], s: &[f64], rs: &[f64]| -> f64 {
7651                let eta = cell.eta(edge0);
7652                let eta_z = cell.c1 + 2.0 * cell.c2 * edge0 + 3.0 * cell.c3 * edge0 * edge0;
7653                let cr = poly_eval_at(r, edge0);
7654                let cs = poly_eval_at(s, edge0);
7655                let crs = poly_eval_at(rs, edge0);
7656                let cr_z = r.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7657                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7658                });
7659                let cs_z = s.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7660                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7661                });
7662                let crs_z = rs.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7663                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7664                });
7665                let amp = crs - eta * cr * cs;
7666                let amp_z = crs_z - eta_z * cr * cs - eta * cr_z * cs - eta * cr * cs_z;
7667                let q_z = edge0 + eta * eta_z;
7668                (amp_z - amp * q_z) * (-cell.q(edge0)).exp() * INV_TWO_PI
7669            };
7670
7671        let f_left =
7672            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7673        let f_right =
7674            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7675        let fz_left = boundary_z_derivative(left, &left_r, &left_s, &left_rs);
7676        let fz_right = boundary_z_derivative(right, &right_r, &right_s, &right_rs);
7677        let analytic = edge_d12 * (f_left - f_right) + edge_d1 * edge_d2 * (fz_left - fz_right);
7678
7679        let denom = analytic.abs().max(1e-8);
7680        let rel = (fd - analytic).abs() / denom;
7681        assert!(
7682            rel <= 2e-7,
7683            "moving edge mixed term mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7684        );
7685    }
7686
7687    // gam#1454 resolution. The reported defect ("survival flex directional
7688    // third[g,w0] wrong: candidate f_au_dir/f_aa_dir missing self-flux") posited
7689    // a MISSING third-order Leibniz self-flux at the moving link-knot crossings.
7690    // This regression establishes the two facts that, together, prove the
7691    // implicit-intercept third-order tower
7692    // (`row_primary_third_contracted_recompute*`) is CORRECT to add no such flux:
7693    //
7694    //   (1) The third-derivative integrand `F_rst` genuinely DOES jump across a
7695    //       C²-link knot — its third coefficient slice carries `c_rst ∝ 6·α₃`,
7696    //       and `α₃` (the spline's third `z`-derivative) is the one piece a C²
7697    //       cubic spline leaves discontinuous. So the jump is real and the
7698    //       `cell_third_derivative_boundary_integrand` flux formula is exact
7699    //       (verified by FD of a direct ∂/∂edge of the third-integral sum —
7700    //       a FOURTH-order scenario that pins the integrand, not the tower).
7701    //
7702    //   (2) Every boundary term in the Leibniz expansion of a THIRD derivative,
7703    //       however, evaluates an integrand of order ≤ 2 at the moving edge
7704    //       (one of the three differentiations is spent moving the boundary).
7705    //       The second-derivative integrand `F_rs` is CONTINUOUS across the same
7706    //       C² knot (its slices reach at most `α₂ + 3α₃·shift`, i.e. ½·η''(u*),
7707    //       which a C² spline keeps continuous). Hence the shared-edge flux
7708    //       `velocity·(F_rs^L − F_rs^R)` telescopes to ZERO, and the tower's
7709    //       third-order self-flux is a genuine no-op. The real residual lives in
7710    //       the interior implicit-intercept assembly, not at the boundary.
7711    #[test]
7712    fn third_order_self_flux_telescopes_but_third_integrand_jumps_at_c2_knot_1454() {
7713        let edge0 = 0.13_f64;
7714        let edge_velocity = -0.41_f64;
7715
7716        // Build η continuous to C² at edge0 but with a jump in the cubic (3rd
7717        // derivative) coefficient. Pick the left cubic freely; choose the right
7718        // cubic to match value+1st+2nd derivative at edge0, then perturb its c3.
7719        let left_eta = [0.18_f64, -0.12, 0.07, 0.04];
7720        let right_c3 = 0.04_f64 + 0.09; // α₃ jump across the knot.
7721        // Match η, η', η'' at edge0 for the right piece given its c3:
7722        //   η(z)  = c0 + c1 z + c2 z² + c3 z³
7723        //   η'(z) = c1 + 2 c2 z + 3 c3 z²
7724        //   η''(z)= 2 c2 + 6 c3 z
7725        // Solve right (c0,c1,c2) so the three values equal the left ones at edge0.
7726        let l0 = left_eta[0];
7727        let l1 = left_eta[1];
7728        let l2 = left_eta[2];
7729        let l3 = left_eta[3];
7730        let e = edge0;
7731        let eta_val = l0 + l1 * e + l2 * e * e + l3 * e * e * e;
7732        let eta_d1 = l1 + 2.0 * l2 * e + 3.0 * l3 * e * e;
7733        let eta_d2 = 2.0 * l2 + 6.0 * l3 * e;
7734        let rc2 = (eta_d2 - 6.0 * right_c3 * e) / 2.0;
7735        let rc1 = eta_d1 - 2.0 * rc2 * e - 3.0 * right_c3 * e * e;
7736        let rc0 = eta_val - rc1 * e - rc2 * e * e - right_c3 * e * e * e;
7737        let right_eta = [rc0, rc1, rc2, right_c3];
7738
7739        // Coefficient slices. The first/second slices we keep continuous at the
7740        // edge (mimicking c_r=1+η', c_rs∝η'' which a C² spline matches), so the
7741        // 2nd-order flux would cancel. The third-order slice `rst` carries the
7742        // jumping α₃ and is DIFFERENT across the edge — this is the term that
7743        // breaks cancellation.
7744        let common_r = [0.06_f64, -0.04, 0.02, 0.0];
7745        let common_s = [-0.05_f64, 0.03, 0.015, 0.0];
7746        let common_t = [0.08_f64, 0.05, -0.03, 0.0];
7747        let common_rs = [0.02_f64, -0.01, 0.005, 0.0];
7748        let common_rt = [-0.012_f64, 0.008, 0.004, 0.0];
7749        let common_st = [0.015_f64, -0.006, 0.003, 0.0];
7750        // rst ∝ 6·α₃ in the real path: left and right differ by the α₃ jump.
7751        let left_rst = [6.0 * l3, 0.0, 0.0, 0.0];
7752        let right_rst = [6.0 * right_c3, 0.0, 0.0, 0.0];
7753
7754        let max_degree = 15usize;
7755        let neg = |a: &[f64; 4]| a.map(|v| -v);
7756
7757        // The integral sum over the two cells sharing the moving edge, computed
7758        // via the fixed-domain moment reduction with the SURVIVAL/probit sign
7759        // convention (negated cell + negated coefficient slices), exactly as the
7760        // production `row_primary_third_contracted_recompute` path does.
7761        let integral_at = |shift: f64| -> f64 {
7762            let edge = edge0 + edge_velocity * shift;
7763            let left = DenestedCubicCell {
7764                left: -0.7,
7765                right: edge,
7766                c0: left_eta[0],
7767                c1: left_eta[1],
7768                c2: left_eta[2],
7769                c3: left_eta[3],
7770            };
7771            let right = DenestedCubicCell {
7772                left: edge,
7773                right: 1.0,
7774                c0: right_eta[0],
7775                c1: right_eta[1],
7776                c2: right_eta[2],
7777                c3: right_eta[3],
7778            };
7779            let lst = evaluate_cell_moments(left, max_degree).unwrap();
7780            let rst_m = evaluate_cell_moments(right, max_degree).unwrap();
7781            let neg_left = DenestedCubicCell {
7782                c0: -left.c0,
7783                c1: -left.c1,
7784                c2: -left.c2,
7785                c3: -left.c3,
7786                ..left
7787            };
7788            let neg_right = DenestedCubicCell {
7789                c0: -right.c0,
7790                c1: -right.c1,
7791                c2: -right.c2,
7792                c3: -right.c3,
7793                ..right
7794            };
7795            let li = cell_third_derivative_from_moments(
7796                neg_left,
7797                &neg(&common_r),
7798                &neg(&common_s),
7799                &neg(&common_t),
7800                &neg(&common_rs),
7801                &neg(&common_rt),
7802                &neg(&common_st),
7803                &neg(&left_rst),
7804                &lst.moments,
7805            )
7806            .unwrap();
7807            let ri = cell_third_derivative_from_moments(
7808                neg_right,
7809                &neg(&common_r),
7810                &neg(&common_s),
7811                &neg(&common_t),
7812                &neg(&common_rs),
7813                &neg(&common_rt),
7814                &neg(&common_st),
7815                &neg(&right_rst),
7816                &rst_m.moments,
7817            )
7818            .unwrap();
7819            li + ri
7820        };
7821
7822        let h = 1e-5;
7823        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7824
7825        // Fixed-domain part: differentiate ONLY the integrands (domain frozen at
7826        // edge0). Its directional derivative is the analytic Leibniz flux alone,
7827        // since the integrand coefficients here are edge-independent:
7828        //   flux = velocity · ( F_rst^L(edge0) − F_rst^R(edge0) ).
7829        //
7830        // CONVENTION: the finite-difference `integral_at` above integrates the
7831        // SURVIVAL/probit sign convention — negated cell (η→−η) AND negated
7832        // coefficient slices — exactly as the production
7833        // `row_primary_third_contracted_recompute` path does. The Leibniz
7834        // boundary integrand must therefore be evaluated in that SAME negated
7835        // convention: the third-derivative integrand is ODD under the joint
7836        // (η→−η, coeff→−coeff) negation (its `rst`, `η·rs·t`, and `(η²−1)·r·s·t`
7837        // terms each flip sign an odd number of times), so evaluating the flux
7838        // with un-negated cells/coeffs yields exactly the opposite sign and the
7839        // Leibniz identity `fd = flux` fails as `fd = −flux`. (The
7840        // second-derivative sibling test `moving_shared_edge_second_integral_
7841        // derivative_has_leibniz_jump_sign` keeps BOTH sides un-negated and so
7842        // stays self-consistent; this test keeps BOTH sides negated.)
7843        let neg_eta = |eta: &[f64; 4]| [-eta[0], -eta[1], -eta[2], -eta[3]];
7844        let left_eta_neg = neg_eta(&left_eta);
7845        let right_eta_neg = neg_eta(&right_eta);
7846        let left0 = DenestedCubicCell {
7847            left: -0.7,
7848            right: edge0,
7849            c0: left_eta_neg[0],
7850            c1: left_eta_neg[1],
7851            c2: left_eta_neg[2],
7852            c3: left_eta_neg[3],
7853        };
7854        let right0 = DenestedCubicCell {
7855            left: edge0,
7856            right: 1.0,
7857            c0: right_eta_neg[0],
7858            c1: right_eta_neg[1],
7859            c2: right_eta_neg[2],
7860            c3: right_eta_neg[3],
7861        };
7862        let f_left = cell_third_derivative_boundary_integrand(
7863            left0,
7864            &neg(&common_r),
7865            &neg(&common_s),
7866            &neg(&common_t),
7867            &neg(&common_rs),
7868            &neg(&common_rt),
7869            &neg(&common_st),
7870            &neg(&left_rst),
7871            edge0,
7872        );
7873        let f_right = cell_third_derivative_boundary_integrand(
7874            right0,
7875            &neg(&common_r),
7876            &neg(&common_s),
7877            &neg(&common_t),
7878            &neg(&common_rs),
7879            &neg(&common_rt),
7880            &neg(&common_st),
7881            &neg(&right_rst),
7882            edge0,
7883        );
7884
7885        // The integrand DOES jump across this C² knot (the α₃ third-coefficient
7886        // term is the only discontinuous piece). Confirm the jump is genuine —
7887        // if it were zero the flux would be a no-op and #1454 would not exist.
7888        let jump = f_left - f_right;
7889        assert!(
7890            jump.abs() > 1e-4,
7891            "third-derivative integrand must jump across the C² knot (α₃ discontinuity); \
7892             got jump={jump:.3e}"
7893        );
7894
7895        let analytic_flux = edge_velocity * jump;
7896        let denom = fd.abs().max(1e-6);
7897        let rel = (fd - analytic_flux).abs() / denom;
7898        assert!(
7899            rel <= 1e-5,
7900            "moving-edge third-derivative flux mismatch (#1454): fd={fd:.12e} \
7901             analytic_flux={analytic_flux:.12e} rel={rel:.3e}"
7902        );
7903
7904        // ---- Fact (2): the SECOND-derivative integrand telescopes to zero. ----
7905        // A 3rd-derivative Leibniz boundary term spends one differentiation on
7906        // the moving edge and evaluates a ≤2nd-order integrand there. The
7907        // hardest such term is the slope-slope Hessian integrand `F_bb`, whose
7908        // coefficient slice is the link cubic's b-b partial
7909        //   dc_dbb(z) = [0, 0, 2(α₂ + 3 α₃·shift), 6 α₃·b]·(z⁰..z³)
7910        //             = z²·η''(u),  with u = a + b·z, shift = a − knot.
7911        // Across a C² knot α₂, α₃, and `shift` all jump, yet η''(u*) is
7912        // continuous — so the EVALUATED slice `c_bb(z*) = z*²·η''(u*)` matches on
7913        // both sides and `F_bb` is continuous. Build the two pieces' raw dc_dbb
7914        // decompositions from `link_cubic_second_partials` and confirm the
7915        // second-derivative integrand carries no jump (flux telescopes to 0).
7916        let a_row = 0.21_f64;
7917        let b_row = 1.37_f64;
7918        let knot = a_row + b_row * edge0; // u-location of the crossing.
7919        // Left/right link pieces: choose α₂,α₃ freely on the left; pick the
7920        // right piece's α₂ so η''(knot) is continuous given a jumped α₃.
7921        let left_link = LocalSpanCubic {
7922            left: knot - 0.6,
7923            right: knot + 0.6,
7924            c0: 0.0,
7925            c1: 0.0,
7926            c2: 0.08,
7927            c3: -0.05,
7928        };
7929        let right_alpha3 = -0.05_f64 + 0.11; // α₃ jump.
7930        // η''(knot) continuity:  2α₂ᴸ + 6α₃ᴸ·(knot−leftᴸ) = 2α₂ᴿ + 6α₃ᴿ·(knot−leftᴿ).
7931        let right_left_coord = knot - 0.4;
7932        let lhs = 2.0 * left_link.c2 + 6.0 * left_link.c3 * (knot - left_link.left);
7933        let right_alpha2 = (lhs - 6.0 * right_alpha3 * (knot - right_left_coord)) / 2.0;
7934        let right_link = LocalSpanCubic {
7935            left: right_left_coord,
7936            right: right_left_coord + 0.8,
7937            c0: 0.0,
7938            c1: 0.0,
7939            c2: right_alpha2,
7940            c3: right_alpha3,
7941        };
7942        let (_, _, dc_dbb_left) = link_cubic_second_partials(left_link, a_row, b_row);
7943        let (_, _, dc_dbb_right) = link_cubic_second_partials(right_link, a_row, b_row);
7944        // The per-coefficient arrays differ (α₃ jumped)...
7945        assert!(
7946            (dc_dbb_left[3] - dc_dbb_right[3]).abs() > 1e-3,
7947            "α₃ jump must make the raw dc_dbb coefficient arrays differ"
7948        );
7949        // ...but the EVALUATED second-order slice at the crossing matches, so the
7950        // F_bb boundary integrand carries no jump and the flux telescopes to 0.
7951        let c_bb_left = poly_eval_at(&dc_dbb_left, edge0);
7952        let c_bb_right = poly_eval_at(&dc_dbb_right, edge0);
7953        assert!(
7954            (c_bb_left - c_bb_right).abs() <= 1e-12,
7955            "second-derivative slope-slope integrand must be CONTINUOUS across the \
7956             C² knot (telescoping self-flux): left={c_bb_left:.15e} right={c_bb_right:.15e}"
7957        );
7958    }
7959}
gam_model_kernels/cubic_cell_kernel.rs

gam_model_kernels/
cubic_cell_kernel.rs