gam_model_kernels/
cubic_cell_kernel.rs

1use gam_math::probability::normal_cdf;
2use gam_runtime::resource::{ByteLruCache, ResidentBytes};
3use smallvec::{SmallVec, smallvec};
4use std::hash::{Hash, Hasher};
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7
8/// Typed errors raised by the de-nested cubic transport kernel.
9///
10/// Sibling families (`bernoulli_marginal_slope`, `survival_marginal_slope`,
11/// `marginal_slope_shared`) currently consume the kernel's public surface via
12/// `Result<_, String>`. To stay source-compatible, the kernel converts errors
13/// to `String` at the boundary via `From<CubicCellKernelError> for String` and
14/// keeps the public function signatures returning `Result<_, String>`.
15/// `Display` is exact-byte-equivalent to the previous `format!(...)` strings.
16#[derive(Clone, Debug)]
17pub enum CubicCellKernelError {
18    /// Interval probe / cell-bounds preconditions (ordered bounds, supported
19    /// infinity patterns, positive finite width).
20    InvalidInterval { reason: String },
21    /// Cell-shape / branch-classification failure: tail cells not affine,
22    /// finite cells with non-positive width, non-finite affine coefficients,
23    /// non-affine cell with infinite bounds, leading-coefficient degeneracy
24    /// in the moment recurrence, etc.
25    InvalidCellShape { reason: String },
26    /// Reduced moment vector (or polynomial-convolution scratch) is shorter
27    /// than the polynomial degree the leaf needs to evaluate.
28    InsufficientMoments { reason: String },
29    /// Bivariate-normal CDF domain validation (non-finite/non-infinite
30    /// argument, non-finite correlation).
31    BivariateNormalDomain { reason: String },
32}
33
34impl_reason_error_boilerplate! {
35    CubicCellKernelError {
36        InvalidInterval,
37        InvalidCellShape,
38        InsufficientMoments,
39        BivariateNormalDomain,
40    }
41}
42
43impl CubicCellKernelError {
44    #[inline]
45    fn invalid_interval(reason: impl Into<String>) -> Self {
46        CubicCellKernelError::InvalidInterval {
47            reason: reason.into(),
48        }
49    }
50    #[inline]
51    fn invalid_cell_shape(reason: impl Into<String>) -> Self {
52        CubicCellKernelError::InvalidCellShape {
53            reason: reason.into(),
54        }
55    }
56    #[inline]
57    fn insufficient_moments(reason: impl Into<String>) -> Self {
58        CubicCellKernelError::InsufficientMoments {
59            reason: reason.into(),
60        }
61    }
62    #[inline]
63    fn bivariate_normal_domain(reason: impl Into<String>) -> Self {
64        CubicCellKernelError::BivariateNormalDomain {
65            reason: reason.into(),
66        }
67    }
68}
69
70// De-nested cubic transport kernel.
71//
72// This module implements the de-nested flexible-link/score-warp model
73//
74//   eta(z) = a + b*z + b*delta_h(z) + delta_w(a + b*z)
75//
76// where delta_h is the score warp and delta_w is the link deviation.
77// This is not the literal nested composition L(a + b*H(z)); it is an
78// additive-correction model around the affine core a + b*z.
79//
80// On each partition cell, both deviations are cubic polynomials, so eta is
81// at most sextic in z and q(z) = 0.5*(z^2 + eta^2) is at most degree 12.
82// The integral of exp(-q(z)) is evaluated by transporting from the affine
83// anchor (c2=c3=0, where q is Gaussian and the integral reduces to BVN)
84// to the target non-affine cell via the polynomial moment recurrence.
85//
86// The partition covers (-∞, +∞) with:
87//   • two semi-infinite affine TAIL cells (outside all deviation support),
88//   • finitely many interior cells (each a sextic microcell).
89// Because tail cells have constant deviations (c2=c3=0), their bounds
90// are parameter-independent, so no Leibniz boundary-motion corrections
91// appear in the derivatives.
92//
93// Shared by bernoulli_marginal_slope and survival_marginal_slope families.
94
95#[derive(Clone, Copy, Debug, PartialEq)]
96pub struct LocalSpanCubic {
97    pub left: f64,
98    pub right: f64,
99    pub c0: f64,
100    pub c1: f64,
101    pub c2: f64,
102    pub c3: f64,
103}
104
105impl LocalSpanCubic {
106    #[inline]
107    pub fn evaluate(self, x: f64) -> f64 {
108        let t = x - self.left;
109        self.c0 + self.c1 * t + self.c2 * t * t + self.c3 * t * t * t
110    }
111
112    #[inline]
113    pub fn first_derivative(self, x: f64) -> f64 {
114        let t = x - self.left;
115        self.c1 + 2.0 * self.c2 * t + 3.0 * self.c3 * t * t
116    }
117
118    #[inline]
119    pub fn second_derivative(self, x: f64) -> f64 {
120        let t = x - self.left;
121        2.0 * self.c2 + 6.0 * self.c3 * t
122    }
123}
124
125pub const ANCHORED_DEVIATION_KERNEL: &str = "DenestedCubicTransport";
126/// Default normalized non-affine branch tolerance used by [`branch_cell`].
127///
128/// Keep this cutoff explicit and hill-climbable: the large-scale cycle-0
129/// sweep evaluated `{1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3}` against the
130/// legacy transport path.  The more aggressive candidates require an
131/// end-to-end beta acceptance run before promotion; the default therefore
132/// remains the legacy `1e-10` value to preserve bit-for-bit model behavior.
133pub const NORMALIZED_CELL_BRANCH_TOL: f64 = 1e-10;
134
135const INV_TWO_PI: f64 = 1.0 / std::f64::consts::TAU;
136
137/// 384-point Gauss–Legendre nodes, re-exported for the GPU cubic-cell kernel
138/// (`src/gpu/cubic_cell/kernel_src.rs`) to embed as `__constant__` device
139/// memory. Linux-only because the kernel emitter is Linux-only.
140#[cfg(target_os = "linux")]
141pub const GL_NODES_FOR_GPU_KERNEL: &[f64; 384] = &GL_NODES;
142/// Companion weights to [`GL_NODES_FOR_GPU_KERNEL`].
143#[cfg(target_os = "linux")]
144pub const GL_WEIGHTS_FOR_GPU_KERNEL: &[f64; 384] = &GL_WEIGHTS;
145
146const GL_NODES: [f64; 384] = [
147    -9.999_804_411_726_474e-1,
148    -9.998_969_471_378_596e-1,
149    -9.997_467_408_113_523e-1,
150    -9.995_297_988_558_859e-1,
151    -9.992_461_316_671_845e-1,
152    -9.988_957_572_063_257e-1,
153    -9.984_786_985_384_589e-1,
154    -9.979_949_833_727_938e-1,
155    -9.974_446_439_389_107e-1,
156    -9.968_277_169_440_913e-1,
157    -9.961_442_435_551_087e-1,
158    -9.953_942_693_885_953e-1,
159    -9.945_778_445_047_068e-1,
160    -9.936_950_234_020_883e-1,
161    -9.927_458_650_133_153e-1,
162    -9.917_304_327_004_32e-1,
163    -9.906_487_942_504_061e-1,
164    -9.895_010_218_704_087e-1,
165    -9.882_871_921_828_699e-1,
166    -9.870_073_862_202_815e-1,
167    -9.856_616_894_197_333e-1,
168    -9.842_501_916_171_713e-1,
169    -9.827_729_870_413_743e-1,
170    -9.812_301_743_076_443e-1,
171    -9.796_218_564_112_101e-1,
172    -9.779_481_407_203_411e-1,
173    -9.762_091_389_691_724e-1,
174    -9.744_049_672_502_397e-1,
175    -9.725_357_460_067_257e-1,
176    -9.706_016_000_244_151e-1,
177    -9.686_026_584_233_628e-1,
178    -9.665_390_546_492_71e-1,
179    -9.644_109_264_645_802e-1,
180    -9.622_184_159_392_698e-1,
181    -9.599_616_694_413_742e-1,
182    -9.576_408_376_272_095e-1,
183    -9.552_560_754_313_16e-1,
184    -9.528_075_420_561_144e-1,
185    -9.502_954_009_612_771e-1,
186    -9.477_198_198_528_157e-1,
187    -9.450_809_706_718_851e-1,
188    -9.423_790_295_833_044e-1,
189    -9.396_141_769_637_963e-1,
190    -9.367_865_973_899_459e-1,
191    -9.338_964_796_258_775e-1,
192    -9.309_440_166_106_54e-1,
193    -9.279_294_054_453_956e-1,
194    -9.248_528_473_801_222e-1,
195    -9.217_145_478_003_181e-1,
196    -9.185_147_162_132_208e-1,
197    -9.152_535_662_338_34e-1,
198    -9.119_313_155_706_682e-1,
199    -9.085_481_860_112_055e-1,
200    -9.051_044_034_070_944e-1,
201    -9.016_001_976_590_722e-1,
202    -8.980_358_027_016_164e-1,
203    -8.944_114_564_873_288e-1,
204    -8.907_274_009_710_492e-1,
205    -8.869_838_820_937_034e-1,
206    -8.831_811_497_658_847e-1,
207    -8.793_194_578_511_7e-1,
208    -8.753_990_641_491_725e-1,
209    -8.714_202_303_783_312e-1,
210    -8.673_832_221_584_393e-1,
211    -8.632_883_089_929_12e-1,
212    -8.591_357_642_507_945e-1,
213    -8.549_258_651_485_127e-1,
214    -8.506_588_927_313_666e-1,
215    -8.463_351_318_547_683e-1,
216    -8.419_548_711_652_254e-1,
217    -8.375_184_030_810_715e-1,
218    -8.330_260_237_729_452e-1,
219    -8.284_780_331_440_178e-1,
220    -8.238_747_348_099_726e-1,
221    -8.192_164_360_787_36e-1,
222    -8.145_034_479_299_62e-1,
223    -8.097_360_849_942_72e-1,
224    -8.049_146_655_322_506e-1,
225    -8.000_395_114_131_988e-1,
226    -7.951_109_480_936_471e-1,
227    -7.901_293_045_956_28e-1,
228    -7.850_949_134_847_117e-1,
229    -7.800_081_108_478_04e-1,
230    -7.748_692_362_707_1e-1,
231    -7.696_786_328_154_644e-1,
232    -7.644_366_469_974_285e-1,
233    -7.591_436_287_621_58e-1,
234    -7.537_999_314_620_412e-1,
235    -7.484_059_118_327_094e-1,
236    -7.429_619_299_692_227e-1,
237    -7.374_683_493_020_299e-1,
238    -7.319_255_365_727_068e-1,
239    -7.263_338_618_094_733e-1,
240    -7.206_936_983_024_912e-1,
241    -7.150_054_225_789_432e-1,
242    -7.092_694_143_778_975e-1,
243    -7.034_860_566_249_567e-1,
244    -6.976_557_354_066_943e-1,
245    -6.917_788_399_448_808e-1,
246    -6.858_557_625_704_99e-1,
247    -6.798_868_986_975_534e-1,
248    -6.738_726_467_966_731e-1,
249    -6.678_134_083_685_102e-1,
250    -6.617_095_879_169_366e-1,
251    -6.555_615_929_220_4e-1,
252    -6.493_698_338_129_212e-1,
253    -6.431_347_239_402_948e-1,
254    -6.368_566_795_488_945e-1,
255    -6.305_361_197_496_849e-1,
256    -6.241_734_664_918_837e-1,
257    -6.177_691_445_347_913e-1,
258    -6.113_235_814_194_364e-1,
259    -6.048_372_074_400_329e-1,
260    -5.983_104_556_152_549e-1,
261    -5.917_437_616_593_286e-1,
262    -5.851_375_639_529_456e-1,
263    -5.784_923_035_139_965e-1,
264    -5.718_084_239_681_3e-1,
265    -5.650_863_715_191_369e-1,
266    -5.583_265_949_191_623e-1,
267    -5.515_295_454_387_482e-1,
268    -5.446_956_768_367_068e-1,
269    -5.378_254_453_298_289e-1,
270    -5.309_193_095_624_275e-1,
271    -5.239_777_305_757_194e-1,
272    -5.170_011_717_770_473e-1,
273    -5.099_900_989_089_429e-1,
274    -5.029_449_800_180_356e-1,
275    -4.958_662_854_238_058_4e-1,
276    -4.887_544_876_871_878e-1,
277    -4.816_100_615_790_221e-1,
278    -4.744_334_840_483_605_5e-1,
279    -4.672_252_341_906_264e-1,
280    -4.599_857_932_156_304e-1,
281    -4.527_156_444_154_463_7e-1,
282    -4.454_152_731_321_473_5e-1,
283    -4.380_851_667_254_05e-1,
284    -4.307_258_145_399_544_5e-1,
285    -4.233_377_078_729_265e-1,
286    -4.159_213_399_410_494e-1,
287    -4.084_772_058_477_228e-1,
288    -4.010_058_025_499_653e-1,
289    -3.935_076_288_252_386e-1,
290    -3.859_831_852_381_500_6e-1,
291    -3.784_329_741_070_358_6e-1,
292    -3.708_574_994_704_271e-1,
293    -3.632_572_670_534_011e-1,
294    -3.556_327_842_338_202e-1,
295    -3.479_845_600_084_600_6e-1,
296    -3.403_131_049_590_297e-1,
297    -3.326_189_312_180_866e-1,
298    -3.249_025_524_348_469_5e-1,
299    -3.171_644_837_408_958_4e-1,
300    -3.094_052_417_157_978e-1,
301    -3.016_253_443_526_109e-1,
302    -2.938_253_110_233_064_5e-1,
303    -2.860_056_624_440_967_5e-1,
304    -2.781_669_206_406_729e-1,
305    -2.703_096_089_133_553e-1,
306    -2.624_342_518_021_592_4e-1,
307    -2.545_413_750_517_773e-1,
308    -2.466_315_055_764_817_5e-1,
309    -2.387_051_714_249_486_3e-1,
310    -2.307_629_017_450_062e-1,
311    -2.228_052_267_483_099_4e-1,
312    -2.148_326_776_749_466_5e-1,
313    -2.068_457_867_579_697_5e-1,
314    -1.988_450_871_878_683_4e-1,
315    -1.908_311_130_769_724_5e-1,
316    -1.828_043_994_237_965_6e-1,
317    -1.747_654_820_773_241_2e-1,
318    -1.667_148_977_012_352_4e-1,
319    -1.586_531_837_380_799_3e-1,
320    -1.505_808_783_733_995e-1,
321    -1.424_985_204_997_981_4e-1,
322    -1.344_066_496_809_674_7e-1,
323    -1.263_058_061_156_663e-1,
324    -1.181_965_306_016_578_4e-1,
325    -1.100_793_644_996_070_4e-1,
326    -1.019_548_496_969_403_7e-1,
327    -9.382_352_857_167_028e-2,
328    -8.568_594_395_618_719e-2,
329    -7.754_263_910_102_077e-2,
330    -6.939_415_763_857_37e-2,
331    -6.124_104_354_682_962e-2,
332    -5.308_384_111_303_817_6e-2,
333    -4.492_309_489_737_94e-2,
334    -3.675_934_969_660_982e-2,
335    -2.859_315_050_769_284_7e-2,
336    -2.042_504_249_141_571e-2,
337    -1.225_557_093_599_553_8e-2,
338    -4.085_281_220_676_868e-3,
339    4.085_281_220_676_868e-3,
340    1.225_557_093_599_553_8e-2,
341    2.042_504_249_141_571e-2,
342    2.859_315_050_769_284_7e-2,
343    3.675_934_969_660_982e-2,
344    4.492_309_489_737_94e-2,
345    5.308_384_111_303_817_6e-2,
346    6.124_104_354_682_962e-2,
347    6.939_415_763_857_37e-2,
348    7.754_263_910_102_077e-2,
349    8.568_594_395_618_719e-2,
350    9.382_352_857_167_028e-2,
351    1.019_548_496_969_403_7e-1,
352    1.100_793_644_996_070_4e-1,
353    1.181_965_306_016_578_4e-1,
354    1.263_058_061_156_663e-1,
355    1.344_066_496_809_674_7e-1,
356    1.424_985_204_997_981_4e-1,
357    1.505_808_783_733_995e-1,
358    1.586_531_837_380_799_3e-1,
359    1.667_148_977_012_352_4e-1,
360    1.747_654_820_773_241_2e-1,
361    1.828_043_994_237_965_6e-1,
362    1.908_311_130_769_724_5e-1,
363    1.988_450_871_878_683_4e-1,
364    2.068_457_867_579_697_5e-1,
365    2.148_326_776_749_466_5e-1,
366    2.228_052_267_483_099_4e-1,
367    2.307_629_017_450_062e-1,
368    2.387_051_714_249_486_3e-1,
369    2.466_315_055_764_817_5e-1,
370    2.545_413_750_517_773e-1,
371    2.624_342_518_021_592_4e-1,
372    2.703_096_089_133_553e-1,
373    2.781_669_206_406_729e-1,
374    2.860_056_624_440_967_5e-1,
375    2.938_253_110_233_064_5e-1,
376    3.016_253_443_526_109e-1,
377    3.094_052_417_157_978e-1,
378    3.171_644_837_408_958_4e-1,
379    3.249_025_524_348_469_5e-1,
380    3.326_189_312_180_866e-1,
381    3.403_131_049_590_297e-1,
382    3.479_845_600_084_600_6e-1,
383    3.556_327_842_338_202e-1,
384    3.632_572_670_534_011e-1,
385    3.708_574_994_704_271e-1,
386    3.784_329_741_070_358_6e-1,
387    3.859_831_852_381_500_6e-1,
388    3.935_076_288_252_386e-1,
389    4.010_058_025_499_653e-1,
390    4.084_772_058_477_228e-1,
391    4.159_213_399_410_494e-1,
392    4.233_377_078_729_265e-1,
393    4.307_258_145_399_544_5e-1,
394    4.380_851_667_254_05e-1,
395    4.454_152_731_321_473_5e-1,
396    4.527_156_444_154_463_7e-1,
397    4.599_857_932_156_304e-1,
398    4.672_252_341_906_264e-1,
399    4.744_334_840_483_605_5e-1,
400    4.816_100_615_790_221e-1,
401    4.887_544_876_871_878e-1,
402    4.958_662_854_238_058_4e-1,
403    5.029_449_800_180_356e-1,
404    5.099_900_989_089_429e-1,
405    5.170_011_717_770_473e-1,
406    5.239_777_305_757_194e-1,
407    5.309_193_095_624_275e-1,
408    5.378_254_453_298_289e-1,
409    5.446_956_768_367_068e-1,
410    5.515_295_454_387_482e-1,
411    5.583_265_949_191_623e-1,
412    5.650_863_715_191_369e-1,
413    5.718_084_239_681_3e-1,
414    5.784_923_035_139_965e-1,
415    5.851_375_639_529_456e-1,
416    5.917_437_616_593_286e-1,
417    5.983_104_556_152_549e-1,
418    6.048_372_074_400_329e-1,
419    6.113_235_814_194_364e-1,
420    6.177_691_445_347_913e-1,
421    6.241_734_664_918_837e-1,
422    6.305_361_197_496_849e-1,
423    6.368_566_795_488_945e-1,
424    6.431_347_239_402_948e-1,
425    6.493_698_338_129_212e-1,
426    6.555_615_929_220_4e-1,
427    6.617_095_879_169_366e-1,
428    6.678_134_083_685_102e-1,
429    6.738_726_467_966_731e-1,
430    6.798_868_986_975_534e-1,
431    6.858_557_625_704_99e-1,
432    6.917_788_399_448_808e-1,
433    6.976_557_354_066_943e-1,
434    7.034_860_566_249_567e-1,
435    7.092_694_143_778_975e-1,
436    7.150_054_225_789_432e-1,
437    7.206_936_983_024_912e-1,
438    7.263_338_618_094_733e-1,
439    7.319_255_365_727_068e-1,
440    7.374_683_493_020_299e-1,
441    7.429_619_299_692_227e-1,
442    7.484_059_118_327_094e-1,
443    7.537_999_314_620_412e-1,
444    7.591_436_287_621_58e-1,
445    7.644_366_469_974_285e-1,
446    7.696_786_328_154_644e-1,
447    7.748_692_362_707_1e-1,
448    7.800_081_108_478_04e-1,
449    7.850_949_134_847_117e-1,
450    7.901_293_045_956_28e-1,
451    7.951_109_480_936_471e-1,
452    8.000_395_114_131_988e-1,
453    8.049_146_655_322_506e-1,
454    8.097_360_849_942_72e-1,
455    8.145_034_479_299_62e-1,
456    8.192_164_360_787_36e-1,
457    8.238_747_348_099_726e-1,
458    8.284_780_331_440_178e-1,
459    8.330_260_237_729_452e-1,
460    8.375_184_030_810_715e-1,
461    8.419_548_711_652_254e-1,
462    8.463_351_318_547_683e-1,
463    8.506_588_927_313_666e-1,
464    8.549_258_651_485_127e-1,
465    8.591_357_642_507_945e-1,
466    8.632_883_089_929_12e-1,
467    8.673_832_221_584_393e-1,
468    8.714_202_303_783_312e-1,
469    8.753_990_641_491_725e-1,
470    8.793_194_578_511_7e-1,
471    8.831_811_497_658_847e-1,
472    8.869_838_820_937_034e-1,
473    8.907_274_009_710_492e-1,
474    8.944_114_564_873_288e-1,
475    8.980_358_027_016_164e-1,
476    9.016_001_976_590_722e-1,
477    9.051_044_034_070_944e-1,
478    9.085_481_860_112_055e-1,
479    9.119_313_155_706_682e-1,
480    9.152_535_662_338_34e-1,
481    9.185_147_162_132_208e-1,
482    9.217_145_478_003_181e-1,
483    9.248_528_473_801_222e-1,
484    9.279_294_054_453_956e-1,
485    9.309_440_166_106_54e-1,
486    9.338_964_796_258_775e-1,
487    9.367_865_973_899_459e-1,
488    9.396_141_769_637_963e-1,
489    9.423_790_295_833_044e-1,
490    9.450_809_706_718_851e-1,
491    9.477_198_198_528_157e-1,
492    9.502_954_009_612_771e-1,
493    9.528_075_420_561_144e-1,
494    9.552_560_754_313_16e-1,
495    9.576_408_376_272_095e-1,
496    9.599_616_694_413_742e-1,
497    9.622_184_159_392_698e-1,
498    9.644_109_264_645_802e-1,
499    9.665_390_546_492_71e-1,
500    9.686_026_584_233_628e-1,
501    9.706_016_000_244_151e-1,
502    9.725_357_460_067_257e-1,
503    9.744_049_672_502_397e-1,
504    9.762_091_389_691_724e-1,
505    9.779_481_407_203_411e-1,
506    9.796_218_564_112_101e-1,
507    9.812_301_743_076_443e-1,
508    9.827_729_870_413_743e-1,
509    9.842_501_916_171_713e-1,
510    9.856_616_894_197_333e-1,
511    9.870_073_862_202_815e-1,
512    9.882_871_921_828_699e-1,
513    9.895_010_218_704_087e-1,
514    9.906_487_942_504_061e-1,
515    9.917_304_327_004_32e-1,
516    9.927_458_650_133_153e-1,
517    9.936_950_234_020_883e-1,
518    9.945_778_445_047_068e-1,
519    9.953_942_693_885_953e-1,
520    9.961_442_435_551_087e-1,
521    9.968_277_169_440_913e-1,
522    9.974_446_439_389_107e-1,
523    9.979_949_833_727_938e-1,
524    9.984_786_985_384_589e-1,
525    9.988_957_572_063_257e-1,
526    9.992_461_316_671_845e-1,
527    9.995_297_988_558_859e-1,
528    9.997_467_408_113_523e-1,
529    9.998_969_471_378_596e-1,
530    9.999_804_411_726_474e-1,
531];
532const GL_WEIGHTS: [f64; 384] = [
533    5.019_410_348_676_869_6e-5,
534    1.168_390_665_730_266_3e-4,
535    1.835_749_193_551_655_8e-4,
536    2.503_070_890_844_105e-4,
537    3.170_242_698_112_815e-4,
538    3.837_208_020_912_921_4e-4,
539    4.503_919_137_716_827e-4,
540    5.170_330_453_491_649e-4,
541    5.836_397_042_630_135e-4,
542    6.502_074_240_969_948e-4,
543    7.167_317_509_947_801e-4,
544    7.832_082_385_905_168e-4,
545    8.496_324_460_039_209e-4,
546    9.159_999_370_632_641e-4,
547    9.823_062_800_663_463e-4,
548    1.048_547_047_793_689_5e-3,
549    1.114_717_817_647_310_6e-3,
550    1.180_814_171_855_922e-3,
551    1.246_831_697_715_441_5e-3,
552    1.312_765_987_850_66e-3,
553    1.378_612_640_487_646_8e-3,
554    1.444_367_259_734_736e-3,
555    1.510_025_455_865_810_3e-3,
556    1.575_582_845_607_936_8e-3,
557    1.641_035_052_429_271_5e-3,
558    1.706_377_706_828_447_1e-3,
559    1.771_606_446_623_834_7e-3,
560    1.836_716_917_243_567_5e-3,
561    1.901_704_772_014_899_2e-3,
562    1.966_565_672_453_437e-3,
563    2.031_295_288_552_398_4e-3,
564    2.095_889_299_071_020_6e-3,
565    2.160_343_391_822_734_3e-3,
566    2.224_653_263_962_713e-3,
567    2.288_814_622_274_955e-3,
568    2.352_823_183_458_769e-3,
569    2.416_674_674_414_340_5e-3,
570    2.480_364_832_528_265_6e-3,
571    2.543_889_405_957_74e-3,
572    2.607_244_153_914_452e-3,
573    2.670_424_846_947_554e-3,
574    2.733_427_267_226_093_3e-3,
575    2.796_247_208_820_428e-3,
576    2.858_880_477_983_06e-3,
577    2.921_322_893_428_515_3e-3,
578    2.983_570_286_612_554_5e-3,
579    3.045_618_502_010_327_8e-3,
580    3.107_463_397_393_755_5e-3,
581    3.169_100_844_108_32e-3,
582    3.230_526_727_348_174e-3,
583    3.291_736_946_431_361e-3,
584    3.352_727_415_073_250_3e-3,
585    3.413_494_061_659_418_4e-3,
586    3.474_032_829_517_317e-3,
587    3.534_339_677_187_348_4e-3,
588    3.594_410_578_692_452e-3,
589    3.654_241_523_806_987e-3,
590    3.713_828_518_324_312_5e-3,
591    3.773_167_584_323_583_5e-3,
592    3.832_254_760_435_171e-3,
593    3.891_086_102_105_193_4e-3,
594    3.949_657_681_858_895e-3,
595    4.007_965_589_562_678e-3,
596    4.066_005_932_685_269e-3,
597    4.123_774_836_557_6e-3,
598    4.181_268_444_631_281e-3,
599    4.238_482_918_736_289e-3,
600    4.295_414_439_336_925e-3,
601    4.352_059_205_787_275e-3,
602    4.408_413_436_584_285e-3,
603    4.464_473_369_620_78e-3,
604    4.520_235_262_436_235e-3,
605    4.575_695_392_466_791e-3,
606    4.630_850_057_293_894e-3,
607    4.685_695_574_891_041e-3,
608    4.740_228_283_870_022e-3,
609    4.794_444_543_725_102e-3,
610    4.848_340_735_076_109e-3,
611    4.901_913_259_910_197e-3,
612    4.955_158_541_821_682_4e-3,
613    5.008_073_026_251_332e-3,
614    5.060_653_180_723_101_4e-3,
615    5.112_895_495_080_397e-3,
616    5.164_796_481_720_011e-3,
617    5.216_352_675_825_451e-3,
618    5.267_560_635_597_735e-3,
619    5.318_416_942_485_385e-3,
620    5.368_918_201_412_827e-3,
621    5.419_061_041_006_627e-3,
622    5.468_842_113_820_941e-3,
623    5.518_258_096_560_71e-3,
624    5.567_305_690_303_767e-3,
625    5.615_981_620_720_803e-3,
626    5.664_282_638_294_182e-3,
627    5.712_205_518_534_655e-3,
628    5.759_747_062_196_925_5e-3,
629    5.806_904_095_492_818e-3,
630    5.853_673_470_303_617_4e-3,
631    5.900_052_064_389_824e-3,
632    5.946_036_781_599_814e-3,
633    5.991_624_552_076_468e-3,
634    6.036_812_332_462_087e-3,
635    6.081_597_106_101_673e-3,
636    6.125_975_883_244_196e-3,
637    6.169_945_701_242_237e-3,
638    6.213_503_624_749_591e-3,
639    6.256_646_745_917_723e-3,
640    6.299_372_184_589_237e-3,
641    6.341_677_088_490_664e-3,
642    6.383_558_633_422_572e-3,
643    6.425_014_023_448_273e-3,
644    6.466_040_491_080_434e-3,
645    6.506_635_297_465_724e-3,
646    6.546_795_732_567_842_5e-3,
647    6.586_519_115_348_261e-3,
648    6.625_802_793_945_317e-3,
649    6.664_644_145_851_14e-3,
650    6.703_040_578_086_941e-3,
651    6.740_989_527_375_895e-3,
652    6.778_488_460_314_126e-3,
653    6.815_534_873_540_5e-3,
654    6.852_126_293_902_878e-3,
655    6.888_260_278_623_754e-3,
656    6.923_934_415_463_31e-3,
657    6.959_146_322_880_146_5e-3,
658    6.993_893_650_190_702e-3,
659    7.028_174_077_725_734e-3,
660    7.061_985_316_985_506e-3,
661    7.095_325_110_792_439e-3,
662    7.128_191_233_441_844e-3,
663    7.160_581_490_850_321e-3,
664    7.192_493_720_702_486e-3,
665    7.223_925_792_595_309e-3,
666    7.254_875_608_179_984e-3,
667    7.285_341_101_302_512e-3,
668    7.315_320_238_141_324_5e-3,
669    7.344_811_017_343_063e-3,
670    7.373_811_470_156_258e-3,
671    7.402_319_660_562_818e-3,
672    7.430_333_685_407_178e-3,
673    7.457_851_674_523_319e-3,
674    7.484_871_790_859_79e-3,
675    7.511_392_230_602_079e-3,
676    7.537_411_223_293_362e-3,
677    7.562_927_031_952_382e-3,
678    7.587_937_953_189_561_5e-3,
679    7.612_442_317_320_796e-3,
680    7.636_438_488_478_739e-3,
681    7.659_924_864_722_064e-3,
682    7.682_899_878_142_539e-3,
683    7.705_361_994_969_524e-3,
684    7.727_309_715_672_44e-3,
685    7.748_741_575_060_914e-3,
686    7.769_656_142_382_462e-3,
687    7.790_052_021_418_226e-3,
688    7.809_927_850_575_903e-3,
689    7.829_282_302_980_82e-3,
690    7.848_114_086_564_56e-3,
691    7.866_421_944_151_094e-3,
692    7.884_204_653_540_665e-3,
693    7.901_461_027_591_6e-3,
694    7.918_189_914_299_318e-3,
695    7.934_390_196_873_448e-3,
696    7.950_060_793_812_204e-3,
697    7.965_200_658_974_709e-3,
698    7.979_808_781_650_77e-3,
699    7.993_884_186_628_266e-3,
700    8.007_425_934_258_548e-3,
701    8.020_433_120_518_866e-3,
702    8.032_904_877_072_8e-3,
703    8.044_840_371_328_26e-3,
704    8.056_238_806_493_175e-3,
705    8.067_099_421_628_42e-3,
706    8.077_421_491_698_82e-3,
707    8.087_204_327_621_594e-3,
708    8.096_447_276_312_202e-3,
709    8.105_149_720_727_933e-3,
710    8.113_311_079_909_208e-3,
711    8.120_930_809_018_415e-3,
712    8.128_008_399_376_085e-3,
713    8.134_543_378_495_033e-3,
714    8.140_535_310_111_77e-3,
715    8.145_983_794_215_77e-3,
716    8.150_888_467_075_875e-3,
717    8.155_249_001_265_092e-3,
718    8.159_065_105_681_899e-3,
719    8.162_336_525_570_1e-3,
720    8.165_063_042_535_465e-3,
721    8.167_244_474_560_707e-3,
722    8.168_880_676_017_344e-3,
723    8.169_971_537_675_47e-3,
724    8.170_516_986_711_104e-3,
725    8.170_516_986_711_104e-3,
726    8.169_971_537_675_47e-3,
727    8.168_880_676_017_344e-3,
728    8.167_244_474_560_707e-3,
729    8.165_063_042_535_465e-3,
730    8.162_336_525_570_1e-3,
731    8.159_065_105_681_899e-3,
732    8.155_249_001_265_092e-3,
733    8.150_888_467_075_875e-3,
734    8.145_983_794_215_77e-3,
735    8.140_535_310_111_77e-3,
736    8.134_543_378_495_033e-3,
737    8.128_008_399_376_085e-3,
738    8.120_930_809_018_415e-3,
739    8.113_311_079_909_208e-3,
740    8.105_149_720_727_933e-3,
741    8.096_447_276_312_202e-3,
742    8.087_204_327_621_594e-3,
743    8.077_421_491_698_82e-3,
744    8.067_099_421_628_42e-3,
745    8.056_238_806_493_175e-3,
746    8.044_840_371_328_26e-3,
747    8.032_904_877_072_8e-3,
748    8.020_433_120_518_866e-3,
749    8.007_425_934_258_548e-3,
750    7.993_884_186_628_266e-3,
751    7.979_808_781_650_77e-3,
752    7.965_200_658_974_709e-3,
753    7.950_060_793_812_204e-3,
754    7.934_390_196_873_448e-3,
755    7.918_189_914_299_318e-3,
756    7.901_461_027_591_6e-3,
757    7.884_204_653_540_665e-3,
758    7.866_421_944_151_094e-3,
759    7.848_114_086_564_56e-3,
760    7.829_282_302_980_82e-3,
761    7.809_927_850_575_903e-3,
762    7.790_052_021_418_226e-3,
763    7.769_656_142_382_462e-3,
764    7.748_741_575_060_914e-3,
765    7.727_309_715_672_44e-3,
766    7.705_361_994_969_524e-3,
767    7.682_899_878_142_539e-3,
768    7.659_924_864_722_064e-3,
769    7.636_438_488_478_739e-3,
770    7.612_442_317_320_796e-3,
771    7.587_937_953_189_561_5e-3,
772    7.562_927_031_952_382e-3,
773    7.537_411_223_293_362e-3,
774    7.511_392_230_602_079e-3,
775    7.484_871_790_859_79e-3,
776    7.457_851_674_523_319e-3,
777    7.430_333_685_407_178e-3,
778    7.402_319_660_562_818e-3,
779    7.373_811_470_156_258e-3,
780    7.344_811_017_343_063e-3,
781    7.315_320_238_141_324_5e-3,
782    7.285_341_101_302_512e-3,
783    7.254_875_608_179_984e-3,
784    7.223_925_792_595_309e-3,
785    7.192_493_720_702_486e-3,
786    7.160_581_490_850_321e-3,
787    7.128_191_233_441_844e-3,
788    7.095_325_110_792_439e-3,
789    7.061_985_316_985_506e-3,
790    7.028_174_077_725_734e-3,
791    6.993_893_650_190_702e-3,
792    6.959_146_322_880_146_5e-3,
793    6.923_934_415_463_31e-3,
794    6.888_260_278_623_754e-3,
795    6.852_126_293_902_878e-3,
796    6.815_534_873_540_5e-3,
797    6.778_488_460_314_126e-3,
798    6.740_989_527_375_895e-3,
799    6.703_040_578_086_941e-3,
800    6.664_644_145_851_14e-3,
801    6.625_802_793_945_317e-3,
802    6.586_519_115_348_261e-3,
803    6.546_795_732_567_842_5e-3,
804    6.506_635_297_465_724e-3,
805    6.466_040_491_080_434e-3,
806    6.425_014_023_448_273e-3,
807    6.383_558_633_422_572e-3,
808    6.341_677_088_490_664e-3,
809    6.299_372_184_589_237e-3,
810    6.256_646_745_917_723e-3,
811    6.213_503_624_749_591e-3,
812    6.169_945_701_242_237e-3,
813    6.125_975_883_244_196e-3,
814    6.081_597_106_101_673e-3,
815    6.036_812_332_462_087e-3,
816    5.991_624_552_076_468e-3,
817    5.946_036_781_599_814e-3,
818    5.900_052_064_389_824e-3,
819    5.853_673_470_303_617_4e-3,
820    5.806_904_095_492_818e-3,
821    5.759_747_062_196_925_5e-3,
822    5.712_205_518_534_655e-3,
823    5.664_282_638_294_182e-3,
824    5.615_981_620_720_803e-3,
825    5.567_305_690_303_767e-3,
826    5.518_258_096_560_71e-3,
827    5.468_842_113_820_941e-3,
828    5.419_061_041_006_627e-3,
829    5.368_918_201_412_827e-3,
830    5.318_416_942_485_385e-3,
831    5.267_560_635_597_735e-3,
832    5.216_352_675_825_451e-3,
833    5.164_796_481_720_011e-3,
834    5.112_895_495_080_397e-3,
835    5.060_653_180_723_101_4e-3,
836    5.008_073_026_251_332e-3,
837    4.955_158_541_821_682_4e-3,
838    4.901_913_259_910_197e-3,
839    4.848_340_735_076_109e-3,
840    4.794_444_543_725_102e-3,
841    4.740_228_283_870_022e-3,
842    4.685_695_574_891_041e-3,
843    4.630_850_057_293_894e-3,
844    4.575_695_392_466_791e-3,
845    4.520_235_262_436_235e-3,
846    4.464_473_369_620_78e-3,
847    4.408_413_436_584_285e-3,
848    4.352_059_205_787_275e-3,
849    4.295_414_439_336_925e-3,
850    4.238_482_918_736_289e-3,
851    4.181_268_444_631_281e-3,
852    4.123_774_836_557_6e-3,
853    4.066_005_932_685_269e-3,
854    4.007_965_589_562_678e-3,
855    3.949_657_681_858_895e-3,
856    3.891_086_102_105_193_4e-3,
857    3.832_254_760_435_171e-3,
858    3.773_167_584_323_583_5e-3,
859    3.713_828_518_324_312_5e-3,
860    3.654_241_523_806_987e-3,
861    3.594_410_578_692_452e-3,
862    3.534_339_677_187_348_4e-3,
863    3.474_032_829_517_317e-3,
864    3.413_494_061_659_418_4e-3,
865    3.352_727_415_073_250_3e-3,
866    3.291_736_946_431_361e-3,
867    3.230_526_727_348_174e-3,
868    3.169_100_844_108_32e-3,
869    3.107_463_397_393_755_5e-3,
870    3.045_618_502_010_327_8e-3,
871    2.983_570_286_612_554_5e-3,
872    2.921_322_893_428_515_3e-3,
873    2.858_880_477_983_06e-3,
874    2.796_247_208_820_428e-3,
875    2.733_427_267_226_093_3e-3,
876    2.670_424_846_947_554e-3,
877    2.607_244_153_914_452e-3,
878    2.543_889_405_957_74e-3,
879    2.480_364_832_528_265_6e-3,
880    2.416_674_674_414_340_5e-3,
881    2.352_823_183_458_769e-3,
882    2.288_814_622_274_955e-3,
883    2.224_653_263_962_713e-3,
884    2.160_343_391_822_734_3e-3,
885    2.095_889_299_071_020_6e-3,
886    2.031_295_288_552_398_4e-3,
887    1.966_565_672_453_437e-3,
888    1.901_704_772_014_899_2e-3,
889    1.836_716_917_243_567_5e-3,
890    1.771_606_446_623_834_7e-3,
891    1.706_377_706_828_447_1e-3,
892    1.641_035_052_429_271_5e-3,
893    1.575_582_845_607_936_8e-3,
894    1.510_025_455_865_810_3e-3,
895    1.444_367_259_734_736e-3,
896    1.378_612_640_487_646_8e-3,
897    1.312_765_987_850_66e-3,
898    1.246_831_697_715_441_5e-3,
899    1.180_814_171_855_922e-3,
900    1.114_717_817_647_310_6e-3,
901    1.048_547_047_793_689_5e-3,
902    9.823_062_800_663_463e-4,
903    9.159_999_370_632_641e-4,
904    8.496_324_460_039_209e-4,
905    7.832_082_385_905_168e-4,
906    7.167_317_509_947_801e-4,
907    6.502_074_240_969_948e-4,
908    5.836_397_042_630_135e-4,
909    5.170_330_453_491_649e-4,
910    4.503_919_137_716_827e-4,
911    3.837_208_020_912_921_4e-4,
912    3.170_242_698_112_815e-4,
913    2.503_070_890_844_105e-4,
914    1.835_749_193_551_655_8e-4,
915    1.168_390_665_730_266_3e-4,
916    5.019_410_348_676_869_6e-5,
917];
918
919#[derive(Clone, Copy, Debug, Eq, PartialEq)]
920pub enum ExactCellBranch {
921    Affine,
922    Quartic,
923    Sextic,
924}
925
926/// Auto-tune the per-cell affine/non-affine branch tolerance from the cell's
927/// own coefficient magnitudes.
928///
929/// The legacy `branch_cell` compared the normalized cubic coefficients
930/// `(k2, k3)` against a single global constant.  That constant is calibrated
931/// for cells whose anchor coefficients `(c0, c1)` are O(1).  When the anchor
932/// dominates — e.g. a tail cell with `|c0|, |c1| >> 1` — a relative criterion
933/// against the anchor magnitude is more numerically meaningful than the bare
934/// global threshold, because the affine contribution to `eta` already absorbs
935/// any difference at the chosen scale.
936///
937/// The returned tolerance is always at least [`NORMALIZED_CELL_BRANCH_TOL`],
938/// so cells with O(1) anchors recover bit-identical classification with the
939/// legacy code path.  This preserves numerical equivalence for the
940/// established `cubic_cell_kernel` tests, including the
941/// `tuned_branch_tolerance_matches_legacy_non_affine_transport_grid` grid.
942#[inline]
943fn effective_branch_tol(cell: DenestedCubicCell) -> f64 {
944    let anchor_scale = cell.c0.abs().max(cell.c1.abs()).max(1.0);
945    NORMALIZED_CELL_BRANCH_TOL * anchor_scale
946}
947
948#[derive(Clone, Copy, Debug, PartialEq)]
949pub struct DenestedCubicCell {
950    pub left: f64,
951    pub right: f64,
952    pub c0: f64,
953    pub c1: f64,
954    pub c2: f64,
955    pub c3: f64,
956}
957
958impl DenestedCubicCell {
959    #[inline]
960    pub fn eta(self, z: f64) -> f64 {
961        self.c0 + self.c1 * z + self.c2 * z * z + self.c3 * z * z * z
962    }
963
964    #[inline]
965    pub fn q(self, z: f64) -> f64 {
966        let eta = self.eta(z);
967        0.5 * (z * z + eta * eta)
968    }
969}
970
971#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
972pub struct CellMomentFingerprint {
973    pub hash: u64,
974    bins: [u64; 6],
975}
976
977#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
978pub struct CellMomentCacheKey {
979    pub fingerprint: CellMomentFingerprint,
980    pub max_degree: usize,
981}
982
983#[derive(Clone, Copy, Debug, Default, PartialEq)]
984pub struct CellMomentDedupStats {
985    pub lookups: u64,
986    pub hits: u64,
987    pub misses: u64,
988}
989
990impl CellMomentDedupStats {
991    #[inline]
992    pub fn hit_rate(self) -> f64 {
993        if self.lookups == 0 {
994            0.0
995        } else {
996            self.hits as f64 / self.lookups as f64
997        }
998    }
999}
1000
1001#[inline]
1002fn splitmix64(x: u64) -> u64 {
1003    gam_linalg::utils::splitmix64_hash(x)
1004}
1005
1006#[inline]
1007fn mix_fingerprint_words(words: &[u64]) -> u64 {
1008    let mut h = 0xcbf2_9ce4_8422_2325u64;
1009    for &word in words {
1010        h ^= splitmix64(word);
1011        h = h.wrapping_mul(0x100_0000_01b3);
1012    }
1013    h
1014}
1015
1016#[inline]
1017fn quantized_cell_word(x: f64, epsilon: f64) -> u64 {
1018    if epsilon == 0.0 || !epsilon.is_finite() || epsilon < 0.0 || !x.is_finite() {
1019        return x.to_bits();
1020    }
1021    (x / epsilon).round().to_bits()
1022}
1023
1024/// Returns a deterministic geometric fingerprint for a de-nested cubic cell.
1025///
1026/// With `epsilon == 0.0`, each coordinate is represented by its exact IEEE-754
1027/// bit pattern, so equal fingerprints imply bit-equal `(left, right, c0, c1,
1028/// c2, c3)` tuples.  With `epsilon > 0`, finite coordinates are binned to the
1029/// nearest multiple of `epsilon`; callers should treat this as an approximate
1030/// cache key and validate the resulting model error for their data.
1031pub fn cell_moment_fingerprint(cell: DenestedCubicCell, epsilon: f64) -> CellMomentFingerprint {
1032    let bins = [
1033        quantized_cell_word(cell.left, epsilon),
1034        quantized_cell_word(cell.right, epsilon),
1035        quantized_cell_word(cell.c0, epsilon),
1036        quantized_cell_word(cell.c1, epsilon),
1037        quantized_cell_word(cell.c2, epsilon),
1038        quantized_cell_word(cell.c3, epsilon),
1039    ];
1040    CellMomentFingerprint {
1041        hash: mix_fingerprint_words(&bins),
1042        bins,
1043    }
1044}
1045
1046#[inline]
1047pub fn cell_moment_cache_key(
1048    cell: DenestedCubicCell,
1049    max_degree: usize,
1050    epsilon: f64,
1051) -> CellMomentCacheKey {
1052    CellMomentCacheKey {
1053        fingerprint: cell_moment_fingerprint(cell, epsilon),
1054        max_degree,
1055    }
1056}
1057
1058#[derive(Clone, Copy, Debug, PartialEq)]
1059pub struct DenestedPartitionCell {
1060    pub cell: DenestedCubicCell,
1061    pub score_span: LocalSpanCubic,
1062    pub link_span: LocalSpanCubic,
1063    /// Provenance of the cell's boundaries: a fixed z location (score break
1064    /// or ±∞ tail) or a link-knot crossing `z = (τ - a)/b`. Together with
1065    /// `(score_span, link_span)` this identifies the cell's two-parameter
1066    /// family in `(a, b)` across rows (see
1067    /// [`crate::cell_moment_family`]).
1068    pub left_edge: PartitionEdge,
1069    pub right_edge: PartitionEdge,
1070}
1071
1072impl DenestedPartitionCell {}
1073
1074/// Provenance of one boundary of a denested partition cell.
1075#[derive(Clone, Copy, Debug, PartialEq)]
1076pub enum PartitionEdge {
1077    /// A z location independent of the row scalars: a score-spline break,
1078    /// or ±∞ for tail cells.
1079    Fixed(f64),
1080    /// A link-knot crossing: the boundary sits at `z = (τ - a)/b` for the
1081    /// row's `(a, b)`.
1082    Crossing { tau: f64 },
1083}
1084
1085impl PartitionEdge {
1086    /// The boundary's z location at the row scalars `(a, b)`.
1087    #[inline]
1088    pub fn z_at(self, a: f64, b: f64) -> f64 {
1089        match self {
1090            Self::Fixed(z) => z,
1091            Self::Crossing { tau } => (tau - a) / b,
1092        }
1093    }
1094}
1095
1096#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
1097struct TailCellMomentCacheKey {
1098    c0_bits: u64,
1099    c1_bits: u64,
1100    endpoint_bits: u64,
1101    side: i8,
1102    max_degree: usize,
1103}
1104
1105const TAIL_CELL_MOMENT_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
1106const TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES: usize = 262_144;
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1109pub struct TailCellMomentCacheStats {
1110    pub hits: usize,
1111    pub misses: usize,
1112    pub entries: usize,
1113}
1114
1115impl TailCellMomentCacheStats {
1116    #[inline]
1117    pub fn requests(self) -> usize {
1118        self.hits + self.misses
1119    }
1120
1121    #[inline]
1122    pub fn hit_rate(self) -> f64 {
1123        let requests = self.requests();
1124        if requests == 0 {
1125            0.0
1126        } else {
1127            self.hits as f64 / requests as f64
1128        }
1129    }
1130}
1131
1132/// Affine-tail cell-moment memo.
1133///
1134/// Stand-alone instances (`TailCellMomentCache::new()`) are useful when a
1135/// caller needs deterministic hit/miss bookkeeping that is not polluted by
1136/// concurrent traffic on the global memo. The production path uses the
1137/// global instance behind [`evaluate_cell_moments`].
1138///
1139/// All methods take `&self`: the LRU is internally synchronized (sharded for
1140/// the concurrent global memo) and the counters are atomics, so the global
1141/// instance needs no outer `Mutex`. The previous `OnceLock<Mutex<…>>` wrapper
1142/// serialized every tail-cell evaluation across all rayon workers of the
1143/// marginal-slope exact-cache build — the same contention class the sharded
1144/// per-family cell-moment LRU fix removed.
1145#[derive(Debug)]
1146pub struct TailCellMomentCache {
1147    moments: ByteLruCache<TailCellMomentCacheKey, CellMomentState>,
1148    in_flight: std::sync::Mutex<
1149        std::collections::HashMap<
1150            TailCellMomentCacheKey,
1151            Arc<std::sync::OnceLock<Result<CellMomentState, String>>>,
1152        >,
1153    >,
1154    hits: std::sync::atomic::AtomicUsize,
1155    misses: std::sync::atomic::AtomicUsize,
1156}
1157
1158impl Default for TailCellMomentCache {
1159    fn default() -> Self {
1160        // Tail-cell entries are small (a short moment vector), so sharding
1161        // the byte/entry budgets is harmless; size the shard count off the
1162        // worker pool exactly like the per-family cell-moment LRU.
1163        let shard_count = std::thread::available_parallelism()
1164            .map(|workers| workers.get().saturating_mul(8))
1165            .unwrap_or(32)
1166            .clamp(8, 256);
1167        Self {
1168            moments: ByteLruCache::with_max_entries_sharded(
1169                TAIL_CELL_MOMENT_CACHE_MAX_BYTES,
1170                TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES,
1171                shard_count,
1172            ),
1173            in_flight: std::sync::Mutex::new(std::collections::HashMap::new()),
1174            hits: std::sync::atomic::AtomicUsize::new(0),
1175            misses: std::sync::atomic::AtomicUsize::new(0),
1176        }
1177    }
1178}
1179
1180impl TailCellMomentCache {
1181    /// Construct an empty cache. Hits/misses start at zero.
1182    #[inline]
1183    pub fn new() -> Self {
1184        Self::default()
1185    }
1186
1187    /// Reset the cache to its empty state. Existing entries are dropped and
1188    /// the hit/miss counters are zeroed.
1189    #[inline]
1190    pub fn clear(&self) {
1191        self.moments.clear();
1192        self.in_flight
1193            .lock()
1194            .unwrap_or_else(|p| p.into_inner())
1195            .clear();
1196        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
1197        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
1198    }
1199
1200    /// Snapshot of the cache's current usage stats.
1201    #[inline]
1202    pub fn stats(&self) -> TailCellMomentCacheStats {
1203        TailCellMomentCacheStats {
1204            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
1205            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
1206            entries: self.moments.len(),
1207        }
1208    }
1209
1210    /// Look up `cell` at `max_degree`, computing and inserting the result on
1211    /// miss. Cells outside the affine-tail keyset bypass the cache and run
1212    /// the uncached evaluator directly without touching the counters.
1213    ///
1214    /// Stat semantics: every request served from an existing resident entry,
1215    /// or from a concurrently published entry for the same key, increments
1216    /// `hits`; a **miss** is counted only for the caller that actually
1217    /// computes a cold key. The compute happens outside the LRU shard lock,
1218    /// but an in-flight table coalesces same-key cold races so followers reuse
1219    /// the leader's published value instead of duplicating work.
1220    pub fn evaluate(
1221        &self,
1222        cell: DenestedCubicCell,
1223        max_degree: usize,
1224    ) -> Result<CellMomentState, String> {
1225        let Some(key) = tail_cell_cache_key(cell, max_degree) else {
1226            return evaluate_cell_moments_uncached(cell, max_degree);
1227        };
1228        if let Some(state) = self.moments.get(&key) {
1229            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1230            return Ok(state);
1231        }
1232
1233        let (slot, leader) = {
1234            let mut in_flight = self.in_flight.lock().unwrap_or_else(|p| p.into_inner());
1235            if let Some(slot) = in_flight.get(&key) {
1236                (Arc::clone(slot), false)
1237            } else {
1238                let slot = Arc::new(std::sync::OnceLock::new());
1239                in_flight.insert(key, Arc::clone(&slot));
1240                (slot, true)
1241            }
1242        };
1243
1244        if !leader {
1245            let state = slot.wait().clone()?;
1246            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1247            return Ok(state);
1248        }
1249
1250        let state = evaluate_cell_moments_uncached(cell, max_degree);
1251        if let Ok(state) = &state {
1252            self.moments.insert(key, state.clone());
1253            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1254        }
1255        self.misses
1256            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1257        if let Err(existing_state) = slot.set(state.clone()) {
1258            std::mem::drop(existing_state);
1259        }
1260        self.in_flight
1261            .lock()
1262            .unwrap_or_else(|p| p.into_inner())
1263            .remove(&key);
1264        state
1265    }
1266}
1267
1268static TAIL_CELL_MOMENT_CACHE: std::sync::OnceLock<TailCellMomentCache> =
1269    std::sync::OnceLock::new();
1270static TAIL_CELL_MOMENT_CACHE_ENABLED: std::sync::atomic::AtomicBool =
1271    std::sync::atomic::AtomicBool::new(true);
1272
1273fn tail_cell_moment_cache() -> &'static TailCellMomentCache {
1274    TAIL_CELL_MOMENT_CACHE.get_or_init(TailCellMomentCache::default)
1275}
1276
1277#[inline]
1278fn tail_cell_cache_key(
1279    cell: DenestedCubicCell,
1280    max_degree: usize,
1281) -> Option<TailCellMomentCacheKey> {
1282    if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL {
1283        return None;
1284    }
1285    match (!cell.left.is_finite(), !cell.right.is_finite()) {
1286        (true, false) if cell.right.is_finite() => Some(TailCellMomentCacheKey {
1287            c0_bits: cell.c0.to_bits(),
1288            c1_bits: cell.c1.to_bits(),
1289            endpoint_bits: cell.right.to_bits(),
1290            side: -1,
1291            max_degree,
1292        }),
1293        (false, true) if cell.left.is_finite() => Some(TailCellMomentCacheKey {
1294            c0_bits: cell.c0.to_bits(),
1295            c1_bits: cell.c1.to_bits(),
1296            endpoint_bits: cell.left.to_bits(),
1297            side: 1,
1298            max_degree,
1299        }),
1300        _ => None,
1301    }
1302}
1303
1304pub fn set_tail_cell_moment_cache_enabled(enabled: bool) {
1305    TAIL_CELL_MOMENT_CACHE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed);
1306}
1307
1308pub fn reset_tail_cell_moment_cache() {
1309    tail_cell_moment_cache().clear();
1310}
1311
1312pub fn tail_cell_moment_cache_stats() -> TailCellMomentCacheStats {
1313    tail_cell_moment_cache().stats()
1314}
1315
1316#[derive(Clone, Copy, Debug, Eq)]
1317pub struct CellFingerprint {
1318    c0: u64,
1319    c1: u64,
1320    c2: u64,
1321    c3: u64,
1322    left: u64,
1323    right: u64,
1324}
1325
1326impl CellFingerprint {
1327    #[inline]
1328    pub fn new(cell: DenestedCubicCell) -> Self {
1329        Self {
1330            c0: cell.c0.to_bits(),
1331            c1: cell.c1.to_bits(),
1332            c2: cell.c2.to_bits(),
1333            c3: cell.c3.to_bits(),
1334            left: cell.left.to_bits(),
1335            right: cell.right.to_bits(),
1336        }
1337    }
1338}
1339
1340impl PartialEq for CellFingerprint {
1341    #[inline]
1342    fn eq(&self, other: &Self) -> bool {
1343        self.c0 == other.c0
1344            && self.c1 == other.c1
1345            && self.c2 == other.c2
1346            && self.c3 == other.c3
1347            && self.left == other.left
1348            && self.right == other.right
1349    }
1350}
1351
1352impl Hash for CellFingerprint {
1353    #[inline]
1354    fn hash<H: Hasher>(&self, state: &mut H) {
1355        self.c0.hash(state);
1356        self.c1.hash(state);
1357        self.c2.hash(state);
1358        self.c3.hash(state);
1359        self.left.hash(state);
1360        self.right.hash(state);
1361    }
1362}
1363
1364#[derive(Clone, Debug, Default, PartialEq)]
1365pub struct CachedCellMoments {
1366    /// Regular (value) cell moments, populated by
1367    /// `evaluate_cell_moments_cached`. None when only derivative moments
1368    /// have been cached for this cell. Wrapped in `Arc` so `ByteLruCache`
1369    /// returns lookups through cheap refcount bumps instead of deep-cloning
1370    /// the inline `SmallVec<[f64; 10]>` (which spills on every degree-`>= 10`
1371    /// request) on every hot-path LRU hit.
1372    state: Option<Arc<CellMomentState>>,
1373    /// Derivative moments, populated by
1374    /// `evaluate_cell_derivative_moments_cached`. None when only value
1375    /// moments have been cached for this cell. Both variants share the
1376    /// same `CellFingerprint` key so derivative-only callers do not evict
1377    /// pre-cached value entries and vice versa. Same `Arc` wrapping rationale
1378    /// as `state` above.
1379    derivative_state: Option<Arc<CellDerivativeMomentState>>,
1380}
1381
1382impl CachedCellMoments {
1383    #[inline]
1384    pub fn new(state: Arc<CellMomentState>) -> Self {
1385        Self {
1386            state: Some(state),
1387            derivative_state: None,
1388        }
1389    }
1390
1391    #[inline]
1392    pub fn new_derivative(state: Arc<CellDerivativeMomentState>) -> Self {
1393        Self {
1394            state: None,
1395            derivative_state: Some(state),
1396        }
1397    }
1398
1399    #[inline]
1400    pub fn state_for_degree(&self, max_degree: usize) -> Option<CellMomentState> {
1401        let state = self.state.as_ref()?;
1402        if state.moments.len().saturating_sub(1) < max_degree {
1403            return None;
1404        }
1405        // Cached `Arc<CellMomentState>` is shared across LRU hits, so we
1406        // cannot reuse the inner vector in place. Clone the underlying state
1407        // and (rarely) truncate down to the requested degree to honour the
1408        // public moment-length contract.
1409        let mut state = (**state).clone();
1410        state.moments.truncate(max_degree + 1);
1411        Some(state)
1412    }
1413
1414    #[inline]
1415    pub fn derivative_state_for_degree(
1416        &self,
1417        max_degree: usize,
1418    ) -> Option<CellDerivativeMomentState> {
1419        let state = self.derivative_state.as_ref()?;
1420        if state.moments.len().saturating_sub(1) < max_degree {
1421            return None;
1422        }
1423        // See `state_for_degree`: shared `Arc` forces an inner clone here.
1424        let mut state = (**state).clone();
1425        state.moments.truncate(max_degree + 1);
1426        Some(state)
1427    }
1428
1429    #[inline]
1430    pub fn with_value(mut self, state: Arc<CellMomentState>) -> Self {
1431        self.state = Some(state);
1432        self
1433    }
1434
1435    #[inline]
1436    pub fn with_derivative(mut self, state: Arc<CellDerivativeMomentState>) -> Self {
1437        self.derivative_state = Some(state);
1438        self
1439    }
1440}
1441
1442impl ResidentBytes for CachedCellMoments {
1443    fn resident_bytes(&self) -> usize {
1444        let value_bytes = self
1445            .state
1446            .as_ref()
1447            .map_or(0, |state| state.resident_bytes());
1448        let derivative_bytes = self
1449            .derivative_state
1450            .as_ref()
1451            .map_or(0, |state| state.resident_bytes());
1452        std::mem::size_of::<Self>()
1453            .saturating_add(value_bytes)
1454            .saturating_add(derivative_bytes)
1455    }
1456}
1457
1458#[derive(Debug, Default)]
1459pub struct CellMomentCacheStats {
1460    hits: AtomicU64,
1461    misses: AtomicU64,
1462}
1463
1464impl CellMomentCacheStats {
1465    #[inline]
1466    pub fn snapshot(&self) -> (u64, u64) {
1467        (
1468            self.hits.load(Ordering::Relaxed),
1469            self.misses.load(Ordering::Relaxed),
1470        )
1471    }
1472
1473    #[inline]
1474    pub fn hit_rate_delta(&self, before: (u64, u64)) -> (u64, u64, f64) {
1475        let (hits, misses) = self.snapshot();
1476        let dh = hits.saturating_sub(before.0);
1477        let dm = misses.saturating_sub(before.1);
1478        let total = dh + dm;
1479        let rate = if total == 0 {
1480            0.0
1481        } else {
1482            dh as f64 / total as f64
1483        };
1484        (dh, dm, rate)
1485    }
1486}
1487
1488pub type CellMomentLruCache = ByteLruCache<CellFingerprint, CachedCellMoments>;
1489
1490pub const CELL_MOMENT_INLINE_CAPACITY: usize = 10;
1491
1492pub type CellMomentVec = SmallVec<[f64; CELL_MOMENT_INLINE_CAPACITY]>;
1493
1494#[derive(Clone, Debug, PartialEq)]
1495pub struct CellMomentState {
1496    pub branch: ExactCellBranch,
1497    pub value: f64,
1498    pub moments: CellMomentVec,
1499}
1500
1501impl ResidentBytes for CellMomentState {
1502    fn resident_bytes(&self) -> usize {
1503        let spilled_bytes = if self.moments.spilled() {
1504            self.moments
1505                .capacity()
1506                .saturating_mul(std::mem::size_of::<f64>())
1507        } else {
1508            0
1509        };
1510        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1511    }
1512}
1513
1514#[derive(Clone, Debug, PartialEq)]
1515pub struct CellDerivativeMomentState {
1516    pub branch: ExactCellBranch,
1517    pub moments: CellMomentVec,
1518}
1519
1520impl ResidentBytes for CellDerivativeMomentState {
1521    fn resident_bytes(&self) -> usize {
1522        let spilled_bytes = if self.moments.spilled() {
1523            self.moments
1524                .capacity()
1525                .saturating_mul(std::mem::size_of::<f64>())
1526        } else {
1527            0
1528        };
1529        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1530    }
1531}
1532
1533#[derive(Clone, Copy, Debug, PartialEq)]
1534pub struct CellMomentStateRef<'a> {
1535    pub branch: ExactCellBranch,
1536    pub value: f64,
1537    pub moments: &'a [f64],
1538}
1539
1540#[derive(Clone, Debug)]
1541pub struct CellMomentScratch {
1542    moments: Vec<f64>,
1543}
1544
1545impl Default for CellMomentScratch {
1546    fn default() -> Self {
1547        // Pre-size to the codebase's max moment degree so steady-state
1548        // `prepare_moments` calls never reallocate. Calls with `len`
1549        // exceeding this still reserve lazily.
1550        Self {
1551            moments: Vec::with_capacity(MAX_AFFINE_ANCHOR_DEGREE + 1),
1552        }
1553    }
1554}
1555
1556impl CellMomentScratch {
1557    pub fn new() -> Self {
1558        Self::default()
1559    }
1560
1561    pub fn with_capacity(max_degree: usize) -> Self {
1562        Self {
1563            moments: Vec::with_capacity(max_degree + 1),
1564        }
1565    }
1566
1567    #[inline]
1568    fn prepare_moments(&mut self, len: usize) -> &mut [f64] {
1569        if self.moments.capacity() < len {
1570            CELL_MOMENT_REALLOCS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1571            self.moments.reserve(len - self.moments.capacity());
1572        }
1573        // Grow monotonically: shorter requests should not truncate the backing
1574        // storage and then zero the old tail when a later request grows again.
1575        // Only the active prefix is scratch for this evaluation.
1576        if self.moments.len() < len {
1577            self.moments.resize(len, 0.0);
1578        }
1579        let out = &mut self.moments[..len];
1580        out.fill(0.0);
1581        out
1582    }
1583}
1584
1585/// Counter for moment-buffer reallocations in `prepare_moments`. Production
1586/// code increments this on every buffer growth; the test mod inspects it to
1587/// assert the steady-state hot loop allocates exactly once per row buffer.
1588pub(crate) static CELL_MOMENT_REALLOCS: std::sync::atomic::AtomicUsize =
1589    std::sync::atomic::AtomicUsize::new(0);
1590
1591/// Canonical 20-point Gauss–Legendre nodes on [-1, 1] (Abramowitz & Stegun
1592/// 25.4), tabulated to f64 precision. Used here for the Drezner–Wesolowsky
1593/// bivariate normal CDF representation — 20 points give >30-digit accuracy for
1594/// the smooth arcsin-transformed integrand, ensuring the BVN value is exact to
1595/// f64 precision for all (h, k, ρ) — and shared with the cubic-cell B-spline
1596/// moment parity gate in [`crate::gpu_kernels::cubic_bspline_moments`].
1597pub const GL20_NODES: [f64; 20] = [
1598    -0.993_128_599_185_094_9,
1599    -0.963_971_927_277_913_8,
1600    -0.912_234_428_251_326,
1601    -0.839_116_971_822_218_8,
1602    -0.746_331_906_460_150_8,
1603    -0.636_053_680_726_515,
1604    -0.510_867_001_950_827_1,
1605    -0.373_706_088_715_419_6,
1606    -0.227_785_851_141_645_1,
1607    -0.076_526_521_133_497_33,
1608    0.076_526_521_133_497_33,
1609    0.227_785_851_141_645_1,
1610    0.373_706_088_715_419_6,
1611    0.510_867_001_950_827_1,
1612    0.636_053_680_726_515,
1613    0.746_331_906_460_150_8,
1614    0.839_116_971_822_218_8,
1615    0.912_234_428_251_326,
1616    0.963_971_927_277_913_8,
1617    0.993_128_599_185_094_9,
1618];
1619
1620/// Companion weights to [`GL20_NODES`]. Symmetric, summing to 2.
1621pub const GL20_WEIGHTS: [f64; 20] = [
1622    0.017_614_007_139_152_12,
1623    0.040_601_429_800_386_94,
1624    0.062_672_048_334_109_06,
1625    0.083_276_741_576_704_75,
1626    0.101_930_119_817_240_4,
1627    0.118_194_531_961_518_4,
1628    0.131_688_638_449_176_6,
1629    0.142_096_109_318_382_1,
1630    0.149_172_986_472_603_7,
1631    0.152_753_387_130_725_9,
1632    0.152_753_387_130_725_9,
1633    0.149_172_986_472_603_7,
1634    0.142_096_109_318_382_1,
1635    0.131_688_638_449_176_6,
1636    0.118_194_531_961_518_4,
1637    0.101_930_119_817_240_4,
1638    0.083_276_741_576_704_75,
1639    0.062_672_048_334_109_06,
1640    0.040_601_429_800_386_94,
1641    0.017_614_007_139_152_12,
1642];
1643
1644/// Provenance-tagged breakpoint dedup: sorts ascending and merges entries
1645/// coinciding within 1e-12, but when a fixed score break and a link-knot
1646/// crossing coincide (the kink configuration), the surviving entry keeps
1647/// the `Fixed` tag — a deterministic choice; the z location is identical
1648/// either way.
1649fn dedup_sorted_tagged_breakpoints(points: &mut Vec<(f64, PartitionEdge)>) {
1650    points.sort_by(|lhs, rhs| {
1651        lhs.0
1652            .partial_cmp(&rhs.0)
1653            .unwrap_or(std::cmp::Ordering::Equal)
1654    });
1655    points.dedup_by(|lhs, rhs| {
1656        let coincide = if lhs.0 == rhs.0 {
1657            true
1658        } else if lhs.0.is_finite() && rhs.0.is_finite() {
1659            (lhs.0 - rhs.0).abs() <= 1e-12
1660        } else {
1661            false
1662        };
1663        if coincide && matches!(lhs.1, PartitionEdge::Fixed(_)) {
1664            // `dedup_by` keeps `rhs` (the earlier element) — propagate the
1665            // Fixed tag onto the survivor.
1666            rhs.1 = lhs.1;
1667        }
1668        coincide
1669    });
1670}
1671
1672#[inline]
1673pub fn interval_probe_point(left: f64, right: f64) -> Result<f64, String> {
1674    if !(left < right) {
1675        return Err(CubicCellKernelError::invalid_interval(format!(
1676            "interval probe requires ordered bounds, got [{left}, {right}]"
1677        ))
1678        .into());
1679    }
1680    if left.is_finite() && right.is_finite() {
1681        Ok(0.5 * (left + right))
1682    } else if left == f64::NEG_INFINITY && right == f64::INFINITY {
1683        Ok(0.0)
1684    } else if left == f64::NEG_INFINITY && right.is_finite() {
1685        Ok(right - 1.0)
1686    } else if left.is_finite() && right == f64::INFINITY {
1687        Ok(left + 1.0)
1688    } else {
1689        Err(CubicCellKernelError::invalid_interval(format!(
1690            "interval probe requires finite bounds or full infinities, got [{left}, {right}]"
1691        ))
1692        .into())
1693    }
1694}
1695
1696#[inline]
1697pub fn quartic_qprime_coefficients(c0: f64, c1: f64, c2: f64) -> [f64; 4] {
1698    [
1699        c0 * c1,
1700        1.0 + c1 * c1 + 2.0 * c0 * c2,
1701        3.0 * c1 * c2,
1702        2.0 * c2 * c2,
1703    ]
1704}
1705
1706#[inline]
1707pub fn sextic_qprime_coefficients(c0: f64, c1: f64, c2: f64, c3: f64) -> [f64; 6] {
1708    [
1709        c0 * c1,
1710        1.0 + c1 * c1 + 2.0 * c0 * c2,
1711        3.0 * c0 * c3 + 3.0 * c1 * c2,
1712        4.0 * c1 * c3 + 2.0 * c2 * c2,
1713        5.0 * c2 * c3,
1714        3.0 * c3 * c3,
1715    ]
1716}
1717
1718/// Boundary term `right^n · exp(−q(right)) − left^n · exp(−q(left))` used by
1719/// the moment recurrences. Takes precomputed `left^n` and `right^n` so callers
1720/// can roll the powers across a recurrence — each iteration becomes one
1721/// multiply instead of a fresh `powi(n)`.
1722#[inline]
1723fn moment_boundary_term_with_powers(
1724    cell: DenestedCubicCell,
1725    left_pow_n: f64,
1726    right_pow_n: f64,
1727) -> f64 {
1728    let left_term = if cell.left.is_infinite() {
1729        0.0
1730    } else {
1731        left_pow_n * (-cell.q(cell.left)).exp()
1732    };
1733    let right_term = if cell.right.is_infinite() {
1734        0.0
1735    } else {
1736        right_pow_n * (-cell.q(cell.right)).exp()
1737    };
1738    right_term - left_term
1739}
1740
1741#[inline]
1742fn base_moments_match_direct(base: &[f64], direct: &[f64]) -> bool {
1743    base.iter()
1744        .zip(direct.iter())
1745        .all(|(&lhs, &rhs)| (lhs - rhs).abs() <= 1e-10 * (1.0 + lhs.abs().max(rhs.abs())))
1746}
1747
1748#[inline]
1749fn direct_non_affine_moments_if_base_matches(
1750    cell: DenestedCubicCell,
1751    base: &[f64],
1752    max_degree: usize,
1753) -> Option<Vec<f64>> {
1754    if !cell.left.is_finite() || !cell.right.is_finite() {
1755        return None;
1756    }
1757    // When the supplied base moments are the actual moments of this fixed
1758    // finite cell, prefer the same quadrature-backed evaluator used by the
1759    // public non-affine moment path.  The algebraic raising recurrence is kept
1760    // below for callers that intentionally pass symbolic or otherwise
1761    // non-cell-consistent bases, but repeatedly dividing by the quartic/sextic
1762    // leading coefficient can amplify harmless base-roundoff into high-order
1763    // moment error.
1764    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
1765    if base_moments_match_direct(base, &moments) {
1766        Some(moments.into_vec())
1767    } else {
1768        None
1769    }
1770}
1771
1772pub fn reduce_quartic_moments(
1773    cell: DenestedCubicCell,
1774    base_m0_m2: [f64; 3],
1775    max_degree: usize,
1776) -> Result<Vec<f64>, String> {
1777    if max_degree <= 2 {
1778        return Ok(base_m0_m2[..=max_degree].to_vec());
1779    }
1780    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m2, max_degree)
1781    {
1782        return Ok(moments);
1783    }
1784    let d = quartic_qprime_coefficients(cell.c0, cell.c1, cell.c2);
1785    let lead = d[3];
1786    if !lead.is_finite() || lead.abs() <= 1e-18 {
1787        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1788            "quartic moment reduction requires nonzero leading coefficient, got {lead:.3e}"
1789        ))
1790        .into());
1791    }
1792    let mut moments = vec![0.0; max_degree + 1];
1793    moments[0] = base_m0_m2[0];
1794    moments[1] = base_m0_m2[1];
1795    moments[2] = base_m0_m2[2];
1796    // Roll left^n / right^n across the recurrence rather than calling
1797    // `powi(n)` each iteration. Skip the multiply when an endpoint is
1798    // infinite — the boundary helper ignores the power in that case, and
1799    // ∞·0 would produce a NaN we'd then have to mask off anyway.
1800    let left_finite = cell.left.is_finite();
1801    let right_finite = cell.right.is_finite();
1802    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1803    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1804    for n in 0..=(max_degree - 3) {
1805        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1806        let mut numer = if n == 0 {
1807            0.0
1808        } else {
1809            (n as f64) * moments[n - 1]
1810        };
1811        for j in 0..=2 {
1812            numer -= d[j] * moments[n + j];
1813        }
1814        numer -= b_n;
1815        moments[n + 3] = numer / lead;
1816        if left_finite {
1817            left_pow_n *= cell.left;
1818        }
1819        if right_finite {
1820            right_pow_n *= cell.right;
1821        }
1822    }
1823    Ok(moments)
1824}
1825
1826pub fn reduce_sextic_moments(
1827    cell: DenestedCubicCell,
1828    base_m0_m4: [f64; 5],
1829    max_degree: usize,
1830) -> Result<Vec<f64>, String> {
1831    if max_degree <= 4 {
1832        return Ok(base_m0_m4[..=max_degree].to_vec());
1833    }
1834    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m4, max_degree)
1835    {
1836        return Ok(moments);
1837    }
1838    let d = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3);
1839    let lead = d[5];
1840    if !lead.is_finite() {
1841        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1842            "sextic moment reduction encountered non-finite leading coefficient: {lead:.3e}"
1843        ))
1844        .into());
1845    }
1846    if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
1847        if lower_branch == ExactCellBranch::Quartic {
1848            return evaluate_non_affine_cell_state(
1849                DenestedCubicCell { c3: 0.0, ..cell },
1850                ExactCellBranch::Quartic,
1851                max_degree,
1852            )
1853            .map(|state| state.moments.into_vec());
1854        }
1855        return evaluate_affine_cell_state(
1856            DenestedCubicCell {
1857                left: cell.left,
1858                right: cell.right,
1859                c0: cell.c0,
1860                c1: cell.c1,
1861                c2: 0.0,
1862                c3: 0.0,
1863            },
1864            max_degree,
1865        )
1866        .map(|state| state.moments.into_vec());
1867    }
1868    let mut moments = vec![0.0; max_degree + 1];
1869    for (idx, value) in base_m0_m4.into_iter().enumerate() {
1870        moments[idx] = value;
1871    }
1872    let left_finite = cell.left.is_finite();
1873    let right_finite = cell.right.is_finite();
1874    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1875    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1876    for n in 0..=(max_degree - 5) {
1877        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1878        let mut numer = if n == 0 {
1879            0.0
1880        } else {
1881            (n as f64) * moments[n - 1]
1882        };
1883        for j in 0..=4 {
1884            numer -= d[j] * moments[n + j];
1885        }
1886        numer -= b_n;
1887        moments[n + 5] = numer / lead;
1888        if left_finite {
1889            left_pow_n *= cell.left;
1890        }
1891        if right_finite {
1892            right_pow_n *= cell.right;
1893        }
1894    }
1895    Ok(moments)
1896}
1897
1898#[inline]
1899pub fn cell_first_derivative_from_moments(
1900    derivative_coefficients: &[f64],
1901    moments: &[f64],
1902) -> Result<f64, String> {
1903    let value = moment_dot_with_coefficients(derivative_coefficients, moments, "first derivative")?;
1904    Ok(value * INV_TWO_PI)
1905}
1906
1907/// Maximum moment index (i.e. `max_degree` passed to
1908/// `evaluate_cell_moments`) required to evaluate
1909/// `cell_first_derivative_from_moments(derivative_coefficients, moments)`.
1910///
1911/// Callers must request at least `cell_first_derivative_required_max_degree(
1912/// derivative_coefficients)` so the moment dot is well-defined; #321 was
1913/// caused by hardcoding a smaller value at one call site.
1914#[inline]
1915pub fn cell_first_derivative_required_max_degree(derivative_coefficients: &[f64]) -> usize {
1916    derivative_coefficients.len().saturating_sub(1)
1917}
1918
1919/// Maximum moment index required by `cell_second_derivative_from_moments`.
1920///
1921/// Mirrors the kernel's internal `needed = max(second_deg, product_deg) + 1`
1922/// computation, but returned as `max_degree` (i.e. `needed - 1`) so it lines
1923/// up with the `evaluate_cell_moments(cell, max_degree)` argument convention.
1924/// The contraction folds an inner cubic `eta` (always degree 3) with the two
1925/// first-coefficient slices and the second-coefficient slice; the +3 below is
1926/// the cubic-cell eta polynomial.
1927#[inline]
1928pub fn cell_second_derivative_required_max_degree(
1929    first_coefficients_r: &[f64],
1930    first_coefficients_s: &[f64],
1931    second_coefficients_rs: &[f64],
1932) -> usize {
1933    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1934    let product_degree = first_coefficients_r.len().saturating_sub(1)
1935        + first_coefficients_s.len().saturating_sub(1)
1936        + 3;
1937    second_degree.max(product_degree)
1938}
1939
1940#[inline]
1941pub fn cell_polynomial_integral_from_moments(
1942    polynomial_coefficients: &[f64],
1943    moments: &[f64],
1944    label: &str,
1945) -> Result<f64, String> {
1946    let value = moment_dot_with_coefficients(polynomial_coefficients, moments, label)?;
1947    Ok(value * INV_TWO_PI)
1948}
1949
1950#[inline]
1951pub fn cell_second_derivative_from_moments(
1952    cell: DenestedCubicCell,
1953    first_coefficients_r: &[f64],
1954    first_coefficients_s: &[f64],
1955    second_coefficients_rs: &[f64],
1956    moments: &[f64],
1957) -> Result<f64, String> {
1958    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1959    let product_degree = first_coefficients_r.len().saturating_sub(1)
1960        + first_coefficients_s.len().saturating_sub(1)
1961        + 3;
1962    let needed = second_degree.max(product_degree) + 1;
1963    if needed > moments.len() {
1964        return Err(CubicCellKernelError::insufficient_moments(format!(
1965            "insufficient reduced moments for second derivative: need {}, have {}",
1966            needed,
1967            moments.len()
1968        ))
1969        .into());
1970    }
1971    let second_term = moment_dot_with_coefficients_unchecked(second_coefficients_rs, moments);
1972    // Fold `Σ_{e,i,j} eta[e]·r[i]·s[j]·moments[e+i+j]` into a single dot
1973    // against `moments`. Convolving `eta ⊗ r ⊗ s` first turns the original
1974    // `len(eta)·len(r)·len(s)` triple loop (typically 4·4·4 = 64 mul-adds
1975    // per call) into `len(eta)·len(r) + (len(eta)+len(r)-1)·len(s) +
1976    // len(out)` ≈ 16 + 28 + 10 = 54 mul-adds, with the inner loops now in
1977    // straight-line FMA-friendly form.
1978    let cubic = [cell.c0, cell.c1, cell.c2, cell.c3];
1979    // Capacity bound: cubic (4) + first_r (≤MAX) + first_s (≤MAX) - 2.
1980    // First-coefficient slices are passed in as `[f64; 4]` from every
1981    // production caller; sizing to 32 covers any realistic test input.
1982    const SCRATCH: usize = 32;
1983    let mut eta_r = [0.0_f64; SCRATCH];
1984    let mut eta_rs = [0.0_f64; SCRATCH];
1985    let er_len = poly_conv_into(&cubic, first_coefficients_r, &mut eta_r);
1986    let ers_len = poly_conv_into(&eta_r[..er_len], first_coefficients_s, &mut eta_rs);
1987    let mut eta_term = 0.0;
1988    for k in 0..ers_len {
1989        eta_term = eta_rs[k].mul_add(moments[k], eta_term);
1990    }
1991    Ok((second_term - eta_term) * INV_TWO_PI)
1992}
1993
1994/// Pointwise value of the cell second-derivative integrand
1995/// `(∂²/∂r∂s) exp(-q(z))/2π` at a single `z`, evaluated from the SAME
1996/// `(r, s, rs)` coefficient polynomials the moment reduction
1997/// [`cell_second_derivative_from_moments`] integrates:
1998///
1999/// ```text
2000///   F_rs(z) = ( c_rs(z) - η(z)·c_r(z)·c_s(z) ) · exp(-q(z)) · 1/2π ,
2001/// ```
2002///
2003/// with `c_•(z) = Σ_k coeff_•[k]·zᵏ`, `η(z)` the cell cubic, and
2004/// `q(z) = ½(z² + η(z)²)`. This is the integrand whose `[cell.left,
2005/// cell.right]` integral the from-moments form returns — needed for the
2006/// Leibniz boundary term when a cell edge (a link-knot crossing
2007/// `z=(τ-a)/b`) moves with a parameter (the slope `b`): the directional
2008/// derivative of `∫_{z_L}^{z_R} F_rs dz` picks up
2009/// `F_rs(z_R)·z_R'(dir) - F_rs(z_L)·z_L'(dir)` on top of the fixed-domain
2010/// part. Coefficient sign convention matches the simpson reference
2011/// (`numeric_ab`): pass the ACTUAL derivative-coefficient polynomials
2012/// `∂c/∂r` etc. (not the negated `neg_dc_d•` the moment path consumes).
2013#[inline]
2014pub fn cell_second_derivative_boundary_integrand(
2015    cell: DenestedCubicCell,
2016    first_coefficients_r: &[f64],
2017    first_coefficients_s: &[f64],
2018    second_coefficients_rs: &[f64],
2019    z: f64,
2020) -> f64 {
2021    let eta = cell.eta(z);
2022    let c_r = poly_eval_at(first_coefficients_r, z);
2023    let c_s = poly_eval_at(first_coefficients_s, z);
2024    let c_rs = poly_eval_at(second_coefficients_rs, z);
2025    (c_rs - eta * c_r * c_s) * (-cell.q(z)).exp() * INV_TWO_PI
2026}
2027
2028/// Pointwise value of the cell third-derivative integrand
2029/// `(∂³/∂r∂s∂t) exp(-q(z))/2π` at a single `z`, evaluated from the same
2030/// `(r, s, t, rs, rt, st, rst)` coefficient polynomials that
2031/// [`cell_third_derivative_from_moments`] integrates:
2032///
2033/// ```text
2034/// F_rst(z) = (
2035///     c_rst(z)
2036///   - η(z)·(c_rs(z)c_t(z) + c_rt(z)c_s(z) + c_st(z)c_r(z))
2037///   + (η(z)² - 1)·c_r(z)c_s(z)c_t(z)
2038/// ) · exp(-q(z)) · 1/2π .
2039/// ```
2040///
2041/// This is the boundary value for differentiating an already-third-order
2042/// fixed-domain integral with respect to a moving edge. The sign convention is
2043/// intentionally identical to [`cell_third_derivative_from_moments`]: callers
2044/// must pass the coefficient slices in the convention of the integral they are
2045/// differentiating. In particular, survival/probit paths that integrate the
2046/// jointly negated cell and coefficient slices must evaluate this boundary
2047/// integrand with the same joint negation; evaluating an un-negated boundary for
2048/// a negated fixed-domain integral flips the sign of this odd-order integrand.
2049#[inline]
2050pub fn cell_third_derivative_boundary_integrand(
2051    cell: DenestedCubicCell,
2052    first_coefficients_r: &[f64],
2053    first_coefficients_s: &[f64],
2054    first_coefficients_t: &[f64],
2055    second_coefficients_rs: &[f64],
2056    second_coefficients_rt: &[f64],
2057    second_coefficients_st: &[f64],
2058    third_coefficients_rst: &[f64],
2059    z: f64,
2060) -> f64 {
2061    let eta = cell.eta(z);
2062    let c_r = poly_eval_at(first_coefficients_r, z);
2063    let c_s = poly_eval_at(first_coefficients_s, z);
2064    let c_t = poly_eval_at(first_coefficients_t, z);
2065    let c_rs = poly_eval_at(second_coefficients_rs, z);
2066    let c_rt = poly_eval_at(second_coefficients_rt, z);
2067    let c_st = poly_eval_at(second_coefficients_st, z);
2068    let c_rst = poly_eval_at(third_coefficients_rst, z);
2069    let amplitude =
2070        c_rst - eta * (c_rs * c_t + c_rt * c_s + c_st * c_r) + (eta * eta - 1.0) * c_r * c_s * c_t;
2071    amplitude * (-cell.q(z)).exp() * INV_TWO_PI
2072}
2073
2074/// Pointwise value of the density-weighted integrand `g(z)·exp(-q(z))/2π` at a
2075/// single `z`, for an arbitrary integrand polynomial `g`.
2076///
2077/// This is the boundary value needed for the moving-domain (Leibniz) term of a
2078/// density-normalization integral `∫ g(z)·exp(-q(z))/2π dz` whose cell edge is a
2079/// link-knot crossing `z=(τ-a)/b` that moves with a parameter direction: the
2080/// directional derivative of the integral picks up
2081/// `g(z_R)·w(z_R)·z_R'(dir) - g(z_L)·w(z_L)·z_L'(dir)` on top of the
2082/// fixed-domain part, with `w(z)=exp(-q(z))/2π` the same weight the moment
2083/// reductions integrate. Unlike the Hessian-integral boundary term (which is
2084/// shared by adjacent cells and cancels across each interior knot), the
2085/// ln-density integrand `D_t`/`D_t,uv` carries a non-shared `g`, so this
2086/// Leibniz term does NOT cancel and must be added (gam#932/#979).
2087pub fn cell_density_boundary_integrand(cell: DenestedCubicCell, g: &[f64], z: f64) -> f64 {
2088    poly_eval_at(g, z) * (-cell.q(z)).exp() * INV_TWO_PI
2089}
2090
2091/// Horner evaluation of `Σ_k coefficients[k]·zᵏ`.
2092#[inline]
2093fn poly_eval_at(coefficients: &[f64], z: f64) -> f64 {
2094    let mut acc = 0.0_f64;
2095    for &c in coefficients.iter().rev() {
2096        acc = acc.mul_add(z, c);
2097    }
2098    acc
2099}
2100
2101#[inline]
2102fn moment_dot_with_coefficients(
2103    coefficients: &[f64],
2104    moments: &[f64],
2105    label: &str,
2106) -> Result<f64, String> {
2107    if coefficients.len() > moments.len() {
2108        return Err(CubicCellKernelError::insufficient_moments(format!(
2109            "insufficient reduced moments for {label}: need {}, have {}",
2110            coefficients.len(),
2111            moments.len()
2112        ))
2113        .into());
2114    }
2115    Ok(moment_dot_with_coefficients_unchecked(
2116        coefficients,
2117        moments,
2118    ))
2119}
2120
2121#[inline]
2122fn moment_dot_with_coefficients_unchecked(coefficients: &[f64], moments: &[f64]) -> f64 {
2123    let mut acc = 0.0;
2124    for (idx, &coeff) in coefficients.iter().enumerate() {
2125        acc = coeff.mul_add(moments[idx], acc);
2126    }
2127    acc
2128}
2129
2130/// Convolve two polynomial coefficient slices into a fixed-capacity output
2131/// buffer. Returns the populated length (`lhs.len() + rhs.len() - 1` when
2132/// both are non-empty). The buffer's tail (beyond the returned length) is
2133/// not zeroed; callers must use only the returned prefix.
2134///
2135/// Used by the multi-derivative reductions to fold `eta · r · s · …` triple
2136/// and quadruple sums into a single moment dot, eliminating the
2137/// `O(deg^3)`/`O(deg^4)` inner-loop work that dominated the
2138/// `cell_*_derivative_from_moments` hot leaves on large-scale fits.
2139#[inline]
2140fn poly_conv_into(lhs: &[f64], rhs: &[f64], out: &mut [f64]) -> usize {
2141    if lhs.is_empty() || rhs.is_empty() {
2142        return 0;
2143    }
2144    let len = lhs.len() + rhs.len() - 1;
2145    assert!(out.len() >= len);
2146    for slot in out[..len].iter_mut() {
2147        *slot = 0.0;
2148    }
2149    for (i, &lv) in lhs.iter().enumerate() {
2150        for (j, &rv) in rhs.iter().enumerate() {
2151            out[i + j] = lv.mul_add(rv, out[i + j]);
2152        }
2153    }
2154    len
2155}
2156
2157#[inline]
2158fn require_moments_degree(
2159    required_degree: usize,
2160    moments: &[f64],
2161    label: &str,
2162) -> Result<(), String> {
2163    if required_degree >= moments.len() {
2164        return Err(CubicCellKernelError::insufficient_moments(format!(
2165            "insufficient reduced moments for {label}: need {}, have {}",
2166            required_degree + 1,
2167            moments.len()
2168        ))
2169        .into());
2170    }
2171    Ok::<(), _>(())
2172}
2173
2174#[inline]
2175fn require_scratch_capacity(
2176    required_len: usize,
2177    capacity: usize,
2178    label: &str,
2179) -> Result<(), String> {
2180    if required_len > capacity {
2181        return Err(CubicCellKernelError::insufficient_moments(format!(
2182            "{label} polynomial convolution scratch too small: need {required_len}, have {capacity}"
2183        ))
2184        .into());
2185    }
2186    Ok::<(), _>(())
2187}
2188
2189#[inline]
2190fn convolution_chain_len(lengths: &[usize]) -> usize {
2191    if lengths.is_empty() || lengths.contains(&0) {
2192        0
2193    } else {
2194        lengths.iter().sum::<usize>() - (lengths.len() - 1)
2195    }
2196}
2197
2198#[inline]
2199fn first_coefficients_degree(label: &str, coefficients: &[f64]) -> Result<usize, String> {
2200    coefficients
2201        .len()
2202        .checked_sub(1)
2203        .ok_or_else(|| format!("{label} first-derivative coefficients must be non-empty"))
2204}
2205
2206#[inline]
2207pub fn cell_third_derivative_from_moments(
2208    cell: DenestedCubicCell,
2209    first_coefficients_r: &[f64],
2210    first_coefficients_s: &[f64],
2211    first_coefficients_t: &[f64],
2212    second_coefficients_rs: &[f64],
2213    second_coefficients_rt: &[f64],
2214    second_coefficients_st: &[f64],
2215    third_coefficients_rst: &[f64],
2216    moments: &[f64],
2217) -> Result<f64, String> {
2218    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2219    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2220    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2221    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2222    let second_sum_degree = [
2223        second_coefficients_rs.len() + first_coefficients_t.len(),
2224        second_coefficients_rt.len() + first_coefficients_s.len(),
2225        second_coefficients_st.len() + first_coefficients_r.len(),
2226    ]
2227    .into_iter()
2228    .max()
2229    .unwrap_or(0)
2230    .saturating_sub(1);
2231    let triple_product_degree = r_degree + s_degree + t_degree;
2232    let needed = (third_coefficients_rst.len().saturating_sub(1))
2233        .max(3 + second_sum_degree)
2234        .max(6 + triple_product_degree);
2235    require_moments_degree(needed, moments, "third derivative")?;
2236
2237    let third_term = moment_dot_with_coefficients_unchecked(third_coefficients_rst, moments);
2238
2239    // This is a deliberately serial leaf kernel: each call performs only a
2240    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2241    // at the surrounding row/cell batch level rather than inside this hot path.
2242    const SCRATCH: usize = 32;
2243    let max_linear_conv_len = [
2244        convolution_chain_len(&[
2245            eta.len(),
2246            second_coefficients_rs.len(),
2247            first_coefficients_t.len(),
2248        ]),
2249        convolution_chain_len(&[
2250            eta.len(),
2251            second_coefficients_rt.len(),
2252            first_coefficients_s.len(),
2253        ]),
2254        convolution_chain_len(&[
2255            eta.len(),
2256            second_coefficients_st.len(),
2257            first_coefficients_r.len(),
2258        ]),
2259    ]
2260    .into_iter()
2261    .max()
2262    .unwrap_or(0);
2263    let max_cubic_conv_len = convolution_chain_len(&[
2264        7,
2265        first_coefficients_r.len(),
2266        first_coefficients_s.len(),
2267        first_coefficients_t.len(),
2268    ]);
2269    require_scratch_capacity(
2270        max_linear_conv_len.max(max_cubic_conv_len),
2271        SCRATCH,
2272        "third derivative",
2273    )?;
2274    let mut buf_a = [0.0_f64; SCRATCH];
2275    let mut buf_b = [0.0_f64; SCRATCH];
2276
2277    // eta_second_term = Σ over (rs⊗t, rt⊗s, st⊗r) of eta⊗product · moments.
2278    // Fold each of the three triple sums into a single moment dot.
2279    let mut eta_second_term = 0.0;
2280    let conv_dot = |first: &[f64],
2281                    second: &[f64],
2282                    buf_a: &mut [f64; SCRATCH],
2283                    buf_b: &mut [f64; SCRATCH]|
2284     -> f64 {
2285        let m = poly_conv_into(first, second, buf_a);
2286        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2287        let mut acc = 0.0;
2288        for k in 0..n {
2289            acc = buf_b[k].mul_add(moments[k], acc);
2290        }
2291        acc
2292    };
2293    eta_second_term += conv_dot(
2294        second_coefficients_rs,
2295        first_coefficients_t,
2296        &mut buf_a,
2297        &mut buf_b,
2298    );
2299    eta_second_term += conv_dot(
2300        second_coefficients_rt,
2301        first_coefficients_s,
2302        &mut buf_a,
2303        &mut buf_b,
2304    );
2305    eta_second_term += conv_dot(
2306        second_coefficients_st,
2307        first_coefficients_r,
2308        &mut buf_a,
2309        &mut buf_b,
2310    );
2311
2312    // cubic_coeff_term = Σ_{e,i,j,k} (eta·eta − 1)[e] · r[i] · s[j] · t[k] · moments[e+i+j+k].
2313    // Convolve r⊗s, then ⊗t, then ⊗(eta·eta − 1), giving a single dot.
2314    let mut eta_sq_minus_one = [0.0_f64; 7];
2315    for (i, &eta_i) in eta.iter().enumerate() {
2316        for (j, &eta_j) in eta.iter().enumerate() {
2317            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2318        }
2319    }
2320    eta_sq_minus_one[0] -= 1.0;
2321
2322    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2323    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2324    // buf_a now reused for (eta_sq_minus_one ⊗ rst).
2325    let final_len = poly_conv_into(&eta_sq_minus_one, &buf_b[..rst_len], &mut buf_a);
2326    let mut cubic_coeff_term = 0.0;
2327    for k in 0..final_len {
2328        cubic_coeff_term = buf_a[k].mul_add(moments[k], cubic_coeff_term);
2329    }
2330
2331    Ok((third_term - eta_second_term + cubic_coeff_term) * INV_TWO_PI)
2332}
2333
2334#[inline]
2335pub fn cell_fourth_derivative_from_moments(
2336    cell: DenestedCubicCell,
2337    first_coefficients_r: &[f64],
2338    first_coefficients_s: &[f64],
2339    first_coefficients_t: &[f64],
2340    first_coefficients_u: &[f64],
2341    second_coefficients_rs: &[f64],
2342    second_coefficients_rt: &[f64],
2343    second_coefficients_ru: &[f64],
2344    second_coefficients_st: &[f64],
2345    second_coefficients_su: &[f64],
2346    second_coefficients_tu: &[f64],
2347    third_coefficients_rst: &[f64],
2348    third_coefficients_rsu: &[f64],
2349    third_coefficients_rtu: &[f64],
2350    third_coefficients_stu: &[f64],
2351    fourth_coefficients_rstu: &[f64],
2352    moments: &[f64],
2353) -> Result<f64, String> {
2354    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2355    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2356    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2357    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2358    let u_degree = first_coefficients_degree("u", first_coefficients_u)?;
2359    let linear_sum_degree = [
2360        third_coefficients_rst.len() + first_coefficients_u.len(),
2361        third_coefficients_rsu.len() + first_coefficients_t.len(),
2362        third_coefficients_rtu.len() + first_coefficients_s.len(),
2363        third_coefficients_stu.len() + first_coefficients_r.len(),
2364        second_coefficients_rs.len() + second_coefficients_tu.len(),
2365        second_coefficients_rt.len() + second_coefficients_su.len(),
2366        second_coefficients_ru.len() + second_coefficients_st.len(),
2367    ]
2368    .into_iter()
2369    .max()
2370    .unwrap_or(0)
2371    .saturating_sub(1);
2372    let quad_sum_degree = [
2373        second_coefficients_rs.len() + first_coefficients_t.len() + first_coefficients_u.len(),
2374        second_coefficients_rt.len() + first_coefficients_s.len() + first_coefficients_u.len(),
2375        second_coefficients_ru.len() + first_coefficients_s.len() + first_coefficients_t.len(),
2376        second_coefficients_st.len() + first_coefficients_r.len() + first_coefficients_u.len(),
2377        second_coefficients_su.len() + first_coefficients_r.len() + first_coefficients_t.len(),
2378        second_coefficients_tu.len() + first_coefficients_r.len() + first_coefficients_s.len(),
2379    ]
2380    .into_iter()
2381    .max()
2382    .unwrap_or(0)
2383    .saturating_sub(2);
2384    let quartic_product_degree = r_degree + s_degree + t_degree + u_degree;
2385    let needed = (fourth_coefficients_rstu.len().saturating_sub(1))
2386        .max(3 + linear_sum_degree)
2387        .max(6 + quad_sum_degree)
2388        .max(9 + quartic_product_degree);
2389    require_moments_degree(needed, moments, "fourth derivative")?;
2390
2391    let fourth_term = moment_dot_with_coefficients_unchecked(fourth_coefficients_rstu, moments);
2392
2393    // This is a deliberately serial leaf kernel: each call performs only a
2394    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2395    // at the surrounding row/cell batch level rather than inside this hot path.
2396    const SCRATCH: usize = 32;
2397    let max_linear_conv_len = [
2398        convolution_chain_len(&[
2399            eta.len(),
2400            third_coefficients_rst.len(),
2401            first_coefficients_u.len(),
2402        ]),
2403        convolution_chain_len(&[
2404            eta.len(),
2405            third_coefficients_rsu.len(),
2406            first_coefficients_t.len(),
2407        ]),
2408        convolution_chain_len(&[
2409            eta.len(),
2410            third_coefficients_rtu.len(),
2411            first_coefficients_s.len(),
2412        ]),
2413        convolution_chain_len(&[
2414            eta.len(),
2415            third_coefficients_stu.len(),
2416            first_coefficients_r.len(),
2417        ]),
2418        convolution_chain_len(&[
2419            eta.len(),
2420            second_coefficients_rs.len(),
2421            second_coefficients_tu.len(),
2422        ]),
2423        convolution_chain_len(&[
2424            eta.len(),
2425            second_coefficients_rt.len(),
2426            second_coefficients_su.len(),
2427        ]),
2428        convolution_chain_len(&[
2429            eta.len(),
2430            second_coefficients_ru.len(),
2431            second_coefficients_st.len(),
2432        ]),
2433    ]
2434    .into_iter()
2435    .max()
2436    .unwrap_or(0);
2437    let max_quad_conv_len = [
2438        convolution_chain_len(&[
2439            7,
2440            second_coefficients_rs.len(),
2441            first_coefficients_t.len(),
2442            first_coefficients_u.len(),
2443        ]),
2444        convolution_chain_len(&[
2445            7,
2446            second_coefficients_rt.len(),
2447            first_coefficients_s.len(),
2448            first_coefficients_u.len(),
2449        ]),
2450        convolution_chain_len(&[
2451            7,
2452            second_coefficients_ru.len(),
2453            first_coefficients_s.len(),
2454            first_coefficients_t.len(),
2455        ]),
2456        convolution_chain_len(&[
2457            7,
2458            second_coefficients_st.len(),
2459            first_coefficients_r.len(),
2460            first_coefficients_u.len(),
2461        ]),
2462        convolution_chain_len(&[
2463            7,
2464            second_coefficients_su.len(),
2465            first_coefficients_r.len(),
2466            first_coefficients_t.len(),
2467        ]),
2468        convolution_chain_len(&[
2469            7,
2470            second_coefficients_tu.len(),
2471            first_coefficients_r.len(),
2472            first_coefficients_s.len(),
2473        ]),
2474    ]
2475    .into_iter()
2476    .max()
2477    .unwrap_or(0);
2478    let max_quartic_conv_len = convolution_chain_len(&[
2479        10,
2480        first_coefficients_r.len(),
2481        first_coefficients_s.len(),
2482        first_coefficients_t.len(),
2483        first_coefficients_u.len(),
2484    ]);
2485    require_scratch_capacity(
2486        max_linear_conv_len
2487            .max(max_quad_conv_len)
2488            .max(max_quartic_conv_len),
2489        SCRATCH,
2490        "fourth derivative",
2491    )?;
2492    let mut buf_a = [0.0_f64; SCRATCH];
2493    let mut buf_b = [0.0_f64; SCRATCH];
2494
2495    // eta_linear_term = Σ over seven (rst⊗u, rsu⊗t, rtu⊗s, stu⊗r, rs⊗tu,
2496    // rt⊗su, ru⊗st) of eta⊗product · moments. Fold each triple sum into
2497    // a single moment dot.
2498    let conv_eta_dot = |first: &[f64],
2499                        second: &[f64],
2500                        buf_a: &mut [f64; SCRATCH],
2501                        buf_b: &mut [f64; SCRATCH]|
2502     -> f64 {
2503        let m = poly_conv_into(first, second, buf_a);
2504        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2505        let mut acc = 0.0;
2506        for k in 0..n {
2507            acc = buf_b[k].mul_add(moments[k], acc);
2508        }
2509        acc
2510    };
2511    let mut eta_linear_term = 0.0;
2512    eta_linear_term += conv_eta_dot(
2513        third_coefficients_rst,
2514        first_coefficients_u,
2515        &mut buf_a,
2516        &mut buf_b,
2517    );
2518    eta_linear_term += conv_eta_dot(
2519        third_coefficients_rsu,
2520        first_coefficients_t,
2521        &mut buf_a,
2522        &mut buf_b,
2523    );
2524    eta_linear_term += conv_eta_dot(
2525        third_coefficients_rtu,
2526        first_coefficients_s,
2527        &mut buf_a,
2528        &mut buf_b,
2529    );
2530    eta_linear_term += conv_eta_dot(
2531        third_coefficients_stu,
2532        first_coefficients_r,
2533        &mut buf_a,
2534        &mut buf_b,
2535    );
2536    eta_linear_term += conv_eta_dot(
2537        second_coefficients_rs,
2538        second_coefficients_tu,
2539        &mut buf_a,
2540        &mut buf_b,
2541    );
2542    eta_linear_term += conv_eta_dot(
2543        second_coefficients_rt,
2544        second_coefficients_su,
2545        &mut buf_a,
2546        &mut buf_b,
2547    );
2548    eta_linear_term += conv_eta_dot(
2549        second_coefficients_ru,
2550        second_coefficients_st,
2551        &mut buf_a,
2552        &mut buf_b,
2553    );
2554
2555    let mut eta_sq_minus_one = [0.0_f64; 7];
2556    for (i, &eta_i) in eta.iter().enumerate() {
2557        for (j, &eta_j) in eta.iter().enumerate() {
2558            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2559        }
2560    }
2561    eta_sq_minus_one[0] -= 1.0;
2562
2563    // quad_coeff_term: six (eta²−1)⊗A⊗B⊗C · moments sums, where the (A,B,C)
2564    // factors are: (rs,t,u), (rt,s,u), (ru,s,t), (st,r,u), (su,r,t), (tu,r,s).
2565    let mut buf_c = [0.0_f64; SCRATCH];
2566    let conv_weighted_triple_dot = |weight: &[f64],
2567                                    a: &[f64],
2568                                    b: &[f64],
2569                                    c: &[f64],
2570                                    buf_a: &mut [f64; SCRATCH],
2571                                    buf_b: &mut [f64; SCRATCH],
2572                                    buf_c: &mut [f64; SCRATCH]|
2573     -> f64 {
2574        let ab_len = poly_conv_into(a, b, buf_a);
2575        let abc_len = poly_conv_into(&buf_a[..ab_len], c, buf_b);
2576        let final_len = poly_conv_into(weight, &buf_b[..abc_len], buf_c);
2577        let mut acc = 0.0;
2578        for k in 0..final_len {
2579            acc = buf_c[k].mul_add(moments[k], acc);
2580        }
2581        acc
2582    };
2583    let mut quad_coeff_term = 0.0;
2584    quad_coeff_term += conv_weighted_triple_dot(
2585        &eta_sq_minus_one,
2586        second_coefficients_rs,
2587        first_coefficients_t,
2588        first_coefficients_u,
2589        &mut buf_a,
2590        &mut buf_b,
2591        &mut buf_c,
2592    );
2593    quad_coeff_term += conv_weighted_triple_dot(
2594        &eta_sq_minus_one,
2595        second_coefficients_rt,
2596        first_coefficients_s,
2597        first_coefficients_u,
2598        &mut buf_a,
2599        &mut buf_b,
2600        &mut buf_c,
2601    );
2602    quad_coeff_term += conv_weighted_triple_dot(
2603        &eta_sq_minus_one,
2604        second_coefficients_ru,
2605        first_coefficients_s,
2606        first_coefficients_t,
2607        &mut buf_a,
2608        &mut buf_b,
2609        &mut buf_c,
2610    );
2611    quad_coeff_term += conv_weighted_triple_dot(
2612        &eta_sq_minus_one,
2613        second_coefficients_st,
2614        first_coefficients_r,
2615        first_coefficients_u,
2616        &mut buf_a,
2617        &mut buf_b,
2618        &mut buf_c,
2619    );
2620    quad_coeff_term += conv_weighted_triple_dot(
2621        &eta_sq_minus_one,
2622        second_coefficients_su,
2623        first_coefficients_r,
2624        first_coefficients_t,
2625        &mut buf_a,
2626        &mut buf_b,
2627        &mut buf_c,
2628    );
2629    quad_coeff_term += conv_weighted_triple_dot(
2630        &eta_sq_minus_one,
2631        second_coefficients_tu,
2632        first_coefficients_r,
2633        first_coefficients_s,
2634        &mut buf_a,
2635        &mut buf_b,
2636        &mut buf_c,
2637    );
2638
2639    // cubic_weight = 3·eta − eta³ (same as the prior expansion: eta_sq*eta
2640    // negated, plus the 3·eta linear correction).
2641    let mut eta_sq = [0.0_f64; 7];
2642    for (i, &eta_i) in eta.iter().enumerate() {
2643        for (j, &eta_j) in eta.iter().enumerate() {
2644            eta_sq[i + j] = eta_i.mul_add(eta_j, eta_sq[i + j]);
2645        }
2646    }
2647    let mut cubic_weight = [0.0_f64; 10];
2648    for (i, &eta_sq_i) in eta_sq.iter().enumerate() {
2649        for (j, &eta_j) in eta.iter().enumerate() {
2650            cubic_weight[i + j] = (-eta_sq_i).mul_add(eta_j, cubic_weight[i + j]);
2651        }
2652    }
2653    for (idx, &eta_coeff) in eta.iter().enumerate() {
2654        cubic_weight[idx] += 3.0 * eta_coeff;
2655    }
2656
2657    // quartic_coeff_term: cubic_weight ⊗ r ⊗ s ⊗ t ⊗ u · moments. The
2658    // original quintuple loop did 10·4·4·4·4 = 2560 mul-adds per call;
2659    // four sequential convolutions plus one moment dot drop this to
2660    // ~16+28+40+52+16 ≈ 152 mul-adds.
2661    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2662    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2663    let rstu_len = poly_conv_into(&buf_b[..rst_len], first_coefficients_u, &mut buf_a);
2664    let final_len = poly_conv_into(&cubic_weight, &buf_a[..rstu_len], &mut buf_b);
2665    let mut quartic_coeff_term = 0.0;
2666    for k in 0..final_len {
2667        quartic_coeff_term = buf_b[k].mul_add(moments[k], quartic_coeff_term);
2668    }
2669
2670    Ok((fourth_term - eta_linear_term + quad_coeff_term + quartic_coeff_term) * INV_TWO_PI)
2671}
2672
2673#[inline]
2674pub fn global_cubic_from_local(span: LocalSpanCubic) -> (f64, f64, f64, f64) {
2675    let left = span.left;
2676    let q0 = span.c0 - span.c1 * left + span.c2 * left * left - span.c3 * left * left * left;
2677    let q1 = span.c1 - 2.0 * span.c2 * left + 3.0 * span.c3 * left * left;
2678    let q2 = span.c2 - 3.0 * span.c3 * left;
2679    let q3 = span.c3;
2680    (q0, q1, q2, q3)
2681}
2682
2683/// Return the cubic polynomial coefficients (in `z`) of
2684/// `f(z) = link_span.evaluate(a + b*z)`.
2685///
2686/// `link_span.evaluate` is a cubic in its argument, so `f(z)` is also a cubic
2687/// in `z` and can be written exactly as
2688///
2689/// ```text
2690///     f(z) = d0 + d1·z + d2·z² + d3·z³
2691/// ```
2692///
2693/// where `(d0, d1, d2, d3)` are the values returned by this function. These
2694/// are **polynomial coefficients**, *not* derivatives of `f` at `z = 0`. The
2695/// relationship to Taylor derivatives is
2696///
2697/// ```text
2698///     d_k = f^(k)(0) / k!
2699/// ```
2700///
2701/// so `d0 = f(0)`, `d1 = f'(0)`, `d2 = ½·f''(0)`, `d3 = ⅙·f'''(0)`. Callers
2702/// such as [`denested_cell_coefficients`] and [`link_basis_cell_coefficients`]
2703/// rely on the polynomial-coefficient convention, since they propagate the
2704/// values directly as the `(c0, c1, c2, c3)` slots of a downstream polynomial
2705/// in `z`.
2706#[inline]
2707pub fn transformed_link_cubic(link_span: LocalSpanCubic, a: f64, b: f64) -> (f64, f64, f64, f64) {
2708    let shift = a - link_span.left;
2709    let d0 = link_span.c0
2710        + link_span.c1 * shift
2711        + link_span.c2 * shift * shift
2712        + link_span.c3 * shift * shift * shift;
2713    let d1 = b * (link_span.c1 + 2.0 * link_span.c2 * shift + 3.0 * link_span.c3 * shift * shift);
2714    let d2 = b * b * (link_span.c2 + 3.0 * link_span.c3 * shift);
2715    let d3 = link_span.c3 * b * b * b;
2716    (d0, d1, d2, d3)
2717}
2718
2719#[inline]
2720pub fn denested_cell_coefficients(
2721    score_span: LocalSpanCubic,
2722    link_span: LocalSpanCubic,
2723    a: f64,
2724    b: f64,
2725) -> [f64; 4] {
2726    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2727    let (d0, d1, d2, d3) = transformed_link_cubic(link_span, a, b);
2728    [a + b * h0 + d0, b + b * h1 + d1, b * h2 + d2, b * h3 + d3]
2729}
2730
2731#[inline]
2732pub fn denested_cell_coefficient_partials(
2733    score_span: LocalSpanCubic,
2734    link_span: LocalSpanCubic,
2735    a: f64,
2736    b: f64,
2737) -> ([f64; 4], [f64; 4]) {
2738    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2739    let shift = a - link_span.left;
2740    let alpha1 = link_span.c1;
2741    let alpha2 = link_span.c2;
2742    let alpha3 = link_span.c3;
2743    let dc_da = [
2744        1.0 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2745        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2746        3.0 * alpha3 * b * b,
2747        0.0,
2748    ];
2749    let dc_db = [
2750        h0,
2751        1.0 + h1 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2752        h2 + 2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2753        h3 + 3.0 * alpha3 * b * b,
2754    ];
2755    (dc_da, dc_db)
2756}
2757
2758#[inline]
2759fn link_cubic_second_partials(
2760    link_span: LocalSpanCubic,
2761    a: f64,
2762    b: f64,
2763) -> ([f64; 4], [f64; 4], [f64; 4]) {
2764    let shift = a - link_span.left;
2765    let alpha2 = link_span.c2;
2766    let alpha3 = link_span.c3;
2767    let dc_daa = [
2768        2.0 * alpha2 + 6.0 * alpha3 * shift,
2769        6.0 * alpha3 * b,
2770        0.0,
2771        0.0,
2772    ];
2773    let dc_dab = [
2774        0.0,
2775        2.0 * alpha2 + 6.0 * alpha3 * shift,
2776        6.0 * alpha3 * b,
2777        0.0,
2778    ];
2779    let dc_dbb = [
2780        0.0,
2781        0.0,
2782        2.0 * (alpha2 + 3.0 * alpha3 * shift),
2783        6.0 * alpha3 * b,
2784    ];
2785    (dc_daa, dc_dab, dc_dbb)
2786}
2787
2788#[inline]
2789pub fn denested_cell_second_partials(
2790    score_span: LocalSpanCubic,
2791    link_span: LocalSpanCubic,
2792    a: f64,
2793    b: f64,
2794) -> ([f64; 4], [f64; 4], [f64; 4]) {
2795    let score_left = score_span.left;
2796    if !score_left.is_finite() {
2797        return ([f64::NAN; 4], [f64::NAN; 4], [f64::NAN; 4]);
2798    }
2799    link_cubic_second_partials(link_span, a, b)
2800}
2801
2802#[inline]
2803fn link_cubic_third_partials(
2804    link_span: LocalSpanCubic,
2805) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2806    let alpha3 = link_span.c3;
2807    (
2808        [6.0 * alpha3, 0.0, 0.0, 0.0],
2809        [0.0, 6.0 * alpha3, 0.0, 0.0],
2810        [0.0, 0.0, 6.0 * alpha3, 0.0],
2811        [0.0, 0.0, 0.0, 6.0 * alpha3],
2812    )
2813}
2814
2815#[inline]
2816pub fn denested_cell_third_partials(
2817    link_span: LocalSpanCubic,
2818) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2819    link_cubic_third_partials(link_span)
2820}
2821
2822#[inline]
2823pub fn score_basis_cell_coefficients(score_basis_span: LocalSpanCubic, b: f64) -> [f64; 4] {
2824    let (h0, h1, h2, h3) = global_cubic_from_local(score_basis_span);
2825    [b * h0, b * h1, b * h2, b * h3]
2826}
2827
2828#[inline]
2829pub fn link_basis_cell_coefficients(link_basis_span: LocalSpanCubic, a: f64, b: f64) -> [f64; 4] {
2830    let (d0, d1, d2, d3) = transformed_link_cubic(link_basis_span, a, b);
2831    [d0, d1, d2, d3]
2832}
2833
2834#[inline]
2835pub fn link_basis_cell_coefficient_partials(
2836    link_basis_span: LocalSpanCubic,
2837    a: f64,
2838    b: f64,
2839) -> ([f64; 4], [f64; 4]) {
2840    let shift = a - link_basis_span.left;
2841    let alpha1 = link_basis_span.c1;
2842    let alpha2 = link_basis_span.c2;
2843    let alpha3 = link_basis_span.c3;
2844    let dc_da = [
2845        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2846        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2847        3.0 * alpha3 * b * b,
2848        0.0,
2849    ];
2850    let dc_db = [
2851        0.0,
2852        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2853        2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2854        3.0 * alpha3 * b * b,
2855    ];
2856    (dc_da, dc_db)
2857}
2858
2859#[inline]
2860pub fn link_basis_cell_second_partials(
2861    link_basis_span: LocalSpanCubic,
2862    a: f64,
2863    b: f64,
2864) -> ([f64; 4], [f64; 4], [f64; 4]) {
2865    link_cubic_second_partials(link_basis_span, a, b)
2866}
2867
2868#[inline]
2869pub fn link_basis_cell_third_partials(
2870    link_basis_span: LocalSpanCubic,
2871) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2872    link_cubic_third_partials(link_basis_span)
2873}
2874
2875pub fn build_denested_partition_cells<FS, FL>(
2876    a: f64,
2877    b: f64,
2878    score_breaks: &[f64],
2879    link_breaks: &[f64],
2880    score_span_at: FS,
2881    link_span_at: FL,
2882) -> Result<Vec<DenestedPartitionCell>, String>
2883where
2884    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2885    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2886{
2887    build_denested_partition_cells_with_tails(
2888        a,
2889        b,
2890        score_breaks,
2891        link_breaks,
2892        score_span_at,
2893        link_span_at,
2894    )
2895}
2896
2897/// Build a partition covering `(-∞, +∞)` with parameter-independent outer
2898/// bounds.  Interior cells use the same finite-cell polynomial algebra.
2899/// The two tail cells are guaranteed affine (c2=c3=0) because both
2900/// deviations saturate to constants outside their knot support.
2901///
2902/// The tail cells' score/link spans come from the same closures evaluated
2903/// at a representative point in the tail region — the closures must return
2904/// constant (c1=c2=c3=0) cubics for points outside support.
2905pub fn build_denested_partition_cells_with_tails<FS, FL>(
2906    a: f64,
2907    b: f64,
2908    score_breaks: &[f64],
2909    link_breaks: &[f64],
2910    mut score_span_at: FS,
2911    mut link_span_at: FL,
2912) -> Result<Vec<DenestedPartitionCell>, String>
2913where
2914    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2915    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2916{
2917    // Collect all INTERNAL split points (finite), each tagged with its
2918    // provenance: a fixed score break or a link-knot crossing. Provenance
2919    // identifies the cell's `(a, b)` family for the Chebyshev moment-family
2920    // layer; the z coordinates alone cannot distinguish the two kinds.
2921    let mut split_points: Vec<(f64, PartitionEdge)> = score_breaks
2922        .iter()
2923        .map(|&sigma| (sigma, PartitionEdge::Fixed(sigma)))
2924        .collect();
2925    if b.abs() > 1e-12 {
2926        for &tau in link_breaks {
2927            let z = (tau - a) / b;
2928            if z.is_finite() {
2929                split_points.push((z, PartitionEdge::Crossing { tau }));
2930            }
2931        }
2932    }
2933    dedup_sorted_tagged_breakpoints(&mut split_points);
2934
2935    let mut out = Vec::new();
2936
2937    if split_points.is_empty() {
2938        let score_span = score_span_at(0.0)?;
2939        let link_span = link_span_at(a)?;
2940        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2941        return Ok(vec![DenestedPartitionCell {
2942            cell: DenestedCubicCell {
2943                left: f64::NEG_INFINITY,
2944                right: f64::INFINITY,
2945                c0: coeffs[0],
2946                c1: coeffs[1],
2947                c2: 0.0,
2948                c3: 0.0,
2949            },
2950            score_span,
2951            link_span,
2952            left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2953            right_edge: PartitionEdge::Fixed(f64::INFINITY),
2954        }]);
2955    }
2956
2957    // ── Left tail cell: (-∞, leftmost_split] ──
2958    let (leftmost, leftmost_edge) = split_points[0];
2959    // Evaluate spans at a point just left of the leftmost split.  The
2960    // closures return constant tail cubics for this region.
2961    let left_probe = interval_probe_point(f64::NEG_INFINITY, leftmost)?;
2962    let left_score_span = score_span_at(left_probe)?;
2963    let left_link_span = link_span_at(a + b * left_probe)?;
2964    let left_coeffs = denested_cell_coefficients(left_score_span, left_link_span, a, b);
2965    if left_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2966        || left_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2967    {
2968        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2969            "left tail cell must be affine (deviations constant outside support), \
2970             got c2={:.3e}, c3={:.3e}",
2971            left_coeffs[2], left_coeffs[3]
2972        ))
2973        .into());
2974    }
2975    out.push(DenestedPartitionCell {
2976        cell: DenestedCubicCell {
2977            left: f64::NEG_INFINITY,
2978            right: leftmost,
2979            c0: left_coeffs[0],
2980            c1: left_coeffs[1],
2981            c2: 0.0,
2982            c3: 0.0,
2983        },
2984        score_span: left_score_span,
2985        link_span: left_link_span,
2986        left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2987        right_edge: leftmost_edge,
2988    });
2989
2990    // ── Interior cells (all finite) ──
2991    for window in split_points.windows(2) {
2992        let (left, left_edge) = window[0];
2993        let (right, right_edge) = window[1];
2994        if !left.is_finite() || !right.is_finite() || right - left <= 1e-12 {
2995            continue;
2996        }
2997        let mid = interval_probe_point(left, right)?;
2998        let score_span = score_span_at(mid)?;
2999        let link_span = link_span_at(a + b * mid)?;
3000        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
3001        out.push(DenestedPartitionCell {
3002            cell: DenestedCubicCell {
3003                left,
3004                right,
3005                c0: coeffs[0],
3006                c1: coeffs[1],
3007                c2: coeffs[2],
3008                c3: coeffs[3],
3009            },
3010            score_span,
3011            link_span,
3012            left_edge,
3013            right_edge,
3014        });
3015    }
3016
3017    // ── Right tail cell: [rightmost_split, +∞) ──
3018    let (rightmost, rightmost_edge) = *split_points.last().unwrap();
3019    let right_probe = interval_probe_point(rightmost, f64::INFINITY)?;
3020    let right_score_span = score_span_at(right_probe)?;
3021    let right_link_span = link_span_at(a + b * right_probe)?;
3022    let right_coeffs = denested_cell_coefficients(right_score_span, right_link_span, a, b);
3023    if right_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
3024        || right_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
3025    {
3026        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3027            "right tail cell must be affine (deviations constant outside support), \
3028             got c2={:.3e}, c3={:.3e}",
3029            right_coeffs[2], right_coeffs[3]
3030        ))
3031        .into());
3032    }
3033    out.push(DenestedPartitionCell {
3034        cell: DenestedCubicCell {
3035            left: rightmost,
3036            right: f64::INFINITY,
3037            c0: right_coeffs[0],
3038            c1: right_coeffs[1],
3039            c2: 0.0,
3040            c3: 0.0,
3041        },
3042        score_span: right_score_span,
3043        link_span: right_link_span,
3044        left_edge: rightmost_edge,
3045        right_edge: PartitionEdge::Fixed(f64::INFINITY),
3046    });
3047
3048    Ok(out)
3049}
3050
3051#[inline]
3052pub fn normalized_non_affine_coefficients(
3053    left: f64,
3054    right: f64,
3055    c0: f64,
3056    c1: f64,
3057    c2: f64,
3058    c3: f64,
3059) -> Result<(f64, f64), String> {
3060    let width = right - left;
3061    if !width.is_finite() || width <= 0.0 {
3062        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3063            "normalized cubic coefficients require a positive finite cell width, got left={left}, right={right}"
3064        ))
3065        .into());
3066    }
3067    let anchor_scale = c0.abs() + c1.abs();
3068    if !anchor_scale.is_finite() {
3069        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3070            "normalized cubic coefficients require finite affine coefficients, got c0={c0}, c1={c1}"
3071        ))
3072        .into());
3073    }
3074    let mid = 0.5 * (left + right);
3075    let half = 0.5 * width;
3076    let k2 = half * half * (c2 + 3.0 * c3 * mid);
3077    let k3 = c3 * half * half * half;
3078    Ok((k2, k3))
3079}
3080
3081#[inline]
3082pub fn branch_cell(cell: DenestedCubicCell) -> Result<ExactCellBranch, String> {
3083    let tol = effective_branch_tol(cell);
3084    if !cell.left.is_finite() || !cell.right.is_finite() {
3085        if cell.c2.abs() <= tol && cell.c3.abs() <= tol {
3086            return Ok(ExactCellBranch::Affine);
3087        }
3088        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3089            "non-affine cells require finite bounds, got [{}, {}] with c2={:.6e}, c3={:.6e}",
3090            cell.left, cell.right, cell.c2, cell.c3
3091        ))
3092        .into());
3093    }
3094    let (k2, k3) = normalized_non_affine_coefficients(
3095        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3096    )?;
3097    if k2.abs() <= tol && k3.abs() <= tol {
3098        Ok(ExactCellBranch::Affine)
3099    } else if k3.abs() <= tol {
3100        Ok(ExactCellBranch::Quartic)
3101    } else {
3102        Ok(ExactCellBranch::Sextic)
3103    }
3104}
3105
3106#[inline]
3107fn degenerate_sextic_branch(
3108    cell: DenestedCubicCell,
3109    lead: f64,
3110) -> Result<Option<ExactCellBranch>, String> {
3111    // The sextic recurrence divides by `lead = 3*c3^2`. When that division is
3112    // unstable, lower the polynomial degree without discarding a material
3113    // quadratic coefficient.
3114    let (normalized_k2, normalized_k3) = normalized_non_affine_coefficients(
3115        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3116    )?;
3117    if normalized_k3.abs() > NORMALIZED_CELL_BRANCH_TOL && lead.abs() > 1e-18 {
3118        return Ok(None);
3119    }
3120    if normalized_k2.abs() > NORMALIZED_CELL_BRANCH_TOL {
3121        Ok(Some(ExactCellBranch::Quartic))
3122    } else {
3123        Ok(Some(ExactCellBranch::Affine))
3124    }
3125}
3126
3127#[inline]
3128fn validate_bvn_args(h: f64, k: f64, rho: f64) -> Result<(), String> {
3129    if !h.is_finite() && !h.is_infinite() {
3130        return Err(CubicCellKernelError::bivariate_normal_domain(
3131            "bivariate normal cdf requires finite or infinite h",
3132        )
3133        .into());
3134    }
3135    if !k.is_finite() && !k.is_infinite() {
3136        return Err(CubicCellKernelError::bivariate_normal_domain(
3137            "bivariate normal cdf requires finite or infinite k",
3138        )
3139        .into());
3140    }
3141    if !rho.is_finite() {
3142        return Err(CubicCellKernelError::bivariate_normal_domain(format!(
3143            "bivariate normal cdf requires finite correlation, got {rho}"
3144        ))
3145        .into());
3146    }
3147    Ok::<(), _>(())
3148}
3149
3150#[inline]
3151fn bvn_gl_sum(h: f64, k: f64, rho_clamped: f64, asr: f64) -> f64 {
3152    // The Drezner-Wesolowsky arcsin representation is integrated with the
3153    // same 20-point Gauss-Legendre rule as before, but mirrored node pairs are
3154    // evaluated with one sin_cos for the half-angle offset rather than two
3155    // independent sin calls.  This preserves the quadrature rule (and hence
3156    // the accuracy envelope) while reducing the transcendental work in the
3157    // dominant finite-bound path from 20 sin calls to 11 sin/cos evaluations.
3158    if rho_clamped == 0.0 {
3159        return 0.0;
3160    }
3161    let hs = 0.5 * (h * h + k * k);
3162    let hk = h * k;
3163    let half_asr = 0.5 * asr;
3164    let (sin_mid, cos_mid) = half_asr.sin_cos();
3165    let mut sum = 0.0;
3166    for i in 0..10 {
3167        let node = GL20_NODES[i].abs();
3168        let weight = GL20_WEIGHTS[i];
3169        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3170
3171        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3172        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3173        let expo_lo = ((sn_lo * hk) - hs) / one_minus_lo;
3174
3175        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3176        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3177        let expo_hi = ((sn_hi * hk) - hs) / one_minus_hi;
3178
3179        sum += weight * (expo_lo.exp() + expo_hi.exp());
3180    }
3181    sum
3182}
3183
3184pub fn bivariate_normal_cdf(h: f64, k: f64, rho: f64) -> Result<f64, String> {
3185    validate_bvn_args(h, k, rho)?;
3186    if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
3187        return Ok(0.0);
3188    }
3189    if h == f64::INFINITY {
3190        return Ok(normal_cdf(k));
3191    }
3192    if k == f64::INFINITY {
3193        return Ok(normal_cdf(h));
3194    }
3195
3196    let rho_clamped = rho.clamp(-1.0, 1.0);
3197    if rho_clamped >= 1.0 - 1e-12 {
3198        return Ok(normal_cdf(h.min(k)));
3199    }
3200    if rho_clamped <= -1.0 + 1e-12 {
3201        return Ok((normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0));
3202    }
3203    if rho_clamped == 0.0 {
3204        return Ok((normal_cdf(h) * normal_cdf(k)).clamp(0.0, 1.0));
3205    }
3206    if h == 0.0 && k == 0.0 {
3207        return Ok((0.25 + rho_clamped.asin() / std::f64::consts::TAU).clamp(0.0, 1.0));
3208    }
3209
3210    let asr = rho_clamped.asin();
3211    let sum = bvn_gl_sum(h, k, rho_clamped, asr);
3212    Ok((normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3213}
3214
3215#[inline]
3216fn bvn_gl_sum_interval(h: f64, left: f64, right: f64, rho_clamped: f64, asr: f64) -> f64 {
3217    if rho_clamped == 0.0 {
3218        return 0.0;
3219    }
3220    let h2 = h * h;
3221    let right_hs = 0.5 * (h2 + right * right);
3222    let left_hs = 0.5 * (h2 + left * left);
3223    let half_asr = 0.5 * asr;
3224    let (sin_mid, cos_mid) = half_asr.sin_cos();
3225    let mut sum = 0.0;
3226    for i in 0..10 {
3227        let node = GL20_NODES[i].abs();
3228        let weight = GL20_WEIGHTS[i];
3229        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3230
3231        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3232        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3233        let lo_right = (((sn_lo * h * right) - right_hs) / one_minus_lo).exp();
3234        let lo_left = (((sn_lo * h * left) - left_hs) / one_minus_lo).exp();
3235
3236        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3237        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3238        let hi_right = (((sn_hi * h * right) - right_hs) / one_minus_hi).exp();
3239        let hi_left = (((sn_hi * h * left) - left_hs) / one_minus_hi).exp();
3240
3241        sum += weight * ((lo_right - lo_left) + (hi_right - hi_left));
3242    }
3243    sum
3244}
3245
3246fn bivariate_normal_cdf_interval(h: f64, left: f64, right: f64, rho: f64) -> Result<f64, String> {
3247    if right <= left {
3248        return Ok(0.0);
3249    }
3250    if left == f64::NEG_INFINITY && right == f64::INFINITY {
3251        return Ok(normal_cdf(h));
3252    }
3253    if !left.is_finite() || !right.is_finite() {
3254        let upper = bivariate_normal_cdf(h, right, rho)?;
3255        let lower = bivariate_normal_cdf(h, left, rho)?;
3256        return Ok((upper - lower).clamp(0.0, 1.0));
3257    }
3258    validate_bvn_args(h, left, rho)?;
3259    validate_bvn_args(h, right, rho)?;
3260    if h == f64::NEG_INFINITY {
3261        return Ok(0.0);
3262    }
3263    if h == f64::INFINITY {
3264        return Ok((normal_cdf(right) - normal_cdf(left)).clamp(0.0, 1.0));
3265    }
3266
3267    let rho_clamped = rho.clamp(-1.0, 1.0);
3268    if rho_clamped >= 1.0 - 1e-12 || rho_clamped <= -1.0 + 1e-12 {
3269        let upper = bivariate_normal_cdf(h, right, rho_clamped)?;
3270        let lower = bivariate_normal_cdf(h, left, rho_clamped)?;
3271        return Ok((upper - lower).clamp(0.0, 1.0));
3272    }
3273
3274    let cdf_h = normal_cdf(h);
3275    let normal_part = cdf_h * (normal_cdf(right) - normal_cdf(left));
3276    if rho_clamped == 0.0 {
3277        return Ok(normal_part.clamp(0.0, 1.0));
3278    }
3279    let asr = rho_clamped.asin();
3280    let sum = bvn_gl_sum_interval(h, left, right, rho_clamped, asr);
3281    Ok((normal_part + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3282}
3283
3284fn exp_neg_half_square(x: f64) -> f64 {
3285    if x.is_infinite() {
3286        0.0
3287    } else {
3288        (-0.5 * x * x).exp()
3289    }
3290}
3291
3292/// Zeroth truncated standard-normal moment `T_0(a, b) = ∫_a^b e^(−z²/2) dz
3293/// = √(2π)·(Φ(b) − Φ(a))`, evaluated without catastrophic cancellation in
3294/// either tail.
3295///
3296/// Writing `T_0 = √(π/2)·[erf(b/√2) − erf(a/√2)]`, the naive form collapses
3297/// to `0.0` whenever both endpoints lie in the *same* far tail: `erf`
3298/// saturates at the IEEE-754 values `±1.0` for `|x| ≳ 8.3·√2`, so the
3299/// difference of two saturated values is exactly zero even though the
3300/// integral is a strictly positive number well inside the f64 normal range
3301/// (e.g. `∫_{-12}^{-10} ≈ 1.9e-23`). The fix is to reduce the erf difference
3302/// to complementary tail probabilities — `erfc` is evaluated with a dedicated
3303/// tail series, *not* as `1 − erf` — and to pick, by the sign of the
3304/// endpoints, the algebraically-equivalent form whose terms do not cancel
3305/// against one another:
3306///
3307/// ```text
3308/// both ≥ 0 (upper tail):  erf(b/√2) − erf(a/√2) = erfc(a/√2) − erfc(b/√2)
3309/// both ≤ 0 (lower tail):  erf(b/√2) − erf(a/√2) = erfc(−b/√2) − erfc(−a/√2)
3310/// straddling zero:        erf(b/√2) − erf(a/√2)
3311///                        = erf(b/√2) + erf(−a/√2)       near the anchor
3312///                        = 2 − erfc(b/√2) − erfc(−a/√2) otherwise
3313/// ```
3314///
3315/// In each branch every `erfc` argument is `≥ 0`, so the terms are small
3316/// positive tail values, while narrow straddling intervals add two
3317/// non-negative `erf` masses measured outward from the anchor. That avoids
3318/// the `2 − erfc(b/√2) − erfc(−a/√2)` cancellation when both erfc terms round
3319/// to `1.0`, but keeps the erfc-tail form for ordinary/full-line straddling
3320/// intervals. No large quantities cancel and full f64 precision survives down
3321/// to the underflow boundary in either tail and around the affine anchor.
3322///
3323/// Uses `libm::erfc` (msun double-precision implementation, ≤ 1 ulp) rather
3324/// than `statrs::function::erf::erfc` (a 6-term rational approximation that
3325/// carries ~3·10⁻¹¹ relative error around `|x| ≈ 1/√2` — see the existing
3326/// `libm::erfc` consumer at `inference::polya_gamma_core::normal_cdf`). That
3327/// statrs error propagates directly into `T_0`, then through every higher
3328/// moment `T_n` (the recurrence `T_n = a^{n-1}e^{-a²/2} − b^{n-1}e^{-b²/2}
3329/// + (n-1)·T_{n-2}` walks `T_0` up two steps at a time), then through every
3330/// affine-cell moment via `affine_anchor_moment_vector` (whose `out[n]` is a
3331/// linear combination of `T_0..=T_n`), and is the dominant source of error
3332/// in the affine-cell branch of the cubic-cell substrate (CPU/GPU parity
3333/// reference for transformation-normal, bernoulli-marginal-slope, and the
3334/// BMS flex-row higher-derivative reuse path).
3335fn truncated_gaussian_zeroth_moment(a: f64, b: f64) -> f64 {
3336    let inv_sqrt2 = 1.0 / std::f64::consts::SQRT_2;
3337    let za = a * inv_sqrt2;
3338    let zb = b * inv_sqrt2;
3339    let erf_diff = if za >= 0.0 {
3340        libm::erfc(za) - libm::erfc(zb)
3341    } else if zb <= 0.0 {
3342        libm::erfc(-zb) - libm::erfc(-za)
3343    } else if zb <= 0.5 && -za <= 0.5 {
3344        // Near the affine anchor, erfc(zb) and erfc(-za) are both close to
3345        // one; subtracting them from 2.0 can round a tiny but representable
3346        // cell mass to zero. The equivalent erf sum adds small positive
3347        // quantities directly.
3348        libm::erf(zb) + libm::erf(-za)
3349    } else {
3350        2.0 - libm::erfc(zb) - libm::erfc(-za)
3351    };
3352    // √(2π)·½ = √(π/2).
3353    (std::f64::consts::PI / 2.0).sqrt() * erf_diff
3354}
3355
3356/// Fill `out[0..=max_degree]` with the raw truncated standard-normal moments
3357///
3358/// ```text
3359/// T_n(a, b) = ∫_a^b z^n exp(-z²/2) dz
3360/// ```
3361///
3362/// using the integration-by-parts recurrence
3363///
3364/// ```text
3365/// T_0(a, b) = √(2π) (Φ(b) − Φ(a))
3366/// T_1(a, b) = exp(−a²/2) − exp(−b²/2)
3367/// T_n(a, b) = a^(n−1) e^{−a²/2} − b^(n−1) e^{−b²/2} + (n−1) T_{n−2}(a, b)
3368/// ```
3369///
3370/// Computed in one forward sweep so each call evaluates `erf` and
3371/// `exp(−x²/2)` exactly twice (once at `a`, once at `b`) regardless of the
3372/// requested degree. The naive form — calling `T_n` recursively for each
3373/// `n = 0..=max_degree` — re-evaluated `erf`/`exp` about `max_degree²/4`
3374/// times per affine cell, which dominated the wall time of the
3375/// transformation-normal and bernoulli-marginal-slope inner solves with
3376/// `max_degree = 64` (the transport order's required degree budget).
3377fn fill_truncated_gaussian_moments(a: f64, b: f64, out: &mut [f64]) {
3378    if out.is_empty() {
3379        return;
3380    }
3381    out[0] = truncated_gaussian_zeroth_moment(a, b);
3382    if out.len() == 1 {
3383        return;
3384    }
3385    let ea = exp_neg_half_square(a);
3386    let eb = exp_neg_half_square(b);
3387    out[1] = ea - eb;
3388    if out.len() == 2 {
3389        return;
3390    }
3391    let a_finite = a.is_finite();
3392    let b_finite = b.is_finite();
3393    // For n in 2..=max_degree we need a^{n-1} e^{-a²/2} (resp. b). Carry the
3394    // running powers a^{n-1}, b^{n-1} forward by a single multiply per step.
3395    // Infinite endpoints contribute 0 (the integrand decays at the rate of
3396    // exp(−x²/2)), matching the prior `is_infinite` branch in the recursive
3397    // implementation; we still update the running power so the iteration
3398    // stays branchless when both endpoints are finite.
3399    let mut a_pow_n_minus_1 = a; // a^1, used at n = 2
3400    let mut b_pow_n_minus_1 = b;
3401    for n in 2..out.len() {
3402        let left = if a_finite { a_pow_n_minus_1 * ea } else { 0.0 };
3403        let right = if b_finite { b_pow_n_minus_1 * eb } else { 0.0 };
3404        out[n] = left - right + (n as f64 - 1.0) * out[n - 2];
3405        a_pow_n_minus_1 *= a;
3406        b_pow_n_minus_1 *= b;
3407    }
3408}
3409
3410/// Stack-array bound for `affine_anchor_moment_vector_into`. Public callers
3411/// use up to ~24 (largest is the bernoulli-margslope outer-step degree-21
3412/// reduction); 64 leaves comfortable headroom without growing the per-call
3413/// stack footprint meaningfully.
3414const MAX_AFFINE_ANCHOR_DEGREE: usize = 64;
3415
3416pub fn affine_anchor_moment_vector(
3417    alpha: f64,
3418    beta: f64,
3419    left: f64,
3420    right: f64,
3421    max_degree: usize,
3422) -> Vec<f64> {
3423    let mut out = vec![0.0; max_degree + 1];
3424    affine_anchor_moment_vector_into(alpha, beta, left, right, max_degree, &mut out);
3425    out
3426}
3427
3428fn affine_anchor_moment_vector_into(
3429    alpha: f64,
3430    beta: f64,
3431    left: f64,
3432    right: f64,
3433    max_degree: usize,
3434    out: &mut [f64],
3435) {
3436    assert_eq!(out.len(), max_degree + 1);
3437    let s = (1.0 + beta * beta).sqrt();
3438    let mu = -alpha * beta / (1.0 + beta * beta);
3439    let y_left = if left.is_infinite() {
3440        if left.is_sign_positive() {
3441            f64::INFINITY
3442        } else {
3443            f64::NEG_INFINITY
3444        }
3445    } else {
3446        s * (left - mu)
3447    };
3448    let y_right = if right.is_infinite() {
3449        if right.is_sign_positive() {
3450            f64::INFINITY
3451        } else {
3452            f64::NEG_INFINITY
3453        }
3454    } else {
3455        s * (right - mu)
3456    };
3457    let anchor = (-alpha * alpha / (2.0 * s * s)).exp() / s;
3458    assert!(
3459        max_degree <= MAX_AFFINE_ANCHOR_DEGREE,
3460        "affine_anchor_moment_vector max_degree {} exceeds compile-time bound {}",
3461        max_degree,
3462        MAX_AFFINE_ANCHOR_DEGREE
3463    );
3464    let mut t = [0.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3465    fill_truncated_gaussian_moments(y_left, y_right, &mut t[..=max_degree]);
3466    // Build mu^k and s^{-k} tables once. The inner sum is the binomial
3467    // expansion of the affine change-of-variables, and computing the
3468    // binomial coefficient via Pascal's row recurrence + carrying mu/s
3469    // powers eliminates the per-(n, k) `powi` and binomial calls that
3470    // otherwise dominated the inner loop at large `max_degree`.
3471    let mut mu_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3472    for k in 1..=max_degree {
3473        mu_pow[k] = mu_pow[k - 1] * mu;
3474    }
3475    let inv_s = 1.0 / s;
3476    let mut inv_s_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3477    for k in 1..=max_degree {
3478        inv_s_pow[k] = inv_s_pow[k - 1] * inv_s;
3479    }
3480    out.fill(0.0);
3481    for n in 0..=max_degree {
3482        let mut acc = 0.0;
3483        // C(n, k+1) = C(n, k) · (n − k) / (k + 1).
3484        let mut binom = 1.0;
3485        for k in 0..=n {
3486            let term = binom * mu_pow[n - k] * inv_s_pow[k];
3487            acc = term.mul_add(t[k], acc);
3488            if k < n {
3489                binom = binom * (n - k) as f64 / (k + 1) as f64;
3490            }
3491        }
3492        out[n] = anchor * acc;
3493    }
3494}
3495
3496fn affine_value_from_moment_primitive(alpha: f64, beta: f64, left: f64, right: f64) -> f64 {
3497    // Exact formula via bivariate normal CDF.
3498    //
3499    // V(α,β,l,r) = ∫_l^r Φ(α+βz)φ(z)dz
3500    //            = P(U ≤ α+βZ, l ≤ Z ≤ r)    where U,Z iid N(0,1)
3501    //            = Φ₂(h, r; ρ) − Φ₂(h, l; ρ)
3502    //
3503    // with h = α/√(1+β²) and ρ = −β/√(1+β²).
3504    //
3505    // This is exact to floating-point precision via the high-accuracy
3506    // Drezner-Wesolowsky BVN routine, replacing the previous fixed 20-point
3507    // Gauss-Legendre numerical integration of the derivative primitive.
3508    let s = (1.0 + beta * beta).sqrt();
3509    let h = alpha / s;
3510    let rho = -beta / s;
3511    bivariate_normal_cdf_interval(h, left, right, rho).unwrap_or(0.0)
3512}
3513
3514/// Evaluate an affine cell (c2=c3=0) with a value/moment-consistent primitive.
3515///
3516/// Value and moments are now generated from the same affine moment primitive.
3517/// The zero-moment derivative is exact, and `value` is reconstructed by
3518/// integrating `d value / d alpha = INV_TWO_PI * moments[0]` over `alpha`
3519/// on a transformed semi-infinite domain.
3520pub fn evaluate_affine_cell_state(
3521    cell: DenestedCubicCell,
3522    max_degree: usize,
3523) -> Result<CellMomentState, String> {
3524    let alpha = cell.c0;
3525    let beta = cell.c1;
3526    let value = affine_value_from_moment_primitive(alpha, beta, cell.left, cell.right);
3527    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3528    Ok(CellMomentState {
3529        branch: ExactCellBranch::Affine,
3530        value,
3531        moments: moments.into(),
3532    })
3533}
3534
3535fn evaluate_affine_cell_derivative_state(
3536    cell: DenestedCubicCell,
3537    max_degree: usize,
3538) -> Result<CellDerivativeMomentState, String> {
3539    let alpha = cell.c0;
3540    let beta = cell.c1;
3541    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3542    Ok(CellDerivativeMomentState {
3543        branch: ExactCellBranch::Affine,
3544        moments: moments.into(),
3545    })
3546}
3547
3548/// Accumulate `mw * z^k` into `moments[k]` for k=0..moments.len(). The
3549/// "unrolled4" name is historical — this is the plain scalar accumulator
3550/// that the SIMD outer loop calls per lane. Moment counts are small enough
3551/// (max_degree + 1 <= ~10) that explicit 4-way unrolling does not measurably
3552/// improve throughput over the iterator path; the wide::f64x4::exp savings
3553/// in the SIMD outer dominate the kernel's runtime.
3554#[inline]
3555fn accumulate_moments_unrolled4(moments: &mut [f64], mw: f64, z: f64) {
3556    let mut z_pow = 1.0_f64;
3557    for slot in moments.iter_mut() {
3558        *slot = mw.mul_add(z_pow, *slot);
3559        z_pow *= z;
3560    }
3561}
3562
3563// Shared SIMD Gauss-Legendre core for non-affine cells. The const generic
3564// `COMPUTE_VALUE` selects whether the cell value integral
3565// `∫ φ(η(z)) · exp(-½z²) dz / √(2π)` is accumulated alongside the moments.
3566// Monomorphization collapses the const-generic branches at compile time, so
3567// `COMPUTE_VALUE = false` emits the moment-only path verbatim.
3568//
3569// Single source of truth for the moment SIMD lane ordering, the Horner-with-FMA
3570// pattern for η(z), the `0.5 * (z² + η²)` quadratic-form evaluation order, the
3571// unscaled per-node GL moment weights, the post-loop half-width fold, and the
3572// per-lane `accumulate_moments_unrolled4` call. The previous duplicated code paths
3573// drifted by 1 ULP whenever any of these details diverged; here both paths
3574// share the same instructions, eliminating an entire class of regressions
3575// where a tweak to the quadrature order or the FMA pattern would silently
3576// re-introduce divergence between the value- and derivative-only callers.
3577//
3578// Gauss-Legendre on [left, right] converges geometrically for the analytic
3579// integrand exp(-q(z)) with quartic/sextic q on a bounded cell; the prior
3580// adaptive transport path expanded basis_moments via the forward 3-/5-step
3581// recurrences in reduce_quartic/sextic_moments, which amplify roundoff by
3582// (1/lead)^n with lead = 2c2²/3c3² and overflow to NaN for small c2/c3 cells
3583// that arise naturally in production.
3584//
3585// The fixed 384-node rule that replaced the transport path is accurate but
3586// pays ~384 exp evaluations per cell unconditionally. Production cells are
3587// narrow spline-knot subdivisions where a 12- or 24-node rule is already
3588// converged to machine precision, and the flex marginal-slope row calculus
3589// evaluates O(100) such cells per row across n=10⁵–10⁶ rows per criterion
3590// evaluation — the fixed rule was the dominant cost of the whole fit (#979).
3591// `evaluate_non_affine_cell_simd` therefore walks a progressive ladder of
3592// rules (12, 24, 48, 96, 192, 384 nodes) and returns as soon as two
3593// consecutive rules agree to `NON_AFFINE_LADDER_RTOL` relative to the moment
3594// vector's own scale. Unlike the old fixed rule — whose error was real but
3595// uncertified — every accepted ladder result carries an embedded two-rule
3596// agreement certificate; a cell that never certifies falls through to the
3597// same 384-node answer the fixed rule produced.
3598//
3599// SIMD path: process 4 GL nodes per outer iteration, batching the two scalar
3600// `exp` calls into single 4-wide `wide::f64x4::exp` invocations. All ladder
3601// rule sizes are divisible by 4, so no scalar tail is needed for the GL
3602// sweep. The inner moment accumulation is then run scalar per-lane but with
3603// a 4-way unrolled slab over the moment slots to break the `z_pow *= z`
3604// serial dependency chain.
3605#[inline(always)]
3606fn evaluate_non_affine_cell_with_rule<const COMPUTE_VALUE: bool>(
3607    cell: DenestedCubicCell,
3608    max_degree: usize,
3609    gl_nodes: &[f64],
3610    gl_weights: &[f64],
3611) -> (CellMomentVec, f64) {
3612    let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
3613    let mut value_integral = 0.0_f64;
3614    let center = 0.5 * (cell.left + cell.right);
3615    let half_width = 0.5 * (cell.right - cell.left);
3616    let c0 = cell.c0;
3617    let c1 = cell.c1;
3618    let c2 = cell.c2;
3619    let c3 = cell.c3;
3620    let moments_slice: &mut [f64] = &mut moments;
3621    assert_eq!(gl_nodes.len(), gl_weights.len());
3622    use wide::f64x4;
3623    let center_v = f64x4::splat(center);
3624    let half_width_v = f64x4::splat(half_width);
3625    let c0_v = f64x4::splat(c0);
3626    let c1_v = f64x4::splat(c1);
3627    let c2_v = f64x4::splat(c2);
3628    let c3_v = f64x4::splat(c3);
3629    let neg_half_v = f64x4::splat(-0.5);
3630    let n_total = gl_nodes.len();
3631    let n_simd = n_total - (n_total % 4);
3632    let mut i = 0;
3633    while i < n_simd {
3634        let node_v = f64x4::from([
3635            gl_nodes[i],
3636            gl_nodes[i + 1],
3637            gl_nodes[i + 2],
3638            gl_nodes[i + 3],
3639        ]);
3640        let weight_v = f64x4::from([
3641            gl_weights[i],
3642            gl_weights[i + 1],
3643            gl_weights[i + 2],
3644            gl_weights[i + 3],
3645        ]);
3646        let z_v = half_width_v.mul_add(node_v, center_v);
3647        // Horner: ((c3*z + c2)*z + c1)*z + c0
3648        let eta_v = c3_v
3649            .mul_add(z_v, c2_v)
3650            .mul_add(z_v, c1_v)
3651            .mul_add(z_v, c0_v);
3652        let z2_v = z_v * z_v;
3653        let neg_q_v = neg_half_v * (z2_v + eta_v * eta_v);
3654        let exp_negq_v = neg_q_v.exp();
3655        let moment_weight_v = weight_v * exp_negq_v;
3656        let z_arr = z_v.to_array();
3657        let mw_arr = moment_weight_v.to_array();
3658        if COMPUTE_VALUE {
3659            for lane in 0..4 {
3660                let z = z_arr[lane];
3661                let mw = mw_arr[lane];
3662                accumulate_moments_unrolled4(moments_slice, mw, z);
3663                // The value integrand carries Φ(η)'s erfc, whose systematic
3664                // per-z error is ~1e-13. To honor the cell-value accuracy
3665                // contract the value term must be assembled bit-for-bit like
3666                // the scalar reference: a non-fused node map
3667                // `z_ref = center + half_width·node`, the expanded
3668                // `η = c0 + c1·z + c2·z² + c3·z³` (NOT the SIMD Horner-FMA used
3669                // for the moments), the unscaled GL weight, a scalar `exp(-½z²)`,
3670                // and a plain `+=`. The SIMD `z_v`/`eta_v` above (fused) feed
3671                // ONLY the moments and are left untouched. Any single ULP slip
3672                // here (FMA node map, Horner η, per-term half_width, SIMD exp,
3673                // FMA accumulation) drifts the 384-node sum by ~1.4e-13 and
3674                // breaks the contract.
3675                let node = gl_nodes[i + lane];
3676                let weight = gl_weights[i + lane];
3677                let z_ref = center + half_width * node;
3678                let eta_ref = c0 + c1 * z_ref + c2 * z_ref * z_ref + c3 * z_ref * z_ref * z_ref;
3679                value_integral += weight * (-0.5 * z_ref * z_ref).exp() * normal_cdf(eta_ref);
3680            }
3681        } else {
3682            for lane in 0..4 {
3683                let z = z_arr[lane];
3684                let mw = mw_arr[lane];
3685                accumulate_moments_unrolled4(moments_slice, mw, z);
3686            }
3687        }
3688        i += 4;
3689    }
3690    while i < n_total {
3691        let node = gl_nodes[i];
3692        let weight = gl_weights[i];
3693        let z = center + half_width * node;
3694        let eta = c3.mul_add(z, c2).mul_add(z, c1).mul_add(z, c0);
3695        let q = 0.5 * (z * z + eta * eta);
3696        let moment_weight = weight * (-q).exp();
3697        accumulate_moments_unrolled4(moments_slice, moment_weight, z);
3698        if COMPUTE_VALUE {
3699            // Bit-for-bit the reference value structure (see SIMD branch): the
3700            // node map `z = center + half_width·node` here already matches the
3701            // reference (non-fused), but η must use the expanded reference form
3702            // rather than the moment path's Horner-FMA.
3703            let eta_ref = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3704            value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta_ref);
3705        }
3706        i += 1;
3707    }
3708    // Apply the cell half-width to both moment and value integrals ONCE at the
3709    // end, mirroring the prefold reference. Folding half_width per-term changes
3710    // f64 rounding enough to show up at the 1e-13 contract.
3711    for moment in moments_slice.iter_mut() {
3712        *moment *= half_width;
3713    }
3714    let value = if COMPUTE_VALUE {
3715        value_integral * half_width
3716    } else {
3717        value_integral
3718    };
3719    (moments, value)
3720}
3721
3722/// Relative agreement threshold for the progressive non-affine quadrature
3723/// ladder: two consecutive Gauss-Legendre rules must agree on every moment
3724/// slot to this tolerance relative to the moment vector's own max magnitude
3725/// before the finer rule's result
3726/// is accepted. Gauss-Legendre error decays geometrically in the node count
3727/// for the analytic integrand `exp(-q(z))`, so agreement between an n-node
3728/// and a 2n-node rule certifies that both are converged: the coarse rule's
3729/// true error is bounded by the observed difference plus the (much smaller)
3730/// fine-rule error.
3731///
3732/// History (#979): a roundoff-floor relaxation of this test (accept when
3733/// successive rungs agree to `≈ n·ε·scale` rather than the bare `3e-15`) was
3734/// tried to let smooth cells certify below the terminal 384-node rung. It was
3735/// reverted: the value-bearing path carries `∫ φ(z)·Φ(η(z)) dz`, and `Φ`'s
3736/// `erfc` implementation has a *systematic per-z* error of order `1e-13` that
3737/// each rung's node set samples differently. Only the exact 384-node rule
3738/// reproduces the reference's erfc-noise realization, so any sub-384 rung
3739/// drifts from the 384 value by `≈ 1e-13` — a drift that is NOT truncation,
3740/// does NOT shrink with rung, and is NOT bounded by rung-to-rung agreement.
3741/// The moment ladder remains independent of the value integral so value- and
3742/// derivative-only evaluators keep returning bit-identical moments. The scalar
3743/// value now evaluates on the terminal 384-node rule directly, preserving the
3744/// `non_affine_cell_state_matches_prefold_reference_to_1e_minus_13` value
3745/// contract without forcing every derivative-moment caller to use the terminal
3746/// rung.
3747const NON_AFFINE_LADDER_RTOL: f64 = 1e-15;
3748
3749/// Node counts of the progressive ladder below the 384-node terminal rung.
3750/// All divisible by 4 so the SIMD sweep needs no scalar tail.
3751const NON_AFFINE_LADDER_RUNGS: [usize; 5] = [12, 24, 48, 96, 192];
3752
3753/// Runtime-generated Gauss-Legendre rules for the ladder rungs, computed
3754/// once per process by Newton iteration on the Legendre polynomial roots
3755/// (standard `gauleg`: cosine initial guess, 3-4 Newton steps to machine
3756/// precision). The terminal 384-node rung reuses the compile-time
3757/// `GL_NODES`/`GL_WEIGHTS` tables, which also remain the single source for
3758/// the GPU kernel.
3759fn non_affine_ladder_rules() -> &'static [(Vec<f64>, Vec<f64>)] {
3760    static RULES: std::sync::OnceLock<Vec<(Vec<f64>, Vec<f64>)>> = std::sync::OnceLock::new();
3761    RULES.get_or_init(|| {
3762        NON_AFFINE_LADDER_RUNGS
3763            .iter()
3764            .map(|&n| gauss_legendre_rule(n))
3765            .collect()
3766    })
3767}
3768
3769/// Nodes and weights of the `n`-point Gauss-Legendre rule on `[-1, 1]`.
3770///
3771/// Newton iteration on `P_n` from the cosine initial guess
3772/// `cos(π(i + 0.75)/(n + 0.5))` converges to every root in a handful of
3773/// steps; weights follow from `w_i = 2 / ((1 - x_i²) P_n'(x_i)²)`. Roots are
3774/// filled symmetrically so the rule is exactly antisymmetric about 0.
3775fn gauss_legendre_rule(n: usize) -> (Vec<f64>, Vec<f64>) {
3776    let mut nodes = vec![0.0_f64; n];
3777    let mut weights = vec![0.0_f64; n];
3778    for i in 0..n.div_ceil(2) {
3779        let mut z = (std::f64::consts::PI * (i as f64 + 0.75) / (n as f64 + 0.5)).cos();
3780        let mut pp = 0.0_f64;
3781        for _ in 0..100 {
3782            // Legendre recurrence: p1 = P_n(z), p2 = P_{n-1}(z).
3783            let mut p1 = 1.0_f64;
3784            let mut p2 = 0.0_f64;
3785            for j in 1..=n {
3786                let p3 = p2;
3787                p2 = p1;
3788                p1 = ((2 * j - 1) as f64 * z * p2 - (j - 1) as f64 * p3) / j as f64;
3789            }
3790            pp = n as f64 * (z * p1 - p2) / (z * z - 1.0);
3791            let z_prev = z;
3792            z = z_prev - p1 / pp;
3793            if (z - z_prev).abs() <= f64::EPSILON {
3794                break;
3795            }
3796        }
3797        nodes[i] = -z;
3798        nodes[n - 1 - i] = z;
3799        let w = 2.0 / ((1.0 - z * z) * pp * pp);
3800        weights[i] = w;
3801        weights[n - 1 - i] = w;
3802    }
3803    (nodes, weights)
3804}
3805
3806/// Two-rule agreement certificate for the progressive ladder. `true` when
3807/// every MOMENT slot agrees to `NON_AFFINE_LADDER_RTOL` relative to the fine
3808/// result's max magnitude. Non-finite results never certify, so they fall
3809/// through to the terminal 384-node rung and reproduce the fixed rule's
3810/// behavior exactly.
3811///
3812/// The decision is deliberately moment-only and independent of whether the
3813/// caller also computed the cell value: the value- and derivative-only
3814/// evaluators MUST select the same ladder rung so they accumulate the moment
3815/// vector over the same nodes and return bit-identical moments (the
3816/// `derivative_moment_evaluator_matches_value_evaluator_moments` invariant).
3817/// Value-bearing callers evaluate the scalar cell probability separately on
3818/// the terminal 384-node rule; this certificate governs only the reusable
3819/// derivative moment vector.
3820fn non_affine_ladder_converged(coarse: &CellMomentVec, fine: &CellMomentVec) -> bool {
3821    let mut scale = 0.0_f64;
3822    let mut err = 0.0_f64;
3823    for (&c, &f) in coarse.iter().zip(fine.iter()) {
3824        scale = scale.max(f.abs());
3825        err = err.max((c - f).abs());
3826    }
3827    if !(scale.is_finite() && err.is_finite()) {
3828        return false;
3829    }
3830    err <= NON_AFFINE_LADDER_RTOL * scale
3831}
3832
3833/// Per-rung certification histogram for the non-affine ladder, indexed by the
3834/// rung that certified (`NON_AFFINE_LADDER_RUNGS[i]` at index `i`), with the
3835/// final slot counting cells that fell through to the terminal 384-node rule.
3836/// Incremented once per non-affine cell evaluation; the BMS exact-cache build
3837/// logs the distribution so the ladder's real cost (early-certify win vs.
3838/// terminal-fallthrough cost) is observable on every large-scale fit rather
3839/// than assumed. `+1` length for the terminal bucket.
3840pub(crate) static NON_AFFINE_LADDER_CERT_COUNTS: [AtomicU64; NON_AFFINE_LADDER_RUNGS.len() + 1] = [
3841    AtomicU64::new(0),
3842    AtomicU64::new(0),
3843    AtomicU64::new(0),
3844    AtomicU64::new(0),
3845    AtomicU64::new(0),
3846    AtomicU64::new(0),
3847];
3848
3849/// Snapshot the ladder certification histogram as `(rung_node_count, count)`
3850/// pairs plus the terminal-fallthrough count, for logging/inspection.
3851pub fn non_affine_ladder_cert_histogram() -> (Vec<(usize, u64)>, u64) {
3852    let per_rung = NON_AFFINE_LADDER_RUNGS
3853        .iter()
3854        .enumerate()
3855        .map(|(i, &n)| (n, NON_AFFINE_LADDER_CERT_COUNTS[i].load(Ordering::Relaxed)))
3856        .collect();
3857    let terminal =
3858        NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].load(Ordering::Relaxed);
3859    (per_rung, terminal)
3860}
3861
3862/// Progressive-ladder evaluation of a non-affine cell: walk the rule ladder
3863/// from 12 nodes upward and return the first result certified by two-rule
3864/// agreement; a cell that never certifies returns the terminal 384-node
3865/// result, byte-identical to the previous fixed-rule implementation.
3866#[inline]
3867fn evaluate_non_affine_cell_simd<const COMPUTE_VALUE: bool>(
3868    cell: DenestedCubicCell,
3869    max_degree: usize,
3870) -> (CellMomentVec, f64) {
3871    let mut prev: Option<(CellMomentVec, f64)> = None;
3872    for (i, (nodes, weights)) in non_affine_ladder_rules().iter().enumerate() {
3873        let cur =
3874            evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, nodes, weights);
3875        if let Some(prev) = prev.as_ref()
3876            && non_affine_ladder_converged(&prev.0, &cur.0)
3877        {
3878            NON_AFFINE_LADDER_CERT_COUNTS[i].fetch_add(1, Ordering::Relaxed);
3879            return cur;
3880        }
3881        prev = Some(cur);
3882    }
3883    NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].fetch_add(1, Ordering::Relaxed);
3884    evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, &GL_NODES, &GL_WEIGHTS)
3885}
3886
3887/// Value-only evaluation of a non-affine cell on the terminal 384-node rule.
3888///
3889/// Returns the cell probability integral `∫ exp(-½z²)·Φ(η(z)) dz` (pre the
3890/// `1/√τ` normalization) computed bit-for-bit like the value branch of
3891/// [`evaluate_non_affine_cell_with_rule`]: the non-fused node map
3892/// `z = center + half_width·node`, the expanded (non-Horner)
3893/// `η = c0 + c1·z + c2·z² + c3·z³`, the unscaled GL weight, a scalar
3894/// `exp(-½z²)`, a plain `+=` in ascending node order, and a single trailing
3895/// `·half_width`. The terminal rule has 384 nodes (divisible by 4), so the
3896/// general kernel's value path never takes its scalar tail — this loop walks
3897/// the same nodes in the same order and therefore reproduces the reference
3898/// erfc-noise realization the `1e-13` value contract pins down.
3899///
3900/// Computing this through `evaluate_non_affine_cell_with_rule::<true>` at
3901/// `max_degree = 0` would additionally run the 4-wide SIMD `exp(-q)` moment
3902/// sweep and a moment accumulation on every node only to discard the moment
3903/// vector. The survival marginal-slope fit evaluates a value per non-affine
3904/// partition cell, so that discarded moment work is the dominant waste in the
3905/// per-cell pass; this evaluator does only the work the value needs.
3906fn evaluate_non_affine_cell_value_terminal(cell: DenestedCubicCell) -> f64 {
3907    let center = 0.5 * (cell.left + cell.right);
3908    let half_width = 0.5 * (cell.right - cell.left);
3909    let c0 = cell.c0;
3910    let c1 = cell.c1;
3911    let c2 = cell.c2;
3912    let c3 = cell.c3;
3913    let mut value_integral = 0.0_f64;
3914    for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
3915        let z = center + half_width * node;
3916        let eta = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3917        value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
3918    }
3919    value_integral * half_width
3920}
3921
3922fn evaluate_non_affine_cell_state(
3923    cell: DenestedCubicCell,
3924    branch: ExactCellBranch,
3925    max_degree: usize,
3926) -> Result<CellMomentState, String> {
3927    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3928    let value_integral = evaluate_non_affine_cell_value_terminal(cell);
3929    // Reference structure: `value_integral * half_width / sqrt(TAU)`. The
3930    // half_width factor is already applied inside the rule evaluator, so divide
3931    // by sqrt(TAU) here (a true division, NOT multiply-by-reciprocal) to
3932    // reproduce the reference's final rounding bit-for-bit.
3933    Ok(CellMomentState {
3934        branch,
3935        value: value_integral / (std::f64::consts::TAU).sqrt(),
3936        moments,
3937    })
3938}
3939
3940fn evaluate_non_affine_cell_derivative_state(
3941    cell: DenestedCubicCell,
3942    branch: ExactCellBranch,
3943    max_degree: usize,
3944) -> Result<CellDerivativeMomentState, String> {
3945    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3946    Ok(CellDerivativeMomentState { branch, moments })
3947}
3948
3949/// De-nested cubic cell evaluator.
3950///
3951/// Affine cells use the closed-form affine anchor; non-affine cells (Quartic
3952/// and Sextic branches) are evaluated in a single pass over a fixed
3953/// high-order Gauss-Legendre rule on `[left, right]`.
3954pub fn evaluate_cell_moments(
3955    cell: DenestedCubicCell,
3956    max_degree: usize,
3957) -> Result<CellMomentState, String> {
3958    if !TAIL_CELL_MOMENT_CACHE_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
3959        return evaluate_cell_moments_uncached(cell, max_degree);
3960    }
3961    tail_cell_moment_cache().evaluate(cell, max_degree)
3962}
3963
3964/// Evaluate cell moments without consulting the global affine-tail memo.
3965///
3966/// This is retained for regression tests and before/after microbenchmarks;
3967/// production callers should use [`evaluate_cell_moments`].
3968pub fn evaluate_cell_moments_uncached(
3969    cell: DenestedCubicCell,
3970    max_degree: usize,
3971) -> Result<CellMomentState, String> {
3972    evaluate_cell_state_dispatched(
3973        cell,
3974        max_degree,
3975        evaluate_affine_cell_state,
3976        evaluate_non_affine_cell_state,
3977    )
3978}
3979
3980/// Evaluate only the moment vector needed by derivative contractions.
3981///
3982/// This deliberately does not compute the cell probability value
3983/// `∫ φ(z) Φ(η(z)) dz`. Derivative contractions consume
3984/// `∫ z^k exp(-q(z)) dz` moments only, so keeping the value out of the return
3985/// type prevents this cheaper evaluator from satisfying value-bearing calls.
3986pub fn evaluate_cell_derivative_moments_uncached(
3987    cell: DenestedCubicCell,
3988    max_degree: usize,
3989) -> Result<CellDerivativeMomentState, String> {
3990    evaluate_cell_state_dispatched(
3991        cell,
3992        max_degree,
3993        evaluate_affine_cell_derivative_state,
3994        evaluate_non_affine_cell_derivative_state,
3995    )
3996}
3997
3998/// Shared branch dispatch for the value-bearing and derivative-only cell
3999/// evaluators. Both walk the same decision tree (semi-infinite tail → must
4000/// be affine; finite cell → branch-by-coefficients with the sextic
4001/// degenerate-lowering path), differing only in which pair of
4002/// `(affine, non_affine)` evaluator helpers to delegate to.  The two helpers
4003/// are passed as `fn` pointers so the dispatch monomorphizes per `S` and
4004/// keeps the existing pre-condition errors / unreachable branch handling
4005/// in lockstep across both evaluators.
4006fn evaluate_cell_state_dispatched<S>(
4007    cell: DenestedCubicCell,
4008    max_degree: usize,
4009    affine: fn(DenestedCubicCell, usize) -> Result<S, String>,
4010    non_affine: fn(DenestedCubicCell, ExactCellBranch, usize) -> Result<S, String>,
4011) -> Result<S, String> {
4012    let left_inf = !cell.left.is_finite();
4013    let right_inf = !cell.right.is_finite();
4014    if left_inf || right_inf {
4015        // Semi-infinite tail cells must be affine: the deviation saturates
4016        // to a constant outside support, so c2=c3=0.  Both the BVN CDF
4017        // and the truncated-Gaussian moment vector handle infinite bounds.
4018        if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL
4019        {
4020            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4021                "semi-infinite cell [{}, {}] must be affine (c2=c3=0), got c2={:.3e}, c3={:.3e}",
4022                cell.left, cell.right, cell.c2, cell.c3
4023            ))
4024            .into());
4025        }
4026        return affine(cell, max_degree);
4027    }
4028    if cell.right <= cell.left {
4029        return Err(CubicCellKernelError::invalid_cell_shape(format!(
4030            "finite cell must have left < right, got [{}, {}]",
4031            cell.left, cell.right
4032        ))
4033        .into());
4034    }
4035    let branch = branch_cell(cell)?;
4036    if branch == ExactCellBranch::Affine {
4037        return affine(cell, max_degree);
4038    }
4039    if branch == ExactCellBranch::Sextic {
4040        let lead = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3)[5];
4041        if !lead.is_finite() {
4042            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4043                "sextic cell evaluation encountered non-finite leading coefficient: {lead:.3e}"
4044            ))
4045            .into());
4046        }
4047        if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
4048            return match lower_branch {
4049                ExactCellBranch::Quartic => non_affine(
4050                    DenestedCubicCell { c3: 0.0, ..cell },
4051                    ExactCellBranch::Quartic,
4052                    max_degree,
4053                ),
4054                ExactCellBranch::Affine => affine(
4055                    DenestedCubicCell {
4056                        c2: 0.0,
4057                        c3: 0.0,
4058                        ..cell
4059                    },
4060                    max_degree,
4061                ),
4062                ExactCellBranch::Sextic => Err(CubicCellKernelError::invalid_cell_shape(
4063                    "internal: degenerate_sextic_branch returned Sextic as a lowered branch",
4064                )
4065                .into()),
4066            };
4067        }
4068    }
4069    non_affine(cell, branch, max_degree)
4070}
4071
4072/// Evaluate a de-nested cubic cell through a fit-lifetime byte-limited LRU cache.
4073///
4074/// The fingerprint is an exact bit-cast of `(c0, c1, c2, c3, left, right)`, so
4075/// eviction and reuse cannot alias nearby-but-different cells.  A cached entry
4076/// computed to a higher degree may satisfy a lower-degree request by truncating
4077/// the moment vector, preserving the public [`evaluate_cell_moments`] contract.
4078pub fn evaluate_cell_moments_cached(
4079    cell: DenestedCubicCell,
4080    max_degree: usize,
4081    cache: &CellMomentLruCache,
4082    stats: Option<&CellMomentCacheStats>,
4083) -> Result<CellMomentState, String> {
4084    // Affine cells (every rigid-path cell and every tail cell) evaluate
4085    // through the closed-form anchor — cheaper than a single LRU probe. The
4086    // LRU exists only to amortize the EXPENSIVE non-affine transport across
4087    // recurring cells; at large n the row scalars `(a, b)` are unique per
4088    // row, so affine cells never recur and routing them through the sharded
4089    // mutex was pure cost (320k lock+insert+evict ops per gradient eval, ~0%
4090    // hit — the dominant cost of the rigid n=320k fit, #979). Bypass the
4091    // cache entirely for them.
4092    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4093        if let Some(stats) = stats {
4094            stats.misses.fetch_add(1, Ordering::Relaxed);
4095        }
4096        return evaluate_cell_moments_uncached(cell, max_degree);
4097    }
4098    let key = CellFingerprint::new(cell);
4099    let existing_derivative = match cache.get(&key) {
4100        Some(cached) => {
4101            if let Some(state) = cached.state_for_degree(max_degree) {
4102                if let Some(stats) = stats {
4103                    stats.hits.fetch_add(1, Ordering::Relaxed);
4104                }
4105                return Ok(state);
4106            }
4107            // `cached.derivative_state` is `Option<Arc<_>>`; `.clone()` here
4108            // is the cheap refcount bump the audit-39 fix targets, not a
4109            // full moment-vector deep clone.
4110            cached.derivative_state.clone()
4111        }
4112        None => None,
4113    };
4114    if let Some(stats) = stats {
4115        stats.misses.fetch_add(1, Ordering::Relaxed);
4116    }
4117    let state = evaluate_cell_moments(cell, max_degree)?;
4118    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4119    // through `Arc::clone`, and return the underlying value by unwrapping the
4120    // unique-reference (caller-side) `Arc`. This replaces the prior
4121    // `state.clone()` deep copy at the insert site.
4122    let shared = Arc::new(state);
4123    let mut entry = CachedCellMoments::new(Arc::clone(&shared));
4124    if let Some(derivative) = existing_derivative {
4125        entry = entry.with_derivative(derivative);
4126    }
4127    cache.insert(key, entry);
4128    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4129}
4130
4131/// Derivative-moment counterpart to [`evaluate_cell_moments_cached`]. Shares
4132/// the value-moment LRU by storing both moment kinds in a single
4133/// [`CachedCellMoments`] entry keyed on the cell fingerprint — derivative
4134/// insertions preserve any pre-existing value state and vice versa, so the
4135/// two callers never evict each other's work.
4136pub fn evaluate_cell_derivative_moments_cached(
4137    cell: DenestedCubicCell,
4138    max_degree: usize,
4139    cache: &CellMomentLruCache,
4140    stats: Option<&CellMomentCacheStats>,
4141) -> Result<CellDerivativeMomentState, String> {
4142    // Affine cells bypass the LRU — see `evaluate_cell_moments_cached` for
4143    // why the sharded-mutex memo is pure overhead on the closed-form affine
4144    // path at large n (#979).
4145    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4146        if let Some(stats) = stats {
4147            stats.misses.fetch_add(1, Ordering::Relaxed);
4148        }
4149        return evaluate_cell_derivative_moments_uncached(cell, max_degree);
4150    }
4151    let key = CellFingerprint::new(cell);
4152    let existing_value = match cache.get(&key) {
4153        Some(cached) => {
4154            if let Some(state) = cached.derivative_state_for_degree(max_degree) {
4155                if let Some(stats) = stats {
4156                    stats.hits.fetch_add(1, Ordering::Relaxed);
4157                }
4158                return Ok(state);
4159            }
4160            // `cached.state` is `Option<Arc<_>>`; `.clone()` here is the cheap
4161            // refcount bump the audit-39 fix targets, not a full moment-vector
4162            // deep clone.
4163            cached.state.clone()
4164        }
4165        None => None,
4166    };
4167    if let Some(stats) = stats {
4168        stats.misses.fetch_add(1, Ordering::Relaxed);
4169    }
4170    let state = evaluate_cell_derivative_moments_uncached(cell, max_degree)?;
4171    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4172    // through `Arc::clone`, and return the underlying value by unwrapping the
4173    // unique-reference (caller-side) `Arc`. This replaces the prior
4174    // `state.clone()` deep copy at the insert site.
4175    let shared = Arc::new(state);
4176    let mut entry = CachedCellMoments::new_derivative(Arc::clone(&shared));
4177    if let Some(value) = existing_value {
4178        entry = entry.with_value(value);
4179    }
4180    cache.insert(key, entry);
4181    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4182}
4183
4184/// Scratch-backed variant of [`evaluate_cell_moments`].
4185///
4186/// Reuses the supplied [`CellMomentScratch`] for the returned moments slice,
4187/// so repeated calls with the same scratch (and a sufficient initial capacity)
4188/// avoid per-call `Vec` allocations on the hot inner-PIRLS row-intercept
4189/// solver path. Internal transport allocations are unchanged.
4190pub fn evaluate_cell_moments_with_scratch<'a>(
4191    cell: DenestedCubicCell,
4192    max_degree: usize,
4193    scratch: &'a mut CellMomentScratch,
4194) -> Result<CellMomentStateRef<'a>, String> {
4195    let state = evaluate_cell_moments(cell, max_degree)?;
4196    let out = scratch.prepare_moments(max_degree + 1);
4197    out.copy_from_slice(&state.moments);
4198    Ok(CellMomentStateRef {
4199        branch: state.branch,
4200        value: state.value,
4201        moments: out,
4202    })
4203}
4204
4205#[cfg(test)]
4206mod tests {
4207    use super::*;
4208    use gam_math::probability::normal_pdf;
4209
4210    #[inline]
4211    pub(super) fn polynomial_value(coefficients: &[f64], z: f64) -> f64 {
4212        coefficients
4213            .iter()
4214            .rev()
4215            .fold(0.0, |acc, &coeff| acc * z + coeff)
4216    }
4217
4218    fn reset_cell_moment_test_reallocs() {
4219        super::CELL_MOMENT_REALLOCS.store(0, std::sync::atomic::Ordering::Relaxed);
4220    }
4221
4222    fn cell_moment_test_reallocs() -> usize {
4223        super::CELL_MOMENT_REALLOCS.load(std::sync::atomic::Ordering::Relaxed)
4224    }
4225
4226    fn assert_close_rel(label: &str, actual: f64, expected: f64, tol: f64) {
4227        let denom = expected.abs().max(1.0);
4228        let rel = (actual - expected).abs() / denom;
4229        assert!(
4230            rel <= tol,
4231            "{label}: actual={actual:.17e} expected={expected:.17e} rel={rel:.3e} tol={tol:.3e}"
4232        );
4233    }
4234
4235    // The link-basis cell coefficient `transformed_link_cubic(span, a, b)` is, in
4236    // each of its four output components, a polynomial of TOTAL degree exactly 3 in
4237    // (a, b):
4238    //   d0 = c0 + c1·s + c2·s² + c3·s³            (s = a − left; deg 3 in a)
4239    //   d1 = b·(c1 + 2c2·s + 3c3·s²)              (a²·b → total deg 3)
4240    //   d2 = b²·(c2 + 3c3·s)                       (a·b² → total deg 3)
4241    //   d3 = c3·b³                                 (b³  → total deg 3)
4242    // Therefore EVERY 4th-order total (a,b)-partial (∂⁴/∂aⁱ∂b^{4−i}) is identically
4243    // zero, while the 3rd-order partials (∂³/∂aⁱ∂b^{3−i}) are the highest nonzero
4244    // ones. This is the exact algebraic fact the bidirectional flex jet relies on:
4245    // a "second mixed derivative of a third-a-partial" slot, etc., demands a 4th
4246    // total (a,b)-partial and must be hard-zero — substituting a (nonzero) 3rd
4247    // partial there is a bug. This test certifies BOTH facts by central FD so the
4248    // hard-coded `0.0` fixes are provably correct and provably necessary.
4249    #[test]
4250    fn link_basis_cell_fourth_ab_partials_vanish_third_are_nonzero() {
4251        let span = LocalSpanCubic {
4252            left: -0.4,
4253            right: 1.6,
4254            c0: 0.37,
4255            c1: -0.81,
4256            c2: 0.53,
4257            c3: -0.29,
4258        };
4259        let a0 = 0.23_f64;
4260        let b0 = 0.61_f64;
4261        let h = 1e-2_f64;
4262
4263        // Generic central-difference stencils per derivative order.
4264        let stencil = |order: usize| -> &'static [(i64, f64)] {
4265            match order {
4266                0 => &[(0, 1.0)],
4267                1 => &[(-1, -0.5), (1, 0.5)],
4268                2 => &[(-1, 1.0), (0, -2.0), (1, 1.0)],
4269                3 => &[(-2, -0.5), (-1, 1.0), (1, -1.0), (2, 0.5)],
4270                4 => &[(-2, 1.0), (-1, -4.0), (0, 6.0), (1, -4.0), (2, 1.0)],
4271                _ => &[(0, 1.0)],
4272            }
4273        };
4274        // FD of component `k` of the cell coefficient: ∂^{na+nb}/∂a^{na}∂b^{nb}.
4275        let fd = |k: usize, na: usize, nb: usize| -> f64 {
4276            let mut acc = 0.0;
4277            for &(ia, wa) in stencil(na) {
4278                for &(ib, wb) in stencil(nb) {
4279                    let a = a0 + (ia as f64) * h;
4280                    let b = b0 + (ib as f64) * h;
4281                    acc += wa * wb * link_basis_cell_coefficients(span, a, b)[k];
4282                }
4283            }
4284            acc / h.powi((na + nb) as i32)
4285        };
4286
4287        let (p3_aaa, p3_aab, p3_abb, p3_bbb) = link_basis_cell_third_partials(span);
4288
4289        // (1) The analytic 3rd partials match FD (within FD truncation) — and at
4290        // least one is appreciably nonzero, so these are real signal that a wrong
4291        // slot would inject.
4292        let mut max_third = 0.0_f64;
4293        for k in 0..4 {
4294            for (label, (na, nb), analytic) in [
4295                ("aaa", (3usize, 0usize), p3_aaa[k]),
4296                ("aab", (2, 1), p3_aab[k]),
4297                ("abb", (1, 2), p3_abb[k]),
4298                ("bbb", (0, 3), p3_bbb[k]),
4299            ] {
4300                let got = fd(k, na, nb);
4301                assert!(
4302                    (got - analytic).abs() <= 1e-4 + 1e-3 * analytic.abs(),
4303                    "3rd partial {label}[{k}] analytic {analytic:+.6e} vs FD {got:+.6e}"
4304                );
4305                max_third = max_third.max(analytic.abs());
4306            }
4307        }
4308        assert!(
4309            max_third > 1e-1,
4310            "expected an appreciable nonzero 3rd (a,b)-partial; max |analytic| = {max_third:.3e}"
4311        );
4312
4313        // (2) EVERY 4th-order total (a,b)-partial vanishes (degree-3 polynomial),
4314        // certifying that the hard-coded `0.0` in the bidirectional d12 slots is the
4315        // mathematically required value, not an approximation.
4316        for k in 0..4 {
4317            for (na, nb) in [(4usize, 0usize), (3, 1), (2, 2), (1, 3), (0, 4)] {
4318                let got = fd(k, na, nb);
4319                assert!(
4320                    got.abs() <= 1e-2,
4321                    "4th (a,b)-partial ∂^{na}_a∂^{nb}_b of cell coeff[{k}] must vanish, FD = {got:+.6e}"
4322                );
4323            }
4324        }
4325    }
4326
4327    #[test]
4328    fn non_affine_cell_state_grid_matches_public_cell_moments_reference() {
4329        let cells = [
4330            DenestedCubicCell {
4331                left: -1.25,
4332                right: -0.2,
4333                c0: -0.35,
4334                c1: 0.85,
4335                c2: 0.04,
4336                c3: -0.015,
4337            },
4338            DenestedCubicCell {
4339                left: -0.2,
4340                right: 0.55,
4341                c0: 0.12,
4342                c1: -0.65,
4343                c2: -0.025,
4344                c3: 0.02,
4345            },
4346            DenestedCubicCell {
4347                left: 0.55,
4348                right: 1.6,
4349                c0: 0.42,
4350                c1: 0.35,
4351                c2: 0.018,
4352                c3: 0.012,
4353            },
4354        ];
4355        for cell in cells {
4356            let branch = branch_cell(cell).expect("branch");
4357            assert_ne!(branch, ExactCellBranch::Affine);
4358            for max_degree in [0usize, 2, 4, 9, 16] {
4359                let direct = evaluate_non_affine_cell_state(cell, branch, max_degree)
4360                    .expect("direct non-affine transport");
4361                let public = evaluate_cell_moments(cell, max_degree).expect("public evaluator");
4362                assert_eq!(direct.branch, public.branch);
4363                assert_eq!(direct.moments.len(), public.moments.len());
4364                let value_scale = direct.value.abs().max(public.value.abs()).max(1.0);
4365                assert!(
4366                    (direct.value - public.value).abs() <= 1e-10 * value_scale,
4367                    "value mismatch for {cell:?} degree {max_degree}: direct={} public={}",
4368                    direct.value,
4369                    public.value
4370                );
4371                for (degree, (lhs, rhs)) in
4372                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4373                {
4374                    let scale = lhs.abs().max(rhs.abs()).max(1.0);
4375                    assert!(
4376                        (lhs - rhs).abs() <= 1e-10 * scale,
4377                        "moment {degree} mismatch for {cell:?} degree {max_degree}: {lhs} vs {rhs}"
4378                    );
4379                }
4380            }
4381        }
4382    }
4383
4384    #[test]
4385    fn affine_tail_cell_memo_matches_uncached_grid_and_records_hits() {
4386        // Use a dedicated local cache so the test's hit/miss/entry counters
4387        // are not perturbed by concurrent tests that drive the shared
4388        // global memo through `evaluate_cell_moments`. Asserting on the
4389        // global counters made this test race-flaky when the suite ran in
4390        // parallel.
4391        let cache = TailCellMomentCache::new();
4392        let c0s = [-2.0, -0.25, 0.0, 1.5];
4393        let c1s = [-1.2, -0.05, 0.0, 0.8];
4394        let endpoints = [-4.0, -1.0, 0.0, 2.5, 6.0];
4395        let degrees = [0_usize, 4, 9, 16, 24];
4396
4397        for &c0 in &c0s {
4398            for &c1 in &c1s {
4399                for &endpoint in &endpoints {
4400                    for &max_degree in &degrees {
4401                        for &(left, right) in
4402                            &[(f64::NEG_INFINITY, endpoint), (endpoint, f64::INFINITY)]
4403                        {
4404                            let cell = DenestedCubicCell {
4405                                left,
4406                                right,
4407                                c0,
4408                                c1,
4409                                c2: 0.0,
4410                                c3: 0.0,
4411                            };
4412                            let expected = evaluate_cell_moments_uncached(cell, max_degree)
4413                                .expect("uncached affine tail moments");
4414                            let actual = cache
4415                                .evaluate(cell, max_degree)
4416                                .expect("cached affine tail moments miss");
4417                            let repeat = cache
4418                                .evaluate(cell, max_degree)
4419                                .expect("cached affine tail moments hit");
4420                            assert_eq!(actual.branch, expected.branch);
4421                            assert_eq!(repeat.branch, expected.branch);
4422                            assert_close_rel(
4423                                "tail value miss",
4424                                actual.value,
4425                                expected.value,
4426                                1e-14,
4427                            );
4428                            assert_close_rel("tail value hit", repeat.value, expected.value, 1e-14);
4429                            assert_eq!(actual.moments.len(), expected.moments.len());
4430                            assert_eq!(repeat.moments.len(), expected.moments.len());
4431                            for (idx, ((a, r), e)) in actual
4432                                .moments
4433                                .iter()
4434                                .zip(repeat.moments.iter())
4435                                .zip(expected.moments.iter())
4436                                .enumerate()
4437                            {
4438                                assert_close_rel(
4439                                    &format!("tail moment miss[{idx}]"),
4440                                    *a,
4441                                    *e,
4442                                    1e-14,
4443                                );
4444                                assert_close_rel(&format!("tail moment hit[{idx}]"), *r, *e, 1e-14);
4445                            }
4446                        }
4447                    }
4448                }
4449            }
4450        }
4451
4452        let stats = cache.stats();
4453        assert_eq!(stats.misses, stats.entries);
4454        assert!(
4455            stats.hits >= stats.misses,
4456            "expected repeat hits: {stats:?}"
4457        );
4458        assert!(
4459            stats.hit_rate() >= 0.5,
4460            "unexpected low hit rate: {stats:?}"
4461        );
4462    }
4463
4464    fn reference_bivariate_normal_cdf_20(h: f64, k: f64, rho: f64) -> f64 {
4465        if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
4466            return 0.0;
4467        }
4468        if h == f64::INFINITY {
4469            return normal_cdf(k);
4470        }
4471        if k == f64::INFINITY {
4472            return normal_cdf(h);
4473        }
4474        let rho_clamped = rho.clamp(-1.0, 1.0);
4475        if rho_clamped >= 1.0 - 1e-12 {
4476            return normal_cdf(h.min(k));
4477        }
4478        if rho_clamped <= -1.0 + 1e-12 {
4479            return (normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0);
4480        }
4481
4482        let hs = 0.5 * (h * h + k * k);
4483        let asr = rho_clamped.asin();
4484        let mut sum = 0.0;
4485        for (&node, &weight) in GL20_NODES.iter().zip(GL20_WEIGHTS.iter()) {
4486            let sn = (0.5 * asr * (node + 1.0)).sin();
4487            let one_minus = 1.0 - sn * sn;
4488            let expo = ((sn * h * k) - hs) / one_minus;
4489            sum += weight * expo.exp();
4490        }
4491        (normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0)
4492    }
4493
4494    #[test]
4495    fn non_affine_cell_state_reference_grid_matches_public_moments() {
4496        let c0s = [-0.4, 0.0, 0.35];
4497        let c1s = [-0.8, 0.25, 1.1];
4498        let c2s = [-0.12, 0.08];
4499        let c3s = [-0.04, 0.03];
4500        let intervals = [(-1.25, -0.2), (-0.5, 0.75), (0.1, 1.4)];
4501        let degrees = [3usize, 6, 9, 12];
4502
4503        for &c0 in &c0s {
4504            for &c1 in &c1s {
4505                for &c2 in &c2s {
4506                    for &c3 in &c3s {
4507                        for &(left, right) in &intervals {
4508                            let cell = DenestedCubicCell {
4509                                left,
4510                                right,
4511                                c0,
4512                                c1,
4513                                c2,
4514                                c3,
4515                            };
4516                            let branch = branch_cell(cell).expect("branch");
4517                            assert_ne!(branch, ExactCellBranch::Affine);
4518                            for &degree in &degrees {
4519                                let direct = evaluate_non_affine_cell_state(cell, branch, degree)
4520                                    .expect("direct non-affine state");
4521                                let public = evaluate_cell_moments(cell, degree)
4522                                    .expect("public non-affine state");
4523                                assert_eq!(direct.branch, public.branch);
4524                                let value_scale =
4525                                    direct.value.abs().max(public.value.abs()).max(1.0);
4526                                assert!(
4527                                    (direct.value - public.value).abs() / value_scale <= 1.0e-15,
4528                                    "value mismatch for {cell:?}, degree {degree}: direct={:.17e}, public={:.17e}",
4529                                    direct.value,
4530                                    public.value
4531                                );
4532                                assert_eq!(direct.moments.len(), public.moments.len());
4533                                for (idx, (&a, &b)) in
4534                                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4535                                {
4536                                    let scale = a.abs().max(b.abs()).max(1.0);
4537                                    assert!(
4538                                        (a - b).abs() / scale <= 1.0e-15,
4539                                        "moment {idx} mismatch for {cell:?}, degree {degree}: direct={a:.17e}, public={b:.17e}"
4540                                    );
4541                                }
4542                            }
4543                        }
4544                    }
4545                }
4546            }
4547        }
4548    }
4549
4550    #[test]
4551    fn bivariate_normal_cdf_matches_reference_grid_to_1e_minus_10() {
4552        let hs = [-8.0, -5.0, -3.0, -1.5, -0.5, 0.0, 0.25, 1.0, 2.5, 5.0, 8.0];
4553        let ks = [-8.0, -4.0, -2.0, -0.75, 0.0, 0.4, 1.25, 3.0, 6.0, 8.0];
4554        let rhos = [
4555            -0.999_999_999_999,
4556            -0.999,
4557            -0.95,
4558            -0.7,
4559            -0.3,
4560            -1.0e-12,
4561            0.0,
4562            1.0e-12,
4563            0.3,
4564            0.7,
4565            0.95,
4566            0.999,
4567            0.999_999_999_999,
4568        ];
4569        for &h in &hs {
4570            for &k in &ks {
4571                for &rho in &rhos {
4572                    let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4573                    let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4574                    let scale = expected.abs().max(1.0e-300);
4575                    let rel = (actual - expected).abs() / scale;
4576                    assert!(
4577                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4578                        "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4579                    );
4580                }
4581            }
4582        }
4583    }
4584
4585    #[test]
4586    fn bivariate_normal_cdf_matches_reference_lcg_property_samples() {
4587        let mut seed = 0x5eed_cafe_f00d_u64;
4588        let mut next_unit = || {
4589            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
4590            ((seed >> 11) as f64) * (1.0 / ((1_u64 << 53) as f64))
4591        };
4592        for _ in 0..4096 {
4593            let h = -8.0 + 16.0 * next_unit();
4594            let k = -8.0 + 16.0 * next_unit();
4595            let rho = -0.999 + 1.998 * next_unit();
4596            let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4597            let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4598            let scale = expected.abs().max(1.0e-300);
4599            let rel = (actual - expected).abs() / scale;
4600            assert!(
4601                rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4602                "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4603            );
4604        }
4605    }
4606
4607    #[test]
4608    fn affine_bvn_interval_primitive_matches_two_cdf_difference() {
4609        let hs = [-6.0, -2.0, -0.25, 0.0, 0.8, 3.0, 6.0];
4610        let bounds = [
4611            (-5.0, -2.0),
4612            (-3.0, -0.1),
4613            (-1.0, 0.0),
4614            (-0.25, 0.75),
4615            (0.2, 3.5),
4616            (2.0, 7.0),
4617        ];
4618        let rhos = [-0.98, -0.8, -0.25, 0.0, 0.25, 0.8, 0.98];
4619        for &h in &hs {
4620            for &(left, right) in &bounds {
4621                for &rho in &rhos {
4622                    let actual =
4623                        bivariate_normal_cdf_interval(h, left, right, rho).expect("interval");
4624                    let expected = (reference_bivariate_normal_cdf_20(h, right, rho)
4625                        - reference_bivariate_normal_cdf_20(h, left, rho))
4626                    .clamp(0.0, 1.0);
4627                    let scale = expected.abs().max(1.0e-300);
4628                    let rel = (actual - expected).abs() / scale;
4629                    assert!(
4630                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-12,
4631                        "h={h} left={left} right={right} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4632                    );
4633                }
4634            }
4635        }
4636    }
4637
4638    fn simpson_integral<F>(left: f64, right: f64, steps: usize, f: F) -> f64
4639    where
4640        F: Fn(f64) -> f64,
4641    {
4642        let n = if steps.is_multiple_of(2) {
4643            steps
4644        } else {
4645            steps + 1
4646        };
4647        let h = (right - left) / n as f64;
4648        let mut acc = f(left) + f(right);
4649        for k in 1..n {
4650            let x = left + h * k as f64;
4651            let w = if k % 2 == 0 { 2.0 } else { 4.0 };
4652            acc += w * f(x);
4653        }
4654        acc * h / 3.0
4655    }
4656
4657    #[test]
4658    fn global_transform_preserves_local_span_polynomial() {
4659        let span = LocalSpanCubic {
4660            left: -1.2,
4661            right: 0.8,
4662            c0: 0.3,
4663            c1: -0.25,
4664            c2: 0.11,
4665            c3: -0.04,
4666        };
4667        let (g0, g1, g2, g3) = global_cubic_from_local(span);
4668        for &x in &[-1.2, -0.7, -0.1, 0.4, 0.8] {
4669            let local = span.evaluate(x);
4670            let global = g0 + g1 * x + g2 * x * x + g3 * x * x * x;
4671            assert!((local - global).abs() < 1e-12);
4672        }
4673    }
4674
4675    #[test]
4676    fn bivariate_normal_cdf_independent_factorizes() {
4677        let h = -0.35;
4678        let k = 0.8;
4679        let out = bivariate_normal_cdf(h, k, 0.0).expect("bvn");
4680        let target = normal_cdf(h) * normal_cdf(k);
4681        assert!((out - target).abs() < 1e-12);
4682    }
4683
4684    #[test]
4685    fn evaluate_affine_cell_state_matches_numeric_integrals() {
4686        let cell = DenestedCubicCell {
4687            left: -0.9,
4688            right: 0.8,
4689            c0: 0.15,
4690            c1: -0.35,
4691            c2: 0.0,
4692            c3: 0.0,
4693        };
4694        let state = evaluate_affine_cell_state(cell, 6).expect("affine cell");
4695        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
4696            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
4697        });
4698        assert_eq!(state.branch, ExactCellBranch::Affine);
4699        assert!((state.value - value_numeric).abs() < 1e-9);
4700        for degree in 0..=6 {
4701            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
4702                z.powi(degree as i32) * (-cell.q(z)).exp()
4703            });
4704            assert!((state.moments[degree] - target).abs() < 1e-9);
4705        }
4706    }
4707
4708    #[test]
4709    fn affine_cell_value_matches_zero_moment_derivative() {
4710        let cell = DenestedCubicCell {
4711            left: -1.1,
4712            right: 0.7,
4713            c0: 0.23,
4714            c1: -0.41,
4715            c2: 0.0,
4716            c3: 0.0,
4717        };
4718        let h = 1e-6;
4719        let plus = evaluate_affine_cell_state(
4720            DenestedCubicCell {
4721                c0: cell.c0 + h,
4722                ..cell
4723            },
4724            0,
4725        )
4726        .expect("affine plus");
4727        let minus = evaluate_affine_cell_state(
4728            DenestedCubicCell {
4729                c0: cell.c0 - h,
4730                ..cell
4731            },
4732            0,
4733        )
4734        .expect("affine minus");
4735        let center = evaluate_affine_cell_state(cell, 0).expect("affine center");
4736        let d_value = (plus.value - minus.value) / (2.0 * h);
4737        let target = INV_TWO_PI * center.moments[0];
4738        assert!((d_value - target).abs() < 1e-8);
4739    }
4740
4741    #[test]
4742    fn coefficient_partials_match_exact_span_derivatives() {
4743        let score_span = LocalSpanCubic {
4744            left: -0.75,
4745            right: 0.25,
4746            c0: 0.08,
4747            c1: -0.03,
4748            c2: 0.02,
4749            c3: -0.01,
4750        };
4751        let link_span = LocalSpanCubic {
4752            left: -0.6,
4753            right: 0.9,
4754            c0: -0.05,
4755            c1: 0.04,
4756            c2: -0.02,
4757            c3: 0.015,
4758        };
4759        let a = 0.3;
4760        let b = -0.7;
4761        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
4762        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4763            let u = a + b * z;
4764            let eta_a = 1.0 + link_span.first_derivative(u);
4765            let eta_b = z + score_span.evaluate(z) + z * link_span.first_derivative(u);
4766            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4767            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4768        }
4769    }
4770
4771    #[test]
4772    fn second_coefficient_partials_match_exact_span_derivatives() {
4773        let score_span = LocalSpanCubic {
4774            left: -0.75,
4775            right: 0.25,
4776            c0: 0.08,
4777            c1: -0.03,
4778            c2: 0.02,
4779            c3: -0.01,
4780        };
4781        let link_span = LocalSpanCubic {
4782            left: -0.6,
4783            right: 0.9,
4784            c0: -0.05,
4785            c1: 0.04,
4786            c2: -0.02,
4787            c3: 0.015,
4788        };
4789        let a = 0.3;
4790        let b = -0.7;
4791        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
4792        let dc_daa = second_partials.0;
4793        let dc_dab = second_partials.1;
4794        let dc_dbb = second_partials.2;
4795        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4796            let u = a + b * z;
4797            let eta_aa = link_span.second_derivative(u);
4798            let eta_ab = z * link_span.second_derivative(u);
4799            let eta_bb = z * z * link_span.second_derivative(u);
4800            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4801            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4802            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4803        }
4804    }
4805
4806    #[test]
4807    fn higher_derivative_moment_helpers_reject_empty_first_coefficients() {
4808        let cell = DenestedCubicCell {
4809            left: -1.0,
4810            right: 1.0,
4811            c0: 0.0,
4812            c1: 1.0,
4813            c2: 0.0,
4814            c3: 0.0,
4815        };
4816        let moments = [1.0; 16];
4817
4818        let third_err = cell_third_derivative_from_moments(
4819            cell,
4820            &[],
4821            &[1.0],
4822            &[1.0],
4823            &[],
4824            &[],
4825            &[],
4826            &[],
4827            &moments,
4828        )
4829        .expect_err("empty first coefficients should be rejected");
4830        assert!(third_err.contains("r first-derivative coefficients must be non-empty"));
4831
4832        let fourth_err = cell_fourth_derivative_from_moments(
4833            cell,
4834            &[1.0],
4835            &[],
4836            &[1.0],
4837            &[1.0],
4838            &[],
4839            &[],
4840            &[],
4841            &[],
4842            &[],
4843            &[],
4844            &[],
4845            &[],
4846            &[],
4847            &[],
4848            &[],
4849            &moments,
4850        )
4851        .expect_err("empty first coefficients should be rejected");
4852        assert!(fourth_err.contains("s first-derivative coefficients must be non-empty"));
4853    }
4854
4855    #[test]
4856    fn fourth_derivative_rejects_overlong_scratch_convolutions() {
4857        let cell = DenestedCubicCell {
4858            left: -1.0,
4859            right: 1.0,
4860            c0: 0.0,
4861            c1: 1.0,
4862            c2: 0.0,
4863            c3: 0.0,
4864        };
4865        let long_first = [1.0; 10];
4866        let zero = [0.0; 1];
4867        let moments = [1.0; 64];
4868
4869        let err = cell_fourth_derivative_from_moments(
4870            cell,
4871            &long_first,
4872            &long_first,
4873            &long_first,
4874            &long_first,
4875            &zero,
4876            &zero,
4877            &zero,
4878            &zero,
4879            &zero,
4880            &zero,
4881            &zero,
4882            &zero,
4883            &zero,
4884            &zero,
4885            &zero,
4886            &moments,
4887        )
4888        .expect_err("oversized convolution should be rejected before writing scratch");
4889        assert!(err.contains("fourth derivative polynomial convolution scratch too small"));
4890    }
4891
4892    #[test]
4893    fn score_and_link_basis_cell_coefficients_match_direct_construction() {
4894        let score_basis_span = LocalSpanCubic {
4895            left: -0.7,
4896            right: 0.4,
4897            c0: 0.2,
4898            c1: -0.04,
4899            c2: 0.03,
4900            c3: -0.01,
4901        };
4902        let link_basis_span = LocalSpanCubic {
4903            left: -0.5,
4904            right: 1.1,
4905            c0: -0.03,
4906            c1: 0.05,
4907            c2: -0.02,
4908            c3: 0.01,
4909        };
4910        let a = 0.25;
4911        let b = -0.8;
4912        let score_coeffs = score_basis_cell_coefficients(score_basis_span, b);
4913        let link_coeffs = link_basis_cell_coefficients(link_basis_span, a, b);
4914        for &z in &[-0.7, -0.1, 0.2, 0.4] {
4915            let score_poly = polynomial_value(&score_coeffs, z);
4916            let link_poly = polynomial_value(&link_coeffs, z);
4917            assert!((score_poly - b * score_basis_span.evaluate(z)).abs() < 1e-12);
4918            assert!((link_poly - link_basis_span.evaluate(a + b * z)).abs() < 1e-12);
4919        }
4920    }
4921
4922    #[test]
4923    fn link_basis_partials_match_exact_span_derivatives() {
4924        let link_basis_span = LocalSpanCubic {
4925            left: -0.5,
4926            right: 1.1,
4927            c0: -0.03,
4928            c1: 0.05,
4929            c2: -0.02,
4930            c3: 0.01,
4931        };
4932        let a = 0.25;
4933        let b = -0.8;
4934        let (dc_da, dc_db) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
4935        let (dc_daa, dc_dab, dc_dbb) = link_basis_cell_second_partials(link_basis_span, a, b);
4936        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4937            let u = a + b * z;
4938            let eta_a = link_basis_span.first_derivative(u);
4939            let eta_b = z * link_basis_span.first_derivative(u);
4940            let eta_aa = link_basis_span.second_derivative(u);
4941            let eta_ab = z * link_basis_span.second_derivative(u);
4942            let eta_bb = z * z * link_basis_span.second_derivative(u);
4943            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4944            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4945            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4946            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4947            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4948        }
4949    }
4950
4951    #[test]
4952    fn denested_third_partials_match_exact_span_derivatives() {
4953        let link_span = LocalSpanCubic {
4954            left: -0.6,
4955            right: 0.9,
4956            c0: -0.05,
4957            c1: 0.04,
4958            c2: -0.02,
4959            c3: 0.015,
4960        };
4961        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
4962        let link_third = 6.0 * link_span.c3;
4963        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4964            let eta_aaa = link_third;
4965            let eta_aab = z * link_third;
4966            let eta_abb = z * z * link_third;
4967            let eta_bbb = z * z * z * link_third;
4968            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4969            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4970            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4971            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4972        }
4973    }
4974
4975    #[test]
4976    fn link_basis_third_partials_match_exact_span_derivatives() {
4977        let link_basis_span = LocalSpanCubic {
4978            left: -0.5,
4979            right: 1.1,
4980            c0: -0.03,
4981            c1: 0.05,
4982            c2: -0.02,
4983            c3: 0.01,
4984        };
4985        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = link_basis_cell_third_partials(link_basis_span);
4986        let link_third = 6.0 * link_basis_span.c3;
4987        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4988            let eta_aaa = link_third;
4989            let eta_aab = z * link_third;
4990            let eta_abb = z * z * link_third;
4991            let eta_bbb = z * z * z * link_third;
4992            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4993            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4994            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4995            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4996        }
4997    }
4998
4999    #[test]
5000    fn branch_selection_uses_normalized_non_affine_coefficients() {
5001        let affine = DenestedCubicCell {
5002            left: -1.0,
5003            right: 1.0,
5004            c0: 0.1,
5005            c1: -0.4,
5006            c2: 1e-13,
5007            c3: -1e-13,
5008        };
5009        let quartic = DenestedCubicCell {
5010            c2: 2e-4,
5011            c3: 1e-13,
5012            ..affine
5013        };
5014        let sextic = DenestedCubicCell {
5015            c2: 2e-4,
5016            c3: 5e-3,
5017            ..affine
5018        };
5019        assert_eq!(branch_cell(affine).unwrap(), ExactCellBranch::Affine);
5020        assert_eq!(branch_cell(quartic).unwrap(), ExactCellBranch::Quartic);
5021        assert_eq!(branch_cell(sextic).unwrap(), ExactCellBranch::Sextic);
5022    }
5023
5024    #[test]
5025    fn affine_anchor_moments_match_whole_line_closed_forms() {
5026        let out = affine_anchor_moment_vector(0.0, 0.0, f64::NEG_INFINITY, f64::INFINITY, 4);
5027        // `affine_anchor_moment_vector` returns the RAW substrate moments
5028        // `T_n = ∫ z^n exp(-½z²) dz` (the cubic-cell `∫ z^n exp(-q) dz`
5029        // convention that every production consumer and the GPU parity path
5030        // share; the `1/√(2π)` is folded in downstream via `INV_TWO_PI`). At
5031        // the affine identity the anchor is the *unnormalized* standard normal,
5032        // so M0 = M2 = √(2π) and M1 = 0 — the normalized {1, 0, 1} moments
5033        // scaled by the whole-line mass √(2π).
5034        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5035        assert!((out[0] - sqrt_2pi).abs() < 1e-12);
5036        assert!(out[1].abs() < 1e-12);
5037        assert!((out[2] - sqrt_2pi).abs() < 1e-12);
5038    }
5039
5040    #[test]
5041    fn affine_anchor_moments_match_shifted_gaussian_whole_line() {
5042        let alpha = 0.7;
5043        let beta = -0.4;
5044        let out = affine_anchor_moment_vector(alpha, beta, f64::NEG_INFINITY, f64::INFINITY, 4);
5045        let s = (1.0 + beta * beta).sqrt();
5046        let mu = -alpha * beta / (1.0 + beta * beta);
5047        // RAW (unnormalized) whole-line moments of the affine anchor
5048        // `exp(-½(alpha + beta·z)²)·exp(-½z²)`, an unnormalized Gaussian with
5049        // mean `mu` and variance `1/s²`. Its raw moments carry the `√(2π)` mass
5050        // factor: M0 = √(2π)·scale, M1 = √(2π)·scale·mu,
5051        // M2 = √(2π)·scale·(mu² + 1/s²), where the anchor amplitude
5052        // `scale = exp(-alpha² / 2s²) / s`.
5053        let scale = (-alpha * alpha / (2.0 * s * s)).exp() / s;
5054        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5055        assert!((out[0] - scale * sqrt_2pi).abs() < 1e-12);
5056        assert!((out[1] - scale * sqrt_2pi * mu).abs() < 1e-12);
5057        assert!((out[2] - scale * sqrt_2pi * (mu * mu + 1.0 / (s * s))).abs() < 1e-10);
5058    }
5059
5060    #[test]
5061    fn quartic_recurrence_reduces_higher_moments() {
5062        let cell = DenestedCubicCell {
5063            left: -1.0,
5064            right: 0.9,
5065            c0: 0.2,
5066            c1: -0.3,
5067            c2: 0.18,
5068            c3: 0.0,
5069        };
5070        let exact = |k: usize| {
5071            simpson_integral(cell.left, cell.right, 2000, |z| {
5072                z.powi(k as i32) * (-cell.q(z)).exp()
5073            })
5074        };
5075        let reduced = reduce_quartic_moments(cell, [exact(0), exact(1), exact(2)], 6)
5076            .expect("quartic reduction");
5077        for k in 0..=6 {
5078            let target = exact(k);
5079            assert!(
5080                (reduced[k] - target).abs() < 1e-7,
5081                "quartic reduced moment M{k} mismatch: {} vs {}",
5082                reduced[k],
5083                target
5084            );
5085        }
5086    }
5087
5088    #[test]
5089    fn sextic_recurrence_reduces_higher_moments() {
5090        let cell = DenestedCubicCell {
5091            left: -0.8,
5092            right: 0.7,
5093            c0: -0.1,
5094            c1: 0.25,
5095            c2: -0.14,
5096            c3: 0.22,
5097        };
5098        let exact = |k: usize| {
5099            simpson_integral(cell.left, cell.right, 3000, |z| {
5100                z.powi(k as i32) * (-cell.q(z)).exp()
5101            })
5102        };
5103        let reduced =
5104            reduce_sextic_moments(cell, [exact(0), exact(1), exact(2), exact(3), exact(4)], 9)
5105                .expect("sextic reduction");
5106        for k in 0..=9 {
5107            let target = exact(k);
5108            assert!(
5109                (reduced[k] - target).abs() < 1e-7,
5110                "sextic reduced moment M{k} mismatch: {} vs {}",
5111                reduced[k],
5112                target
5113            );
5114        }
5115    }
5116
5117    #[test]
5118    fn degenerate_sextic_branch_preserves_quadratic_coefficient() {
5119        let cell = DenestedCubicCell {
5120            left: -1.0,
5121            right: 1.0,
5122            c0: 0.0,
5123            c1: 0.0,
5124            c2: 0.1,
5125            c3: 2.0e-10,
5126        };
5127        assert_eq!(branch_cell(cell).unwrap(), ExactCellBranch::Sextic);
5128
5129        let state = evaluate_cell_moments(cell, 9).expect("degenerate sextic cell");
5130        let quartic_cell = DenestedCubicCell { c3: 0.0, ..cell };
5131        let quartic = evaluate_cell_moments(quartic_cell, 9).expect("quartic cell");
5132        let affine = evaluate_affine_cell_state(
5133            DenestedCubicCell {
5134                c2: 0.0,
5135                c3: 0.0,
5136                ..cell
5137            },
5138            9,
5139        )
5140        .expect("affine cell");
5141
5142        assert_eq!(state.branch, ExactCellBranch::Quartic);
5143        for k in 0..=9 {
5144            assert!(
5145                (state.moments[k] - quartic.moments[k]).abs() < 1e-12,
5146                "lowered moment M{k} should match the quartic cell: {} vs {}",
5147                state.moments[k],
5148                quartic.moments[k]
5149            );
5150        }
5151        assert!(
5152            (state.moments[0] - affine.moments[0]).abs() > 1e-4,
5153            "degenerate sextic handling must not drop the nonzero c2 term"
5154        );
5155    }
5156
5157    #[test]
5158    fn moment_reduced_first_and_second_derivatives_match_numeric_integrals() {
5159        let cell = DenestedCubicCell {
5160            left: -0.9,
5161            right: 0.6,
5162            c0: 0.15,
5163            c1: -0.2,
5164            c2: 0.08,
5165            c3: 0.17,
5166        };
5167        let moments = reduce_sextic_moments(
5168            cell,
5169            [
5170                simpson_integral(cell.left, cell.right, 3000, |z| (-cell.q(z)).exp()),
5171                simpson_integral(cell.left, cell.right, 3000, |z| z * (-cell.q(z)).exp()),
5172                simpson_integral(cell.left, cell.right, 3000, |z| z * z * (-cell.q(z)).exp()),
5173                simpson_integral(cell.left, cell.right, 3000, |z| {
5174                    z.powi(3) * (-cell.q(z)).exp()
5175                }),
5176                simpson_integral(cell.left, cell.right, 3000, |z| {
5177                    z.powi(4) * (-cell.q(z)).exp()
5178                }),
5179            ],
5180            9,
5181        )
5182        .expect("reduced moments");
5183
5184        let r = [0.7, -0.1, 0.3];
5185        let s = [0.2, 0.5];
5186        let second = [0.4, -0.2, 0.1];
5187        let exact_first = cell_first_derivative_from_moments(&r, &moments).expect("first");
5188        let exact_second =
5189            cell_second_derivative_from_moments(cell, &r, &s, &second, &moments).expect("second");
5190
5191        let numeric_first = simpson_integral(cell.left, cell.right, 3000, |z| {
5192            polynomial_value(&r, z) * (-cell.q(z)).exp() / (2.0 * std::f64::consts::PI)
5193        });
5194        let numeric_second = simpson_integral(cell.left, cell.right, 3000, |z| {
5195            let eta = cell.eta(z);
5196            (polynomial_value(&second, z) - eta * polynomial_value(&r, z) * polynomial_value(&s, z))
5197                * (-cell.q(z)).exp()
5198                / (2.0 * std::f64::consts::PI)
5199        });
5200
5201        assert!((exact_first - numeric_first).abs() < 1e-7);
5202        assert!((exact_second - numeric_second).abs() < 1e-7);
5203    }
5204
5205    #[test]
5206    fn moment_reduced_third_derivative_matches_numeric_integral() {
5207        let cell = DenestedCubicCell {
5208            left: -0.85,
5209            right: 0.7,
5210            c0: -0.12,
5211            c1: 0.18,
5212            c2: 0.09,
5213            c3: -0.11,
5214        };
5215        let moments = evaluate_cell_moments(cell, 12).expect("cell moments");
5216        let r = [0.35, -0.12, 0.08];
5217        let s = [0.17, 0.09];
5218        let t = [-0.21, 0.14, -0.04];
5219        let rs = [0.11, -0.07, 0.05];
5220        let rt = [-0.06, 0.03];
5221        let st = [0.08, -0.02, 0.01];
5222        let rst = [0.04, -0.05, 0.02];
5223
5224        let exact_third = cell_third_derivative_from_moments(
5225            cell,
5226            &r,
5227            &s,
5228            &t,
5229            &rs,
5230            &rt,
5231            &st,
5232            &rst,
5233            &moments.moments,
5234        )
5235        .expect("third derivative");
5236        let numeric_third = simpson_integral(cell.left, cell.right, 4000, |z| {
5237            let eta = cell.eta(z);
5238            let rz = polynomial_value(&r, z);
5239            let sz = polynomial_value(&s, z);
5240            let tz = polynomial_value(&t, z);
5241            let rsz = polynomial_value(&rs, z);
5242            let rtz = polynomial_value(&rt, z);
5243            let stz = polynomial_value(&st, z);
5244            let rstz = polynomial_value(&rst, z);
5245            (rstz - eta * (rsz * tz + rtz * sz + stz * rz) + (eta * eta - 1.0) * rz * sz * tz)
5246                * (-cell.q(z)).exp()
5247                / (2.0 * std::f64::consts::PI)
5248        });
5249
5250        assert!((exact_third - numeric_third).abs() < 1e-7);
5251    }
5252
5253    #[test]
5254    fn moment_reduced_fourth_derivative_matches_numeric_integral() {
5255        let cell = DenestedCubicCell {
5256            left: -0.8,
5257            right: 0.65,
5258            c0: 0.11,
5259            c1: -0.22,
5260            c2: 0.07,
5261            c3: 0.13,
5262        };
5263        let moments = evaluate_cell_moments(cell, 16).expect("cell moments");
5264        let r = [0.21, -0.13, 0.06];
5265        let s = [-0.18, 0.04];
5266        let t = [0.09, 0.07, -0.03];
5267        let u = [-0.14, 0.05];
5268        let rs = [0.08, -0.03, 0.02];
5269        let rt = [-0.05, 0.01];
5270        let ru = [0.04, -0.02, 0.01];
5271        let st = [0.03, 0.02];
5272        let su = [-0.02, 0.05, -0.01];
5273        let tu = [0.07, -0.04];
5274        let rst = [0.03, -0.01, 0.02];
5275        let rsu = [-0.02, 0.04];
5276        let rtu = [0.01, 0.02, -0.01];
5277        let stu = [-0.03, 0.02];
5278        let rstu = [0.02, -0.01, 0.01];
5279
5280        let exact_fourth = cell_fourth_derivative_from_moments(
5281            cell,
5282            &r,
5283            &s,
5284            &t,
5285            &u,
5286            &rs,
5287            &rt,
5288            &ru,
5289            &st,
5290            &su,
5291            &tu,
5292            &rst,
5293            &rsu,
5294            &rtu,
5295            &stu,
5296            &rstu,
5297            &moments.moments,
5298        )
5299        .expect("fourth derivative");
5300        let numeric_fourth = simpson_integral(cell.left, cell.right, 5000, |z| {
5301            let eta = cell.eta(z);
5302            let rz = polynomial_value(&r, z);
5303            let sz = polynomial_value(&s, z);
5304            let tz = polynomial_value(&t, z);
5305            let uz = polynomial_value(&u, z);
5306            let rsz = polynomial_value(&rs, z);
5307            let rtz = polynomial_value(&rt, z);
5308            let ruz = polynomial_value(&ru, z);
5309            let stz = polynomial_value(&st, z);
5310            let suz = polynomial_value(&su, z);
5311            let tuz = polynomial_value(&tu, z);
5312            let rstz = polynomial_value(&rst, z);
5313            let rsuz = polynomial_value(&rsu, z);
5314            let rtuz = polynomial_value(&rtu, z);
5315            let stuz = polynomial_value(&stu, z);
5316            let rstuz = polynomial_value(&rstu, z);
5317            let linear =
5318                rstz * uz + rsuz * tz + rtuz * sz + stuz * rz + rsz * tuz + rtz * suz + ruz * stz;
5319            let quadratic = rsz * tz * uz
5320                + rtz * sz * uz
5321                + ruz * sz * tz
5322                + stz * rz * uz
5323                + suz * rz * tz
5324                + tuz * rz * sz;
5325            let quartic = rz * sz * tz * uz;
5326            (rstuz - eta * linear
5327                + (eta * eta - 1.0) * quadratic
5328                + (-eta * eta * eta + 3.0 * eta) * quartic)
5329                * (-cell.q(z)).exp()
5330                / (2.0 * std::f64::consts::PI)
5331        });
5332
5333        assert!((exact_fourth - numeric_fourth).abs() < 2e-7);
5334    }
5335
5336    #[test]
5337    fn denested_cell_parameter_derivatives_match_exact_integrands() {
5338        let score_span = LocalSpanCubic {
5339            left: -0.75,
5340            right: 0.25,
5341            c0: 0.08,
5342            c1: -0.03,
5343            c2: 0.02,
5344            c3: -0.01,
5345        };
5346        let link_span = LocalSpanCubic {
5347            left: -0.6,
5348            right: 0.9,
5349            c0: -0.05,
5350            c1: 0.04,
5351            c2: -0.02,
5352            c3: 0.015,
5353        };
5354        let a = 0.3;
5355        let b = -0.7;
5356        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5357        let cell = DenestedCubicCell {
5358            left: score_span.left,
5359            right: score_span.right,
5360            c0: coeffs[0],
5361            c1: coeffs[1],
5362            c2: coeffs[2],
5363            c3: coeffs[3],
5364        };
5365        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5366        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5367        let (dc_daa, dc_dab, dc_dbb) = denested_cell_second_partials(score_span, link_span, a, b);
5368        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
5369        let zero = [0.0; 4];
5370        let link_third = 6.0 * link_span.c3;
5371
5372        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5373        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5374        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5375        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5376        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5377        let eta_aaa = |z: f64| link_third + 0.0 * z;
5378        let eta_aab = |z: f64| z * link_third;
5379        let eta_abb = |z: f64| z * z * link_third;
5380        let eta_bbb = |z: f64| z * z * z * link_third;
5381
5382        let exact_a = cell_first_derivative_from_moments(&dc_da, &state.moments).expect("a");
5383        let exact_b = cell_first_derivative_from_moments(&dc_db, &state.moments).expect("b");
5384        let exact_aa =
5385            cell_second_derivative_from_moments(cell, &dc_da, &dc_da, &dc_daa, &state.moments)
5386                .expect("aa");
5387        let exact_ab =
5388            cell_second_derivative_from_moments(cell, &dc_da, &dc_db, &dc_dab, &state.moments)
5389                .expect("ab");
5390        let exact_bb =
5391            cell_second_derivative_from_moments(cell, &dc_db, &dc_db, &dc_dbb, &state.moments)
5392                .expect("bb");
5393        let exact_aaa = cell_third_derivative_from_moments(
5394            cell,
5395            &dc_da,
5396            &dc_da,
5397            &dc_da,
5398            &dc_daa,
5399            &dc_daa,
5400            &dc_daa,
5401            &dc_daaa,
5402            &state.moments,
5403        )
5404        .expect("aaa");
5405        let exact_aab = cell_third_derivative_from_moments(
5406            cell,
5407            &dc_da,
5408            &dc_da,
5409            &dc_db,
5410            &dc_daa,
5411            &dc_dab,
5412            &dc_dab,
5413            &dc_daab,
5414            &state.moments,
5415        )
5416        .expect("aab");
5417        let exact_abb = cell_third_derivative_from_moments(
5418            cell,
5419            &dc_da,
5420            &dc_db,
5421            &dc_db,
5422            &dc_dab,
5423            &dc_dab,
5424            &dc_dbb,
5425            &dc_dabb,
5426            &state.moments,
5427        )
5428        .expect("abb");
5429        let exact_bbb = cell_third_derivative_from_moments(
5430            cell,
5431            &dc_db,
5432            &dc_db,
5433            &dc_db,
5434            &dc_dbb,
5435            &dc_dbb,
5436            &dc_dbb,
5437            &dc_dbbb,
5438            &state.moments,
5439        )
5440        .expect("bbb");
5441        let exact_aaaa = cell_fourth_derivative_from_moments(
5442            cell,
5443            &dc_da,
5444            &dc_da,
5445            &dc_da,
5446            &dc_da,
5447            &dc_daa,
5448            &dc_daa,
5449            &dc_daa,
5450            &dc_daa,
5451            &dc_daa,
5452            &dc_daa,
5453            &dc_daaa,
5454            &dc_daaa,
5455            &dc_daaa,
5456            &dc_daaa,
5457            &zero,
5458            &state.moments,
5459        )
5460        .expect("aaaa");
5461        let exact_aaab = cell_fourth_derivative_from_moments(
5462            cell,
5463            &dc_da,
5464            &dc_da,
5465            &dc_da,
5466            &dc_db,
5467            &dc_daa,
5468            &dc_daa,
5469            &dc_dab,
5470            &dc_daa,
5471            &dc_dab,
5472            &dc_dab,
5473            &dc_daaa,
5474            &dc_daab,
5475            &dc_daab,
5476            &dc_daab,
5477            &zero,
5478            &state.moments,
5479        )
5480        .expect("aaab");
5481        let exact_aabb = cell_fourth_derivative_from_moments(
5482            cell,
5483            &dc_da,
5484            &dc_da,
5485            &dc_db,
5486            &dc_db,
5487            &dc_daa,
5488            &dc_dab,
5489            &dc_dab,
5490            &dc_dab,
5491            &dc_dab,
5492            &dc_dbb,
5493            &dc_daab,
5494            &dc_daab,
5495            &dc_dabb,
5496            &dc_dabb,
5497            &zero,
5498            &state.moments,
5499        )
5500        .expect("aabb");
5501        let exact_abbb = cell_fourth_derivative_from_moments(
5502            cell,
5503            &dc_da,
5504            &dc_db,
5505            &dc_db,
5506            &dc_db,
5507            &dc_dab,
5508            &dc_dab,
5509            &dc_dab,
5510            &dc_dbb,
5511            &dc_dbb,
5512            &dc_dbb,
5513            &dc_dabb,
5514            &dc_dabb,
5515            &dc_dabb,
5516            &dc_dbbb,
5517            &zero,
5518            &state.moments,
5519        )
5520        .expect("abbb");
5521        let exact_bbbb = cell_fourth_derivative_from_moments(
5522            cell,
5523            &dc_db,
5524            &dc_db,
5525            &dc_db,
5526            &dc_db,
5527            &dc_dbb,
5528            &dc_dbb,
5529            &dc_dbb,
5530            &dc_dbb,
5531            &dc_dbb,
5532            &dc_dbb,
5533            &dc_dbbb,
5534            &dc_dbbb,
5535            &dc_dbbb,
5536            &dc_dbbb,
5537            &zero,
5538            &state.moments,
5539        )
5540        .expect("bbbb");
5541
5542        let numeric_a = simpson_integral(cell.left, cell.right, 5000, |z| {
5543            eta_a(z) * (-cell.q(z)).exp() * INV_TWO_PI
5544        });
5545        let numeric_b = simpson_integral(cell.left, cell.right, 5000, |z| {
5546            eta_b(z) * (-cell.q(z)).exp() * INV_TWO_PI
5547        });
5548        let numeric_aa = simpson_integral(cell.left, cell.right, 5000, |z| {
5549            (eta_aa(z) - cell.eta(z) * eta_a(z) * eta_a(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5550        });
5551        let numeric_ab = simpson_integral(cell.left, cell.right, 5000, |z| {
5552            (eta_ab(z) - cell.eta(z) * eta_a(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5553        });
5554        let numeric_bb = simpson_integral(cell.left, cell.right, 5000, |z| {
5555            (eta_bb(z) - cell.eta(z) * eta_b(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5556        });
5557        let numeric_aaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5558            let eta = cell.eta(z);
5559            (eta_aaa(z) - 3.0 * eta * eta_aa(z) * eta_a(z) + (eta * eta - 1.0) * eta_a(z).powi(3))
5560                * (-cell.q(z)).exp()
5561                * INV_TWO_PI
5562        });
5563        let numeric_aab = simpson_integral(cell.left, cell.right, 5000, |z| {
5564            let eta = cell.eta(z);
5565            let a_z = eta_a(z);
5566            let b_z = eta_b(z);
5567            (eta_aab(z) - eta * (eta_aa(z) * b_z + 2.0 * eta_ab(z) * a_z)
5568                + (eta * eta - 1.0) * a_z * a_z * b_z)
5569                * (-cell.q(z)).exp()
5570                * INV_TWO_PI
5571        });
5572        let numeric_abb = simpson_integral(cell.left, cell.right, 5000, |z| {
5573            let eta = cell.eta(z);
5574            let a_z = eta_a(z);
5575            let b_z = eta_b(z);
5576            (eta_abb(z) - eta * (2.0 * eta_ab(z) * b_z + eta_bb(z) * a_z)
5577                + (eta * eta - 1.0) * a_z * b_z * b_z)
5578                * (-cell.q(z)).exp()
5579                * INV_TWO_PI
5580        });
5581        let numeric_bbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5582            let eta = cell.eta(z);
5583            (eta_bbb(z) - 3.0 * eta * eta_bb(z) * eta_b(z) + (eta * eta - 1.0) * eta_b(z).powi(3))
5584                * (-cell.q(z)).exp()
5585                * INV_TWO_PI
5586        });
5587        let numeric_aaaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5588            let eta = cell.eta(z);
5589            let eta_a_z = eta_a(z);
5590            let eta_aa_z = eta_aa(z);
5591            let eta_aaa_z = eta_aaa(z);
5592            (-eta * (4.0 * eta_aaa_z * eta_a_z + 3.0 * eta_aa_z * eta_aa_z)
5593                + (eta * eta - 1.0) * (6.0 * eta_aa_z * eta_a_z * eta_a_z)
5594                + (-eta * eta * eta + 3.0 * eta) * eta_a_z.powi(4))
5595                * (-cell.q(z)).exp()
5596                * INV_TWO_PI
5597        });
5598        let numeric_aaab = simpson_integral(cell.left, cell.right, 5000, |z| {
5599            let eta = cell.eta(z);
5600            let a_z = eta_a(z);
5601            let b_z = eta_b(z);
5602            let aa_z = eta_aa(z);
5603            let ab_z = eta_ab(z);
5604            let aaa_z = eta_aaa(z);
5605            let aab_z = eta_aab(z);
5606            (-eta * (aaa_z * b_z + 3.0 * aab_z * a_z + 3.0 * aa_z * ab_z)
5607                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * b_z + 3.0 * ab_z * a_z * a_z)
5608                + (-eta * eta * eta + 3.0 * eta) * a_z.powi(3) * b_z)
5609                * (-cell.q(z)).exp()
5610                * INV_TWO_PI
5611        });
5612        let numeric_aabb = simpson_integral(cell.left, cell.right, 5000, |z| {
5613            let eta = cell.eta(z);
5614            let a_z = eta_a(z);
5615            let b_z = eta_b(z);
5616            let aa_z = eta_aa(z);
5617            let ab_z = eta_ab(z);
5618            let bb_z = eta_bb(z);
5619            let aab_z = eta_aab(z);
5620            let abb_z = eta_abb(z);
5621            (-eta * (2.0 * aab_z * b_z + 2.0 * abb_z * a_z + aa_z * bb_z + 2.0 * ab_z * ab_z)
5622                + (eta * eta - 1.0)
5623                    * (aa_z * b_z * b_z + 4.0 * ab_z * a_z * b_z + bb_z * a_z * a_z)
5624                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * b_z * b_z)
5625                * (-cell.q(z)).exp()
5626                * INV_TWO_PI
5627        });
5628        let numeric_abbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5629            let eta = cell.eta(z);
5630            let a_z = eta_a(z);
5631            let b_z = eta_b(z);
5632            let ab_z = eta_ab(z);
5633            let bb_z = eta_bb(z);
5634            let abb_z = eta_abb(z);
5635            let bbb_z = eta_bbb(z);
5636            (-eta * (3.0 * abb_z * b_z + bbb_z * a_z + 3.0 * ab_z * bb_z)
5637                + (eta * eta - 1.0) * (3.0 * ab_z * b_z * b_z + 3.0 * bb_z * a_z * b_z)
5638                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z.powi(3))
5639                * (-cell.q(z)).exp()
5640                * INV_TWO_PI
5641        });
5642        let numeric_bbbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5643            let eta = cell.eta(z);
5644            let eta_b_z = eta_b(z);
5645            let eta_bb_z = eta_bb(z);
5646            let eta_bbb_z = eta_bbb(z);
5647            (-eta * (4.0 * eta_bbb_z * eta_b_z + 3.0 * eta_bb_z * eta_bb_z)
5648                + (eta * eta - 1.0) * (6.0 * eta_bb_z * eta_b_z * eta_b_z)
5649                + (-eta * eta * eta + 3.0 * eta) * eta_b_z.powi(4))
5650                * (-cell.q(z)).exp()
5651                * INV_TWO_PI
5652        });
5653
5654        assert!((exact_a - numeric_a).abs() < 1e-8);
5655        assert!((exact_b - numeric_b).abs() < 1e-8);
5656        assert!((exact_aa - numeric_aa).abs() < 1e-8);
5657        assert!((exact_ab - numeric_ab).abs() < 1e-8);
5658        assert!((exact_bb - numeric_bb).abs() < 1e-8);
5659        assert!((exact_aaa - numeric_aaa).abs() < 2e-7);
5660        assert!((exact_aab - numeric_aab).abs() < 2e-7);
5661        assert!((exact_abb - numeric_abb).abs() < 2e-7);
5662        assert!((exact_bbb - numeric_bbb).abs() < 2e-7);
5663        assert!((exact_aaaa - numeric_aaaa).abs() < 2e-6);
5664        assert!((exact_aaab - numeric_aaab).abs() < 2e-6);
5665        assert!((exact_aabb - numeric_aabb).abs() < 2e-6);
5666        assert!((exact_abbb - numeric_abbb).abs() < 2e-6);
5667        assert!((exact_bbbb - numeric_bbbb).abs() < 2e-6);
5668    }
5669
5670    #[test]
5671    fn link_basis_cell_derivatives_match_exact_integrands() {
5672        let score_span = LocalSpanCubic {
5673            left: -0.75,
5674            right: 0.25,
5675            c0: 0.08,
5676            c1: -0.03,
5677            c2: 0.02,
5678            c3: -0.01,
5679        };
5680        let link_span = LocalSpanCubic {
5681            left: -0.6,
5682            right: 0.9,
5683            c0: -0.05,
5684            c1: 0.04,
5685            c2: -0.02,
5686            c3: 0.015,
5687        };
5688        let link_basis_span = LocalSpanCubic {
5689            left: -0.6,
5690            right: 0.9,
5691            c0: 0.02,
5692            c1: -0.01,
5693            c2: 0.03,
5694            c3: -0.02,
5695        };
5696        let a = 0.3;
5697        let b = -0.7;
5698        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5699        let cell = DenestedCubicCell {
5700            left: score_span.left,
5701            right: score_span.right,
5702            c0: coeffs[0],
5703            c1: coeffs[1],
5704            c2: coeffs[2],
5705            c3: coeffs[3],
5706        };
5707        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5708        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5709        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5710        let dc_daa = second_partials.0;
5711        let dc_dab = second_partials.1;
5712        let dc_dbb = second_partials.2;
5713        let denested_third = denested_cell_third_partials(link_span);
5714        let dc_daaa = denested_third.0;
5715        let dc_dbbb = denested_third.3;
5716
5717        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
5718        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
5719        let (coeff_aaw, coeff_abw, coeff_bbw) =
5720            link_basis_cell_second_partials(link_basis_span, a, b);
5721        let link_basis_third = link_basis_cell_third_partials(link_basis_span);
5722        let coeff_aaaw = link_basis_third.0;
5723        let coeff_bbbw = link_basis_third.3;
5724        let zero = [0.0; 4];
5725        let basis_third = 6.0 * link_basis_span.c3;
5726
5727        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5728        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5729        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5730        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5731        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5732        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
5733        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
5734        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
5735        let eta_aaw = |z: f64| link_basis_span.second_derivative(a + b * z);
5736        let eta_abw = |z: f64| z * link_basis_span.second_derivative(a + b * z);
5737        let eta_bbw = |z: f64| z * z * link_basis_span.second_derivative(a + b * z);
5738        let eta_aaaw = |z: f64| basis_third + 0.0 * z;
5739        let eta_bbbw = |z: f64| z * z * z * basis_third;
5740
5741        let exact_w = cell_first_derivative_from_moments(&coeff_w, &state.moments).expect("w");
5742        let exact_aw =
5743            cell_second_derivative_from_moments(cell, &dc_da, &coeff_w, &coeff_aw, &state.moments)
5744                .expect("aw");
5745        let exact_bw =
5746            cell_second_derivative_from_moments(cell, &dc_db, &coeff_w, &coeff_bw, &state.moments)
5747                .expect("bw");
5748        let exact_ww =
5749            cell_second_derivative_from_moments(cell, &coeff_w, &coeff_w, &zero, &state.moments)
5750                .expect("ww");
5751        let exact_aaw = cell_third_derivative_from_moments(
5752            cell,
5753            &dc_da,
5754            &dc_da,
5755            &coeff_w,
5756            &dc_daa,
5757            &coeff_aw,
5758            &coeff_aw,
5759            &coeff_aaw,
5760            &state.moments,
5761        )
5762        .expect("aaw");
5763        let exact_abw = cell_third_derivative_from_moments(
5764            cell,
5765            &dc_da,
5766            &dc_db,
5767            &coeff_w,
5768            &dc_dab,
5769            &coeff_aw,
5770            &coeff_bw,
5771            &coeff_abw,
5772            &state.moments,
5773        )
5774        .expect("abw");
5775        let exact_bbw = cell_third_derivative_from_moments(
5776            cell,
5777            &dc_db,
5778            &dc_db,
5779            &coeff_w,
5780            &dc_dbb,
5781            &coeff_bw,
5782            &coeff_bw,
5783            &coeff_bbw,
5784            &state.moments,
5785        )
5786        .expect("bbw");
5787        let exact_www = cell_third_derivative_from_moments(
5788            cell,
5789            &coeff_w,
5790            &coeff_w,
5791            &coeff_w,
5792            &zero,
5793            &zero,
5794            &zero,
5795            &zero,
5796            &state.moments,
5797        )
5798        .expect("www");
5799        let exact_aaaw = cell_fourth_derivative_from_moments(
5800            cell,
5801            &dc_da,
5802            &dc_da,
5803            &dc_da,
5804            &coeff_w,
5805            &dc_daa,
5806            &dc_daa,
5807            &coeff_aw,
5808            &dc_daa,
5809            &coeff_aw,
5810            &coeff_aw,
5811            &dc_daaa,
5812            &coeff_aaw,
5813            &coeff_aaw,
5814            &coeff_aaw,
5815            &coeff_aaaw,
5816            &state.moments,
5817        )
5818        .expect("aaaw");
5819        let exact_aaww = cell_fourth_derivative_from_moments(
5820            cell,
5821            &dc_da,
5822            &dc_da,
5823            &coeff_w,
5824            &coeff_w,
5825            &dc_daa,
5826            &coeff_aw,
5827            &coeff_aw,
5828            &coeff_aw,
5829            &coeff_aw,
5830            &zero,
5831            &coeff_aaw,
5832            &coeff_aaw,
5833            &zero,
5834            &zero,
5835            &zero,
5836            &state.moments,
5837        )
5838        .expect("aaww");
5839        let exact_abww = cell_fourth_derivative_from_moments(
5840            cell,
5841            &dc_da,
5842            &dc_db,
5843            &coeff_w,
5844            &coeff_w,
5845            &dc_dab,
5846            &coeff_aw,
5847            &coeff_aw,
5848            &coeff_bw,
5849            &coeff_bw,
5850            &zero,
5851            &coeff_abw,
5852            &coeff_abw,
5853            &zero,
5854            &zero,
5855            &zero,
5856            &state.moments,
5857        )
5858        .expect("abww");
5859        let exact_bbww = cell_fourth_derivative_from_moments(
5860            cell,
5861            &dc_db,
5862            &dc_db,
5863            &coeff_w,
5864            &coeff_w,
5865            &dc_dbb,
5866            &coeff_bw,
5867            &coeff_bw,
5868            &coeff_bw,
5869            &coeff_bw,
5870            &zero,
5871            &coeff_bbw,
5872            &coeff_bbw,
5873            &zero,
5874            &zero,
5875            &zero,
5876            &state.moments,
5877        )
5878        .expect("bbww");
5879        let exact_bbbw = cell_fourth_derivative_from_moments(
5880            cell,
5881            &dc_db,
5882            &dc_db,
5883            &dc_db,
5884            &coeff_w,
5885            &dc_dbb,
5886            &dc_dbb,
5887            &coeff_bw,
5888            &dc_dbb,
5889            &coeff_bw,
5890            &coeff_bw,
5891            &dc_dbbb,
5892            &coeff_bbw,
5893            &coeff_bbw,
5894            &coeff_bbw,
5895            &coeff_bbbw,
5896            &state.moments,
5897        )
5898        .expect("bbbw");
5899        let exact_wwww = cell_fourth_derivative_from_moments(
5900            cell,
5901            &coeff_w,
5902            &coeff_w,
5903            &coeff_w,
5904            &coeff_w,
5905            &zero,
5906            &zero,
5907            &zero,
5908            &zero,
5909            &zero,
5910            &zero,
5911            &zero,
5912            &zero,
5913            &zero,
5914            &zero,
5915            &zero,
5916            &state.moments,
5917        )
5918        .expect("wwww");
5919
5920        let numeric_w = simpson_integral(cell.left, cell.right, 5000, |z| {
5921            eta_w(z) * (-cell.q(z)).exp() * INV_TWO_PI
5922        });
5923        let numeric_aw = simpson_integral(cell.left, cell.right, 5000, |z| {
5924            (eta_aw(z) - cell.eta(z) * eta_a(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5925        });
5926        let numeric_bw = simpson_integral(cell.left, cell.right, 5000, |z| {
5927            (eta_bw(z) - cell.eta(z) * eta_b(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5928        });
5929        let numeric_ww = simpson_integral(cell.left, cell.right, 5000, |z| {
5930            (-cell.eta(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5931        });
5932        let numeric_aaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5933            let eta = cell.eta(z);
5934            let w_z = eta_w(z);
5935            let a_z = eta_a(z);
5936            (eta_aaw(z) - eta * (eta_aa(z) * w_z + 2.0 * eta_aw(z) * a_z)
5937                + (eta * eta - 1.0) * a_z * a_z * w_z)
5938                * (-cell.q(z)).exp()
5939                * INV_TWO_PI
5940        });
5941        let numeric_abw = simpson_integral(cell.left, cell.right, 5000, |z| {
5942            let eta = cell.eta(z);
5943            let w_z = eta_w(z);
5944            let a_z = eta_a(z);
5945            let b_z = eta_b(z);
5946            (eta_abw(z) - eta * (eta_ab(z) * w_z + eta_aw(z) * b_z + eta_bw(z) * a_z)
5947                + (eta * eta - 1.0) * a_z * b_z * w_z)
5948                * (-cell.q(z)).exp()
5949                * INV_TWO_PI
5950        });
5951        let numeric_bbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5952            let eta = cell.eta(z);
5953            let w_z = eta_w(z);
5954            let b_z = eta_b(z);
5955            (eta_bbw(z) - eta * (eta_bb(z) * w_z + 2.0 * eta_bw(z) * b_z)
5956                + (eta * eta - 1.0) * b_z * b_z * w_z)
5957                * (-cell.q(z)).exp()
5958                * INV_TWO_PI
5959        });
5960        let numeric_www = simpson_integral(cell.left, cell.right, 5000, |z| {
5961            let eta = cell.eta(z);
5962            let w_z = eta_w(z);
5963            ((eta * eta - 1.0) * w_z * w_z * w_z) * (-cell.q(z)).exp() * INV_TWO_PI
5964        });
5965        let numeric_aaaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5966            let eta = cell.eta(z);
5967            let a_z = eta_a(z);
5968            let w_z = eta_w(z);
5969            let aa_z = eta_aa(z);
5970            let aw_z = eta_aw(z);
5971            (eta_aaaw(z)
5972                - eta * ((dc_daaa[0] + 0.0 * z) * w_z + 3.0 * eta_aaw(z) * a_z + 3.0 * aa_z * aw_z)
5973                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * w_z + 3.0 * aw_z * a_z * a_z)
5974                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * a_z * w_z)
5975                * (-cell.q(z)).exp()
5976                * INV_TWO_PI
5977        });
5978        let numeric_aaww = simpson_integral(cell.left, cell.right, 5000, |z| {
5979            let eta = cell.eta(z);
5980            let a_z = eta_a(z);
5981            let w_z = eta_w(z);
5982            let aw_z = eta_aw(z);
5983            (-(2.0 * eta * (eta_aaw(z) * w_z + aw_z * aw_z))
5984                + (eta * eta - 1.0) * (eta_aa(z) * w_z * w_z + 4.0 * aw_z * a_z * w_z)
5985                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * w_z * w_z)
5986                * (-cell.q(z)).exp()
5987                * INV_TWO_PI
5988        });
5989        let numeric_abww = simpson_integral(cell.left, cell.right, 5000, |z| {
5990            let eta = cell.eta(z);
5991            let a_z = eta_a(z);
5992            let b_z = eta_b(z);
5993            let w_z = eta_w(z);
5994            let aw_z = eta_aw(z);
5995            let bw_z = eta_bw(z);
5996            (-(2.0 * eta * (eta_abw(z) * w_z + aw_z * bw_z))
5997                + (eta * eta - 1.0)
5998                    * (eta_ab(z) * w_z * w_z + 2.0 * aw_z * b_z * w_z + 2.0 * bw_z * a_z * w_z)
5999                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * w_z * w_z)
6000                * (-cell.q(z)).exp()
6001                * INV_TWO_PI
6002        });
6003        let numeric_bbww = simpson_integral(cell.left, cell.right, 5000, |z| {
6004            let eta = cell.eta(z);
6005            let b_z = eta_b(z);
6006            let w_z = eta_w(z);
6007            let bw_z = eta_bw(z);
6008            (-(2.0 * eta * (eta_bbw(z) * w_z + bw_z * bw_z))
6009                + (eta * eta - 1.0) * (eta_bb(z) * w_z * w_z + 4.0 * bw_z * b_z * w_z)
6010                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * w_z * w_z)
6011                * (-cell.q(z)).exp()
6012                * INV_TWO_PI
6013        });
6014        let numeric_bbbw = simpson_integral(cell.left, cell.right, 5000, |z| {
6015            let eta = cell.eta(z);
6016            let b_z = eta_b(z);
6017            let w_z = eta_w(z);
6018            let bb_z = eta_bb(z);
6019            let bw_z = eta_bw(z);
6020            (eta_bbbw(z)
6021                - eta
6022                    * ((dc_dbbb[3] * z * z * z) * w_z + 3.0 * eta_bbw(z) * b_z + 3.0 * bb_z * bw_z)
6023                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * w_z + 3.0 * bw_z * b_z * b_z)
6024                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * w_z)
6025                * (-cell.q(z)).exp()
6026                * INV_TWO_PI
6027        });
6028        let numeric_wwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6029            let eta = cell.eta(z);
6030            let w_z = eta_w(z);
6031            ((-eta * eta * eta + 3.0 * eta) * w_z * w_z * w_z * w_z)
6032                * (-cell.q(z)).exp()
6033                * INV_TWO_PI
6034        });
6035
6036        assert!((exact_w - numeric_w).abs() < 1e-8);
6037        assert!((exact_aw - numeric_aw).abs() < 1e-7);
6038        assert!((exact_bw - numeric_bw).abs() < 1e-7);
6039        assert!((exact_ww - numeric_ww).abs() < 1e-7);
6040        assert!((exact_aaw - numeric_aaw).abs() < 2e-6);
6041        assert!((exact_abw - numeric_abw).abs() < 2e-6);
6042        assert!((exact_bbw - numeric_bbw).abs() < 2e-6);
6043        assert!((exact_www - numeric_www).abs() < 2e-6);
6044        assert!((exact_aaaw - numeric_aaaw).abs() < 3e-6);
6045        assert!((exact_aaww - numeric_aaww).abs() < 3e-6);
6046        assert!((exact_abww - numeric_abww).abs() < 3e-6);
6047        assert!((exact_bbww - numeric_bbww).abs() < 3e-6);
6048        assert!((exact_bbbw - numeric_bbbw).abs() < 3e-6);
6049        assert!((exact_wwww - numeric_wwww).abs() < 3e-6);
6050    }
6051
6052    #[test]
6053    fn score_basis_cell_derivatives_match_exact_integrands() {
6054        let score_span = LocalSpanCubic {
6055            left: -0.75,
6056            right: 0.25,
6057            c0: 0.08,
6058            c1: -0.03,
6059            c2: 0.02,
6060            c3: -0.01,
6061        };
6062        let score_basis_span = LocalSpanCubic {
6063            left: -0.75,
6064            right: 0.25,
6065            c0: -0.04,
6066            c1: 0.06,
6067            c2: -0.01,
6068            c3: 0.02,
6069        };
6070        let link_span = LocalSpanCubic {
6071            left: -0.6,
6072            right: 0.9,
6073            c0: -0.05,
6074            c1: 0.04,
6075            c2: -0.02,
6076            c3: 0.015,
6077        };
6078        let a = 0.3;
6079        let b = -0.7;
6080        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6081        let cell = DenestedCubicCell {
6082            left: score_span.left,
6083            right: score_span.right,
6084            c0: coeffs[0],
6085            c1: coeffs[1],
6086            c2: coeffs[2],
6087            c3: coeffs[3],
6088        };
6089        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6090        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6091        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
6092        let dc_daa = second_partials.0;
6093        let dc_dab = second_partials.1;
6094        let dc_dbb = second_partials.2;
6095        let denested_third = denested_cell_third_partials(link_span);
6096        let dc_dbbb = denested_third.3;
6097
6098        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6099        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6100        let zero = [0.0; 4];
6101
6102        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6103        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6104        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6105        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
6106        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6107        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6108
6109        let exact_h = cell_first_derivative_from_moments(&coeff_h, &state.moments).expect("h");
6110        let exact_ah =
6111            cell_second_derivative_from_moments(cell, &dc_da, &coeff_h, &zero, &state.moments)
6112                .expect("ah");
6113        let exact_bh =
6114            cell_second_derivative_from_moments(cell, &dc_db, &coeff_h, &coeff_bh, &state.moments)
6115                .expect("bh");
6116        let exact_hh =
6117            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_h, &zero, &state.moments)
6118                .expect("hh");
6119        let exact_abh = cell_third_derivative_from_moments(
6120            cell,
6121            &dc_da,
6122            &dc_db,
6123            &coeff_h,
6124            &dc_dab,
6125            &zero,
6126            &coeff_bh,
6127            &zero,
6128            &state.moments,
6129        )
6130        .expect("abh");
6131        let exact_bbh = cell_third_derivative_from_moments(
6132            cell,
6133            &dc_db,
6134            &dc_db,
6135            &coeff_h,
6136            &dc_dbb,
6137            &coeff_bh,
6138            &coeff_bh,
6139            &zero,
6140            &state.moments,
6141        )
6142        .expect("bbh");
6143        let exact_bhh = cell_third_derivative_from_moments(
6144            cell,
6145            &dc_db,
6146            &coeff_h,
6147            &coeff_h,
6148            &coeff_bh,
6149            &coeff_bh,
6150            &zero,
6151            &zero,
6152            &state.moments,
6153        )
6154        .expect("bhh");
6155        let exact_hhh = cell_third_derivative_from_moments(
6156            cell,
6157            &coeff_h,
6158            &coeff_h,
6159            &coeff_h,
6160            &zero,
6161            &zero,
6162            &zero,
6163            &zero,
6164            &state.moments,
6165        )
6166        .expect("hhh");
6167        let exact_bbbh = cell_fourth_derivative_from_moments(
6168            cell,
6169            &dc_db,
6170            &dc_db,
6171            &dc_db,
6172            &coeff_h,
6173            &dc_dbb,
6174            &dc_dbb,
6175            &coeff_bh,
6176            &dc_dbb,
6177            &coeff_bh,
6178            &coeff_bh,
6179            &dc_dbbb,
6180            &zero,
6181            &zero,
6182            &zero,
6183            &zero,
6184            &state.moments,
6185        )
6186        .expect("bbbh");
6187        let exact_aahh = cell_fourth_derivative_from_moments(
6188            cell,
6189            &dc_da,
6190            &dc_da,
6191            &coeff_h,
6192            &coeff_h,
6193            &dc_daa,
6194            &zero,
6195            &zero,
6196            &zero,
6197            &zero,
6198            &zero,
6199            &zero,
6200            &zero,
6201            &zero,
6202            &zero,
6203            &zero,
6204            &state.moments,
6205        )
6206        .expect("aahh");
6207        let exact_abhh = cell_fourth_derivative_from_moments(
6208            cell,
6209            &dc_da,
6210            &dc_db,
6211            &coeff_h,
6212            &coeff_h,
6213            &dc_dab,
6214            &zero,
6215            &zero,
6216            &coeff_bh,
6217            &coeff_bh,
6218            &zero,
6219            &zero,
6220            &zero,
6221            &zero,
6222            &zero,
6223            &zero,
6224            &state.moments,
6225        )
6226        .expect("abhh");
6227        let exact_bbhh = cell_fourth_derivative_from_moments(
6228            cell,
6229            &dc_db,
6230            &dc_db,
6231            &coeff_h,
6232            &coeff_h,
6233            &dc_dbb,
6234            &coeff_bh,
6235            &coeff_bh,
6236            &coeff_bh,
6237            &coeff_bh,
6238            &zero,
6239            &zero,
6240            &zero,
6241            &zero,
6242            &zero,
6243            &zero,
6244            &state.moments,
6245        )
6246        .expect("bbhh");
6247        let exact_bhhh = cell_fourth_derivative_from_moments(
6248            cell,
6249            &dc_db,
6250            &coeff_h,
6251            &coeff_h,
6252            &coeff_h,
6253            &coeff_bh,
6254            &coeff_bh,
6255            &coeff_bh,
6256            &zero,
6257            &zero,
6258            &zero,
6259            &zero,
6260            &zero,
6261            &zero,
6262            &zero,
6263            &zero,
6264            &state.moments,
6265        )
6266        .expect("bhhh");
6267        let exact_hhhh = cell_fourth_derivative_from_moments(
6268            cell,
6269            &coeff_h,
6270            &coeff_h,
6271            &coeff_h,
6272            &coeff_h,
6273            &zero,
6274            &zero,
6275            &zero,
6276            &zero,
6277            &zero,
6278            &zero,
6279            &zero,
6280            &zero,
6281            &zero,
6282            &zero,
6283            &zero,
6284            &state.moments,
6285        )
6286        .expect("hhhh");
6287
6288        let numeric_h = simpson_integral(cell.left, cell.right, 5000, |z| {
6289            eta_h(z) * (-cell.q(z)).exp() * INV_TWO_PI
6290        });
6291        let numeric_ah = simpson_integral(cell.left, cell.right, 5000, |z| {
6292            (-cell.eta(z) * eta_a(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6293        });
6294        let numeric_bh = simpson_integral(cell.left, cell.right, 5000, |z| {
6295            (eta_bh(z) - cell.eta(z) * eta_b(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6296        });
6297        let numeric_hh = simpson_integral(cell.left, cell.right, 5000, |z| {
6298            (-cell.eta(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6299        });
6300        let numeric_abh = simpson_integral(cell.left, cell.right, 5000, |z| {
6301            let eta = cell.eta(z);
6302            (-(eta * (eta_ab(z) * eta_h(z) + eta_bh(z) * eta_a(z)))
6303                + (eta * eta - 1.0) * eta_a(z) * eta_b(z) * eta_h(z))
6304                * (-cell.q(z)).exp()
6305                * INV_TWO_PI
6306        });
6307        let numeric_bbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6308            let eta = cell.eta(z);
6309            (-(eta * (eta_bb(z) * eta_h(z) + 2.0 * eta_bh(z) * eta_b(z)))
6310                + (eta * eta - 1.0) * eta_b(z) * eta_b(z) * eta_h(z))
6311                * (-cell.q(z)).exp()
6312                * INV_TWO_PI
6313        });
6314        let numeric_bhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6315            let eta = cell.eta(z);
6316            (-(2.0 * eta * eta_bh(z) * eta_h(z))
6317                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_h(z))
6318                * (-cell.q(z)).exp()
6319                * INV_TWO_PI
6320        });
6321        let numeric_hhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6322            let eta = cell.eta(z);
6323            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6324        });
6325        let numeric_bbbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6326            let eta = cell.eta(z);
6327            let b_z = eta_b(z);
6328            let h_z = eta_h(z);
6329            let bb_z = eta_bb(z);
6330            let bh_z = eta_bh(z);
6331            (-(eta * ((dc_dbbb[3] * z * z * z) * h_z + 3.0 * bb_z * bh_z))
6332                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * h_z + 3.0 * bh_z * b_z * b_z)
6333                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * h_z)
6334                * (-cell.q(z)).exp()
6335                * INV_TWO_PI
6336        });
6337        let numeric_aahh = simpson_integral(cell.left, cell.right, 5000, |z| {
6338            let eta = cell.eta(z);
6339            let a_z = eta_a(z);
6340            let h_z = eta_h(z);
6341            ((eta * eta - 1.0) * polynomial_value(&dc_daa, z) * h_z * h_z
6342                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * h_z * h_z)
6343                * (-cell.q(z)).exp()
6344                * INV_TWO_PI
6345        });
6346        let numeric_abhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6347            let eta = cell.eta(z);
6348            let a_z = eta_a(z);
6349            let b_z = eta_b(z);
6350            let h_z = eta_h(z);
6351            ((eta * eta - 1.0) * (eta_ab(z) * h_z * h_z + 2.0 * eta_bh(z) * a_z * h_z)
6352                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * h_z * h_z)
6353                * (-cell.q(z)).exp()
6354                * INV_TWO_PI
6355        });
6356        let numeric_bbhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6357            let eta = cell.eta(z);
6358            let b_z = eta_b(z);
6359            let h_z = eta_h(z);
6360            let bh_z = eta_bh(z);
6361            (-(2.0 * eta * bh_z * bh_z)
6362                + (eta * eta - 1.0) * (eta_bb(z) * h_z * h_z + 4.0 * bh_z * b_z * h_z)
6363                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * h_z * h_z)
6364                * (-cell.q(z)).exp()
6365                * INV_TWO_PI
6366        });
6367        let numeric_bhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6368            let eta = cell.eta(z);
6369            let h_z = eta_h(z);
6370            (-(eta * (3.0 * eta_bh(z) * h_z * h_z))
6371                + (eta * eta - 1.0) * (3.0 * eta_bh(z) * h_z * h_z)
6372                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * h_z * h_z)
6373                * (-cell.q(z)).exp()
6374                * INV_TWO_PI
6375        });
6376        let numeric_hhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6377            let eta = cell.eta(z);
6378            let h_z = eta_h(z);
6379            ((-eta * eta * eta + 3.0 * eta) * h_z * h_z * h_z * h_z)
6380                * (-cell.q(z)).exp()
6381                * INV_TWO_PI
6382        });
6383
6384        assert!((exact_h - numeric_h).abs() < 1e-8);
6385        assert!((exact_ah - numeric_ah).abs() < 1e-7);
6386        assert!((exact_bh - numeric_bh).abs() < 1e-7);
6387        assert!((exact_hh - numeric_hh).abs() < 1e-7);
6388        assert!((exact_abh - numeric_abh).abs() < 2e-6);
6389        assert!((exact_bbh - numeric_bbh).abs() < 2e-6);
6390        assert!((exact_bhh - numeric_bhh).abs() < 2e-6);
6391        assert!((exact_hhh - numeric_hhh).abs() < 2e-6);
6392        assert!((exact_bbbh - numeric_bbbh).abs() < 3e-6);
6393        assert!((exact_aahh - numeric_aahh).abs() < 3e-6);
6394        assert!((exact_abhh - numeric_abhh).abs() < 3e-6);
6395        assert!((exact_bbhh - numeric_bbhh).abs() < 3e-6);
6396        assert!((exact_bhhh - numeric_bhhh).abs() < 3e-6);
6397        assert!((exact_hhhh - numeric_hhhh).abs() < 3e-6);
6398    }
6399
6400    #[test]
6401    fn cross_basis_cell_derivatives_match_exact_integrands() {
6402        let score_span = LocalSpanCubic {
6403            left: -0.75,
6404            right: 0.25,
6405            c0: 0.08,
6406            c1: -0.03,
6407            c2: 0.02,
6408            c3: -0.01,
6409        };
6410        let score_basis_span = LocalSpanCubic {
6411            left: -0.75,
6412            right: 0.25,
6413            c0: -0.04,
6414            c1: 0.06,
6415            c2: -0.01,
6416            c3: 0.02,
6417        };
6418        let link_span = LocalSpanCubic {
6419            left: -0.6,
6420            right: 0.9,
6421            c0: -0.05,
6422            c1: 0.04,
6423            c2: -0.02,
6424            c3: 0.015,
6425        };
6426        let link_basis_span = LocalSpanCubic {
6427            left: -0.6,
6428            right: 0.9,
6429            c0: 0.02,
6430            c1: -0.01,
6431            c2: 0.03,
6432            c3: -0.02,
6433        };
6434        let a = 0.3;
6435        let b = -0.7;
6436        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6437        let cell = DenestedCubicCell {
6438            left: score_span.left,
6439            right: score_span.right,
6440            c0: coeffs[0],
6441            c1: coeffs[1],
6442            c2: coeffs[2],
6443            c3: coeffs[3],
6444        };
6445        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6446        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6447        let (dc_daa, dc_dab, _) = denested_cell_second_partials(score_span, link_span, a, b);
6448
6449        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6450        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6451        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
6452        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
6453        let (coeff_aaw, coeff_abw, _) = link_basis_cell_second_partials(link_basis_span, a, b);
6454        let zero = [0.0; 4];
6455
6456        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6457        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6458        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6459        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6460        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
6461        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6462        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
6463        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
6464
6465        let exact_hw =
6466            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_w, &zero, &state.moments)
6467                .expect("hw");
6468        let exact_ahw = cell_third_derivative_from_moments(
6469            cell,
6470            &dc_da,
6471            &coeff_h,
6472            &coeff_w,
6473            &zero,
6474            &coeff_aw,
6475            &zero,
6476            &zero,
6477            &state.moments,
6478        )
6479        .expect("ahw");
6480        let exact_bhw = cell_third_derivative_from_moments(
6481            cell,
6482            &dc_db,
6483            &coeff_h,
6484            &coeff_w,
6485            &coeff_bh,
6486            &coeff_bw,
6487            &zero,
6488            &zero,
6489            &state.moments,
6490        )
6491        .expect("bhw");
6492        let exact_hhw = cell_third_derivative_from_moments(
6493            cell,
6494            &coeff_h,
6495            &coeff_h,
6496            &coeff_w,
6497            &zero,
6498            &zero,
6499            &zero,
6500            &zero,
6501            &state.moments,
6502        )
6503        .expect("hhw");
6504        let exact_hww = cell_third_derivative_from_moments(
6505            cell,
6506            &coeff_h,
6507            &coeff_w,
6508            &coeff_w,
6509            &zero,
6510            &zero,
6511            &zero,
6512            &zero,
6513            &state.moments,
6514        )
6515        .expect("hww");
6516        let exact_aahw = cell_fourth_derivative_from_moments(
6517            cell,
6518            &dc_da,
6519            &dc_da,
6520            &coeff_h,
6521            &coeff_w,
6522            &dc_daa,
6523            &zero,
6524            &coeff_aw,
6525            &zero,
6526            &coeff_aw,
6527            &zero,
6528            &zero,
6529            &coeff_aaw,
6530            &zero,
6531            &zero,
6532            &zero,
6533            &state.moments,
6534        )
6535        .expect("aahw");
6536        let exact_hhww = cell_fourth_derivative_from_moments(
6537            cell,
6538            &coeff_h,
6539            &coeff_h,
6540            &coeff_w,
6541            &coeff_w,
6542            &zero,
6543            &zero,
6544            &zero,
6545            &zero,
6546            &zero,
6547            &zero,
6548            &zero,
6549            &zero,
6550            &zero,
6551            &zero,
6552            &zero,
6553            &state.moments,
6554        )
6555        .expect("hhww");
6556        let exact_hhhw = cell_fourth_derivative_from_moments(
6557            cell,
6558            &coeff_h,
6559            &coeff_h,
6560            &coeff_h,
6561            &coeff_w,
6562            &zero,
6563            &zero,
6564            &zero,
6565            &zero,
6566            &zero,
6567            &zero,
6568            &zero,
6569            &zero,
6570            &zero,
6571            &zero,
6572            &zero,
6573            &state.moments,
6574        )
6575        .expect("hhhw");
6576        let exact_abhw = cell_fourth_derivative_from_moments(
6577            cell,
6578            &dc_da,
6579            &dc_db,
6580            &coeff_h,
6581            &coeff_w,
6582            &dc_dab,
6583            &zero,
6584            &coeff_aw,
6585            &coeff_bh,
6586            &coeff_bw,
6587            &zero,
6588            &zero,
6589            &coeff_abw,
6590            &zero,
6591            &zero,
6592            &zero,
6593            &state.moments,
6594        )
6595        .expect("abhw");
6596        let exact_ahww = cell_fourth_derivative_from_moments(
6597            cell,
6598            &dc_da,
6599            &coeff_h,
6600            &coeff_w,
6601            &coeff_w,
6602            &zero,
6603            &coeff_aw,
6604            &coeff_aw,
6605            &zero,
6606            &zero,
6607            &zero,
6608            &zero,
6609            &zero,
6610            &zero,
6611            &zero,
6612            &zero,
6613            &state.moments,
6614        )
6615        .expect("ahww");
6616        let exact_bhww = cell_fourth_derivative_from_moments(
6617            cell,
6618            &dc_db,
6619            &coeff_h,
6620            &coeff_w,
6621            &coeff_w,
6622            &coeff_bh,
6623            &coeff_bw,
6624            &coeff_bw,
6625            &zero,
6626            &zero,
6627            &zero,
6628            &zero,
6629            &zero,
6630            &zero,
6631            &zero,
6632            &zero,
6633            &state.moments,
6634        )
6635        .expect("bhww");
6636        let exact_hwww = cell_fourth_derivative_from_moments(
6637            cell,
6638            &coeff_h,
6639            &coeff_w,
6640            &coeff_w,
6641            &coeff_w,
6642            &zero,
6643            &zero,
6644            &zero,
6645            &zero,
6646            &zero,
6647            &zero,
6648            &zero,
6649            &zero,
6650            &zero,
6651            &zero,
6652            &zero,
6653            &state.moments,
6654        )
6655        .expect("hwww");
6656
6657        let numeric_hw = simpson_integral(cell.left, cell.right, 5000, |z| {
6658            (-cell.eta(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6659        });
6660        let numeric_ahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6661            let eta = cell.eta(z);
6662            (-(eta * eta_aw(z) * eta_h(z)) + (eta * eta - 1.0) * eta_a(z) * eta_h(z) * eta_w(z))
6663                * (-cell.q(z)).exp()
6664                * INV_TWO_PI
6665        });
6666        let numeric_bhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6667            let eta = cell.eta(z);
6668            (-(eta * (eta_bh(z) * eta_w(z) + eta_bw(z) * eta_h(z)))
6669                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_w(z))
6670                * (-cell.q(z)).exp()
6671                * INV_TWO_PI
6672        });
6673        let numeric_hhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6674            let eta = cell.eta(z);
6675            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6676        });
6677        let numeric_hww = simpson_integral(cell.left, cell.right, 5000, |z| {
6678            let eta = cell.eta(z);
6679            ((eta * eta - 1.0) * eta_h(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6680        });
6681        let numeric_aahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6682            let eta = cell.eta(z);
6683            (-(eta * polynomial_value(&coeff_aaw, z) * eta_h(z))
6684                + (eta * eta - 1.0)
6685                    * (polynomial_value(&dc_daa, z) * eta_h(z) * eta_w(z)
6686                        + 2.0 * eta_aw(z) * eta_a(z) * eta_h(z))
6687                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_a(z) * eta_h(z) * eta_w(z))
6688                * (-cell.q(z)).exp()
6689                * INV_TWO_PI
6690        });
6691        let numeric_hhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6692            let eta = cell.eta(z);
6693            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_w(z) * eta_w(z))
6694                * (-cell.q(z)).exp()
6695                * INV_TWO_PI
6696        });
6697        let numeric_hhhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6698            let eta = cell.eta(z);
6699            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_h(z) * eta_w(z))
6700                * (-cell.q(z)).exp()
6701                * INV_TWO_PI
6702        });
6703        let numeric_abhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6704            let eta = cell.eta(z);
6705            (-(eta * polynomial_value(&coeff_abw, z) * eta_h(z) + eta * eta_aw(z) * eta_bh(z))
6706                + (eta * eta - 1.0)
6707                    * (eta_ab(z) * eta_h(z) * eta_w(z)
6708                        + eta_aw(z) * eta_b(z) * eta_h(z)
6709                        + eta_bh(z) * eta_a(z) * eta_w(z)
6710                        + eta_bw(z) * eta_a(z) * eta_h(z))
6711                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_b(z) * eta_h(z) * eta_w(z))
6712                * (-cell.q(z)).exp()
6713                * INV_TWO_PI
6714        });
6715        let numeric_ahww = simpson_integral(cell.left, cell.right, 5000, |z| {
6716            let eta = cell.eta(z);
6717            (2.0 * (eta * eta - 1.0) * eta_aw(z) * eta_h(z) * eta_w(z)
6718                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_h(z) * eta_w(z) * eta_w(z))
6719                * (-cell.q(z)).exp()
6720                * INV_TWO_PI
6721        });
6722        let numeric_bhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6723            let eta = cell.eta(z);
6724            let h_z = eta_h(z);
6725            let w_z = eta_w(z);
6726            ((eta * eta - 1.0) * (eta_bh(z) * w_z * w_z + 2.0 * eta_bw(z) * h_z * w_z)
6727                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * w_z * w_z)
6728                * (-cell.q(z)).exp()
6729                * INV_TWO_PI
6730        });
6731        let numeric_hwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6732            let eta = cell.eta(z);
6733            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_w(z) * eta_w(z) * eta_w(z))
6734                * (-cell.q(z)).exp()
6735                * INV_TWO_PI
6736        });
6737
6738        assert!((exact_hw - numeric_hw).abs() < 1e-7);
6739        assert!((exact_ahw - numeric_ahw).abs() < 2e-6);
6740        assert!((exact_bhw - numeric_bhw).abs() < 2e-6);
6741        assert!((exact_hhw - numeric_hhw).abs() < 2e-6);
6742        assert!((exact_hww - numeric_hww).abs() < 2e-6);
6743        assert!((exact_aahw - numeric_aahw).abs() < 3e-6);
6744        assert!((exact_hhww - numeric_hhww).abs() < 3e-6);
6745        assert!((exact_hhhw - numeric_hhhw).abs() < 3e-6);
6746        assert!((exact_abhw - numeric_abhw).abs() < 3e-6);
6747        assert!((exact_ahww - numeric_ahww).abs() < 3e-6);
6748        assert!((exact_bhww - numeric_bhww).abs() < 3e-6);
6749        assert!((exact_hwww - numeric_hwww).abs() < 3e-6);
6750    }
6751
6752    #[test]
6753    fn cell_moment_scratch_reuses_buffers_under_margslope_like_pressure() {
6754        let cells = [
6755            DenestedCubicCell {
6756                left: -1.2,
6757                right: -0.35,
6758                c0: 0.18,
6759                c1: 0.72,
6760                c2: -0.045,
6761                c3: 0.018,
6762            },
6763            DenestedCubicCell {
6764                left: -0.35,
6765                right: 0.48,
6766                c0: -0.08,
6767                c1: 0.91,
6768                c2: 0.038,
6769                c3: -0.014,
6770            },
6771            DenestedCubicCell {
6772                left: 0.48,
6773                right: 1.4,
6774                c0: 0.11,
6775                c1: 0.83,
6776                c2: 0.022,
6777                c3: 0.012,
6778            },
6779        ];
6780        let mut scratch = CellMomentScratch::with_capacity(MAX_AFFINE_ANCHOR_DEGREE);
6781        for cell in cells {
6782            let baseline = evaluate_cell_moments(cell, 9).expect("baseline moments");
6783            let scratch_state =
6784                evaluate_cell_moments_with_scratch(cell, 9, &mut scratch).expect("scratch moments");
6785            assert_eq!(baseline.branch, scratch_state.branch);
6786            assert!((baseline.value - scratch_state.value).abs() <= 1e-10);
6787            assert_eq!(baseline.moments.len(), scratch_state.moments.len());
6788            for (lhs, rhs) in baseline.moments.iter().zip(scratch_state.moments.iter()) {
6789                assert!((lhs - rhs).abs() <= 1e-10, "{lhs} vs {rhs}");
6790            }
6791        }
6792
6793        reset_cell_moment_test_reallocs();
6794        let mut checksum = 0.0;
6795        for i in 0..5_000 {
6796            let cell = cells[i % cells.len()];
6797            let state = evaluate_cell_moments_with_scratch(cell, 9, &mut scratch)
6798                .expect("scratch moments under repeated pressure");
6799            checksum += state.value + state.moments[0] * 1e-12;
6800        }
6801        assert!(checksum.is_finite());
6802        assert_eq!(
6803            cell_moment_test_reallocs(),
6804            0,
6805            "scratch-backed inner cell-moment calls should not grow Vec buffers"
6806        );
6807    }
6808
6809    #[test]
6810    fn evaluate_cell_moments_matches_numeric_integrals() {
6811        let cell = DenestedCubicCell {
6812            left: -0.9,
6813            right: 0.8,
6814            c0: 0.15,
6815            c1: -0.35,
6816            c2: 0.11,
6817            c3: -0.07,
6818        };
6819        let state = evaluate_cell_moments(cell, 6).expect("cell moments");
6820        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
6821            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
6822        });
6823        assert!((state.value - value_numeric).abs() < 1e-9);
6824        for degree in 0..=6 {
6825            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
6826                z.powi(degree as i32) * (-cell.q(z)).exp()
6827            });
6828            assert!((state.moments[degree] - target).abs() < 1e-9);
6829        }
6830    }
6831
6832    #[test]
6833    fn partition_builder_moves_link_preimages_with_intercept() {
6834        let score_breaks = [-2.0, -1.0, 0.0, 1.0, 2.0];
6835        let link_breaks = [-1.5, -0.5, 0.5, 1.5];
6836        let score_span = |z: f64| {
6837            let left = if z < -1.0 {
6838                -2.0
6839            } else if z < 0.0 {
6840                -1.0
6841            } else if z < 1.0 {
6842                0.0
6843            } else {
6844                1.0
6845            };
6846            Ok(LocalSpanCubic {
6847                left,
6848                right: left + 1.0,
6849                c0: 0.1,
6850                c1: 0.2,
6851                c2: 0.0,
6852                c3: 0.0,
6853            })
6854        };
6855        let link_span = |u: f64| {
6856            let left = if u < -0.5 {
6857                -1.5
6858            } else if u < 0.5 {
6859                -0.5
6860            } else {
6861                0.5
6862            };
6863            Ok(LocalSpanCubic {
6864                left,
6865                right: left + 1.0,
6866                c0: -0.05,
6867                c1: 0.1,
6868                c2: 0.0,
6869                c3: 0.0,
6870            })
6871        };
6872        let cells_a0 = build_denested_partition_cells(
6873            0.25,
6874            0.9,
6875            &score_breaks,
6876            &link_breaks,
6877            score_span,
6878            link_span,
6879        )
6880        .expect("cells a0");
6881        let cells_a1 = build_denested_partition_cells(
6882            0.55,
6883            0.9,
6884            &score_breaks,
6885            &link_breaks,
6886            score_span,
6887            link_span,
6888        )
6889        .expect("cells a1");
6890        assert!(cells_a0.len() >= score_breaks.len() - 1);
6891        assert!(
6892            cells_a0
6893                .windows(2)
6894                .all(|w| (w[0].cell.right - w[1].cell.left).abs() <= 1e-12)
6895        );
6896        assert!(
6897            cells_a0
6898                .iter()
6899                .zip(cells_a1.iter())
6900                .any(|(lhs, rhs)| (lhs.cell.left - rhs.cell.left).abs() > 1e-10)
6901        );
6902        assert!(cells_a0.first().unwrap().cell.left.is_infinite());
6903        assert!(cells_a0.last().unwrap().cell.right.is_infinite());
6904    }
6905
6906    #[test]
6907    fn partition_builder_without_breaks_returns_single_global_cell() {
6908        let cells = build_denested_partition_cells_with_tails(
6909            0.3,
6910            -0.4,
6911            &[],
6912            &[],
6913            |z| {
6914                if z.is_nan() {
6915                    return Err("probe z is NaN".to_string());
6916                }
6917                Ok(LocalSpanCubic {
6918                    left: 0.0,
6919                    right: 1.0,
6920                    c0: 0.0,
6921                    c1: 0.0,
6922                    c2: 0.0,
6923                    c3: 0.0,
6924                })
6925            },
6926            |u| {
6927                if u.is_nan() {
6928                    return Err("probe u is NaN".to_string());
6929                }
6930                Ok(LocalSpanCubic {
6931                    left: 0.0,
6932                    right: 1.0,
6933                    c0: 0.0,
6934                    c1: 0.0,
6935                    c2: 0.0,
6936                    c3: 0.0,
6937                })
6938            },
6939        )
6940        .expect("global cell");
6941        assert_eq!(cells.len(), 1);
6942        assert_eq!(cells[0].cell.left, f64::NEG_INFINITY);
6943        assert_eq!(cells[0].cell.right, f64::INFINITY);
6944        assert!(cells[0].cell.c2.abs() < 1e-12);
6945        assert!(cells[0].cell.c3.abs() < 1e-12);
6946    }
6947
6948    #[test]
6949    fn polynomial_integral_helper_matches_moment_sum() {
6950        let cell = DenestedCubicCell {
6951            left: -1.5,
6952            right: 1.25,
6953            c0: 0.2,
6954            c1: -0.4,
6955            c2: 0.15,
6956            c3: 0.03,
6957        };
6958        let state = evaluate_cell_moments(cell, 8).expect("cell moments");
6959        let coeffs = [1.5, -0.25, 0.75, 0.1];
6960        let expected = INV_TWO_PI
6961            * coeffs
6962                .iter()
6963                .enumerate()
6964                .map(|(idx, coeff)| coeff * state.moments[idx])
6965                .sum::<f64>();
6966        let got = cell_polynomial_integral_from_moments(&coeffs, &state.moments, "test poly")
6967            .expect("poly integral");
6968        assert!((got - expected).abs() < 1e-14);
6969    }
6970
6971    #[test]
6972    fn batched_cell_moment_max_degree_matches_direct_non_affine_grid() {
6973        let cells = [
6974            DenestedCubicCell {
6975                left: -2.0,
6976                right: -0.25,
6977                c0: -0.7,
6978                c1: 0.8,
6979                c2: 0.015,
6980                c3: -0.004,
6981            },
6982            DenestedCubicCell {
6983                left: -0.5,
6984                right: 0.75,
6985                c0: 0.2,
6986                c1: -0.35,
6987                c2: -0.025,
6988                c3: 0.0,
6989            },
6990            DenestedCubicCell {
6991                left: 0.1,
6992                right: 1.6,
6993                c0: 0.4,
6994                c1: 0.25,
6995                c2: 0.01,
6996                c3: 0.006,
6997            },
6998            DenestedCubicCell {
6999                left: -1.25,
7000                right: 2.25,
7001                c0: -0.1,
7002                c1: 0.55,
7003                c2: -0.012,
7004                c3: 0.003,
7005            },
7006        ];
7007        for cell in cells {
7008            let branch = branch_cell(cell).expect("branch");
7009            if branch == ExactCellBranch::Affine {
7010                continue;
7011            }
7012            let batched =
7013                evaluate_non_affine_cell_state(cell, branch, 21).expect("degree-21 state");
7014            for degree in [9usize, 15, 21] {
7015                let direct =
7016                    evaluate_non_affine_cell_state(cell, branch, degree).expect("direct state");
7017                assert_eq!(batched.branch, direct.branch);
7018                let denom = direct.value.abs().max(1.0);
7019                assert!(((batched.value - direct.value).abs() / denom) < 1e-10);
7020                for k in 0..=degree {
7021                    let denom = direct.moments[k].abs().max(1.0);
7022                    let rel = (batched.moments[k] - direct.moments[k]).abs() / denom;
7023                    assert!(
7024                        rel < 1e-10,
7025                        "cell={cell:?} degree={degree} moment={k} rel={rel:e}"
7026                    );
7027                }
7028            }
7029        }
7030    }
7031
7032    #[test]
7033    fn derivative_moment_evaluator_matches_value_evaluator_moments() {
7034        let cells = [
7035            DenestedCubicCell {
7036                left: -2.0,
7037                right: -0.4,
7038                c0: 0.15,
7039                c1: -0.8,
7040                c2: 0.0,
7041                c3: 0.0,
7042            },
7043            DenestedCubicCell {
7044                left: -0.75,
7045                right: 1.4,
7046                c0: -0.25,
7047                c1: 0.6,
7048                c2: 0.12,
7049                c3: 0.0,
7050            },
7051            DenestedCubicCell {
7052                left: -1.1,
7053                right: 0.9,
7054                c0: 0.35,
7055                c1: -0.3,
7056                c2: 0.05,
7057                c3: -0.015,
7058            },
7059        ];
7060        for cell in cells {
7061            for degree in [4usize, 9, 15, 21] {
7062                let full = evaluate_cell_moments_uncached(cell, degree).expect("full moments");
7063                let derivative = evaluate_cell_derivative_moments_uncached(cell, degree)
7064                    .expect("derivative moments");
7065                assert_eq!(full.branch, derivative.branch);
7066                assert_eq!(full.moments.len(), derivative.moments.len());
7067                for k in 0..full.moments.len() {
7068                    assert_eq!(full.moments[k].to_bits(), derivative.moments[k].to_bits());
7069                }
7070            }
7071        }
7072    }
7073
7074    #[test]
7075    fn cell_moment_lru_matches_uncached_non_affine_grid() {
7076        let cache = CellMomentLruCache::new(16 * 1024 * 1024);
7077        let stats = CellMomentCacheStats::default();
7078        let c0s = [-0.75, 0.0, 0.5];
7079        let c1s = [-1.2, 0.25, 1.1];
7080        let c2s = [-0.18, 0.07];
7081        let c3s = [0.0, 0.025];
7082        let bounds = [(-2.0, -0.5), (-0.25, 1.5)];
7083        let degrees = [4usize, 9, 15, 21];
7084        for &c0 in &c0s {
7085            for &c1 in &c1s {
7086                for &c2 in &c2s {
7087                    for &c3 in &c3s {
7088                        for &(left, right) in &bounds {
7089                            for &max_degree in &degrees {
7090                                let cell = DenestedCubicCell {
7091                                    left,
7092                                    right,
7093                                    c0,
7094                                    c1,
7095                                    c2,
7096                                    c3,
7097                                };
7098                                let branch = branch_cell(cell).expect("branch");
7099                                if branch == ExactCellBranch::Affine {
7100                                    continue;
7101                                }
7102                                let expected =
7103                                    evaluate_non_affine_cell_state(cell, branch, max_degree)
7104                                        .expect("uncached non-affine moments");
7105                                let got = evaluate_cell_moments_cached(
7106                                    cell,
7107                                    max_degree,
7108                                    &cache,
7109                                    Some(&stats),
7110                                )
7111                                .expect("cached moments");
7112                                assert_eq!(got.branch, expected.branch);
7113                                assert_eq!(got.moments.len(), max_degree + 1);
7114                                let denom = expected.value.abs().max(1.0);
7115                                assert!(
7116                                    ((got.value - expected.value).abs() / denom) < 1e-10,
7117                                    "value mismatch for {cell:?} degree {max_degree}: got {} expected {}",
7118                                    got.value,
7119                                    expected.value
7120                                );
7121                                for (idx, (&lhs, &rhs)) in
7122                                    got.moments.iter().zip(expected.moments.iter()).enumerate()
7123                                {
7124                                    let denom = rhs.abs().max(1.0);
7125                                    assert!(
7126                                        ((lhs - rhs).abs() / denom) < 1e-10,
7127                                        "moment {idx} mismatch for {cell:?} degree {max_degree}: got {lhs} expected {rhs}"
7128                                    );
7129                                }
7130                                let warm = evaluate_cell_moments_cached(
7131                                    cell,
7132                                    max_degree,
7133                                    &cache,
7134                                    Some(&stats),
7135                                )
7136                                .expect("warm cached moments");
7137                                assert_eq!(warm, got);
7138                            }
7139                        }
7140                    }
7141                }
7142            }
7143        }
7144        let (hits, misses) = stats.snapshot();
7145        assert!(hits > 0, "expected warm LRU hits");
7146        assert!(misses > 0, "expected cold LRU misses");
7147    }
7148
7149    #[test]
7150    fn cell_moment_fingerprint_exact_cache_matches_current_evaluator() {
7151        let cells = [
7152            DenestedCubicCell {
7153                left: -1.75,
7154                right: -0.25,
7155                c0: 0.15,
7156                c1: -0.35,
7157                c2: 0.08,
7158                c3: -0.015,
7159            },
7160            DenestedCubicCell {
7161                left: -0.5,
7162                right: 0.8,
7163                c0: -0.2,
7164                c1: 0.45,
7165                c2: -0.12,
7166                c3: 0.025,
7167            },
7168            DenestedCubicCell {
7169                left: 0.1,
7170                right: 1.6,
7171                c0: 0.05,
7172                c1: 0.2,
7173                c2: 0.03,
7174                c3: 0.004,
7175            },
7176        ];
7177        let mut cache = std::collections::HashMap::new();
7178        for max_degree in [0usize, 3, 4, 9, 16] {
7179            for cell in cells {
7180                let baseline = evaluate_cell_moments(cell, max_degree).expect("baseline moments");
7181                let key = cell_moment_cache_key(cell, max_degree, 0.0);
7182                let cached = cache.entry(key).or_insert_with(|| {
7183                    evaluate_cell_moments(cell, max_degree).expect("cached moments")
7184                });
7185                assert_eq!(baseline.branch, cached.branch);
7186                assert_eq!(baseline.value.to_bits(), cached.value.to_bits());
7187                assert_eq!(baseline.moments.len(), cached.moments.len());
7188                for (lhs, rhs) in baseline.moments.iter().zip(cached.moments.iter()) {
7189                    assert_eq!(lhs.to_bits(), rhs.to_bits());
7190                }
7191            }
7192        }
7193    }
7194
7195    #[test]
7196    fn fuzzy_cell_moment_fingerprint_error_scales_with_epsilon() {
7197        for epsilon in [1e-8, 1e-6] {
7198            let base = DenestedCubicCell {
7199                left: -1.25,
7200                right: 1.1,
7201                c0: 0.1,
7202                c1: -0.25,
7203                c2: 0.04,
7204                c3: -0.006,
7205            };
7206            let perturbed = DenestedCubicCell {
7207                left: base.left + 0.001 * epsilon,
7208                right: base.right - 0.001 * epsilon,
7209                c0: base.c0 + 0.001 * epsilon,
7210                c1: base.c1 - 0.001 * epsilon,
7211                c2: base.c2 + 0.001 * epsilon,
7212                c3: base.c3 - 0.001 * epsilon,
7213            };
7214            assert_eq!(
7215                cell_moment_cache_key(base, 9, epsilon),
7216                cell_moment_cache_key(perturbed, 9, epsilon)
7217            );
7218            let lhs = evaluate_cell_moments(base, 9).expect("base moments");
7219            let rhs = evaluate_cell_moments(perturbed, 9).expect("perturbed moments");
7220            let max_rel = lhs
7221                .moments
7222                .iter()
7223                .zip(rhs.moments.iter())
7224                .map(|(a, b)| (a - b).abs() / a.abs().max(b.abs()).max(1.0))
7225                .fold(0.0_f64, f64::max);
7226            assert!(
7227                max_rel <= 10.0 * epsilon,
7228                "epsilon={epsilon:.1e} max_rel={max_rel:.3e}"
7229            );
7230        }
7231    }
7232
7233    /// Locks in numerical equivalence of the optimized
7234    /// `evaluate_non_affine_cell_state` against an inline reference
7235    /// implementation that mirrors the prior pre-fold structure
7236    /// (separate `cell.eta(z)` / `cell.q(z)` calls; post-loop
7237    /// `* half_width`; trailing `value_integral * half_width / sqrt(TAU)`).
7238    /// Any drift larger than 1e-13 relative would indicate the hot-path
7239    /// rewrite changed the math.
7240    #[test]
7241    fn non_affine_cell_state_matches_prefold_reference_to_1e_minus_13() {
7242        // Reference: byte-for-byte the structure of the previous
7243        // implementation. Kept local to this test to avoid leaking a second
7244        // public surface.
7245        fn reference(
7246            cell: DenestedCubicCell,
7247            branch: ExactCellBranch,
7248            max_degree: usize,
7249        ) -> CellMomentState {
7250            let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
7251            let mut value_integral = 0.0_f64;
7252            let center = 0.5 * (cell.left + cell.right);
7253            let half_width = 0.5 * (cell.right - cell.left);
7254            for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
7255                let z = center + half_width * node;
7256                let eta = cell.eta(z);
7257                let moment_weight = weight * (-cell.q(z)).exp();
7258                let mut z_pow = 1.0_f64;
7259                for moment in &mut moments {
7260                    *moment = moment_weight.mul_add(z_pow, *moment);
7261                    z_pow *= z;
7262                }
7263                value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
7264            }
7265            for moment in &mut moments {
7266                *moment *= half_width;
7267            }
7268            CellMomentState {
7269                branch,
7270                value: value_integral * half_width / (std::f64::consts::TAU).sqrt(),
7271                moments,
7272            }
7273        }
7274
7275        // Hand-rolled inputs that cross both Quartic and Sextic branches and
7276        // exercise positive/negative coefficients, asymmetric intervals, and
7277        // a wide degree range (matches survival_marginal_slope's degree=9
7278        // production call as well as the bernoulli outer-step degree=24).
7279        let cells = [
7280            DenestedCubicCell {
7281                left: -1.25,
7282                right: -0.2,
7283                c0: -0.35,
7284                c1: 0.85,
7285                c2: 0.04,
7286                c3: -0.015,
7287            },
7288            DenestedCubicCell {
7289                left: -0.2,
7290                right: 0.55,
7291                c0: 0.12,
7292                c1: -0.65,
7293                c2: -0.025,
7294                c3: 0.02,
7295            },
7296            DenestedCubicCell {
7297                left: 0.55,
7298                right: 1.6,
7299                c0: 0.42,
7300                c1: 0.35,
7301                c2: 0.018,
7302                c3: 0.012,
7303            },
7304            DenestedCubicCell {
7305                left: -3.0,
7306                right: -1.0,
7307                c0: 1.7,
7308                c1: -0.4,
7309                c2: 0.11,
7310                c3: -0.07,
7311            },
7312        ];
7313        let degrees = [0_usize, 4, 9, 16, 24];
7314        for cell in cells {
7315            let branch = branch_cell(cell).expect("branch");
7316            assert_ne!(branch, ExactCellBranch::Affine);
7317            for max_degree in degrees {
7318                let actual = evaluate_non_affine_cell_state(cell, branch, max_degree)
7319                    .expect("optimized non-affine");
7320                let expected = reference(cell, branch, max_degree);
7321                assert_eq!(actual.branch, expected.branch);
7322                assert_eq!(actual.moments.len(), expected.moments.len());
7323                let denom_v = expected.value.abs().max(1.0);
7324                let rel_v = (actual.value - expected.value).abs() / denom_v;
7325                let actual_v = actual.value;
7326                let expected_v = expected.value;
7327                assert!(
7328                    rel_v <= 1e-13,
7329                    "value rel mismatch for {cell:?} degree {max_degree}: \
7330                     actual={actual_v:.17e} expected={expected_v:.17e} rel={rel_v:.3e}"
7331                );
7332                for (k, (lhs, rhs)) in actual
7333                    .moments
7334                    .iter()
7335                    .zip(expected.moments.iter())
7336                    .enumerate()
7337                {
7338                    let denom = rhs.abs().max(1.0);
7339                    let rel = (lhs - rhs).abs() / denom;
7340                    assert!(
7341                        rel <= 1e-13,
7342                        "moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7343                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7344                    );
7345                }
7346
7347                // Also lock in the derivative-state path on the same
7348                // inputs so the (parallel) edit there can't drift.
7349                let actual_deriv =
7350                    evaluate_non_affine_cell_derivative_state(cell, branch, max_degree)
7351                        .expect("optimized derivative");
7352                for (k, (lhs, rhs)) in actual_deriv
7353                    .moments
7354                    .iter()
7355                    .zip(expected.moments.iter())
7356                    .enumerate()
7357                {
7358                    let denom = rhs.abs().max(1.0);
7359                    let rel = (lhs - rhs).abs() / denom;
7360                    assert!(
7361                        rel <= 1e-13,
7362                        "deriv moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7363                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7364                    );
7365                }
7366            }
7367        }
7368    }
7369
7370    /// DECISIVE: the third-derivative kernel must equal the FD of the
7371    /// second-derivative kernel w.r.t. a parameter that perturbs `eta`,
7372    /// RE-EVALUATING the moments at each step (the moments depend on `eta`
7373    /// via the `exp(-q)` weight). This isolates the kernel from all survival
7374    /// partition/cross machinery (gam#979 f_uv_dir localization).
7375    #[test]
7376    fn third_derivative_kernel_matches_fd_of_second_with_eta_perturbation() {
7377        // A finite, non-affine cell.
7378        let base = DenestedCubicCell {
7379            left: -0.6,
7380            right: 0.9,
7381            c0: 0.30,
7382            c1: 0.45,
7383            c2: -0.20,
7384            c3: 0.12,
7385        };
7386        // Synthetic parameter directions as cubic-in-z perturbations of eta:
7387        //   eta_u = ∂eta/∂u, eta_v = ∂eta/∂v, eta_t = ∂eta/∂t (the dir).
7388        let eta_u = [0.11_f64, -0.07, 0.05, 0.02];
7389        let eta_v = [-0.09_f64, 0.13, -0.04, 0.03];
7390        let eta_t = [0.17_f64, 0.06, -0.10, 0.04]; // the "b-like" direction
7391        // Second crosses ∂²eta/∂{·}{·} (pick small non-zero cubics).
7392        let eta_uv = [0.02_f64, 0.01, -0.015, 0.005];
7393        let eta_ut = [-0.01_f64, 0.02, 0.007, -0.003];
7394        let eta_vt = [0.015_f64, -0.008, 0.01, 0.004];
7395        // Third cross ∂³eta/∂u∂v∂t.
7396        let eta_uvt = [0.003_f64, -0.002, 0.001, 0.0005];
7397
7398        let neg = |a: &[f64; 4]| a.map(|v| -v);
7399        let max_degree = 15usize;
7400
7401        // f_uv(s) where param s shifts eta by s·(eta_t + ½ s²... ) — here we
7402        // build the cell at eta + s·eta_t + s²·eta_vt-style is NOT needed; we
7403        // only need the t-direction to first order for ∂/∂t. To FD ∂(f_uv)/∂t
7404        // we perturb eta along eta_t AND carry the s-dependence of the u,v
7405        // crosses: eta_u(s)=eta_u + s·eta_ut, eta_v(s)=eta_v + s·eta_vt,
7406        // eta_uv(s)=eta_uv + s·eta_uvt. The cell cubic shifts by s·eta_t.
7407        let f_uv_at = |s: f64| -> f64 {
7408            let cell_s = DenestedCubicCell {
7409                c0: base.c0 + s * eta_t[0],
7410                c1: base.c1 + s * eta_t[1],
7411                c2: base.c2 + s * eta_t[2],
7412                c3: base.c3 + s * eta_t[3],
7413                ..base
7414            };
7415            // Moments MUST be recomputed at the perturbed eta.
7416            let st = evaluate_cell_moments(cell_s, max_degree).unwrap();
7417            let neg_cell = DenestedCubicCell {
7418                c0: -cell_s.c0,
7419                c1: -cell_s.c1,
7420                c2: -cell_s.c2,
7421                c3: -cell_s.c3,
7422                ..cell_s
7423            };
7424            let u_s = [
7425                eta_u[0] + s * eta_ut[0],
7426                eta_u[1] + s * eta_ut[1],
7427                eta_u[2] + s * eta_ut[2],
7428                eta_u[3] + s * eta_ut[3],
7429            ];
7430            let v_s = [
7431                eta_v[0] + s * eta_vt[0],
7432                eta_v[1] + s * eta_vt[1],
7433                eta_v[2] + s * eta_vt[2],
7434                eta_v[3] + s * eta_vt[3],
7435            ];
7436            let uv_s = [
7437                eta_uv[0] + s * eta_uvt[0],
7438                eta_uv[1] + s * eta_uvt[1],
7439                eta_uv[2] + s * eta_uvt[2],
7440                eta_uv[3] + s * eta_uvt[3],
7441            ];
7442            cell_second_derivative_from_moments(
7443                neg_cell,
7444                &neg(&u_s),
7445                &neg(&v_s),
7446                &neg(&uv_s),
7447                &st.moments,
7448            )
7449            .unwrap()
7450        };
7451
7452        let h = 1e-5;
7453        let fd = (f_uv_at(h) - f_uv_at(-h)) / (2.0 * h);
7454
7455        // Analytic third via the kernel (negated cell + negated crosses, as the
7456        // survival path does).
7457        let st0 = evaluate_cell_moments(base, max_degree).unwrap();
7458        let neg_cell0 = DenestedCubicCell {
7459            c0: -base.c0,
7460            c1: -base.c1,
7461            c2: -base.c2,
7462            c3: -base.c3,
7463            ..base
7464        };
7465        let analytic = cell_third_derivative_from_moments(
7466            neg_cell0,
7467            &neg(&eta_u),
7468            &neg(&eta_v),
7469            &neg(&eta_t),
7470            &neg(&eta_uv),
7471            &neg(&eta_ut),
7472            &neg(&eta_vt),
7473            &neg(&eta_uvt),
7474            &st0.moments,
7475        )
7476        .unwrap();
7477
7478        let denom = fd.abs().max(1e-3);
7479        let rel = (analytic - fd).abs() / denom;
7480        assert!(
7481            rel <= 1e-5,
7482            "third kernel vs FD-of-second mismatch: analytic={analytic:.12e} fd={fd:.12e} rel={rel:.3e}"
7483        );
7484    }
7485
7486    #[test]
7487    fn moving_shared_edge_second_integral_derivative_has_leibniz_jump_sign() {
7488        let edge0 = 0.2_f64;
7489        let edge_velocity = -0.37_f64;
7490
7491        let left_eta = [0.22_f64, -0.18, 0.09, 0.03];
7492        let right_eta = [-0.11_f64, 0.26, -0.04, 0.02];
7493        let left_r = [0.08_f64, -0.05, 0.03, 0.01];
7494        let left_s = [-0.06_f64, 0.04, 0.02, -0.015];
7495        let left_rs = [0.025_f64, -0.012, 0.006, 0.004];
7496        let right_r = [-0.03_f64, 0.07, -0.02, 0.012];
7497        let right_s = [0.05_f64, -0.025, 0.018, 0.007];
7498        let right_rs = [-0.018_f64, 0.014, -0.005, 0.003];
7499
7500        let integral_at = |shift: f64| -> f64 {
7501            let edge = edge0 + edge_velocity * shift;
7502            let left = DenestedCubicCell {
7503                left: -0.7,
7504                right: edge,
7505                c0: left_eta[0],
7506                c1: left_eta[1],
7507                c2: left_eta[2],
7508                c3: left_eta[3],
7509            };
7510            let right = DenestedCubicCell {
7511                left: edge,
7512                right: 1.1,
7513                c0: right_eta[0],
7514                c1: right_eta[1],
7515                c2: right_eta[2],
7516                c3: right_eta[3],
7517            };
7518            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7519            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7520            cell_second_derivative_from_moments(
7521                left,
7522                &left_r,
7523                &left_s,
7524                &left_rs,
7525                &left_state.moments,
7526            )
7527            .expect("left second")
7528                + cell_second_derivative_from_moments(
7529                    right,
7530                    &right_r,
7531                    &right_s,
7532                    &right_rs,
7533                    &right_state.moments,
7534                )
7535                .expect("right second")
7536        };
7537
7538        let h = 1e-5;
7539        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7540
7541        let left = DenestedCubicCell {
7542            left: -0.7,
7543            right: edge0,
7544            c0: left_eta[0],
7545            c1: left_eta[1],
7546            c2: left_eta[2],
7547            c3: left_eta[3],
7548        };
7549        let right = DenestedCubicCell {
7550            left: edge0,
7551            right: 1.1,
7552            c0: right_eta[0],
7553            c1: right_eta[1],
7554            c2: right_eta[2],
7555            c3: right_eta[3],
7556        };
7557        let f_left =
7558            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7559        let f_right =
7560            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7561        let analytic = edge_velocity * (f_left - f_right);
7562
7563        let denom = analytic.abs().max(1e-8);
7564        let rel = (fd - analytic).abs() / denom;
7565        assert!(
7566            rel <= 5e-8,
7567            "moving edge sign mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7568        );
7569    }
7570
7571    #[test]
7572    fn moving_shared_edge_second_integral_mixed_derivative_has_full_leibniz_terms() {
7573        let edge0 = -0.15_f64;
7574        let edge_d1 = 0.31_f64;
7575        let edge_d2 = -0.27_f64;
7576        let edge_d12 = 0.19_f64;
7577
7578        let left_eta = [0.16_f64, -0.21, 0.07, -0.025];
7579        let right_eta = [-0.09_f64, 0.18, -0.055, 0.018];
7580        let left_r = [0.075_f64, -0.045, 0.018, 0.009];
7581        let left_s = [-0.052_f64, 0.033, 0.014, -0.011];
7582        let left_rs = [0.021_f64, -0.009, 0.005, 0.0025];
7583        let right_r = [-0.028_f64, 0.063, -0.017, 0.010];
7584        let right_s = [0.047_f64, -0.023, 0.016, 0.006];
7585        let right_rs = [-0.015_f64, 0.012, -0.004, 0.002];
7586
7587        let integral_at = |s1: f64, s2: f64| -> f64 {
7588            let edge = edge0 + edge_d1 * s1 + edge_d2 * s2 + edge_d12 * s1 * s2;
7589            let left = DenestedCubicCell {
7590                left: -0.8,
7591                right: edge,
7592                c0: left_eta[0],
7593                c1: left_eta[1],
7594                c2: left_eta[2],
7595                c3: left_eta[3],
7596            };
7597            let right = DenestedCubicCell {
7598                left: edge,
7599                right: 0.9,
7600                c0: right_eta[0],
7601                c1: right_eta[1],
7602                c2: right_eta[2],
7603                c3: right_eta[3],
7604            };
7605            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7606            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7607            cell_second_derivative_from_moments(
7608                left,
7609                &left_r,
7610                &left_s,
7611                &left_rs,
7612                &left_state.moments,
7613            )
7614            .expect("left second")
7615                + cell_second_derivative_from_moments(
7616                    right,
7617                    &right_r,
7618                    &right_s,
7619                    &right_rs,
7620                    &right_state.moments,
7621                )
7622                .expect("right second")
7623        };
7624
7625        let h = 2e-4;
7626        let fd = (integral_at(h, h) - integral_at(h, -h) - integral_at(-h, h)
7627            + integral_at(-h, -h))
7628            / (4.0 * h * h);
7629
7630        let left = DenestedCubicCell {
7631            left: -0.8,
7632            right: edge0,
7633            c0: left_eta[0],
7634            c1: left_eta[1],
7635            c2: left_eta[2],
7636            c3: left_eta[3],
7637        };
7638        let right = DenestedCubicCell {
7639            left: edge0,
7640            right: 0.9,
7641            c0: right_eta[0],
7642            c1: right_eta[1],
7643            c2: right_eta[2],
7644            c3: right_eta[3],
7645        };
7646
7647        let boundary_z_derivative =
7648            |cell: DenestedCubicCell, r: &[f64], s: &[f64], rs: &[f64]| -> f64 {
7649                let eta = cell.eta(edge0);
7650                let eta_z = cell.c1 + 2.0 * cell.c2 * edge0 + 3.0 * cell.c3 * edge0 * edge0;
7651                let cr = poly_eval_at(r, edge0);
7652                let cs = poly_eval_at(s, edge0);
7653                let crs = poly_eval_at(rs, edge0);
7654                let cr_z = r.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7655                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7656                });
7657                let cs_z = s.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7658                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7659                });
7660                let crs_z = rs.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7661                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7662                });
7663                let amp = crs - eta * cr * cs;
7664                let amp_z = crs_z - eta_z * cr * cs - eta * cr_z * cs - eta * cr * cs_z;
7665                let q_z = edge0 + eta * eta_z;
7666                (amp_z - amp * q_z) * (-cell.q(edge0)).exp() * INV_TWO_PI
7667            };
7668
7669        let f_left =
7670            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7671        let f_right =
7672            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7673        let fz_left = boundary_z_derivative(left, &left_r, &left_s, &left_rs);
7674        let fz_right = boundary_z_derivative(right, &right_r, &right_s, &right_rs);
7675        let analytic = edge_d12 * (f_left - f_right) + edge_d1 * edge_d2 * (fz_left - fz_right);
7676
7677        let denom = analytic.abs().max(1e-8);
7678        let rel = (fd - analytic).abs() / denom;
7679        assert!(
7680            rel <= 2e-7,
7681            "moving edge mixed term mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7682        );
7683    }
7684
7685    // gam#1454 resolution. The reported defect ("survival flex directional
7686    // third[g,w0] wrong: candidate f_au_dir/f_aa_dir missing self-flux") posited
7687    // a MISSING third-order Leibniz self-flux at the moving link-knot crossings.
7688    // This regression establishes the two facts that, together, prove the
7689    // implicit-intercept third-order tower
7690    // (`row_primary_third_contracted_recompute*`) is CORRECT to add no such flux:
7691    //
7692    //   (1) The third-derivative integrand `F_rst` genuinely DOES jump across a
7693    //       C²-link knot — its third coefficient slice carries `c_rst ∝ 6·α₃`,
7694    //       and `α₃` (the spline's third `z`-derivative) is the one piece a C²
7695    //       cubic spline leaves discontinuous. So the jump is real and the
7696    //       `cell_third_derivative_boundary_integrand` flux formula is exact
7697    //       (verified by FD of a direct ∂/∂edge of the third-integral sum —
7698    //       a FOURTH-order scenario that pins the integrand, not the tower).
7699    //
7700    //   (2) Every boundary term in the Leibniz expansion of a THIRD derivative,
7701    //       however, evaluates an integrand of order ≤ 2 at the moving edge
7702    //       (one of the three differentiations is spent moving the boundary).
7703    //       The second-derivative integrand `F_rs` is CONTINUOUS across the same
7704    //       C² knot (its slices reach at most `α₂ + 3α₃·shift`, i.e. ½·η''(u*),
7705    //       which a C² spline keeps continuous). Hence the shared-edge flux
7706    //       `velocity·(F_rs^L − F_rs^R)` telescopes to ZERO, and the tower's
7707    //       third-order self-flux is a genuine no-op. The real residual lives in
7708    //       the interior implicit-intercept assembly, not at the boundary.
7709    #[test]
7710    fn third_order_self_flux_telescopes_but_third_integrand_jumps_at_c2_knot_1454() {
7711        let edge0 = 0.13_f64;
7712        let edge_velocity = -0.41_f64;
7713
7714        // Build η continuous to C² at edge0 but with a jump in the cubic (3rd
7715        // derivative) coefficient. Pick the left cubic freely; choose the right
7716        // cubic to match value+1st+2nd derivative at edge0, then perturb its c3.
7717        let left_eta = [0.18_f64, -0.12, 0.07, 0.04];
7718        let right_c3 = 0.04_f64 + 0.09; // α₃ jump across the knot.
7719        // Match η, η', η'' at edge0 for the right piece given its c3:
7720        //   η(z)  = c0 + c1 z + c2 z² + c3 z³
7721        //   η'(z) = c1 + 2 c2 z + 3 c3 z²
7722        //   η''(z)= 2 c2 + 6 c3 z
7723        // Solve right (c0,c1,c2) so the three values equal the left ones at edge0.
7724        let l0 = left_eta[0];
7725        let l1 = left_eta[1];
7726        let l2 = left_eta[2];
7727        let l3 = left_eta[3];
7728        let e = edge0;
7729        let eta_val = l0 + l1 * e + l2 * e * e + l3 * e * e * e;
7730        let eta_d1 = l1 + 2.0 * l2 * e + 3.0 * l3 * e * e;
7731        let eta_d2 = 2.0 * l2 + 6.0 * l3 * e;
7732        let rc2 = (eta_d2 - 6.0 * right_c3 * e) / 2.0;
7733        let rc1 = eta_d1 - 2.0 * rc2 * e - 3.0 * right_c3 * e * e;
7734        let rc0 = eta_val - rc1 * e - rc2 * e * e - right_c3 * e * e * e;
7735        let right_eta = [rc0, rc1, rc2, right_c3];
7736
7737        // Coefficient slices. The first/second slices we keep continuous at the
7738        // edge (mimicking c_r=1+η', c_rs∝η'' which a C² spline matches), so the
7739        // 2nd-order flux would cancel. The third-order slice `rst` carries the
7740        // jumping α₃ and is DIFFERENT across the edge — this is the term that
7741        // breaks cancellation.
7742        let common_r = [0.06_f64, -0.04, 0.02, 0.0];
7743        let common_s = [-0.05_f64, 0.03, 0.015, 0.0];
7744        let common_t = [0.08_f64, 0.05, -0.03, 0.0];
7745        let common_rs = [0.02_f64, -0.01, 0.005, 0.0];
7746        let common_rt = [-0.012_f64, 0.008, 0.004, 0.0];
7747        let common_st = [0.015_f64, -0.006, 0.003, 0.0];
7748        // rst ∝ 6·α₃ in the real path: left and right differ by the α₃ jump.
7749        let left_rst = [6.0 * l3, 0.0, 0.0, 0.0];
7750        let right_rst = [6.0 * right_c3, 0.0, 0.0, 0.0];
7751
7752        let max_degree = 15usize;
7753        let neg = |a: &[f64; 4]| a.map(|v| -v);
7754
7755        // The integral sum over the two cells sharing the moving edge, computed
7756        // via the fixed-domain moment reduction with the SURVIVAL/probit sign
7757        // convention (negated cell + negated coefficient slices), exactly as the
7758        // production `row_primary_third_contracted_recompute` path does.
7759        let integral_at = |shift: f64| -> f64 {
7760            let edge = edge0 + edge_velocity * shift;
7761            let left = DenestedCubicCell {
7762                left: -0.7,
7763                right: edge,
7764                c0: left_eta[0],
7765                c1: left_eta[1],
7766                c2: left_eta[2],
7767                c3: left_eta[3],
7768            };
7769            let right = DenestedCubicCell {
7770                left: edge,
7771                right: 1.0,
7772                c0: right_eta[0],
7773                c1: right_eta[1],
7774                c2: right_eta[2],
7775                c3: right_eta[3],
7776            };
7777            let lst = evaluate_cell_moments(left, max_degree).unwrap();
7778            let rst_m = evaluate_cell_moments(right, max_degree).unwrap();
7779            let neg_left = DenestedCubicCell {
7780                c0: -left.c0,
7781                c1: -left.c1,
7782                c2: -left.c2,
7783                c3: -left.c3,
7784                ..left
7785            };
7786            let neg_right = DenestedCubicCell {
7787                c0: -right.c0,
7788                c1: -right.c1,
7789                c2: -right.c2,
7790                c3: -right.c3,
7791                ..right
7792            };
7793            let li = cell_third_derivative_from_moments(
7794                neg_left,
7795                &neg(&common_r),
7796                &neg(&common_s),
7797                &neg(&common_t),
7798                &neg(&common_rs),
7799                &neg(&common_rt),
7800                &neg(&common_st),
7801                &neg(&left_rst),
7802                &lst.moments,
7803            )
7804            .unwrap();
7805            let ri = cell_third_derivative_from_moments(
7806                neg_right,
7807                &neg(&common_r),
7808                &neg(&common_s),
7809                &neg(&common_t),
7810                &neg(&common_rs),
7811                &neg(&common_rt),
7812                &neg(&common_st),
7813                &neg(&right_rst),
7814                &rst_m.moments,
7815            )
7816            .unwrap();
7817            li + ri
7818        };
7819
7820        let h = 1e-5;
7821        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7822
7823        // Fixed-domain part: differentiate ONLY the integrands (domain frozen at
7824        // edge0). Its directional derivative is the analytic Leibniz flux alone,
7825        // since the integrand coefficients here are edge-independent:
7826        //   flux = velocity · ( F_rst^L(edge0) − F_rst^R(edge0) ).
7827        //
7828        // CONVENTION: the finite-difference `integral_at` above integrates the
7829        // SURVIVAL/probit sign convention — negated cell (η→−η) AND negated
7830        // coefficient slices — exactly as the production
7831        // `row_primary_third_contracted_recompute` path does. The Leibniz
7832        // boundary integrand must therefore be evaluated in that SAME negated
7833        // convention: the third-derivative integrand is ODD under the joint
7834        // (η→−η, coeff→−coeff) negation (its `rst`, `η·rs·t`, and `(η²−1)·r·s·t`
7835        // terms each flip sign an odd number of times), so evaluating the flux
7836        // with un-negated cells/coeffs yields exactly the opposite sign and the
7837        // Leibniz identity `fd = flux` fails as `fd = −flux`. (The
7838        // second-derivative sibling test `moving_shared_edge_second_integral_
7839        // derivative_has_leibniz_jump_sign` keeps BOTH sides un-negated and so
7840        // stays self-consistent; this test keeps BOTH sides negated.)
7841        let neg_eta = |eta: &[f64; 4]| [-eta[0], -eta[1], -eta[2], -eta[3]];
7842        let left_eta_neg = neg_eta(&left_eta);
7843        let right_eta_neg = neg_eta(&right_eta);
7844        let left0 = DenestedCubicCell {
7845            left: -0.7,
7846            right: edge0,
7847            c0: left_eta_neg[0],
7848            c1: left_eta_neg[1],
7849            c2: left_eta_neg[2],
7850            c3: left_eta_neg[3],
7851        };
7852        let right0 = DenestedCubicCell {
7853            left: edge0,
7854            right: 1.0,
7855            c0: right_eta_neg[0],
7856            c1: right_eta_neg[1],
7857            c2: right_eta_neg[2],
7858            c3: right_eta_neg[3],
7859        };
7860        let f_left = cell_third_derivative_boundary_integrand(
7861            left0,
7862            &neg(&common_r),
7863            &neg(&common_s),
7864            &neg(&common_t),
7865            &neg(&common_rs),
7866            &neg(&common_rt),
7867            &neg(&common_st),
7868            &neg(&left_rst),
7869            edge0,
7870        );
7871        let f_right = cell_third_derivative_boundary_integrand(
7872            right0,
7873            &neg(&common_r),
7874            &neg(&common_s),
7875            &neg(&common_t),
7876            &neg(&common_rs),
7877            &neg(&common_rt),
7878            &neg(&common_st),
7879            &neg(&right_rst),
7880            edge0,
7881        );
7882
7883        // The integrand DOES jump across this C² knot (the α₃ third-coefficient
7884        // term is the only discontinuous piece). Confirm the jump is genuine —
7885        // if it were zero the flux would be a no-op and #1454 would not exist.
7886        let jump = f_left - f_right;
7887        assert!(
7888            jump.abs() > 1e-4,
7889            "third-derivative integrand must jump across the C² knot (α₃ discontinuity); \
7890             got jump={jump:.3e}"
7891        );
7892
7893        let analytic_flux = edge_velocity * jump;
7894        let denom = fd.abs().max(1e-6);
7895        let rel = (fd - analytic_flux).abs() / denom;
7896        assert!(
7897            rel <= 1e-5,
7898            "moving-edge third-derivative flux mismatch (#1454): fd={fd:.12e} \
7899             analytic_flux={analytic_flux:.12e} rel={rel:.3e}"
7900        );
7901
7902        // ---- Fact (2): the SECOND-derivative integrand telescopes to zero. ----
7903        // A 3rd-derivative Leibniz boundary term spends one differentiation on
7904        // the moving edge and evaluates a ≤2nd-order integrand there. The
7905        // hardest such term is the slope-slope Hessian integrand `F_bb`, whose
7906        // coefficient slice is the link cubic's b-b partial
7907        //   dc_dbb(z) = [0, 0, 2(α₂ + 3 α₃·shift), 6 α₃·b]·(z⁰..z³)
7908        //             = z²·η''(u),  with u = a + b·z, shift = a − knot.
7909        // Across a C² knot α₂, α₃, and `shift` all jump, yet η''(u*) is
7910        // continuous — so the EVALUATED slice `c_bb(z*) = z*²·η''(u*)` matches on
7911        // both sides and `F_bb` is continuous. Build the two pieces' raw dc_dbb
7912        // decompositions from `link_cubic_second_partials` and confirm the
7913        // second-derivative integrand carries no jump (flux telescopes to 0).
7914        let a_row = 0.21_f64;
7915        let b_row = 1.37_f64;
7916        let knot = a_row + b_row * edge0; // u-location of the crossing.
7917        // Left/right link pieces: choose α₂,α₃ freely on the left; pick the
7918        // right piece's α₂ so η''(knot) is continuous given a jumped α₃.
7919        let left_link = LocalSpanCubic {
7920            left: knot - 0.6,
7921            right: knot + 0.6,
7922            c0: 0.0,
7923            c1: 0.0,
7924            c2: 0.08,
7925            c3: -0.05,
7926        };
7927        let right_alpha3 = -0.05_f64 + 0.11; // α₃ jump.
7928        // η''(knot) continuity:  2α₂ᴸ + 6α₃ᴸ·(knot−leftᴸ) = 2α₂ᴿ + 6α₃ᴿ·(knot−leftᴿ).
7929        let right_left_coord = knot - 0.4;
7930        let lhs = 2.0 * left_link.c2 + 6.0 * left_link.c3 * (knot - left_link.left);
7931        let right_alpha2 = (lhs - 6.0 * right_alpha3 * (knot - right_left_coord)) / 2.0;
7932        let right_link = LocalSpanCubic {
7933            left: right_left_coord,
7934            right: right_left_coord + 0.8,
7935            c0: 0.0,
7936            c1: 0.0,
7937            c2: right_alpha2,
7938            c3: right_alpha3,
7939        };
7940        let (_, _, dc_dbb_left) = link_cubic_second_partials(left_link, a_row, b_row);
7941        let (_, _, dc_dbb_right) = link_cubic_second_partials(right_link, a_row, b_row);
7942        // The per-coefficient arrays differ (α₃ jumped)...
7943        assert!(
7944            (dc_dbb_left[3] - dc_dbb_right[3]).abs() > 1e-3,
7945            "α₃ jump must make the raw dc_dbb coefficient arrays differ"
7946        );
7947        // ...but the EVALUATED second-order slice at the crossing matches, so the
7948        // F_bb boundary integrand carries no jump and the flux telescopes to 0.
7949        let c_bb_left = poly_eval_at(&dc_dbb_left, edge0);
7950        let c_bb_right = poly_eval_at(&dc_dbb_right, edge0);
7951        assert!(
7952            (c_bb_left - c_bb_right).abs() <= 1e-12,
7953            "second-derivative slope-slope integrand must be CONTINUOUS across the \
7954             C² knot (telescoping self-flux): left={c_bb_left:.15e} right={c_bb_right:.15e}"
7955        );
7956    }
7957}
gam_model_kernels/cubic_cell_kernel.rs

gam_model_kernels/
cubic_cell_kernel.rs