gam_model_kernels/
cubic_cell_kernel.rs

1use gam_math::probability::normal_cdf;
2use gam_runtime::resource::{ByteLruCache, ResidentBytes};
3use smallvec::{SmallVec, smallvec};
4use std::hash::{Hash, Hasher};
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7
8/// Typed errors raised by the de-nested cubic transport kernel.
9///
10/// Sibling families (`bernoulli_marginal_slope`, `survival_marginal_slope`,
11/// `marginal_slope_shared`) currently consume the kernel's public surface via
12/// `Result<_, String>`. To stay source-compatible, the kernel converts errors
13/// to `String` at the boundary via `From<CubicCellKernelError> for String` and
14/// keeps the public function signatures returning `Result<_, String>`.
15/// `Display` is exact-byte-equivalent to the previous `format!(...)` strings.
16#[derive(Clone, Debug)]
17pub enum CubicCellKernelError {
18    /// Interval probe / cell-bounds preconditions (ordered bounds, supported
19    /// infinity patterns, positive finite width).
20    InvalidInterval { reason: String },
21    /// Cell-shape / branch-classification failure: tail cells not affine,
22    /// finite cells with non-positive width, non-finite affine coefficients,
23    /// non-affine cell with infinite bounds, leading-coefficient degeneracy
24    /// in the moment recurrence, etc.
25    InvalidCellShape { reason: String },
26    /// Reduced moment vector (or polynomial-convolution scratch) is shorter
27    /// than the polynomial degree the leaf needs to evaluate.
28    InsufficientMoments { reason: String },
29    /// Bivariate-normal CDF domain validation (non-finite/non-infinite
30    /// argument, non-finite correlation).
31    BivariateNormalDomain { reason: String },
32}
33
34impl_reason_error_boilerplate! {
35    CubicCellKernelError {
36        InvalidInterval,
37        InvalidCellShape,
38        InsufficientMoments,
39        BivariateNormalDomain,
40    }
41}
42
43impl CubicCellKernelError {
44    #[inline]
45    fn invalid_interval(reason: impl Into<String>) -> Self {
46        CubicCellKernelError::InvalidInterval {
47            reason: reason.into(),
48        }
49    }
50    #[inline]
51    fn invalid_cell_shape(reason: impl Into<String>) -> Self {
52        CubicCellKernelError::InvalidCellShape {
53            reason: reason.into(),
54        }
55    }
56    #[inline]
57    fn insufficient_moments(reason: impl Into<String>) -> Self {
58        CubicCellKernelError::InsufficientMoments {
59            reason: reason.into(),
60        }
61    }
62    #[inline]
63    fn bivariate_normal_domain(reason: impl Into<String>) -> Self {
64        CubicCellKernelError::BivariateNormalDomain {
65            reason: reason.into(),
66        }
67    }
68}
69
70// De-nested cubic transport kernel.
71//
72// This module implements the de-nested flexible-link/score-warp model
73//
74//   eta(z) = a + b*z + b*delta_h(z) + delta_w(a + b*z)
75//
76// where delta_h is the score warp and delta_w is the link deviation.
77// This is not the literal nested composition L(a + b*H(z)); it is an
78// additive-correction model around the affine core a + b*z.
79//
80// On each partition cell, both deviations are cubic polynomials, so eta is
81// at most sextic in z and q(z) = 0.5*(z^2 + eta^2) is at most degree 12.
82// The integral of exp(-q(z)) is evaluated by transporting from the affine
83// anchor (c2=c3=0, where q is Gaussian and the integral reduces to BVN)
84// to the target non-affine cell via the polynomial moment recurrence.
85//
86// The partition covers (-∞, +∞) with:
87//   • two semi-infinite affine TAIL cells (outside all deviation support),
88//   • finitely many interior cells (each a sextic microcell).
89// Because tail cells have constant deviations (c2=c3=0), their bounds
90// are parameter-independent, so no Leibniz boundary-motion corrections
91// appear in the derivatives.
92//
93// Shared by bernoulli_marginal_slope and survival_marginal_slope families.
94
95#[derive(Clone, Copy, Debug, PartialEq)]
96pub struct LocalSpanCubic {
97    pub left: f64,
98    pub right: f64,
99    pub c0: f64,
100    pub c1: f64,
101    pub c2: f64,
102    pub c3: f64,
103}
104
105impl LocalSpanCubic {
106    #[inline]
107    pub fn evaluate(self, x: f64) -> f64 {
108        let t = x - self.left;
109        self.c0 + self.c1 * t + self.c2 * t * t + self.c3 * t * t * t
110    }
111
112    #[inline]
113    pub fn first_derivative(self, x: f64) -> f64 {
114        let t = x - self.left;
115        self.c1 + 2.0 * self.c2 * t + 3.0 * self.c3 * t * t
116    }
117
118    #[inline]
119    pub fn second_derivative(self, x: f64) -> f64 {
120        let t = x - self.left;
121        2.0 * self.c2 + 6.0 * self.c3 * t
122    }
123}
124
125pub const ANCHORED_DEVIATION_KERNEL: &str = "DenestedCubicTransport";
126/// Default normalized non-affine branch tolerance used by [`branch_cell`].
127///
128/// Keep this cutoff explicit and hill-climbable: the large-scale cycle-0
129/// sweep evaluated `{1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3}` against the
130/// legacy transport path.  The more aggressive candidates require an
131/// end-to-end beta acceptance run before promotion; the default therefore
132/// remains the legacy `1e-10` value to preserve bit-for-bit model behavior.
133pub const NORMALIZED_CELL_BRANCH_TOL: f64 = 1e-10;
134
135const INV_TWO_PI: f64 = 1.0 / std::f64::consts::TAU;
136
137/// 384-point Gauss–Legendre nodes, re-exported for the GPU cubic-cell kernel
138/// (`src/gpu/cubic_cell/kernel_src.rs`) to embed as `__constant__` device
139/// memory. Linux-only because the kernel emitter is Linux-only.
140#[cfg(target_os = "linux")]
141pub const GL_NODES_FOR_GPU_KERNEL: &[f64; 384] = &GL_NODES;
142/// Companion weights to [`GL_NODES_FOR_GPU_KERNEL`].
143#[cfg(target_os = "linux")]
144pub const GL_WEIGHTS_FOR_GPU_KERNEL: &[f64; 384] = &GL_WEIGHTS;
145
146const GL_NODES: [f64; 384] = [
147    -9.999_804_411_726_474e-1,
148    -9.998_969_471_378_596e-1,
149    -9.997_467_408_113_523e-1,
150    -9.995_297_988_558_859e-1,
151    -9.992_461_316_671_845e-1,
152    -9.988_957_572_063_257e-1,
153    -9.984_786_985_384_589e-1,
154    -9.979_949_833_727_938e-1,
155    -9.974_446_439_389_107e-1,
156    -9.968_277_169_440_913e-1,
157    -9.961_442_435_551_087e-1,
158    -9.953_942_693_885_953e-1,
159    -9.945_778_445_047_068e-1,
160    -9.936_950_234_020_883e-1,
161    -9.927_458_650_133_153e-1,
162    -9.917_304_327_004_32e-1,
163    -9.906_487_942_504_061e-1,
164    -9.895_010_218_704_087e-1,
165    -9.882_871_921_828_699e-1,
166    -9.870_073_862_202_815e-1,
167    -9.856_616_894_197_333e-1,
168    -9.842_501_916_171_713e-1,
169    -9.827_729_870_413_743e-1,
170    -9.812_301_743_076_443e-1,
171    -9.796_218_564_112_101e-1,
172    -9.779_481_407_203_411e-1,
173    -9.762_091_389_691_724e-1,
174    -9.744_049_672_502_397e-1,
175    -9.725_357_460_067_257e-1,
176    -9.706_016_000_244_151e-1,
177    -9.686_026_584_233_628e-1,
178    -9.665_390_546_492_71e-1,
179    -9.644_109_264_645_802e-1,
180    -9.622_184_159_392_698e-1,
181    -9.599_616_694_413_742e-1,
182    -9.576_408_376_272_095e-1,
183    -9.552_560_754_313_16e-1,
184    -9.528_075_420_561_144e-1,
185    -9.502_954_009_612_771e-1,
186    -9.477_198_198_528_157e-1,
187    -9.450_809_706_718_851e-1,
188    -9.423_790_295_833_044e-1,
189    -9.396_141_769_637_963e-1,
190    -9.367_865_973_899_459e-1,
191    -9.338_964_796_258_775e-1,
192    -9.309_440_166_106_54e-1,
193    -9.279_294_054_453_956e-1,
194    -9.248_528_473_801_222e-1,
195    -9.217_145_478_003_181e-1,
196    -9.185_147_162_132_208e-1,
197    -9.152_535_662_338_34e-1,
198    -9.119_313_155_706_682e-1,
199    -9.085_481_860_112_055e-1,
200    -9.051_044_034_070_944e-1,
201    -9.016_001_976_590_722e-1,
202    -8.980_358_027_016_164e-1,
203    -8.944_114_564_873_288e-1,
204    -8.907_274_009_710_492e-1,
205    -8.869_838_820_937_034e-1,
206    -8.831_811_497_658_847e-1,
207    -8.793_194_578_511_7e-1,
208    -8.753_990_641_491_725e-1,
209    -8.714_202_303_783_312e-1,
210    -8.673_832_221_584_393e-1,
211    -8.632_883_089_929_12e-1,
212    -8.591_357_642_507_945e-1,
213    -8.549_258_651_485_127e-1,
214    -8.506_588_927_313_666e-1,
215    -8.463_351_318_547_683e-1,
216    -8.419_548_711_652_254e-1,
217    -8.375_184_030_810_715e-1,
218    -8.330_260_237_729_452e-1,
219    -8.284_780_331_440_178e-1,
220    -8.238_747_348_099_726e-1,
221    -8.192_164_360_787_36e-1,
222    -8.145_034_479_299_62e-1,
223    -8.097_360_849_942_72e-1,
224    -8.049_146_655_322_506e-1,
225    -8.000_395_114_131_988e-1,
226    -7.951_109_480_936_471e-1,
227    -7.901_293_045_956_28e-1,
228    -7.850_949_134_847_117e-1,
229    -7.800_081_108_478_04e-1,
230    -7.748_692_362_707_1e-1,
231    -7.696_786_328_154_644e-1,
232    -7.644_366_469_974_285e-1,
233    -7.591_436_287_621_58e-1,
234    -7.537_999_314_620_412e-1,
235    -7.484_059_118_327_094e-1,
236    -7.429_619_299_692_227e-1,
237    -7.374_683_493_020_299e-1,
238    -7.319_255_365_727_068e-1,
239    -7.263_338_618_094_733e-1,
240    -7.206_936_983_024_912e-1,
241    -7.150_054_225_789_432e-1,
242    -7.092_694_143_778_975e-1,
243    -7.034_860_566_249_567e-1,
244    -6.976_557_354_066_943e-1,
245    -6.917_788_399_448_808e-1,
246    -6.858_557_625_704_99e-1,
247    -6.798_868_986_975_534e-1,
248    -6.738_726_467_966_731e-1,
249    -6.678_134_083_685_102e-1,
250    -6.617_095_879_169_366e-1,
251    -6.555_615_929_220_4e-1,
252    -6.493_698_338_129_212e-1,
253    -6.431_347_239_402_948e-1,
254    -6.368_566_795_488_945e-1,
255    -6.305_361_197_496_849e-1,
256    -6.241_734_664_918_837e-1,
257    -6.177_691_445_347_913e-1,
258    -6.113_235_814_194_364e-1,
259    -6.048_372_074_400_329e-1,
260    -5.983_104_556_152_549e-1,
261    -5.917_437_616_593_286e-1,
262    -5.851_375_639_529_456e-1,
263    -5.784_923_035_139_965e-1,
264    -5.718_084_239_681_3e-1,
265    -5.650_863_715_191_369e-1,
266    -5.583_265_949_191_623e-1,
267    -5.515_295_454_387_482e-1,
268    -5.446_956_768_367_068e-1,
269    -5.378_254_453_298_289e-1,
270    -5.309_193_095_624_275e-1,
271    -5.239_777_305_757_194e-1,
272    -5.170_011_717_770_473e-1,
273    -5.099_900_989_089_429e-1,
274    -5.029_449_800_180_356e-1,
275    -4.958_662_854_238_058_4e-1,
276    -4.887_544_876_871_878e-1,
277    -4.816_100_615_790_221e-1,
278    -4.744_334_840_483_605_5e-1,
279    -4.672_252_341_906_264e-1,
280    -4.599_857_932_156_304e-1,
281    -4.527_156_444_154_463_7e-1,
282    -4.454_152_731_321_473_5e-1,
283    -4.380_851_667_254_05e-1,
284    -4.307_258_145_399_544_5e-1,
285    -4.233_377_078_729_265e-1,
286    -4.159_213_399_410_494e-1,
287    -4.084_772_058_477_228e-1,
288    -4.010_058_025_499_653e-1,
289    -3.935_076_288_252_386e-1,
290    -3.859_831_852_381_500_6e-1,
291    -3.784_329_741_070_358_6e-1,
292    -3.708_574_994_704_271e-1,
293    -3.632_572_670_534_011e-1,
294    -3.556_327_842_338_202e-1,
295    -3.479_845_600_084_600_6e-1,
296    -3.403_131_049_590_297e-1,
297    -3.326_189_312_180_866e-1,
298    -3.249_025_524_348_469_5e-1,
299    -3.171_644_837_408_958_4e-1,
300    -3.094_052_417_157_978e-1,
301    -3.016_253_443_526_109e-1,
302    -2.938_253_110_233_064_5e-1,
303    -2.860_056_624_440_967_5e-1,
304    -2.781_669_206_406_729e-1,
305    -2.703_096_089_133_553e-1,
306    -2.624_342_518_021_592_4e-1,
307    -2.545_413_750_517_773e-1,
308    -2.466_315_055_764_817_5e-1,
309    -2.387_051_714_249_486_3e-1,
310    -2.307_629_017_450_062e-1,
311    -2.228_052_267_483_099_4e-1,
312    -2.148_326_776_749_466_5e-1,
313    -2.068_457_867_579_697_5e-1,
314    -1.988_450_871_878_683_4e-1,
315    -1.908_311_130_769_724_5e-1,
316    -1.828_043_994_237_965_6e-1,
317    -1.747_654_820_773_241_2e-1,
318    -1.667_148_977_012_352_4e-1,
319    -1.586_531_837_380_799_3e-1,
320    -1.505_808_783_733_995e-1,
321    -1.424_985_204_997_981_4e-1,
322    -1.344_066_496_809_674_7e-1,
323    -1.263_058_061_156_663e-1,
324    -1.181_965_306_016_578_4e-1,
325    -1.100_793_644_996_070_4e-1,
326    -1.019_548_496_969_403_7e-1,
327    -9.382_352_857_167_028e-2,
328    -8.568_594_395_618_719e-2,
329    -7.754_263_910_102_077e-2,
330    -6.939_415_763_857_37e-2,
331    -6.124_104_354_682_962e-2,
332    -5.308_384_111_303_817_6e-2,
333    -4.492_309_489_737_94e-2,
334    -3.675_934_969_660_982e-2,
335    -2.859_315_050_769_284_7e-2,
336    -2.042_504_249_141_571e-2,
337    -1.225_557_093_599_553_8e-2,
338    -4.085_281_220_676_868e-3,
339    4.085_281_220_676_868e-3,
340    1.225_557_093_599_553_8e-2,
341    2.042_504_249_141_571e-2,
342    2.859_315_050_769_284_7e-2,
343    3.675_934_969_660_982e-2,
344    4.492_309_489_737_94e-2,
345    5.308_384_111_303_817_6e-2,
346    6.124_104_354_682_962e-2,
347    6.939_415_763_857_37e-2,
348    7.754_263_910_102_077e-2,
349    8.568_594_395_618_719e-2,
350    9.382_352_857_167_028e-2,
351    1.019_548_496_969_403_7e-1,
352    1.100_793_644_996_070_4e-1,
353    1.181_965_306_016_578_4e-1,
354    1.263_058_061_156_663e-1,
355    1.344_066_496_809_674_7e-1,
356    1.424_985_204_997_981_4e-1,
357    1.505_808_783_733_995e-1,
358    1.586_531_837_380_799_3e-1,
359    1.667_148_977_012_352_4e-1,
360    1.747_654_820_773_241_2e-1,
361    1.828_043_994_237_965_6e-1,
362    1.908_311_130_769_724_5e-1,
363    1.988_450_871_878_683_4e-1,
364    2.068_457_867_579_697_5e-1,
365    2.148_326_776_749_466_5e-1,
366    2.228_052_267_483_099_4e-1,
367    2.307_629_017_450_062e-1,
368    2.387_051_714_249_486_3e-1,
369    2.466_315_055_764_817_5e-1,
370    2.545_413_750_517_773e-1,
371    2.624_342_518_021_592_4e-1,
372    2.703_096_089_133_553e-1,
373    2.781_669_206_406_729e-1,
374    2.860_056_624_440_967_5e-1,
375    2.938_253_110_233_064_5e-1,
376    3.016_253_443_526_109e-1,
377    3.094_052_417_157_978e-1,
378    3.171_644_837_408_958_4e-1,
379    3.249_025_524_348_469_5e-1,
380    3.326_189_312_180_866e-1,
381    3.403_131_049_590_297e-1,
382    3.479_845_600_084_600_6e-1,
383    3.556_327_842_338_202e-1,
384    3.632_572_670_534_011e-1,
385    3.708_574_994_704_271e-1,
386    3.784_329_741_070_358_6e-1,
387    3.859_831_852_381_500_6e-1,
388    3.935_076_288_252_386e-1,
389    4.010_058_025_499_653e-1,
390    4.084_772_058_477_228e-1,
391    4.159_213_399_410_494e-1,
392    4.233_377_078_729_265e-1,
393    4.307_258_145_399_544_5e-1,
394    4.380_851_667_254_05e-1,
395    4.454_152_731_321_473_5e-1,
396    4.527_156_444_154_463_7e-1,
397    4.599_857_932_156_304e-1,
398    4.672_252_341_906_264e-1,
399    4.744_334_840_483_605_5e-1,
400    4.816_100_615_790_221e-1,
401    4.887_544_876_871_878e-1,
402    4.958_662_854_238_058_4e-1,
403    5.029_449_800_180_356e-1,
404    5.099_900_989_089_429e-1,
405    5.170_011_717_770_473e-1,
406    5.239_777_305_757_194e-1,
407    5.309_193_095_624_275e-1,
408    5.378_254_453_298_289e-1,
409    5.446_956_768_367_068e-1,
410    5.515_295_454_387_482e-1,
411    5.583_265_949_191_623e-1,
412    5.650_863_715_191_369e-1,
413    5.718_084_239_681_3e-1,
414    5.784_923_035_139_965e-1,
415    5.851_375_639_529_456e-1,
416    5.917_437_616_593_286e-1,
417    5.983_104_556_152_549e-1,
418    6.048_372_074_400_329e-1,
419    6.113_235_814_194_364e-1,
420    6.177_691_445_347_913e-1,
421    6.241_734_664_918_837e-1,
422    6.305_361_197_496_849e-1,
423    6.368_566_795_488_945e-1,
424    6.431_347_239_402_948e-1,
425    6.493_698_338_129_212e-1,
426    6.555_615_929_220_4e-1,
427    6.617_095_879_169_366e-1,
428    6.678_134_083_685_102e-1,
429    6.738_726_467_966_731e-1,
430    6.798_868_986_975_534e-1,
431    6.858_557_625_704_99e-1,
432    6.917_788_399_448_808e-1,
433    6.976_557_354_066_943e-1,
434    7.034_860_566_249_567e-1,
435    7.092_694_143_778_975e-1,
436    7.150_054_225_789_432e-1,
437    7.206_936_983_024_912e-1,
438    7.263_338_618_094_733e-1,
439    7.319_255_365_727_068e-1,
440    7.374_683_493_020_299e-1,
441    7.429_619_299_692_227e-1,
442    7.484_059_118_327_094e-1,
443    7.537_999_314_620_412e-1,
444    7.591_436_287_621_58e-1,
445    7.644_366_469_974_285e-1,
446    7.696_786_328_154_644e-1,
447    7.748_692_362_707_1e-1,
448    7.800_081_108_478_04e-1,
449    7.850_949_134_847_117e-1,
450    7.901_293_045_956_28e-1,
451    7.951_109_480_936_471e-1,
452    8.000_395_114_131_988e-1,
453    8.049_146_655_322_506e-1,
454    8.097_360_849_942_72e-1,
455    8.145_034_479_299_62e-1,
456    8.192_164_360_787_36e-1,
457    8.238_747_348_099_726e-1,
458    8.284_780_331_440_178e-1,
459    8.330_260_237_729_452e-1,
460    8.375_184_030_810_715e-1,
461    8.419_548_711_652_254e-1,
462    8.463_351_318_547_683e-1,
463    8.506_588_927_313_666e-1,
464    8.549_258_651_485_127e-1,
465    8.591_357_642_507_945e-1,
466    8.632_883_089_929_12e-1,
467    8.673_832_221_584_393e-1,
468    8.714_202_303_783_312e-1,
469    8.753_990_641_491_725e-1,
470    8.793_194_578_511_7e-1,
471    8.831_811_497_658_847e-1,
472    8.869_838_820_937_034e-1,
473    8.907_274_009_710_492e-1,
474    8.944_114_564_873_288e-1,
475    8.980_358_027_016_164e-1,
476    9.016_001_976_590_722e-1,
477    9.051_044_034_070_944e-1,
478    9.085_481_860_112_055e-1,
479    9.119_313_155_706_682e-1,
480    9.152_535_662_338_34e-1,
481    9.185_147_162_132_208e-1,
482    9.217_145_478_003_181e-1,
483    9.248_528_473_801_222e-1,
484    9.279_294_054_453_956e-1,
485    9.309_440_166_106_54e-1,
486    9.338_964_796_258_775e-1,
487    9.367_865_973_899_459e-1,
488    9.396_141_769_637_963e-1,
489    9.423_790_295_833_044e-1,
490    9.450_809_706_718_851e-1,
491    9.477_198_198_528_157e-1,
492    9.502_954_009_612_771e-1,
493    9.528_075_420_561_144e-1,
494    9.552_560_754_313_16e-1,
495    9.576_408_376_272_095e-1,
496    9.599_616_694_413_742e-1,
497    9.622_184_159_392_698e-1,
498    9.644_109_264_645_802e-1,
499    9.665_390_546_492_71e-1,
500    9.686_026_584_233_628e-1,
501    9.706_016_000_244_151e-1,
502    9.725_357_460_067_257e-1,
503    9.744_049_672_502_397e-1,
504    9.762_091_389_691_724e-1,
505    9.779_481_407_203_411e-1,
506    9.796_218_564_112_101e-1,
507    9.812_301_743_076_443e-1,
508    9.827_729_870_413_743e-1,
509    9.842_501_916_171_713e-1,
510    9.856_616_894_197_333e-1,
511    9.870_073_862_202_815e-1,
512    9.882_871_921_828_699e-1,
513    9.895_010_218_704_087e-1,
514    9.906_487_942_504_061e-1,
515    9.917_304_327_004_32e-1,
516    9.927_458_650_133_153e-1,
517    9.936_950_234_020_883e-1,
518    9.945_778_445_047_068e-1,
519    9.953_942_693_885_953e-1,
520    9.961_442_435_551_087e-1,
521    9.968_277_169_440_913e-1,
522    9.974_446_439_389_107e-1,
523    9.979_949_833_727_938e-1,
524    9.984_786_985_384_589e-1,
525    9.988_957_572_063_257e-1,
526    9.992_461_316_671_845e-1,
527    9.995_297_988_558_859e-1,
528    9.997_467_408_113_523e-1,
529    9.998_969_471_378_596e-1,
530    9.999_804_411_726_474e-1,
531];
532const GL_WEIGHTS: [f64; 384] = [
533    5.019_410_348_676_869_6e-5,
534    1.168_390_665_730_266_3e-4,
535    1.835_749_193_551_655_8e-4,
536    2.503_070_890_844_105e-4,
537    3.170_242_698_112_815e-4,
538    3.837_208_020_912_921_4e-4,
539    4.503_919_137_716_827e-4,
540    5.170_330_453_491_649e-4,
541    5.836_397_042_630_135e-4,
542    6.502_074_240_969_948e-4,
543    7.167_317_509_947_801e-4,
544    7.832_082_385_905_168e-4,
545    8.496_324_460_039_209e-4,
546    9.159_999_370_632_641e-4,
547    9.823_062_800_663_463e-4,
548    1.048_547_047_793_689_5e-3,
549    1.114_717_817_647_310_6e-3,
550    1.180_814_171_855_922e-3,
551    1.246_831_697_715_441_5e-3,
552    1.312_765_987_850_66e-3,
553    1.378_612_640_487_646_8e-3,
554    1.444_367_259_734_736e-3,
555    1.510_025_455_865_810_3e-3,
556    1.575_582_845_607_936_8e-3,
557    1.641_035_052_429_271_5e-3,
558    1.706_377_706_828_447_1e-3,
559    1.771_606_446_623_834_7e-3,
560    1.836_716_917_243_567_5e-3,
561    1.901_704_772_014_899_2e-3,
562    1.966_565_672_453_437e-3,
563    2.031_295_288_552_398_4e-3,
564    2.095_889_299_071_020_6e-3,
565    2.160_343_391_822_734_3e-3,
566    2.224_653_263_962_713e-3,
567    2.288_814_622_274_955e-3,
568    2.352_823_183_458_769e-3,
569    2.416_674_674_414_340_5e-3,
570    2.480_364_832_528_265_6e-3,
571    2.543_889_405_957_74e-3,
572    2.607_244_153_914_452e-3,
573    2.670_424_846_947_554e-3,
574    2.733_427_267_226_093_3e-3,
575    2.796_247_208_820_428e-3,
576    2.858_880_477_983_06e-3,
577    2.921_322_893_428_515_3e-3,
578    2.983_570_286_612_554_5e-3,
579    3.045_618_502_010_327_8e-3,
580    3.107_463_397_393_755_5e-3,
581    3.169_100_844_108_32e-3,
582    3.230_526_727_348_174e-3,
583    3.291_736_946_431_361e-3,
584    3.352_727_415_073_250_3e-3,
585    3.413_494_061_659_418_4e-3,
586    3.474_032_829_517_317e-3,
587    3.534_339_677_187_348_4e-3,
588    3.594_410_578_692_452e-3,
589    3.654_241_523_806_987e-3,
590    3.713_828_518_324_312_5e-3,
591    3.773_167_584_323_583_5e-3,
592    3.832_254_760_435_171e-3,
593    3.891_086_102_105_193_4e-3,
594    3.949_657_681_858_895e-3,
595    4.007_965_589_562_678e-3,
596    4.066_005_932_685_269e-3,
597    4.123_774_836_557_6e-3,
598    4.181_268_444_631_281e-3,
599    4.238_482_918_736_289e-3,
600    4.295_414_439_336_925e-3,
601    4.352_059_205_787_275e-3,
602    4.408_413_436_584_285e-3,
603    4.464_473_369_620_78e-3,
604    4.520_235_262_436_235e-3,
605    4.575_695_392_466_791e-3,
606    4.630_850_057_293_894e-3,
607    4.685_695_574_891_041e-3,
608    4.740_228_283_870_022e-3,
609    4.794_444_543_725_102e-3,
610    4.848_340_735_076_109e-3,
611    4.901_913_259_910_197e-3,
612    4.955_158_541_821_682_4e-3,
613    5.008_073_026_251_332e-3,
614    5.060_653_180_723_101_4e-3,
615    5.112_895_495_080_397e-3,
616    5.164_796_481_720_011e-3,
617    5.216_352_675_825_451e-3,
618    5.267_560_635_597_735e-3,
619    5.318_416_942_485_385e-3,
620    5.368_918_201_412_827e-3,
621    5.419_061_041_006_627e-3,
622    5.468_842_113_820_941e-3,
623    5.518_258_096_560_71e-3,
624    5.567_305_690_303_767e-3,
625    5.615_981_620_720_803e-3,
626    5.664_282_638_294_182e-3,
627    5.712_205_518_534_655e-3,
628    5.759_747_062_196_925_5e-3,
629    5.806_904_095_492_818e-3,
630    5.853_673_470_303_617_4e-3,
631    5.900_052_064_389_824e-3,
632    5.946_036_781_599_814e-3,
633    5.991_624_552_076_468e-3,
634    6.036_812_332_462_087e-3,
635    6.081_597_106_101_673e-3,
636    6.125_975_883_244_196e-3,
637    6.169_945_701_242_237e-3,
638    6.213_503_624_749_591e-3,
639    6.256_646_745_917_723e-3,
640    6.299_372_184_589_237e-3,
641    6.341_677_088_490_664e-3,
642    6.383_558_633_422_572e-3,
643    6.425_014_023_448_273e-3,
644    6.466_040_491_080_434e-3,
645    6.506_635_297_465_724e-3,
646    6.546_795_732_567_842_5e-3,
647    6.586_519_115_348_261e-3,
648    6.625_802_793_945_317e-3,
649    6.664_644_145_851_14e-3,
650    6.703_040_578_086_941e-3,
651    6.740_989_527_375_895e-3,
652    6.778_488_460_314_126e-3,
653    6.815_534_873_540_5e-3,
654    6.852_126_293_902_878e-3,
655    6.888_260_278_623_754e-3,
656    6.923_934_415_463_31e-3,
657    6.959_146_322_880_146_5e-3,
658    6.993_893_650_190_702e-3,
659    7.028_174_077_725_734e-3,
660    7.061_985_316_985_506e-3,
661    7.095_325_110_792_439e-3,
662    7.128_191_233_441_844e-3,
663    7.160_581_490_850_321e-3,
664    7.192_493_720_702_486e-3,
665    7.223_925_792_595_309e-3,
666    7.254_875_608_179_984e-3,
667    7.285_341_101_302_512e-3,
668    7.315_320_238_141_324_5e-3,
669    7.344_811_017_343_063e-3,
670    7.373_811_470_156_258e-3,
671    7.402_319_660_562_818e-3,
672    7.430_333_685_407_178e-3,
673    7.457_851_674_523_319e-3,
674    7.484_871_790_859_79e-3,
675    7.511_392_230_602_079e-3,
676    7.537_411_223_293_362e-3,
677    7.562_927_031_952_382e-3,
678    7.587_937_953_189_561_5e-3,
679    7.612_442_317_320_796e-3,
680    7.636_438_488_478_739e-3,
681    7.659_924_864_722_064e-3,
682    7.682_899_878_142_539e-3,
683    7.705_361_994_969_524e-3,
684    7.727_309_715_672_44e-3,
685    7.748_741_575_060_914e-3,
686    7.769_656_142_382_462e-3,
687    7.790_052_021_418_226e-3,
688    7.809_927_850_575_903e-3,
689    7.829_282_302_980_82e-3,
690    7.848_114_086_564_56e-3,
691    7.866_421_944_151_094e-3,
692    7.884_204_653_540_665e-3,
693    7.901_461_027_591_6e-3,
694    7.918_189_914_299_318e-3,
695    7.934_390_196_873_448e-3,
696    7.950_060_793_812_204e-3,
697    7.965_200_658_974_709e-3,
698    7.979_808_781_650_77e-3,
699    7.993_884_186_628_266e-3,
700    8.007_425_934_258_548e-3,
701    8.020_433_120_518_866e-3,
702    8.032_904_877_072_8e-3,
703    8.044_840_371_328_26e-3,
704    8.056_238_806_493_175e-3,
705    8.067_099_421_628_42e-3,
706    8.077_421_491_698_82e-3,
707    8.087_204_327_621_594e-3,
708    8.096_447_276_312_202e-3,
709    8.105_149_720_727_933e-3,
710    8.113_311_079_909_208e-3,
711    8.120_930_809_018_415e-3,
712    8.128_008_399_376_085e-3,
713    8.134_543_378_495_033e-3,
714    8.140_535_310_111_77e-3,
715    8.145_983_794_215_77e-3,
716    8.150_888_467_075_875e-3,
717    8.155_249_001_265_092e-3,
718    8.159_065_105_681_899e-3,
719    8.162_336_525_570_1e-3,
720    8.165_063_042_535_465e-3,
721    8.167_244_474_560_707e-3,
722    8.168_880_676_017_344e-3,
723    8.169_971_537_675_47e-3,
724    8.170_516_986_711_104e-3,
725    8.170_516_986_711_104e-3,
726    8.169_971_537_675_47e-3,
727    8.168_880_676_017_344e-3,
728    8.167_244_474_560_707e-3,
729    8.165_063_042_535_465e-3,
730    8.162_336_525_570_1e-3,
731    8.159_065_105_681_899e-3,
732    8.155_249_001_265_092e-3,
733    8.150_888_467_075_875e-3,
734    8.145_983_794_215_77e-3,
735    8.140_535_310_111_77e-3,
736    8.134_543_378_495_033e-3,
737    8.128_008_399_376_085e-3,
738    8.120_930_809_018_415e-3,
739    8.113_311_079_909_208e-3,
740    8.105_149_720_727_933e-3,
741    8.096_447_276_312_202e-3,
742    8.087_204_327_621_594e-3,
743    8.077_421_491_698_82e-3,
744    8.067_099_421_628_42e-3,
745    8.056_238_806_493_175e-3,
746    8.044_840_371_328_26e-3,
747    8.032_904_877_072_8e-3,
748    8.020_433_120_518_866e-3,
749    8.007_425_934_258_548e-3,
750    7.993_884_186_628_266e-3,
751    7.979_808_781_650_77e-3,
752    7.965_200_658_974_709e-3,
753    7.950_060_793_812_204e-3,
754    7.934_390_196_873_448e-3,
755    7.918_189_914_299_318e-3,
756    7.901_461_027_591_6e-3,
757    7.884_204_653_540_665e-3,
758    7.866_421_944_151_094e-3,
759    7.848_114_086_564_56e-3,
760    7.829_282_302_980_82e-3,
761    7.809_927_850_575_903e-3,
762    7.790_052_021_418_226e-3,
763    7.769_656_142_382_462e-3,
764    7.748_741_575_060_914e-3,
765    7.727_309_715_672_44e-3,
766    7.705_361_994_969_524e-3,
767    7.682_899_878_142_539e-3,
768    7.659_924_864_722_064e-3,
769    7.636_438_488_478_739e-3,
770    7.612_442_317_320_796e-3,
771    7.587_937_953_189_561_5e-3,
772    7.562_927_031_952_382e-3,
773    7.537_411_223_293_362e-3,
774    7.511_392_230_602_079e-3,
775    7.484_871_790_859_79e-3,
776    7.457_851_674_523_319e-3,
777    7.430_333_685_407_178e-3,
778    7.402_319_660_562_818e-3,
779    7.373_811_470_156_258e-3,
780    7.344_811_017_343_063e-3,
781    7.315_320_238_141_324_5e-3,
782    7.285_341_101_302_512e-3,
783    7.254_875_608_179_984e-3,
784    7.223_925_792_595_309e-3,
785    7.192_493_720_702_486e-3,
786    7.160_581_490_850_321e-3,
787    7.128_191_233_441_844e-3,
788    7.095_325_110_792_439e-3,
789    7.061_985_316_985_506e-3,
790    7.028_174_077_725_734e-3,
791    6.993_893_650_190_702e-3,
792    6.959_146_322_880_146_5e-3,
793    6.923_934_415_463_31e-3,
794    6.888_260_278_623_754e-3,
795    6.852_126_293_902_878e-3,
796    6.815_534_873_540_5e-3,
797    6.778_488_460_314_126e-3,
798    6.740_989_527_375_895e-3,
799    6.703_040_578_086_941e-3,
800    6.664_644_145_851_14e-3,
801    6.625_802_793_945_317e-3,
802    6.586_519_115_348_261e-3,
803    6.546_795_732_567_842_5e-3,
804    6.506_635_297_465_724e-3,
805    6.466_040_491_080_434e-3,
806    6.425_014_023_448_273e-3,
807    6.383_558_633_422_572e-3,
808    6.341_677_088_490_664e-3,
809    6.299_372_184_589_237e-3,
810    6.256_646_745_917_723e-3,
811    6.213_503_624_749_591e-3,
812    6.169_945_701_242_237e-3,
813    6.125_975_883_244_196e-3,
814    6.081_597_106_101_673e-3,
815    6.036_812_332_462_087e-3,
816    5.991_624_552_076_468e-3,
817    5.946_036_781_599_814e-3,
818    5.900_052_064_389_824e-3,
819    5.853_673_470_303_617_4e-3,
820    5.806_904_095_492_818e-3,
821    5.759_747_062_196_925_5e-3,
822    5.712_205_518_534_655e-3,
823    5.664_282_638_294_182e-3,
824    5.615_981_620_720_803e-3,
825    5.567_305_690_303_767e-3,
826    5.518_258_096_560_71e-3,
827    5.468_842_113_820_941e-3,
828    5.419_061_041_006_627e-3,
829    5.368_918_201_412_827e-3,
830    5.318_416_942_485_385e-3,
831    5.267_560_635_597_735e-3,
832    5.216_352_675_825_451e-3,
833    5.164_796_481_720_011e-3,
834    5.112_895_495_080_397e-3,
835    5.060_653_180_723_101_4e-3,
836    5.008_073_026_251_332e-3,
837    4.955_158_541_821_682_4e-3,
838    4.901_913_259_910_197e-3,
839    4.848_340_735_076_109e-3,
840    4.794_444_543_725_102e-3,
841    4.740_228_283_870_022e-3,
842    4.685_695_574_891_041e-3,
843    4.630_850_057_293_894e-3,
844    4.575_695_392_466_791e-3,
845    4.520_235_262_436_235e-3,
846    4.464_473_369_620_78e-3,
847    4.408_413_436_584_285e-3,
848    4.352_059_205_787_275e-3,
849    4.295_414_439_336_925e-3,
850    4.238_482_918_736_289e-3,
851    4.181_268_444_631_281e-3,
852    4.123_774_836_557_6e-3,
853    4.066_005_932_685_269e-3,
854    4.007_965_589_562_678e-3,
855    3.949_657_681_858_895e-3,
856    3.891_086_102_105_193_4e-3,
857    3.832_254_760_435_171e-3,
858    3.773_167_584_323_583_5e-3,
859    3.713_828_518_324_312_5e-3,
860    3.654_241_523_806_987e-3,
861    3.594_410_578_692_452e-3,
862    3.534_339_677_187_348_4e-3,
863    3.474_032_829_517_317e-3,
864    3.413_494_061_659_418_4e-3,
865    3.352_727_415_073_250_3e-3,
866    3.291_736_946_431_361e-3,
867    3.230_526_727_348_174e-3,
868    3.169_100_844_108_32e-3,
869    3.107_463_397_393_755_5e-3,
870    3.045_618_502_010_327_8e-3,
871    2.983_570_286_612_554_5e-3,
872    2.921_322_893_428_515_3e-3,
873    2.858_880_477_983_06e-3,
874    2.796_247_208_820_428e-3,
875    2.733_427_267_226_093_3e-3,
876    2.670_424_846_947_554e-3,
877    2.607_244_153_914_452e-3,
878    2.543_889_405_957_74e-3,
879    2.480_364_832_528_265_6e-3,
880    2.416_674_674_414_340_5e-3,
881    2.352_823_183_458_769e-3,
882    2.288_814_622_274_955e-3,
883    2.224_653_263_962_713e-3,
884    2.160_343_391_822_734_3e-3,
885    2.095_889_299_071_020_6e-3,
886    2.031_295_288_552_398_4e-3,
887    1.966_565_672_453_437e-3,
888    1.901_704_772_014_899_2e-3,
889    1.836_716_917_243_567_5e-3,
890    1.771_606_446_623_834_7e-3,
891    1.706_377_706_828_447_1e-3,
892    1.641_035_052_429_271_5e-3,
893    1.575_582_845_607_936_8e-3,
894    1.510_025_455_865_810_3e-3,
895    1.444_367_259_734_736e-3,
896    1.378_612_640_487_646_8e-3,
897    1.312_765_987_850_66e-3,
898    1.246_831_697_715_441_5e-3,
899    1.180_814_171_855_922e-3,
900    1.114_717_817_647_310_6e-3,
901    1.048_547_047_793_689_5e-3,
902    9.823_062_800_663_463e-4,
903    9.159_999_370_632_641e-4,
904    8.496_324_460_039_209e-4,
905    7.832_082_385_905_168e-4,
906    7.167_317_509_947_801e-4,
907    6.502_074_240_969_948e-4,
908    5.836_397_042_630_135e-4,
909    5.170_330_453_491_649e-4,
910    4.503_919_137_716_827e-4,
911    3.837_208_020_912_921_4e-4,
912    3.170_242_698_112_815e-4,
913    2.503_070_890_844_105e-4,
914    1.835_749_193_551_655_8e-4,
915    1.168_390_665_730_266_3e-4,
916    5.019_410_348_676_869_6e-5,
917];
918
919#[derive(Clone, Copy, Debug, Eq, PartialEq)]
920pub enum ExactCellBranch {
921    Affine,
922    Quartic,
923    Sextic,
924}
925
926/// Auto-tune the per-cell affine/non-affine branch tolerance from the cell's
927/// own coefficient magnitudes.
928///
929/// The legacy `branch_cell` compared the normalized cubic coefficients
930/// `(k2, k3)` against a single global constant.  That constant is calibrated
931/// for cells whose anchor coefficients `(c0, c1)` are O(1).  When the anchor
932/// dominates — e.g. a tail cell with `|c0|, |c1| >> 1` — a relative criterion
933/// against the anchor magnitude is more numerically meaningful than the bare
934/// global threshold, because the affine contribution to `eta` already absorbs
935/// any difference at the chosen scale.
936///
937/// The returned tolerance is always at least [`NORMALIZED_CELL_BRANCH_TOL`],
938/// so cells with O(1) anchors recover bit-identical classification with the
939/// legacy code path.  This preserves numerical equivalence for the
940/// established `cubic_cell_kernel` tests, including the
941/// `tuned_branch_tolerance_matches_legacy_non_affine_transport_grid` grid.
942#[inline]
943fn effective_branch_tol(cell: DenestedCubicCell) -> f64 {
944    let anchor_scale = cell.c0.abs().max(cell.c1.abs()).max(1.0);
945    NORMALIZED_CELL_BRANCH_TOL * anchor_scale
946}
947
948#[derive(Clone, Copy, Debug, PartialEq)]
949pub struct DenestedCubicCell {
950    pub left: f64,
951    pub right: f64,
952    pub c0: f64,
953    pub c1: f64,
954    pub c2: f64,
955    pub c3: f64,
956}
957
958impl DenestedCubicCell {
959    #[inline]
960    pub fn eta(self, z: f64) -> f64 {
961        self.c0 + self.c1 * z + self.c2 * z * z + self.c3 * z * z * z
962    }
963
964    #[inline]
965    pub fn q(self, z: f64) -> f64 {
966        let eta = self.eta(z);
967        0.5 * (z * z + eta * eta)
968    }
969}
970
971#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
972pub struct CellMomentFingerprint {
973    pub hash: u64,
974    bins: [u64; 6],
975}
976
977#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
978pub struct CellMomentCacheKey {
979    pub fingerprint: CellMomentFingerprint,
980    pub max_degree: usize,
981}
982
983#[derive(Clone, Copy, Debug, Default, PartialEq)]
984pub struct CellMomentDedupStats {
985    pub lookups: u64,
986    pub hits: u64,
987    pub misses: u64,
988}
989
990impl CellMomentDedupStats {
991    #[inline]
992    pub fn hit_rate(self) -> f64 {
993        if self.lookups == 0 {
994            0.0
995        } else {
996            self.hits as f64 / self.lookups as f64
997        }
998    }
999}
1000
1001#[inline]
1002fn splitmix64(x: u64) -> u64 {
1003    gam_linalg::utils::splitmix64_hash(x)
1004}
1005
1006#[inline]
1007fn mix_fingerprint_words(words: &[u64]) -> u64 {
1008    let mut h = 0xcbf2_9ce4_8422_2325u64;
1009    for &word in words {
1010        h ^= splitmix64(word);
1011        h = h.wrapping_mul(0x100_0000_01b3);
1012    }
1013    h
1014}
1015
1016#[inline]
1017fn quantized_cell_word(x: f64, epsilon: f64) -> u64 {
1018    if epsilon == 0.0 || !epsilon.is_finite() || epsilon < 0.0 || !x.is_finite() {
1019        return x.to_bits();
1020    }
1021    (x / epsilon).round().to_bits()
1022}
1023
1024/// Returns a deterministic geometric fingerprint for a de-nested cubic cell.
1025///
1026/// With `epsilon == 0.0`, each coordinate is represented by its exact IEEE-754
1027/// bit pattern, so equal fingerprints imply bit-equal `(left, right, c0, c1,
1028/// c2, c3)` tuples.  With `epsilon > 0`, finite coordinates are binned to the
1029/// nearest multiple of `epsilon`; callers should treat this as an approximate
1030/// cache key and validate the resulting model error for their data.
1031pub fn cell_moment_fingerprint(cell: DenestedCubicCell, epsilon: f64) -> CellMomentFingerprint {
1032    let bins = [
1033        quantized_cell_word(cell.left, epsilon),
1034        quantized_cell_word(cell.right, epsilon),
1035        quantized_cell_word(cell.c0, epsilon),
1036        quantized_cell_word(cell.c1, epsilon),
1037        quantized_cell_word(cell.c2, epsilon),
1038        quantized_cell_word(cell.c3, epsilon),
1039    ];
1040    CellMomentFingerprint {
1041        hash: mix_fingerprint_words(&bins),
1042        bins,
1043    }
1044}
1045
1046#[inline]
1047pub fn cell_moment_cache_key(
1048    cell: DenestedCubicCell,
1049    max_degree: usize,
1050    epsilon: f64,
1051) -> CellMomentCacheKey {
1052    CellMomentCacheKey {
1053        fingerprint: cell_moment_fingerprint(cell, epsilon),
1054        max_degree,
1055    }
1056}
1057
1058#[derive(Clone, Copy, Debug, PartialEq)]
1059pub struct DenestedPartitionCell {
1060    pub cell: DenestedCubicCell,
1061    pub score_span: LocalSpanCubic,
1062    pub link_span: LocalSpanCubic,
1063    /// Provenance of the cell's boundaries: a fixed z location (score break
1064    /// or ±∞ tail) or a link-knot crossing `z = (τ - a)/b`. Together with
1065    /// `(score_span, link_span)` this identifies the cell's two-parameter
1066    /// family in `(a, b)` across rows (see
1067    /// [`crate::cell_moment_family`]).
1068    pub left_edge: PartitionEdge,
1069    pub right_edge: PartitionEdge,
1070}
1071
1072impl DenestedPartitionCell {}
1073
1074/// Provenance of one boundary of a denested partition cell.
1075#[derive(Clone, Copy, Debug, PartialEq)]
1076pub enum PartitionEdge {
1077    /// A z location independent of the row scalars: a score-spline break,
1078    /// or ±∞ for tail cells.
1079    Fixed(f64),
1080    /// A link-knot crossing: the boundary sits at `z = (τ - a)/b` for the
1081    /// row's `(a, b)`.
1082    Crossing { tau: f64 },
1083}
1084
1085impl PartitionEdge {
1086    /// The boundary's z location at the row scalars `(a, b)`.
1087    #[inline]
1088    pub fn z_at(self, a: f64, b: f64) -> f64 {
1089        match self {
1090            Self::Fixed(z) => z,
1091            Self::Crossing { tau } => (tau - a) / b,
1092        }
1093    }
1094}
1095
1096#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
1097struct TailCellMomentCacheKey {
1098    c0_bits: u64,
1099    c1_bits: u64,
1100    endpoint_bits: u64,
1101    side: i8,
1102    max_degree: usize,
1103}
1104
1105const TAIL_CELL_MOMENT_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
1106const TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES: usize = 262_144;
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1109pub struct TailCellMomentCacheStats {
1110    pub hits: usize,
1111    pub misses: usize,
1112    pub entries: usize,
1113}
1114
1115impl TailCellMomentCacheStats {
1116    #[inline]
1117    pub fn requests(self) -> usize {
1118        self.hits + self.misses
1119    }
1120
1121    #[inline]
1122    pub fn hit_rate(self) -> f64 {
1123        let requests = self.requests();
1124        if requests == 0 {
1125            0.0
1126        } else {
1127            self.hits as f64 / requests as f64
1128        }
1129    }
1130}
1131
1132/// Affine-tail cell-moment memo.
1133///
1134/// Stand-alone instances (`TailCellMomentCache::new()`) are useful when a
1135/// caller needs deterministic hit/miss bookkeeping that is not polluted by
1136/// concurrent traffic on the global memo. The production path uses the
1137/// global instance behind [`evaluate_cell_moments`].
1138///
1139/// All methods take `&self`: the LRU is internally synchronized (sharded for
1140/// the concurrent global memo) and the counters are atomics, so the global
1141/// instance needs no outer `Mutex`. The previous `OnceLock<Mutex<…>>` wrapper
1142/// serialized every tail-cell evaluation across all rayon workers of the
1143/// marginal-slope exact-cache build — the same contention class the sharded
1144/// per-family cell-moment LRU fix removed.
1145#[derive(Debug)]
1146pub struct TailCellMomentCache {
1147    moments: ByteLruCache<TailCellMomentCacheKey, CellMomentState>,
1148    in_flight: std::sync::Mutex<
1149        std::collections::HashMap<
1150            TailCellMomentCacheKey,
1151            Arc<std::sync::OnceLock<Result<CellMomentState, String>>>,
1152        >,
1153    >,
1154    hits: std::sync::atomic::AtomicUsize,
1155    misses: std::sync::atomic::AtomicUsize,
1156}
1157
1158impl Default for TailCellMomentCache {
1159    fn default() -> Self {
1160        // Tail-cell entries are small (a short moment vector), so sharding
1161        // the byte/entry budgets is harmless; size the shard count off the
1162        // worker pool exactly like the per-family cell-moment LRU.
1163        let shard_count = std::thread::available_parallelism()
1164            .map(|workers| workers.get().saturating_mul(8))
1165            .unwrap_or(32)
1166            .clamp(8, 256);
1167        Self {
1168            moments: ByteLruCache::with_max_entries_sharded(
1169                TAIL_CELL_MOMENT_CACHE_MAX_BYTES,
1170                TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES,
1171                shard_count,
1172            ),
1173            in_flight: std::sync::Mutex::new(std::collections::HashMap::new()),
1174            hits: std::sync::atomic::AtomicUsize::new(0),
1175            misses: std::sync::atomic::AtomicUsize::new(0),
1176        }
1177    }
1178}
1179
1180impl TailCellMomentCache {
1181    /// Construct an empty cache. Hits/misses start at zero.
1182    #[inline]
1183    pub fn new() -> Self {
1184        Self::default()
1185    }
1186
1187    /// Reset the cache to its empty state. Existing entries are dropped and
1188    /// the hit/miss counters are zeroed.
1189    #[inline]
1190    pub fn clear(&self) {
1191        self.moments.clear();
1192        self.in_flight
1193            .lock()
1194            .unwrap_or_else(|p| p.into_inner())
1195            .clear();
1196        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
1197        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
1198    }
1199
1200    /// Snapshot of the cache's current usage stats.
1201    #[inline]
1202    pub fn stats(&self) -> TailCellMomentCacheStats {
1203        TailCellMomentCacheStats {
1204            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
1205            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
1206            entries: self.moments.len(),
1207        }
1208    }
1209
1210    /// Look up `cell` at `max_degree`, computing and inserting the result on
1211    /// miss. Cells outside the affine-tail keyset bypass the cache and run
1212    /// the uncached evaluator directly without touching the counters.
1213    ///
1214    /// Stat semantics: every request served from an existing resident entry,
1215    /// or from a concurrently published entry for the same key, increments
1216    /// `hits`; a **miss** is counted only for the caller that actually
1217    /// computes a cold key. The compute happens outside the LRU shard lock,
1218    /// but an in-flight table coalesces same-key cold races so followers reuse
1219    /// the leader's published value instead of duplicating work.
1220    pub fn evaluate(
1221        &self,
1222        cell: DenestedCubicCell,
1223        max_degree: usize,
1224    ) -> Result<CellMomentState, String> {
1225        let Some(key) = tail_cell_cache_key(cell, max_degree) else {
1226            return evaluate_cell_moments_uncached(cell, max_degree);
1227        };
1228        if let Some(state) = self.moments.get(&key) {
1229            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1230            return Ok(state);
1231        }
1232
1233        let (slot, leader) = {
1234            let mut in_flight = self.in_flight.lock().unwrap_or_else(|p| p.into_inner());
1235            if let Some(slot) = in_flight.get(&key) {
1236                (Arc::clone(slot), false)
1237            } else {
1238                let slot = Arc::new(std::sync::OnceLock::new());
1239                in_flight.insert(key, Arc::clone(&slot));
1240                (slot, true)
1241            }
1242        };
1243
1244        if !leader {
1245            let state = slot.wait().clone()?;
1246            self.hits
1247                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1248            return Ok(state);
1249        }
1250
1251        let state = evaluate_cell_moments_uncached(cell, max_degree);
1252        if let Ok(state) = &state {
1253            self.moments.insert(key, state.clone());
1254            self.hits
1255                .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1256        }
1257        self.misses
1258            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1259        if let Err(existing_state) = slot.set(state.clone()) {
1260            std::mem::drop(existing_state);
1261        }
1262        self.in_flight
1263            .lock()
1264            .unwrap_or_else(|p| p.into_inner())
1265            .remove(&key);
1266        state
1267    }
1268}
1269
1270static TAIL_CELL_MOMENT_CACHE: std::sync::OnceLock<TailCellMomentCache> =
1271    std::sync::OnceLock::new();
1272static TAIL_CELL_MOMENT_CACHE_ENABLED: std::sync::atomic::AtomicBool =
1273    std::sync::atomic::AtomicBool::new(true);
1274
1275fn tail_cell_moment_cache() -> &'static TailCellMomentCache {
1276    TAIL_CELL_MOMENT_CACHE.get_or_init(TailCellMomentCache::default)
1277}
1278
1279#[inline]
1280fn tail_cell_cache_key(
1281    cell: DenestedCubicCell,
1282    max_degree: usize,
1283) -> Option<TailCellMomentCacheKey> {
1284    if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL {
1285        return None;
1286    }
1287    match (!cell.left.is_finite(), !cell.right.is_finite()) {
1288        (true, false) if cell.right.is_finite() => Some(TailCellMomentCacheKey {
1289            c0_bits: cell.c0.to_bits(),
1290            c1_bits: cell.c1.to_bits(),
1291            endpoint_bits: cell.right.to_bits(),
1292            side: -1,
1293            max_degree,
1294        }),
1295        (false, true) if cell.left.is_finite() => Some(TailCellMomentCacheKey {
1296            c0_bits: cell.c0.to_bits(),
1297            c1_bits: cell.c1.to_bits(),
1298            endpoint_bits: cell.left.to_bits(),
1299            side: 1,
1300            max_degree,
1301        }),
1302        _ => None,
1303    }
1304}
1305
1306pub fn set_tail_cell_moment_cache_enabled(enabled: bool) {
1307    TAIL_CELL_MOMENT_CACHE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed);
1308}
1309
1310pub fn reset_tail_cell_moment_cache() {
1311    tail_cell_moment_cache().clear();
1312}
1313
1314pub fn tail_cell_moment_cache_stats() -> TailCellMomentCacheStats {
1315    tail_cell_moment_cache().stats()
1316}
1317
1318#[derive(Clone, Copy, Debug, Eq)]
1319pub struct CellFingerprint {
1320    c0: u64,
1321    c1: u64,
1322    c2: u64,
1323    c3: u64,
1324    left: u64,
1325    right: u64,
1326}
1327
1328impl CellFingerprint {
1329    #[inline]
1330    pub fn new(cell: DenestedCubicCell) -> Self {
1331        Self {
1332            c0: cell.c0.to_bits(),
1333            c1: cell.c1.to_bits(),
1334            c2: cell.c2.to_bits(),
1335            c3: cell.c3.to_bits(),
1336            left: cell.left.to_bits(),
1337            right: cell.right.to_bits(),
1338        }
1339    }
1340}
1341
1342impl PartialEq for CellFingerprint {
1343    #[inline]
1344    fn eq(&self, other: &Self) -> bool {
1345        self.c0 == other.c0
1346            && self.c1 == other.c1
1347            && self.c2 == other.c2
1348            && self.c3 == other.c3
1349            && self.left == other.left
1350            && self.right == other.right
1351    }
1352}
1353
1354impl Hash for CellFingerprint {
1355    #[inline]
1356    fn hash<H: Hasher>(&self, state: &mut H) {
1357        self.c0.hash(state);
1358        self.c1.hash(state);
1359        self.c2.hash(state);
1360        self.c3.hash(state);
1361        self.left.hash(state);
1362        self.right.hash(state);
1363    }
1364}
1365
1366#[derive(Clone, Debug, Default, PartialEq)]
1367pub struct CachedCellMoments {
1368    /// Regular (value) cell moments, populated by
1369    /// `evaluate_cell_moments_cached`. None when only derivative moments
1370    /// have been cached for this cell. Wrapped in `Arc` so `ByteLruCache`
1371    /// returns lookups through cheap refcount bumps instead of deep-cloning
1372    /// the inline `SmallVec<[f64; 10]>` (which spills on every degree-`>= 10`
1373    /// request) on every hot-path LRU hit.
1374    state: Option<Arc<CellMomentState>>,
1375    /// Derivative moments, populated by
1376    /// `evaluate_cell_derivative_moments_cached`. None when only value
1377    /// moments have been cached for this cell. Both variants share the
1378    /// same `CellFingerprint` key so derivative-only callers do not evict
1379    /// pre-cached value entries and vice versa. Same `Arc` wrapping rationale
1380    /// as `state` above.
1381    derivative_state: Option<Arc<CellDerivativeMomentState>>,
1382}
1383
1384impl CachedCellMoments {
1385    #[inline]
1386    pub fn new(state: Arc<CellMomentState>) -> Self {
1387        Self {
1388            state: Some(state),
1389            derivative_state: None,
1390        }
1391    }
1392
1393    #[inline]
1394    pub fn new_derivative(state: Arc<CellDerivativeMomentState>) -> Self {
1395        Self {
1396            state: None,
1397            derivative_state: Some(state),
1398        }
1399    }
1400
1401    #[inline]
1402    pub fn state_for_degree(&self, max_degree: usize) -> Option<CellMomentState> {
1403        let state = self.state.as_ref()?;
1404        if state.moments.len().saturating_sub(1) < max_degree {
1405            return None;
1406        }
1407        // Cached `Arc<CellMomentState>` is shared across LRU hits, so we
1408        // cannot reuse the inner vector in place. Clone the underlying state
1409        // and (rarely) truncate down to the requested degree to honour the
1410        // public moment-length contract.
1411        let mut state = (**state).clone();
1412        state.moments.truncate(max_degree + 1);
1413        Some(state)
1414    }
1415
1416    #[inline]
1417    pub fn derivative_state_for_degree(
1418        &self,
1419        max_degree: usize,
1420    ) -> Option<CellDerivativeMomentState> {
1421        let state = self.derivative_state.as_ref()?;
1422        if state.moments.len().saturating_sub(1) < max_degree {
1423            return None;
1424        }
1425        // See `state_for_degree`: shared `Arc` forces an inner clone here.
1426        let mut state = (**state).clone();
1427        state.moments.truncate(max_degree + 1);
1428        Some(state)
1429    }
1430
1431    #[inline]
1432    pub fn with_value(mut self, state: Arc<CellMomentState>) -> Self {
1433        self.state = Some(state);
1434        self
1435    }
1436
1437    #[inline]
1438    pub fn with_derivative(mut self, state: Arc<CellDerivativeMomentState>) -> Self {
1439        self.derivative_state = Some(state);
1440        self
1441    }
1442}
1443
1444impl ResidentBytes for CachedCellMoments {
1445    fn resident_bytes(&self) -> usize {
1446        let value_bytes = self
1447            .state
1448            .as_ref()
1449            .map_or(0, |state| state.resident_bytes());
1450        let derivative_bytes = self
1451            .derivative_state
1452            .as_ref()
1453            .map_or(0, |state| state.resident_bytes());
1454        std::mem::size_of::<Self>()
1455            .saturating_add(value_bytes)
1456            .saturating_add(derivative_bytes)
1457    }
1458}
1459
1460#[derive(Debug, Default)]
1461pub struct CellMomentCacheStats {
1462    hits: AtomicU64,
1463    misses: AtomicU64,
1464}
1465
1466impl CellMomentCacheStats {
1467    #[inline]
1468    pub fn snapshot(&self) -> (u64, u64) {
1469        (
1470            self.hits.load(Ordering::Relaxed),
1471            self.misses.load(Ordering::Relaxed),
1472        )
1473    }
1474
1475    #[inline]
1476    pub fn hit_rate_delta(&self, before: (u64, u64)) -> (u64, u64, f64) {
1477        let (hits, misses) = self.snapshot();
1478        let dh = hits.saturating_sub(before.0);
1479        let dm = misses.saturating_sub(before.1);
1480        let total = dh + dm;
1481        let rate = if total == 0 {
1482            0.0
1483        } else {
1484            dh as f64 / total as f64
1485        };
1486        (dh, dm, rate)
1487    }
1488}
1489
1490pub type CellMomentLruCache = ByteLruCache<CellFingerprint, CachedCellMoments>;
1491
1492pub const CELL_MOMENT_INLINE_CAPACITY: usize = 10;
1493
1494pub type CellMomentVec = SmallVec<[f64; CELL_MOMENT_INLINE_CAPACITY]>;
1495
1496#[derive(Clone, Debug, PartialEq)]
1497pub struct CellMomentState {
1498    pub branch: ExactCellBranch,
1499    pub value: f64,
1500    pub moments: CellMomentVec,
1501}
1502
1503impl ResidentBytes for CellMomentState {
1504    fn resident_bytes(&self) -> usize {
1505        let spilled_bytes = if self.moments.spilled() {
1506            self.moments
1507                .capacity()
1508                .saturating_mul(std::mem::size_of::<f64>())
1509        } else {
1510            0
1511        };
1512        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1513    }
1514}
1515
1516#[derive(Clone, Debug, PartialEq)]
1517pub struct CellDerivativeMomentState {
1518    pub branch: ExactCellBranch,
1519    pub moments: CellMomentVec,
1520}
1521
1522impl ResidentBytes for CellDerivativeMomentState {
1523    fn resident_bytes(&self) -> usize {
1524        let spilled_bytes = if self.moments.spilled() {
1525            self.moments
1526                .capacity()
1527                .saturating_mul(std::mem::size_of::<f64>())
1528        } else {
1529            0
1530        };
1531        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1532    }
1533}
1534
1535#[derive(Clone, Copy, Debug, PartialEq)]
1536pub struct CellMomentStateRef<'a> {
1537    pub branch: ExactCellBranch,
1538    pub value: f64,
1539    pub moments: &'a [f64],
1540}
1541
1542#[derive(Clone, Debug)]
1543pub struct CellMomentScratch {
1544    moments: Vec<f64>,
1545}
1546
1547impl Default for CellMomentScratch {
1548    fn default() -> Self {
1549        // Pre-size to the codebase's max moment degree so steady-state
1550        // `prepare_moments` calls never reallocate. Calls with `len`
1551        // exceeding this still reserve lazily.
1552        Self {
1553            moments: Vec::with_capacity(MAX_AFFINE_ANCHOR_DEGREE + 1),
1554        }
1555    }
1556}
1557
1558impl CellMomentScratch {
1559    pub fn new() -> Self {
1560        Self::default()
1561    }
1562
1563    pub fn with_capacity(max_degree: usize) -> Self {
1564        Self {
1565            moments: Vec::with_capacity(max_degree + 1),
1566        }
1567    }
1568
1569    #[inline]
1570    fn prepare_moments(&mut self, len: usize) -> &mut [f64] {
1571        if self.moments.capacity() < len {
1572            CELL_MOMENT_REALLOCS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1573            self.moments.reserve(len - self.moments.capacity());
1574        }
1575        // Grow monotonically: shorter requests should not truncate the backing
1576        // storage and then zero the old tail when a later request grows again.
1577        // Only the active prefix is scratch for this evaluation.
1578        if self.moments.len() < len {
1579            self.moments.resize(len, 0.0);
1580        }
1581        let out = &mut self.moments[..len];
1582        out.fill(0.0);
1583        out
1584    }
1585}
1586
1587/// Counter for moment-buffer reallocations in `prepare_moments`. Production
1588/// code increments this on every buffer growth; the test mod inspects it to
1589/// assert the steady-state hot loop allocates exactly once per row buffer.
1590pub(crate) static CELL_MOMENT_REALLOCS: std::sync::atomic::AtomicUsize =
1591    std::sync::atomic::AtomicUsize::new(0);
1592
1593/// Canonical 20-point Gauss–Legendre nodes on [-1, 1] (Abramowitz & Stegun
1594/// 25.4), tabulated to f64 precision. Used here for the Drezner–Wesolowsky
1595/// bivariate normal CDF representation — 20 points give >30-digit accuracy for
1596/// the smooth arcsin-transformed integrand, ensuring the BVN value is exact to
1597/// f64 precision for all (h, k, ρ) — and shared with the cubic-cell B-spline
1598/// moment parity gate in [`crate::gpu_kernels::cubic_bspline_moments`].
1599pub const GL20_NODES: [f64; 20] = [
1600    -0.993_128_599_185_094_9,
1601    -0.963_971_927_277_913_8,
1602    -0.912_234_428_251_326,
1603    -0.839_116_971_822_218_8,
1604    -0.746_331_906_460_150_8,
1605    -0.636_053_680_726_515,
1606    -0.510_867_001_950_827_1,
1607    -0.373_706_088_715_419_6,
1608    -0.227_785_851_141_645_1,
1609    -0.076_526_521_133_497_33,
1610    0.076_526_521_133_497_33,
1611    0.227_785_851_141_645_1,
1612    0.373_706_088_715_419_6,
1613    0.510_867_001_950_827_1,
1614    0.636_053_680_726_515,
1615    0.746_331_906_460_150_8,
1616    0.839_116_971_822_218_8,
1617    0.912_234_428_251_326,
1618    0.963_971_927_277_913_8,
1619    0.993_128_599_185_094_9,
1620];
1621
1622/// Companion weights to [`GL20_NODES`]. Symmetric, summing to 2.
1623pub const GL20_WEIGHTS: [f64; 20] = [
1624    0.017_614_007_139_152_12,
1625    0.040_601_429_800_386_94,
1626    0.062_672_048_334_109_06,
1627    0.083_276_741_576_704_75,
1628    0.101_930_119_817_240_4,
1629    0.118_194_531_961_518_4,
1630    0.131_688_638_449_176_6,
1631    0.142_096_109_318_382_1,
1632    0.149_172_986_472_603_7,
1633    0.152_753_387_130_725_9,
1634    0.152_753_387_130_725_9,
1635    0.149_172_986_472_603_7,
1636    0.142_096_109_318_382_1,
1637    0.131_688_638_449_176_6,
1638    0.118_194_531_961_518_4,
1639    0.101_930_119_817_240_4,
1640    0.083_276_741_576_704_75,
1641    0.062_672_048_334_109_06,
1642    0.040_601_429_800_386_94,
1643    0.017_614_007_139_152_12,
1644];
1645
1646/// Provenance-tagged breakpoint dedup: sorts ascending and merges entries
1647/// coinciding within 1e-12, but when a fixed score break and a link-knot
1648/// crossing coincide (the kink configuration), the surviving entry keeps
1649/// the `Fixed` tag — a deterministic choice; the z location is identical
1650/// either way.
1651fn dedup_sorted_tagged_breakpoints(points: &mut Vec<(f64, PartitionEdge)>) {
1652    points.sort_by(|lhs, rhs| {
1653        lhs.0
1654            .partial_cmp(&rhs.0)
1655            .unwrap_or(std::cmp::Ordering::Equal)
1656    });
1657    points.dedup_by(|lhs, rhs| {
1658        let coincide = if lhs.0 == rhs.0 {
1659            true
1660        } else if lhs.0.is_finite() && rhs.0.is_finite() {
1661            (lhs.0 - rhs.0).abs() <= 1e-12
1662        } else {
1663            false
1664        };
1665        if coincide && matches!(lhs.1, PartitionEdge::Fixed(_)) {
1666            // `dedup_by` keeps `rhs` (the earlier element) — propagate the
1667            // Fixed tag onto the survivor.
1668            rhs.1 = lhs.1;
1669        }
1670        coincide
1671    });
1672}
1673
1674#[inline]
1675pub fn interval_probe_point(left: f64, right: f64) -> Result<f64, String> {
1676    if !(left < right) {
1677        return Err(CubicCellKernelError::invalid_interval(format!(
1678            "interval probe requires ordered bounds, got [{left}, {right}]"
1679        ))
1680        .into());
1681    }
1682    if left.is_finite() && right.is_finite() {
1683        Ok(0.5 * (left + right))
1684    } else if left == f64::NEG_INFINITY && right == f64::INFINITY {
1685        Ok(0.0)
1686    } else if left == f64::NEG_INFINITY && right.is_finite() {
1687        Ok(right - 1.0)
1688    } else if left.is_finite() && right == f64::INFINITY {
1689        Ok(left + 1.0)
1690    } else {
1691        Err(CubicCellKernelError::invalid_interval(format!(
1692            "interval probe requires finite bounds or full infinities, got [{left}, {right}]"
1693        ))
1694        .into())
1695    }
1696}
1697
1698#[inline]
1699pub fn quartic_qprime_coefficients(c0: f64, c1: f64, c2: f64) -> [f64; 4] {
1700    [
1701        c0 * c1,
1702        1.0 + c1 * c1 + 2.0 * c0 * c2,
1703        3.0 * c1 * c2,
1704        2.0 * c2 * c2,
1705    ]
1706}
1707
1708#[inline]
1709pub fn sextic_qprime_coefficients(c0: f64, c1: f64, c2: f64, c3: f64) -> [f64; 6] {
1710    [
1711        c0 * c1,
1712        1.0 + c1 * c1 + 2.0 * c0 * c2,
1713        3.0 * c0 * c3 + 3.0 * c1 * c2,
1714        4.0 * c1 * c3 + 2.0 * c2 * c2,
1715        5.0 * c2 * c3,
1716        3.0 * c3 * c3,
1717    ]
1718}
1719
1720/// Boundary term `right^n · exp(−q(right)) − left^n · exp(−q(left))` used by
1721/// the moment recurrences. Takes precomputed `left^n` and `right^n` so callers
1722/// can roll the powers across a recurrence — each iteration becomes one
1723/// multiply instead of a fresh `powi(n)`.
1724#[inline]
1725fn moment_boundary_term_with_powers(
1726    cell: DenestedCubicCell,
1727    left_pow_n: f64,
1728    right_pow_n: f64,
1729) -> f64 {
1730    let left_term = if cell.left.is_infinite() {
1731        0.0
1732    } else {
1733        left_pow_n * (-cell.q(cell.left)).exp()
1734    };
1735    let right_term = if cell.right.is_infinite() {
1736        0.0
1737    } else {
1738        right_pow_n * (-cell.q(cell.right)).exp()
1739    };
1740    right_term - left_term
1741}
1742
1743#[inline]
1744fn base_moments_match_direct(base: &[f64], direct: &[f64]) -> bool {
1745    base.iter()
1746        .zip(direct.iter())
1747        .all(|(&lhs, &rhs)| (lhs - rhs).abs() <= 1e-10 * (1.0 + lhs.abs().max(rhs.abs())))
1748}
1749
1750#[inline]
1751fn direct_non_affine_moments_if_base_matches(
1752    cell: DenestedCubicCell,
1753    base: &[f64],
1754    max_degree: usize,
1755) -> Option<Vec<f64>> {
1756    if !cell.left.is_finite() || !cell.right.is_finite() {
1757        return None;
1758    }
1759    // When the supplied base moments are the actual moments of this fixed
1760    // finite cell, prefer the same quadrature-backed evaluator used by the
1761    // public non-affine moment path.  The algebraic raising recurrence is kept
1762    // below for callers that intentionally pass symbolic or otherwise
1763    // non-cell-consistent bases, but repeatedly dividing by the quartic/sextic
1764    // leading coefficient can amplify harmless base-roundoff into high-order
1765    // moment error.
1766    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
1767    if base_moments_match_direct(base, &moments) {
1768        Some(moments.into_vec())
1769    } else {
1770        None
1771    }
1772}
1773
1774pub fn reduce_quartic_moments(
1775    cell: DenestedCubicCell,
1776    base_m0_m2: [f64; 3],
1777    max_degree: usize,
1778) -> Result<Vec<f64>, String> {
1779    if max_degree <= 2 {
1780        return Ok(base_m0_m2[..=max_degree].to_vec());
1781    }
1782    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m2, max_degree)
1783    {
1784        return Ok(moments);
1785    }
1786    let d = quartic_qprime_coefficients(cell.c0, cell.c1, cell.c2);
1787    let lead = d[3];
1788    if !lead.is_finite() || lead.abs() <= 1e-18 {
1789        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1790            "quartic moment reduction requires nonzero leading coefficient, got {lead:.3e}"
1791        ))
1792        .into());
1793    }
1794    let mut moments = vec![0.0; max_degree + 1];
1795    moments[0] = base_m0_m2[0];
1796    moments[1] = base_m0_m2[1];
1797    moments[2] = base_m0_m2[2];
1798    // Roll left^n / right^n across the recurrence rather than calling
1799    // `powi(n)` each iteration. Skip the multiply when an endpoint is
1800    // infinite — the boundary helper ignores the power in that case, and
1801    // ∞·0 would produce a NaN we'd then have to mask off anyway.
1802    let left_finite = cell.left.is_finite();
1803    let right_finite = cell.right.is_finite();
1804    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1805    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1806    for n in 0..=(max_degree - 3) {
1807        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1808        let mut numer = if n == 0 {
1809            0.0
1810        } else {
1811            (n as f64) * moments[n - 1]
1812        };
1813        for j in 0..=2 {
1814            numer -= d[j] * moments[n + j];
1815        }
1816        numer -= b_n;
1817        moments[n + 3] = numer / lead;
1818        if left_finite {
1819            left_pow_n *= cell.left;
1820        }
1821        if right_finite {
1822            right_pow_n *= cell.right;
1823        }
1824    }
1825    Ok(moments)
1826}
1827
1828pub fn reduce_sextic_moments(
1829    cell: DenestedCubicCell,
1830    base_m0_m4: [f64; 5],
1831    max_degree: usize,
1832) -> Result<Vec<f64>, String> {
1833    if max_degree <= 4 {
1834        return Ok(base_m0_m4[..=max_degree].to_vec());
1835    }
1836    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m4, max_degree)
1837    {
1838        return Ok(moments);
1839    }
1840    let d = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3);
1841    let lead = d[5];
1842    if !lead.is_finite() {
1843        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1844            "sextic moment reduction encountered non-finite leading coefficient: {lead:.3e}"
1845        ))
1846        .into());
1847    }
1848    if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
1849        if lower_branch == ExactCellBranch::Quartic {
1850            return evaluate_non_affine_cell_state(
1851                DenestedCubicCell { c3: 0.0, ..cell },
1852                ExactCellBranch::Quartic,
1853                max_degree,
1854            )
1855            .map(|state| state.moments.into_vec());
1856        }
1857        return evaluate_affine_cell_state(
1858            DenestedCubicCell {
1859                left: cell.left,
1860                right: cell.right,
1861                c0: cell.c0,
1862                c1: cell.c1,
1863                c2: 0.0,
1864                c3: 0.0,
1865            },
1866            max_degree,
1867        )
1868        .map(|state| state.moments.into_vec());
1869    }
1870    let mut moments = vec![0.0; max_degree + 1];
1871    for (idx, value) in base_m0_m4.into_iter().enumerate() {
1872        moments[idx] = value;
1873    }
1874    let left_finite = cell.left.is_finite();
1875    let right_finite = cell.right.is_finite();
1876    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1877    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1878    for n in 0..=(max_degree - 5) {
1879        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1880        let mut numer = if n == 0 {
1881            0.0
1882        } else {
1883            (n as f64) * moments[n - 1]
1884        };
1885        for j in 0..=4 {
1886            numer -= d[j] * moments[n + j];
1887        }
1888        numer -= b_n;
1889        moments[n + 5] = numer / lead;
1890        if left_finite {
1891            left_pow_n *= cell.left;
1892        }
1893        if right_finite {
1894            right_pow_n *= cell.right;
1895        }
1896    }
1897    Ok(moments)
1898}
1899
1900#[inline]
1901pub fn cell_first_derivative_from_moments(
1902    derivative_coefficients: &[f64],
1903    moments: &[f64],
1904) -> Result<f64, String> {
1905    let value = moment_dot_with_coefficients(derivative_coefficients, moments, "first derivative")?;
1906    Ok(value * INV_TWO_PI)
1907}
1908
1909/// Maximum moment index (i.e. `max_degree` passed to
1910/// `evaluate_cell_moments`) required to evaluate
1911/// `cell_first_derivative_from_moments(derivative_coefficients, moments)`.
1912///
1913/// Callers must request at least `cell_first_derivative_required_max_degree(
1914/// derivative_coefficients)` so the moment dot is well-defined; #321 was
1915/// caused by hardcoding a smaller value at one call site.
1916#[inline]
1917pub fn cell_first_derivative_required_max_degree(derivative_coefficients: &[f64]) -> usize {
1918    derivative_coefficients.len().saturating_sub(1)
1919}
1920
1921/// Maximum moment index required by `cell_second_derivative_from_moments`.
1922///
1923/// Mirrors the kernel's internal `needed = max(second_deg, product_deg) + 1`
1924/// computation, but returned as `max_degree` (i.e. `needed - 1`) so it lines
1925/// up with the `evaluate_cell_moments(cell, max_degree)` argument convention.
1926/// The contraction folds an inner cubic `eta` (always degree 3) with the two
1927/// first-coefficient slices and the second-coefficient slice; the +3 below is
1928/// the cubic-cell eta polynomial.
1929#[inline]
1930pub fn cell_second_derivative_required_max_degree(
1931    first_coefficients_r: &[f64],
1932    first_coefficients_s: &[f64],
1933    second_coefficients_rs: &[f64],
1934) -> usize {
1935    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1936    let product_degree = first_coefficients_r.len().saturating_sub(1)
1937        + first_coefficients_s.len().saturating_sub(1)
1938        + 3;
1939    second_degree.max(product_degree)
1940}
1941
1942#[inline]
1943pub fn cell_polynomial_integral_from_moments(
1944    polynomial_coefficients: &[f64],
1945    moments: &[f64],
1946    label: &str,
1947) -> Result<f64, String> {
1948    let value = moment_dot_with_coefficients(polynomial_coefficients, moments, label)?;
1949    Ok(value * INV_TWO_PI)
1950}
1951
1952#[inline]
1953pub fn cell_second_derivative_from_moments(
1954    cell: DenestedCubicCell,
1955    first_coefficients_r: &[f64],
1956    first_coefficients_s: &[f64],
1957    second_coefficients_rs: &[f64],
1958    moments: &[f64],
1959) -> Result<f64, String> {
1960    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1961    let product_degree = first_coefficients_r.len().saturating_sub(1)
1962        + first_coefficients_s.len().saturating_sub(1)
1963        + 3;
1964    let needed = second_degree.max(product_degree) + 1;
1965    if needed > moments.len() {
1966        return Err(CubicCellKernelError::insufficient_moments(format!(
1967            "insufficient reduced moments for second derivative: need {}, have {}",
1968            needed,
1969            moments.len()
1970        ))
1971        .into());
1972    }
1973    let second_term = moment_dot_with_coefficients_unchecked(second_coefficients_rs, moments);
1974    // Fold `Σ_{e,i,j} eta[e]·r[i]·s[j]·moments[e+i+j]` into a single dot
1975    // against `moments`. Convolving `eta ⊗ r ⊗ s` first turns the original
1976    // `len(eta)·len(r)·len(s)` triple loop (typically 4·4·4 = 64 mul-adds
1977    // per call) into `len(eta)·len(r) + (len(eta)+len(r)-1)·len(s) +
1978    // len(out)` ≈ 16 + 28 + 10 = 54 mul-adds, with the inner loops now in
1979    // straight-line FMA-friendly form.
1980    let cubic = [cell.c0, cell.c1, cell.c2, cell.c3];
1981    // Capacity bound: cubic (4) + first_r (≤MAX) + first_s (≤MAX) - 2.
1982    // First-coefficient slices are passed in as `[f64; 4]` from every
1983    // production caller; sizing to 32 covers any realistic test input.
1984    const SCRATCH: usize = 32;
1985    let mut eta_r = [0.0_f64; SCRATCH];
1986    let mut eta_rs = [0.0_f64; SCRATCH];
1987    let er_len = poly_conv_into(&cubic, first_coefficients_r, &mut eta_r);
1988    let ers_len = poly_conv_into(&eta_r[..er_len], first_coefficients_s, &mut eta_rs);
1989    let mut eta_term = 0.0;
1990    for k in 0..ers_len {
1991        eta_term = eta_rs[k].mul_add(moments[k], eta_term);
1992    }
1993    Ok((second_term - eta_term) * INV_TWO_PI)
1994}
1995
1996/// Pointwise value of the cell second-derivative integrand
1997/// `(∂²/∂r∂s) exp(-q(z))/2π` at a single `z`, evaluated from the SAME
1998/// `(r, s, rs)` coefficient polynomials the moment reduction
1999/// [`cell_second_derivative_from_moments`] integrates:
2000///
2001/// ```text
2002///   F_rs(z) = ( c_rs(z) - η(z)·c_r(z)·c_s(z) ) · exp(-q(z)) · 1/2π ,
2003/// ```
2004///
2005/// with `c_•(z) = Σ_k coeff_•[k]·zᵏ`, `η(z)` the cell cubic, and
2006/// `q(z) = ½(z² + η(z)²)`. This is the integrand whose `[cell.left,
2007/// cell.right]` integral the from-moments form returns — needed for the
2008/// Leibniz boundary term when a cell edge (a link-knot crossing
2009/// `z=(τ-a)/b`) moves with a parameter (the slope `b`): the directional
2010/// derivative of `∫_{z_L}^{z_R} F_rs dz` picks up
2011/// `F_rs(z_R)·z_R'(dir) - F_rs(z_L)·z_L'(dir)` on top of the fixed-domain
2012/// part. Coefficient sign convention matches the simpson reference
2013/// (`numeric_ab`): pass the ACTUAL derivative-coefficient polynomials
2014/// `∂c/∂r` etc. (not the negated `neg_dc_d•` the moment path consumes).
2015#[inline]
2016pub fn cell_second_derivative_boundary_integrand(
2017    cell: DenestedCubicCell,
2018    first_coefficients_r: &[f64],
2019    first_coefficients_s: &[f64],
2020    second_coefficients_rs: &[f64],
2021    z: f64,
2022) -> f64 {
2023    let eta = cell.eta(z);
2024    let c_r = poly_eval_at(first_coefficients_r, z);
2025    let c_s = poly_eval_at(first_coefficients_s, z);
2026    let c_rs = poly_eval_at(second_coefficients_rs, z);
2027    (c_rs - eta * c_r * c_s) * (-cell.q(z)).exp() * INV_TWO_PI
2028}
2029
2030/// Pointwise value of the cell third-derivative integrand
2031/// `(∂³/∂r∂s∂t) exp(-q(z))/2π` at a single `z`, evaluated from the same
2032/// `(r, s, t, rs, rt, st, rst)` coefficient polynomials that
2033/// [`cell_third_derivative_from_moments`] integrates:
2034///
2035/// ```text
2036/// F_rst(z) = (
2037///     c_rst(z)
2038///   - η(z)·(c_rs(z)c_t(z) + c_rt(z)c_s(z) + c_st(z)c_r(z))
2039///   + (η(z)² - 1)·c_r(z)c_s(z)c_t(z)
2040/// ) · exp(-q(z)) · 1/2π .
2041/// ```
2042///
2043/// This is the boundary value for differentiating an already-third-order
2044/// fixed-domain integral with respect to a moving edge. The sign convention is
2045/// intentionally identical to [`cell_third_derivative_from_moments`]: callers
2046/// must pass the coefficient slices in the convention of the integral they are
2047/// differentiating. In particular, survival/probit paths that integrate the
2048/// jointly negated cell and coefficient slices must evaluate this boundary
2049/// integrand with the same joint negation; evaluating an un-negated boundary for
2050/// a negated fixed-domain integral flips the sign of this odd-order integrand.
2051#[inline]
2052pub fn cell_third_derivative_boundary_integrand(
2053    cell: DenestedCubicCell,
2054    first_coefficients_r: &[f64],
2055    first_coefficients_s: &[f64],
2056    first_coefficients_t: &[f64],
2057    second_coefficients_rs: &[f64],
2058    second_coefficients_rt: &[f64],
2059    second_coefficients_st: &[f64],
2060    third_coefficients_rst: &[f64],
2061    z: f64,
2062) -> f64 {
2063    let eta = cell.eta(z);
2064    let c_r = poly_eval_at(first_coefficients_r, z);
2065    let c_s = poly_eval_at(first_coefficients_s, z);
2066    let c_t = poly_eval_at(first_coefficients_t, z);
2067    let c_rs = poly_eval_at(second_coefficients_rs, z);
2068    let c_rt = poly_eval_at(second_coefficients_rt, z);
2069    let c_st = poly_eval_at(second_coefficients_st, z);
2070    let c_rst = poly_eval_at(third_coefficients_rst, z);
2071    let amplitude =
2072        c_rst - eta * (c_rs * c_t + c_rt * c_s + c_st * c_r) + (eta * eta - 1.0) * c_r * c_s * c_t;
2073    amplitude * (-cell.q(z)).exp() * INV_TWO_PI
2074}
2075
2076/// Pointwise value of the density-weighted integrand `g(z)·exp(-q(z))/2π` at a
2077/// single `z`, for an arbitrary integrand polynomial `g`.
2078///
2079/// This is the boundary value needed for the moving-domain (Leibniz) term of a
2080/// density-normalization integral `∫ g(z)·exp(-q(z))/2π dz` whose cell edge is a
2081/// link-knot crossing `z=(τ-a)/b` that moves with a parameter direction: the
2082/// directional derivative of the integral picks up
2083/// `g(z_R)·w(z_R)·z_R'(dir) - g(z_L)·w(z_L)·z_L'(dir)` on top of the
2084/// fixed-domain part, with `w(z)=exp(-q(z))/2π` the same weight the moment
2085/// reductions integrate. Unlike the Hessian-integral boundary term (which is
2086/// shared by adjacent cells and cancels across each interior knot), the
2087/// ln-density integrand `D_t`/`D_t,uv` carries a non-shared `g`, so this
2088/// Leibniz term does NOT cancel and must be added (gam#932/#979).
2089pub fn cell_density_boundary_integrand(cell: DenestedCubicCell, g: &[f64], z: f64) -> f64 {
2090    poly_eval_at(g, z) * (-cell.q(z)).exp() * INV_TWO_PI
2091}
2092
2093/// Horner evaluation of `Σ_k coefficients[k]·zᵏ`.
2094#[inline]
2095fn poly_eval_at(coefficients: &[f64], z: f64) -> f64 {
2096    let mut acc = 0.0_f64;
2097    for &c in coefficients.iter().rev() {
2098        acc = acc.mul_add(z, c);
2099    }
2100    acc
2101}
2102
2103#[inline]
2104fn moment_dot_with_coefficients(
2105    coefficients: &[f64],
2106    moments: &[f64],
2107    label: &str,
2108) -> Result<f64, String> {
2109    if coefficients.len() > moments.len() {
2110        return Err(CubicCellKernelError::insufficient_moments(format!(
2111            "insufficient reduced moments for {label}: need {}, have {}",
2112            coefficients.len(),
2113            moments.len()
2114        ))
2115        .into());
2116    }
2117    Ok(moment_dot_with_coefficients_unchecked(
2118        coefficients,
2119        moments,
2120    ))
2121}
2122
2123#[inline]
2124fn moment_dot_with_coefficients_unchecked(coefficients: &[f64], moments: &[f64]) -> f64 {
2125    let mut acc = 0.0;
2126    for (idx, &coeff) in coefficients.iter().enumerate() {
2127        acc = coeff.mul_add(moments[idx], acc);
2128    }
2129    acc
2130}
2131
2132/// Convolve two polynomial coefficient slices into a fixed-capacity output
2133/// buffer. Returns the populated length (`lhs.len() + rhs.len() - 1` when
2134/// both are non-empty). The buffer's tail (beyond the returned length) is
2135/// not zeroed; callers must use only the returned prefix.
2136///
2137/// Used by the multi-derivative reductions to fold `eta · r · s · …` triple
2138/// and quadruple sums into a single moment dot, eliminating the
2139/// `O(deg^3)`/`O(deg^4)` inner-loop work that dominated the
2140/// `cell_*_derivative_from_moments` hot leaves on large-scale fits.
2141#[inline]
2142fn poly_conv_into(lhs: &[f64], rhs: &[f64], out: &mut [f64]) -> usize {
2143    if lhs.is_empty() || rhs.is_empty() {
2144        return 0;
2145    }
2146    let len = lhs.len() + rhs.len() - 1;
2147    assert!(out.len() >= len);
2148    for slot in out[..len].iter_mut() {
2149        *slot = 0.0;
2150    }
2151    for (i, &lv) in lhs.iter().enumerate() {
2152        for (j, &rv) in rhs.iter().enumerate() {
2153            out[i + j] = lv.mul_add(rv, out[i + j]);
2154        }
2155    }
2156    len
2157}
2158
2159#[inline]
2160fn require_moments_degree(
2161    required_degree: usize,
2162    moments: &[f64],
2163    label: &str,
2164) -> Result<(), String> {
2165    if required_degree >= moments.len() {
2166        return Err(CubicCellKernelError::insufficient_moments(format!(
2167            "insufficient reduced moments for {label}: need {}, have {}",
2168            required_degree + 1,
2169            moments.len()
2170        ))
2171        .into());
2172    }
2173    Ok::<(), _>(())
2174}
2175
2176#[inline]
2177fn require_scratch_capacity(
2178    required_len: usize,
2179    capacity: usize,
2180    label: &str,
2181) -> Result<(), String> {
2182    if required_len > capacity {
2183        return Err(CubicCellKernelError::insufficient_moments(format!(
2184            "{label} polynomial convolution scratch too small: need {required_len}, have {capacity}"
2185        ))
2186        .into());
2187    }
2188    Ok::<(), _>(())
2189}
2190
2191#[inline]
2192fn convolution_chain_len(lengths: &[usize]) -> usize {
2193    if lengths.is_empty() || lengths.contains(&0) {
2194        0
2195    } else {
2196        lengths.iter().sum::<usize>() - (lengths.len() - 1)
2197    }
2198}
2199
2200#[inline]
2201fn first_coefficients_degree(label: &str, coefficients: &[f64]) -> Result<usize, String> {
2202    coefficients
2203        .len()
2204        .checked_sub(1)
2205        .ok_or_else(|| format!("{label} first-derivative coefficients must be non-empty"))
2206}
2207
2208#[inline]
2209pub fn cell_third_derivative_from_moments(
2210    cell: DenestedCubicCell,
2211    first_coefficients_r: &[f64],
2212    first_coefficients_s: &[f64],
2213    first_coefficients_t: &[f64],
2214    second_coefficients_rs: &[f64],
2215    second_coefficients_rt: &[f64],
2216    second_coefficients_st: &[f64],
2217    third_coefficients_rst: &[f64],
2218    moments: &[f64],
2219) -> Result<f64, String> {
2220    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2221    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2222    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2223    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2224    let second_sum_degree = [
2225        second_coefficients_rs.len() + first_coefficients_t.len(),
2226        second_coefficients_rt.len() + first_coefficients_s.len(),
2227        second_coefficients_st.len() + first_coefficients_r.len(),
2228    ]
2229    .into_iter()
2230    .max()
2231    .unwrap_or(0)
2232    .saturating_sub(1);
2233    let triple_product_degree = r_degree + s_degree + t_degree;
2234    let needed = (third_coefficients_rst.len().saturating_sub(1))
2235        .max(3 + second_sum_degree)
2236        .max(6 + triple_product_degree);
2237    require_moments_degree(needed, moments, "third derivative")?;
2238
2239    let third_term = moment_dot_with_coefficients_unchecked(third_coefficients_rst, moments);
2240
2241    // This is a deliberately serial leaf kernel: each call performs only a
2242    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2243    // at the surrounding row/cell batch level rather than inside this hot path.
2244    const SCRATCH: usize = 32;
2245    let max_linear_conv_len = [
2246        convolution_chain_len(&[
2247            eta.len(),
2248            second_coefficients_rs.len(),
2249            first_coefficients_t.len(),
2250        ]),
2251        convolution_chain_len(&[
2252            eta.len(),
2253            second_coefficients_rt.len(),
2254            first_coefficients_s.len(),
2255        ]),
2256        convolution_chain_len(&[
2257            eta.len(),
2258            second_coefficients_st.len(),
2259            first_coefficients_r.len(),
2260        ]),
2261    ]
2262    .into_iter()
2263    .max()
2264    .unwrap_or(0);
2265    let max_cubic_conv_len = convolution_chain_len(&[
2266        7,
2267        first_coefficients_r.len(),
2268        first_coefficients_s.len(),
2269        first_coefficients_t.len(),
2270    ]);
2271    require_scratch_capacity(
2272        max_linear_conv_len.max(max_cubic_conv_len),
2273        SCRATCH,
2274        "third derivative",
2275    )?;
2276    let mut buf_a = [0.0_f64; SCRATCH];
2277    let mut buf_b = [0.0_f64; SCRATCH];
2278
2279    // eta_second_term = Σ over (rs⊗t, rt⊗s, st⊗r) of eta⊗product · moments.
2280    // Fold each of the three triple sums into a single moment dot.
2281    let mut eta_second_term = 0.0;
2282    let conv_dot = |first: &[f64],
2283                    second: &[f64],
2284                    buf_a: &mut [f64; SCRATCH],
2285                    buf_b: &mut [f64; SCRATCH]|
2286     -> f64 {
2287        let m = poly_conv_into(first, second, buf_a);
2288        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2289        let mut acc = 0.0;
2290        for k in 0..n {
2291            acc = buf_b[k].mul_add(moments[k], acc);
2292        }
2293        acc
2294    };
2295    eta_second_term += conv_dot(
2296        second_coefficients_rs,
2297        first_coefficients_t,
2298        &mut buf_a,
2299        &mut buf_b,
2300    );
2301    eta_second_term += conv_dot(
2302        second_coefficients_rt,
2303        first_coefficients_s,
2304        &mut buf_a,
2305        &mut buf_b,
2306    );
2307    eta_second_term += conv_dot(
2308        second_coefficients_st,
2309        first_coefficients_r,
2310        &mut buf_a,
2311        &mut buf_b,
2312    );
2313
2314    // cubic_coeff_term = Σ_{e,i,j,k} (eta·eta − 1)[e] · r[i] · s[j] · t[k] · moments[e+i+j+k].
2315    // Convolve r⊗s, then ⊗t, then ⊗(eta·eta − 1), giving a single dot.
2316    let mut eta_sq_minus_one = [0.0_f64; 7];
2317    for (i, &eta_i) in eta.iter().enumerate() {
2318        for (j, &eta_j) in eta.iter().enumerate() {
2319            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2320        }
2321    }
2322    eta_sq_minus_one[0] -= 1.0;
2323
2324    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2325    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2326    // buf_a now reused for (eta_sq_minus_one ⊗ rst).
2327    let final_len = poly_conv_into(&eta_sq_minus_one, &buf_b[..rst_len], &mut buf_a);
2328    let mut cubic_coeff_term = 0.0;
2329    for k in 0..final_len {
2330        cubic_coeff_term = buf_a[k].mul_add(moments[k], cubic_coeff_term);
2331    }
2332
2333    Ok((third_term - eta_second_term + cubic_coeff_term) * INV_TWO_PI)
2334}
2335
2336#[inline]
2337pub fn cell_fourth_derivative_from_moments(
2338    cell: DenestedCubicCell,
2339    first_coefficients_r: &[f64],
2340    first_coefficients_s: &[f64],
2341    first_coefficients_t: &[f64],
2342    first_coefficients_u: &[f64],
2343    second_coefficients_rs: &[f64],
2344    second_coefficients_rt: &[f64],
2345    second_coefficients_ru: &[f64],
2346    second_coefficients_st: &[f64],
2347    second_coefficients_su: &[f64],
2348    second_coefficients_tu: &[f64],
2349    third_coefficients_rst: &[f64],
2350    third_coefficients_rsu: &[f64],
2351    third_coefficients_rtu: &[f64],
2352    third_coefficients_stu: &[f64],
2353    fourth_coefficients_rstu: &[f64],
2354    moments: &[f64],
2355) -> Result<f64, String> {
2356    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2357    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2358    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2359    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2360    let u_degree = first_coefficients_degree("u", first_coefficients_u)?;
2361    let linear_sum_degree = [
2362        third_coefficients_rst.len() + first_coefficients_u.len(),
2363        third_coefficients_rsu.len() + first_coefficients_t.len(),
2364        third_coefficients_rtu.len() + first_coefficients_s.len(),
2365        third_coefficients_stu.len() + first_coefficients_r.len(),
2366        second_coefficients_rs.len() + second_coefficients_tu.len(),
2367        second_coefficients_rt.len() + second_coefficients_su.len(),
2368        second_coefficients_ru.len() + second_coefficients_st.len(),
2369    ]
2370    .into_iter()
2371    .max()
2372    .unwrap_or(0)
2373    .saturating_sub(1);
2374    let quad_sum_degree = [
2375        second_coefficients_rs.len() + first_coefficients_t.len() + first_coefficients_u.len(),
2376        second_coefficients_rt.len() + first_coefficients_s.len() + first_coefficients_u.len(),
2377        second_coefficients_ru.len() + first_coefficients_s.len() + first_coefficients_t.len(),
2378        second_coefficients_st.len() + first_coefficients_r.len() + first_coefficients_u.len(),
2379        second_coefficients_su.len() + first_coefficients_r.len() + first_coefficients_t.len(),
2380        second_coefficients_tu.len() + first_coefficients_r.len() + first_coefficients_s.len(),
2381    ]
2382    .into_iter()
2383    .max()
2384    .unwrap_or(0)
2385    .saturating_sub(2);
2386    let quartic_product_degree = r_degree + s_degree + t_degree + u_degree;
2387    let needed = (fourth_coefficients_rstu.len().saturating_sub(1))
2388        .max(3 + linear_sum_degree)
2389        .max(6 + quad_sum_degree)
2390        .max(9 + quartic_product_degree);
2391    require_moments_degree(needed, moments, "fourth derivative")?;
2392
2393    let fourth_term = moment_dot_with_coefficients_unchecked(fourth_coefficients_rstu, moments);
2394
2395    // This is a deliberately serial leaf kernel: each call performs only a
2396    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2397    // at the surrounding row/cell batch level rather than inside this hot path.
2398    const SCRATCH: usize = 32;
2399    let max_linear_conv_len = [
2400        convolution_chain_len(&[
2401            eta.len(),
2402            third_coefficients_rst.len(),
2403            first_coefficients_u.len(),
2404        ]),
2405        convolution_chain_len(&[
2406            eta.len(),
2407            third_coefficients_rsu.len(),
2408            first_coefficients_t.len(),
2409        ]),
2410        convolution_chain_len(&[
2411            eta.len(),
2412            third_coefficients_rtu.len(),
2413            first_coefficients_s.len(),
2414        ]),
2415        convolution_chain_len(&[
2416            eta.len(),
2417            third_coefficients_stu.len(),
2418            first_coefficients_r.len(),
2419        ]),
2420        convolution_chain_len(&[
2421            eta.len(),
2422            second_coefficients_rs.len(),
2423            second_coefficients_tu.len(),
2424        ]),
2425        convolution_chain_len(&[
2426            eta.len(),
2427            second_coefficients_rt.len(),
2428            second_coefficients_su.len(),
2429        ]),
2430        convolution_chain_len(&[
2431            eta.len(),
2432            second_coefficients_ru.len(),
2433            second_coefficients_st.len(),
2434        ]),
2435    ]
2436    .into_iter()
2437    .max()
2438    .unwrap_or(0);
2439    let max_quad_conv_len = [
2440        convolution_chain_len(&[
2441            7,
2442            second_coefficients_rs.len(),
2443            first_coefficients_t.len(),
2444            first_coefficients_u.len(),
2445        ]),
2446        convolution_chain_len(&[
2447            7,
2448            second_coefficients_rt.len(),
2449            first_coefficients_s.len(),
2450            first_coefficients_u.len(),
2451        ]),
2452        convolution_chain_len(&[
2453            7,
2454            second_coefficients_ru.len(),
2455            first_coefficients_s.len(),
2456            first_coefficients_t.len(),
2457        ]),
2458        convolution_chain_len(&[
2459            7,
2460            second_coefficients_st.len(),
2461            first_coefficients_r.len(),
2462            first_coefficients_u.len(),
2463        ]),
2464        convolution_chain_len(&[
2465            7,
2466            second_coefficients_su.len(),
2467            first_coefficients_r.len(),
2468            first_coefficients_t.len(),
2469        ]),
2470        convolution_chain_len(&[
2471            7,
2472            second_coefficients_tu.len(),
2473            first_coefficients_r.len(),
2474            first_coefficients_s.len(),
2475        ]),
2476    ]
2477    .into_iter()
2478    .max()
2479    .unwrap_or(0);
2480    let max_quartic_conv_len = convolution_chain_len(&[
2481        10,
2482        first_coefficients_r.len(),
2483        first_coefficients_s.len(),
2484        first_coefficients_t.len(),
2485        first_coefficients_u.len(),
2486    ]);
2487    require_scratch_capacity(
2488        max_linear_conv_len
2489            .max(max_quad_conv_len)
2490            .max(max_quartic_conv_len),
2491        SCRATCH,
2492        "fourth derivative",
2493    )?;
2494    let mut buf_a = [0.0_f64; SCRATCH];
2495    let mut buf_b = [0.0_f64; SCRATCH];
2496
2497    // eta_linear_term = Σ over seven (rst⊗u, rsu⊗t, rtu⊗s, stu⊗r, rs⊗tu,
2498    // rt⊗su, ru⊗st) of eta⊗product · moments. Fold each triple sum into
2499    // a single moment dot.
2500    let conv_eta_dot = |first: &[f64],
2501                        second: &[f64],
2502                        buf_a: &mut [f64; SCRATCH],
2503                        buf_b: &mut [f64; SCRATCH]|
2504     -> f64 {
2505        let m = poly_conv_into(first, second, buf_a);
2506        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2507        let mut acc = 0.0;
2508        for k in 0..n {
2509            acc = buf_b[k].mul_add(moments[k], acc);
2510        }
2511        acc
2512    };
2513    let mut eta_linear_term = 0.0;
2514    eta_linear_term += conv_eta_dot(
2515        third_coefficients_rst,
2516        first_coefficients_u,
2517        &mut buf_a,
2518        &mut buf_b,
2519    );
2520    eta_linear_term += conv_eta_dot(
2521        third_coefficients_rsu,
2522        first_coefficients_t,
2523        &mut buf_a,
2524        &mut buf_b,
2525    );
2526    eta_linear_term += conv_eta_dot(
2527        third_coefficients_rtu,
2528        first_coefficients_s,
2529        &mut buf_a,
2530        &mut buf_b,
2531    );
2532    eta_linear_term += conv_eta_dot(
2533        third_coefficients_stu,
2534        first_coefficients_r,
2535        &mut buf_a,
2536        &mut buf_b,
2537    );
2538    eta_linear_term += conv_eta_dot(
2539        second_coefficients_rs,
2540        second_coefficients_tu,
2541        &mut buf_a,
2542        &mut buf_b,
2543    );
2544    eta_linear_term += conv_eta_dot(
2545        second_coefficients_rt,
2546        second_coefficients_su,
2547        &mut buf_a,
2548        &mut buf_b,
2549    );
2550    eta_linear_term += conv_eta_dot(
2551        second_coefficients_ru,
2552        second_coefficients_st,
2553        &mut buf_a,
2554        &mut buf_b,
2555    );
2556
2557    let mut eta_sq_minus_one = [0.0_f64; 7];
2558    for (i, &eta_i) in eta.iter().enumerate() {
2559        for (j, &eta_j) in eta.iter().enumerate() {
2560            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2561        }
2562    }
2563    eta_sq_minus_one[0] -= 1.0;
2564
2565    // quad_coeff_term: six (eta²−1)⊗A⊗B⊗C · moments sums, where the (A,B,C)
2566    // factors are: (rs,t,u), (rt,s,u), (ru,s,t), (st,r,u), (su,r,t), (tu,r,s).
2567    let mut buf_c = [0.0_f64; SCRATCH];
2568    let conv_weighted_triple_dot = |weight: &[f64],
2569                                    a: &[f64],
2570                                    b: &[f64],
2571                                    c: &[f64],
2572                                    buf_a: &mut [f64; SCRATCH],
2573                                    buf_b: &mut [f64; SCRATCH],
2574                                    buf_c: &mut [f64; SCRATCH]|
2575     -> f64 {
2576        let ab_len = poly_conv_into(a, b, buf_a);
2577        let abc_len = poly_conv_into(&buf_a[..ab_len], c, buf_b);
2578        let final_len = poly_conv_into(weight, &buf_b[..abc_len], buf_c);
2579        let mut acc = 0.0;
2580        for k in 0..final_len {
2581            acc = buf_c[k].mul_add(moments[k], acc);
2582        }
2583        acc
2584    };
2585    let mut quad_coeff_term = 0.0;
2586    quad_coeff_term += conv_weighted_triple_dot(
2587        &eta_sq_minus_one,
2588        second_coefficients_rs,
2589        first_coefficients_t,
2590        first_coefficients_u,
2591        &mut buf_a,
2592        &mut buf_b,
2593        &mut buf_c,
2594    );
2595    quad_coeff_term += conv_weighted_triple_dot(
2596        &eta_sq_minus_one,
2597        second_coefficients_rt,
2598        first_coefficients_s,
2599        first_coefficients_u,
2600        &mut buf_a,
2601        &mut buf_b,
2602        &mut buf_c,
2603    );
2604    quad_coeff_term += conv_weighted_triple_dot(
2605        &eta_sq_minus_one,
2606        second_coefficients_ru,
2607        first_coefficients_s,
2608        first_coefficients_t,
2609        &mut buf_a,
2610        &mut buf_b,
2611        &mut buf_c,
2612    );
2613    quad_coeff_term += conv_weighted_triple_dot(
2614        &eta_sq_minus_one,
2615        second_coefficients_st,
2616        first_coefficients_r,
2617        first_coefficients_u,
2618        &mut buf_a,
2619        &mut buf_b,
2620        &mut buf_c,
2621    );
2622    quad_coeff_term += conv_weighted_triple_dot(
2623        &eta_sq_minus_one,
2624        second_coefficients_su,
2625        first_coefficients_r,
2626        first_coefficients_t,
2627        &mut buf_a,
2628        &mut buf_b,
2629        &mut buf_c,
2630    );
2631    quad_coeff_term += conv_weighted_triple_dot(
2632        &eta_sq_minus_one,
2633        second_coefficients_tu,
2634        first_coefficients_r,
2635        first_coefficients_s,
2636        &mut buf_a,
2637        &mut buf_b,
2638        &mut buf_c,
2639    );
2640
2641    // cubic_weight = 3·eta − eta³ (same as the prior expansion: eta_sq*eta
2642    // negated, plus the 3·eta linear correction).
2643    let mut eta_sq = [0.0_f64; 7];
2644    for (i, &eta_i) in eta.iter().enumerate() {
2645        for (j, &eta_j) in eta.iter().enumerate() {
2646            eta_sq[i + j] = eta_i.mul_add(eta_j, eta_sq[i + j]);
2647        }
2648    }
2649    let mut cubic_weight = [0.0_f64; 10];
2650    for (i, &eta_sq_i) in eta_sq.iter().enumerate() {
2651        for (j, &eta_j) in eta.iter().enumerate() {
2652            cubic_weight[i + j] = (-eta_sq_i).mul_add(eta_j, cubic_weight[i + j]);
2653        }
2654    }
2655    for (idx, &eta_coeff) in eta.iter().enumerate() {
2656        cubic_weight[idx] += 3.0 * eta_coeff;
2657    }
2658
2659    // quartic_coeff_term: cubic_weight ⊗ r ⊗ s ⊗ t ⊗ u · moments. The
2660    // original quintuple loop did 10·4·4·4·4 = 2560 mul-adds per call;
2661    // four sequential convolutions plus one moment dot drop this to
2662    // ~16+28+40+52+16 ≈ 152 mul-adds.
2663    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2664    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2665    let rstu_len = poly_conv_into(&buf_b[..rst_len], first_coefficients_u, &mut buf_a);
2666    let final_len = poly_conv_into(&cubic_weight, &buf_a[..rstu_len], &mut buf_b);
2667    let mut quartic_coeff_term = 0.0;
2668    for k in 0..final_len {
2669        quartic_coeff_term = buf_b[k].mul_add(moments[k], quartic_coeff_term);
2670    }
2671
2672    Ok((fourth_term - eta_linear_term + quad_coeff_term + quartic_coeff_term) * INV_TWO_PI)
2673}
2674
2675#[inline]
2676pub fn global_cubic_from_local(span: LocalSpanCubic) -> (f64, f64, f64, f64) {
2677    let left = span.left;
2678    let q0 = span.c0 - span.c1 * left + span.c2 * left * left - span.c3 * left * left * left;
2679    let q1 = span.c1 - 2.0 * span.c2 * left + 3.0 * span.c3 * left * left;
2680    let q2 = span.c2 - 3.0 * span.c3 * left;
2681    let q3 = span.c3;
2682    (q0, q1, q2, q3)
2683}
2684
2685/// Return the cubic polynomial coefficients (in `z`) of
2686/// `f(z) = link_span.evaluate(a + b*z)`.
2687///
2688/// `link_span.evaluate` is a cubic in its argument, so `f(z)` is also a cubic
2689/// in `z` and can be written exactly as
2690///
2691/// ```text
2692///     f(z) = d0 + d1·z + d2·z² + d3·z³
2693/// ```
2694///
2695/// where `(d0, d1, d2, d3)` are the values returned by this function. These
2696/// are **polynomial coefficients**, *not* derivatives of `f` at `z = 0`. The
2697/// relationship to Taylor derivatives is
2698///
2699/// ```text
2700///     d_k = f^(k)(0) / k!
2701/// ```
2702///
2703/// so `d0 = f(0)`, `d1 = f'(0)`, `d2 = ½·f''(0)`, `d3 = ⅙·f'''(0)`. Callers
2704/// such as [`denested_cell_coefficients`] and [`link_basis_cell_coefficients`]
2705/// rely on the polynomial-coefficient convention, since they propagate the
2706/// values directly as the `(c0, c1, c2, c3)` slots of a downstream polynomial
2707/// in `z`.
2708#[inline]
2709pub fn transformed_link_cubic(link_span: LocalSpanCubic, a: f64, b: f64) -> (f64, f64, f64, f64) {
2710    let shift = a - link_span.left;
2711    let d0 = link_span.c0
2712        + link_span.c1 * shift
2713        + link_span.c2 * shift * shift
2714        + link_span.c3 * shift * shift * shift;
2715    let d1 = b * (link_span.c1 + 2.0 * link_span.c2 * shift + 3.0 * link_span.c3 * shift * shift);
2716    let d2 = b * b * (link_span.c2 + 3.0 * link_span.c3 * shift);
2717    let d3 = link_span.c3 * b * b * b;
2718    (d0, d1, d2, d3)
2719}
2720
2721#[inline]
2722pub fn denested_cell_coefficients(
2723    score_span: LocalSpanCubic,
2724    link_span: LocalSpanCubic,
2725    a: f64,
2726    b: f64,
2727) -> [f64; 4] {
2728    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2729    let (d0, d1, d2, d3) = transformed_link_cubic(link_span, a, b);
2730    [a + b * h0 + d0, b + b * h1 + d1, b * h2 + d2, b * h3 + d3]
2731}
2732
2733#[inline]
2734pub fn denested_cell_coefficient_partials(
2735    score_span: LocalSpanCubic,
2736    link_span: LocalSpanCubic,
2737    a: f64,
2738    b: f64,
2739) -> ([f64; 4], [f64; 4]) {
2740    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2741    let shift = a - link_span.left;
2742    let alpha1 = link_span.c1;
2743    let alpha2 = link_span.c2;
2744    let alpha3 = link_span.c3;
2745    let dc_da = [
2746        1.0 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2747        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2748        3.0 * alpha3 * b * b,
2749        0.0,
2750    ];
2751    let dc_db = [
2752        h0,
2753        1.0 + h1 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2754        h2 + 2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2755        h3 + 3.0 * alpha3 * b * b,
2756    ];
2757    (dc_da, dc_db)
2758}
2759
2760#[inline]
2761fn link_cubic_second_partials(
2762    link_span: LocalSpanCubic,
2763    a: f64,
2764    b: f64,
2765) -> ([f64; 4], [f64; 4], [f64; 4]) {
2766    let shift = a - link_span.left;
2767    let alpha2 = link_span.c2;
2768    let alpha3 = link_span.c3;
2769    let dc_daa = [
2770        2.0 * alpha2 + 6.0 * alpha3 * shift,
2771        6.0 * alpha3 * b,
2772        0.0,
2773        0.0,
2774    ];
2775    let dc_dab = [
2776        0.0,
2777        2.0 * alpha2 + 6.0 * alpha3 * shift,
2778        6.0 * alpha3 * b,
2779        0.0,
2780    ];
2781    let dc_dbb = [
2782        0.0,
2783        0.0,
2784        2.0 * (alpha2 + 3.0 * alpha3 * shift),
2785        6.0 * alpha3 * b,
2786    ];
2787    (dc_daa, dc_dab, dc_dbb)
2788}
2789
2790#[inline]
2791pub fn denested_cell_second_partials(
2792    score_span: LocalSpanCubic,
2793    link_span: LocalSpanCubic,
2794    a: f64,
2795    b: f64,
2796) -> ([f64; 4], [f64; 4], [f64; 4]) {
2797    let score_left = score_span.left;
2798    if !score_left.is_finite() {
2799        return ([f64::NAN; 4], [f64::NAN; 4], [f64::NAN; 4]);
2800    }
2801    link_cubic_second_partials(link_span, a, b)
2802}
2803
2804#[inline]
2805fn link_cubic_third_partials(
2806    link_span: LocalSpanCubic,
2807) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2808    let alpha3 = link_span.c3;
2809    (
2810        [6.0 * alpha3, 0.0, 0.0, 0.0],
2811        [0.0, 6.0 * alpha3, 0.0, 0.0],
2812        [0.0, 0.0, 6.0 * alpha3, 0.0],
2813        [0.0, 0.0, 0.0, 6.0 * alpha3],
2814    )
2815}
2816
2817#[inline]
2818pub fn denested_cell_third_partials(
2819    link_span: LocalSpanCubic,
2820) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2821    link_cubic_third_partials(link_span)
2822}
2823
2824#[inline]
2825pub fn score_basis_cell_coefficients(score_basis_span: LocalSpanCubic, b: f64) -> [f64; 4] {
2826    let (h0, h1, h2, h3) = global_cubic_from_local(score_basis_span);
2827    [b * h0, b * h1, b * h2, b * h3]
2828}
2829
2830#[inline]
2831pub fn link_basis_cell_coefficients(link_basis_span: LocalSpanCubic, a: f64, b: f64) -> [f64; 4] {
2832    let (d0, d1, d2, d3) = transformed_link_cubic(link_basis_span, a, b);
2833    [d0, d1, d2, d3]
2834}
2835
2836#[inline]
2837pub fn link_basis_cell_coefficient_partials(
2838    link_basis_span: LocalSpanCubic,
2839    a: f64,
2840    b: f64,
2841) -> ([f64; 4], [f64; 4]) {
2842    let shift = a - link_basis_span.left;
2843    let alpha1 = link_basis_span.c1;
2844    let alpha2 = link_basis_span.c2;
2845    let alpha3 = link_basis_span.c3;
2846    let dc_da = [
2847        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2848        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2849        3.0 * alpha3 * b * b,
2850        0.0,
2851    ];
2852    let dc_db = [
2853        0.0,
2854        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2855        2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2856        3.0 * alpha3 * b * b,
2857    ];
2858    (dc_da, dc_db)
2859}
2860
2861#[inline]
2862pub fn link_basis_cell_second_partials(
2863    link_basis_span: LocalSpanCubic,
2864    a: f64,
2865    b: f64,
2866) -> ([f64; 4], [f64; 4], [f64; 4]) {
2867    link_cubic_second_partials(link_basis_span, a, b)
2868}
2869
2870#[inline]
2871pub fn link_basis_cell_third_partials(
2872    link_basis_span: LocalSpanCubic,
2873) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2874    link_cubic_third_partials(link_basis_span)
2875}
2876
2877pub fn build_denested_partition_cells<FS, FL>(
2878    a: f64,
2879    b: f64,
2880    score_breaks: &[f64],
2881    link_breaks: &[f64],
2882    score_span_at: FS,
2883    link_span_at: FL,
2884) -> Result<Vec<DenestedPartitionCell>, String>
2885where
2886    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2887    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2888{
2889    build_denested_partition_cells_with_tails(
2890        a,
2891        b,
2892        score_breaks,
2893        link_breaks,
2894        score_span_at,
2895        link_span_at,
2896    )
2897}
2898
2899/// Build a partition covering `(-∞, +∞)` with parameter-independent outer
2900/// bounds.  Interior cells use the same finite-cell polynomial algebra.
2901/// The two tail cells are guaranteed affine (c2=c3=0) because both
2902/// deviations saturate to constants outside their knot support.
2903///
2904/// The tail cells' score/link spans come from the same closures evaluated
2905/// at a representative point in the tail region — the closures must return
2906/// constant (c1=c2=c3=0) cubics for points outside support.
2907pub fn build_denested_partition_cells_with_tails<FS, FL>(
2908    a: f64,
2909    b: f64,
2910    score_breaks: &[f64],
2911    link_breaks: &[f64],
2912    mut score_span_at: FS,
2913    mut link_span_at: FL,
2914) -> Result<Vec<DenestedPartitionCell>, String>
2915where
2916    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2917    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2918{
2919    // Collect all INTERNAL split points (finite), each tagged with its
2920    // provenance: a fixed score break or a link-knot crossing. Provenance
2921    // identifies the cell's `(a, b)` family for the Chebyshev moment-family
2922    // layer; the z coordinates alone cannot distinguish the two kinds.
2923    let mut split_points: Vec<(f64, PartitionEdge)> = score_breaks
2924        .iter()
2925        .map(|&sigma| (sigma, PartitionEdge::Fixed(sigma)))
2926        .collect();
2927    if b.abs() > 1e-12 {
2928        for &tau in link_breaks {
2929            let z = (tau - a) / b;
2930            if z.is_finite() {
2931                split_points.push((z, PartitionEdge::Crossing { tau }));
2932            }
2933        }
2934    }
2935    dedup_sorted_tagged_breakpoints(&mut split_points);
2936
2937    let mut out = Vec::new();
2938
2939    if split_points.is_empty() {
2940        let score_span = score_span_at(0.0)?;
2941        let link_span = link_span_at(a)?;
2942        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2943        return Ok(vec![DenestedPartitionCell {
2944            cell: DenestedCubicCell {
2945                left: f64::NEG_INFINITY,
2946                right: f64::INFINITY,
2947                c0: coeffs[0],
2948                c1: coeffs[1],
2949                c2: 0.0,
2950                c3: 0.0,
2951            },
2952            score_span,
2953            link_span,
2954            left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2955            right_edge: PartitionEdge::Fixed(f64::INFINITY),
2956        }]);
2957    }
2958
2959    // ── Left tail cell: (-∞, leftmost_split] ──
2960    let (leftmost, leftmost_edge) = split_points[0];
2961    // Evaluate spans at a point just left of the leftmost split.  The
2962    // closures return constant tail cubics for this region.
2963    let left_probe = interval_probe_point(f64::NEG_INFINITY, leftmost)?;
2964    let left_score_span = score_span_at(left_probe)?;
2965    let left_link_span = link_span_at(a + b * left_probe)?;
2966    let left_coeffs = denested_cell_coefficients(left_score_span, left_link_span, a, b);
2967    if left_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2968        || left_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2969    {
2970        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2971            "left tail cell must be affine (deviations constant outside support), \
2972             got c2={:.3e}, c3={:.3e}",
2973            left_coeffs[2], left_coeffs[3]
2974        ))
2975        .into());
2976    }
2977    out.push(DenestedPartitionCell {
2978        cell: DenestedCubicCell {
2979            left: f64::NEG_INFINITY,
2980            right: leftmost,
2981            c0: left_coeffs[0],
2982            c1: left_coeffs[1],
2983            c2: 0.0,
2984            c3: 0.0,
2985        },
2986        score_span: left_score_span,
2987        link_span: left_link_span,
2988        left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2989        right_edge: leftmost_edge,
2990    });
2991
2992    // ── Interior cells (all finite) ──
2993    for window in split_points.windows(2) {
2994        let (left, left_edge) = window[0];
2995        let (right, right_edge) = window[1];
2996        if !left.is_finite() || !right.is_finite() || right - left <= 1e-12 {
2997            continue;
2998        }
2999        let mid = interval_probe_point(left, right)?;
3000        let score_span = score_span_at(mid)?;
3001        let link_span = link_span_at(a + b * mid)?;
3002        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
3003        out.push(DenestedPartitionCell {
3004            cell: DenestedCubicCell {
3005                left,
3006                right,
3007                c0: coeffs[0],
3008                c1: coeffs[1],
3009                c2: coeffs[2],
3010                c3: coeffs[3],
3011            },
3012            score_span,
3013            link_span,
3014            left_edge,
3015            right_edge,
3016        });
3017    }
3018
3019    // ── Right tail cell: [rightmost_split, +∞) ──
3020    let (rightmost, rightmost_edge) = *split_points.last().unwrap();
3021    let right_probe = interval_probe_point(rightmost, f64::INFINITY)?;
3022    let right_score_span = score_span_at(right_probe)?;
3023    let right_link_span = link_span_at(a + b * right_probe)?;
3024    let right_coeffs = denested_cell_coefficients(right_score_span, right_link_span, a, b);
3025    if right_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
3026        || right_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
3027    {
3028        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3029            "right tail cell must be affine (deviations constant outside support), \
3030             got c2={:.3e}, c3={:.3e}",
3031            right_coeffs[2], right_coeffs[3]
3032        ))
3033        .into());
3034    }
3035    out.push(DenestedPartitionCell {
3036        cell: DenestedCubicCell {
3037            left: rightmost,
3038            right: f64::INFINITY,
3039            c0: right_coeffs[0],
3040            c1: right_coeffs[1],
3041            c2: 0.0,
3042            c3: 0.0,
3043        },
3044        score_span: right_score_span,
3045        link_span: right_link_span,
3046        left_edge: rightmost_edge,
3047        right_edge: PartitionEdge::Fixed(f64::INFINITY),
3048    });
3049
3050    Ok(out)
3051}
3052
3053#[inline]
3054pub fn normalized_non_affine_coefficients(
3055    left: f64,
3056    right: f64,
3057    c0: f64,
3058    c1: f64,
3059    c2: f64,
3060    c3: f64,
3061) -> Result<(f64, f64), String> {
3062    let width = right - left;
3063    if !width.is_finite() || width <= 0.0 {
3064        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3065            "normalized cubic coefficients require a positive finite cell width, got left={left}, right={right}"
3066        ))
3067        .into());
3068    }
3069    let anchor_scale = c0.abs() + c1.abs();
3070    if !anchor_scale.is_finite() {
3071        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3072            "normalized cubic coefficients require finite affine coefficients, got c0={c0}, c1={c1}"
3073        ))
3074        .into());
3075    }
3076    let mid = 0.5 * (left + right);
3077    let half = 0.5 * width;
3078    let k2 = half * half * (c2 + 3.0 * c3 * mid);
3079    let k3 = c3 * half * half * half;
3080    Ok((k2, k3))
3081}
3082
3083#[inline]
3084pub fn branch_cell(cell: DenestedCubicCell) -> Result<ExactCellBranch, String> {
3085    let tol = effective_branch_tol(cell);
3086    if !cell.left.is_finite() || !cell.right.is_finite() {
3087        if cell.c2.abs() <= tol && cell.c3.abs() <= tol {
3088            return Ok(ExactCellBranch::Affine);
3089        }
3090        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3091            "non-affine cells require finite bounds, got [{}, {}] with c2={:.6e}, c3={:.6e}",
3092            cell.left, cell.right, cell.c2, cell.c3
3093        ))
3094        .into());
3095    }
3096    let (k2, k3) = normalized_non_affine_coefficients(
3097        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3098    )?;
3099    if k2.abs() <= tol && k3.abs() <= tol {
3100        Ok(ExactCellBranch::Affine)
3101    } else if k3.abs() <= tol {
3102        Ok(ExactCellBranch::Quartic)
3103    } else {
3104        Ok(ExactCellBranch::Sextic)
3105    }
3106}
3107
3108#[inline]
3109fn degenerate_sextic_branch(
3110    cell: DenestedCubicCell,
3111    lead: f64,
3112) -> Result<Option<ExactCellBranch>, String> {
3113    // The sextic recurrence divides by `lead = 3*c3^2`. When that division is
3114    // unstable, lower the polynomial degree without discarding a material
3115    // quadratic coefficient.
3116    let (normalized_k2, normalized_k3) = normalized_non_affine_coefficients(
3117        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3118    )?;
3119    if normalized_k3.abs() > NORMALIZED_CELL_BRANCH_TOL && lead.abs() > 1e-18 {
3120        return Ok(None);
3121    }
3122    if normalized_k2.abs() > NORMALIZED_CELL_BRANCH_TOL {
3123        Ok(Some(ExactCellBranch::Quartic))
3124    } else {
3125        Ok(Some(ExactCellBranch::Affine))
3126    }
3127}
3128
3129#[inline]
3130fn validate_bvn_args(h: f64, k: f64, rho: f64) -> Result<(), String> {
3131    if !h.is_finite() && !h.is_infinite() {
3132        return Err(CubicCellKernelError::bivariate_normal_domain(
3133            "bivariate normal cdf requires finite or infinite h",
3134        )
3135        .into());
3136    }
3137    if !k.is_finite() && !k.is_infinite() {
3138        return Err(CubicCellKernelError::bivariate_normal_domain(
3139            "bivariate normal cdf requires finite or infinite k",
3140        )
3141        .into());
3142    }
3143    if !rho.is_finite() {
3144        return Err(CubicCellKernelError::bivariate_normal_domain(format!(
3145            "bivariate normal cdf requires finite correlation, got {rho}"
3146        ))
3147        .into());
3148    }
3149    Ok::<(), _>(())
3150}
3151
3152#[inline]
3153fn bvn_gl_sum(h: f64, k: f64, rho_clamped: f64, asr: f64) -> f64 {
3154    // The Drezner-Wesolowsky arcsin representation is integrated with the
3155    // same 20-point Gauss-Legendre rule as before, but mirrored node pairs are
3156    // evaluated with one sin_cos for the half-angle offset rather than two
3157    // independent sin calls.  This preserves the quadrature rule (and hence
3158    // the accuracy envelope) while reducing the transcendental work in the
3159    // dominant finite-bound path from 20 sin calls to 11 sin/cos evaluations.
3160    if rho_clamped == 0.0 {
3161        return 0.0;
3162    }
3163    let hs = 0.5 * (h * h + k * k);
3164    let hk = h * k;
3165    let half_asr = 0.5 * asr;
3166    let (sin_mid, cos_mid) = half_asr.sin_cos();
3167    let mut sum = 0.0;
3168    for i in 0..10 {
3169        let node = GL20_NODES[i].abs();
3170        let weight = GL20_WEIGHTS[i];
3171        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3172
3173        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3174        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3175        let expo_lo = ((sn_lo * hk) - hs) / one_minus_lo;
3176
3177        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3178        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3179        let expo_hi = ((sn_hi * hk) - hs) / one_minus_hi;
3180
3181        sum += weight * (expo_lo.exp() + expo_hi.exp());
3182    }
3183    sum
3184}
3185
3186pub fn bivariate_normal_cdf(h: f64, k: f64, rho: f64) -> Result<f64, String> {
3187    validate_bvn_args(h, k, rho)?;
3188    if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
3189        return Ok(0.0);
3190    }
3191    if h == f64::INFINITY {
3192        return Ok(normal_cdf(k));
3193    }
3194    if k == f64::INFINITY {
3195        return Ok(normal_cdf(h));
3196    }
3197
3198    let rho_clamped = rho.clamp(-1.0, 1.0);
3199    if rho_clamped >= 1.0 - 1e-12 {
3200        return Ok(normal_cdf(h.min(k)));
3201    }
3202    if rho_clamped <= -1.0 + 1e-12 {
3203        return Ok((normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0));
3204    }
3205    if rho_clamped == 0.0 {
3206        return Ok((normal_cdf(h) * normal_cdf(k)).clamp(0.0, 1.0));
3207    }
3208    if h == 0.0 && k == 0.0 {
3209        return Ok((0.25 + rho_clamped.asin() / std::f64::consts::TAU).clamp(0.0, 1.0));
3210    }
3211
3212    let asr = rho_clamped.asin();
3213    let sum = bvn_gl_sum(h, k, rho_clamped, asr);
3214    Ok((normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3215}
3216
3217#[inline]
3218fn bvn_gl_sum_interval(h: f64, left: f64, right: f64, rho_clamped: f64, asr: f64) -> f64 {
3219    if rho_clamped == 0.0 {
3220        return 0.0;
3221    }
3222    let h2 = h * h;
3223    let right_hs = 0.5 * (h2 + right * right);
3224    let left_hs = 0.5 * (h2 + left * left);
3225    let half_asr = 0.5 * asr;
3226    let (sin_mid, cos_mid) = half_asr.sin_cos();
3227    let mut sum = 0.0;
3228    for i in 0..10 {
3229        let node = GL20_NODES[i].abs();
3230        let weight = GL20_WEIGHTS[i];
3231        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3232
3233        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3234        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3235        let lo_right = (((sn_lo * h * right) - right_hs) / one_minus_lo).exp();
3236        let lo_left = (((sn_lo * h * left) - left_hs) / one_minus_lo).exp();
3237
3238        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3239        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3240        let hi_right = (((sn_hi * h * right) - right_hs) / one_minus_hi).exp();
3241        let hi_left = (((sn_hi * h * left) - left_hs) / one_minus_hi).exp();
3242
3243        sum += weight * ((lo_right - lo_left) + (hi_right - hi_left));
3244    }
3245    sum
3246}
3247
3248fn bivariate_normal_cdf_interval(h: f64, left: f64, right: f64, rho: f64) -> Result<f64, String> {
3249    if right <= left {
3250        return Ok(0.0);
3251    }
3252    if left == f64::NEG_INFINITY && right == f64::INFINITY {
3253        return Ok(normal_cdf(h));
3254    }
3255    if !left.is_finite() || !right.is_finite() {
3256        let upper = bivariate_normal_cdf(h, right, rho)?;
3257        let lower = bivariate_normal_cdf(h, left, rho)?;
3258        return Ok((upper - lower).clamp(0.0, 1.0));
3259    }
3260    validate_bvn_args(h, left, rho)?;
3261    validate_bvn_args(h, right, rho)?;
3262    if h == f64::NEG_INFINITY {
3263        return Ok(0.0);
3264    }
3265    if h == f64::INFINITY {
3266        return Ok((normal_cdf(right) - normal_cdf(left)).clamp(0.0, 1.0));
3267    }
3268
3269    let rho_clamped = rho.clamp(-1.0, 1.0);
3270    if rho_clamped >= 1.0 - 1e-12 || rho_clamped <= -1.0 + 1e-12 {
3271        let upper = bivariate_normal_cdf(h, right, rho_clamped)?;
3272        let lower = bivariate_normal_cdf(h, left, rho_clamped)?;
3273        return Ok((upper - lower).clamp(0.0, 1.0));
3274    }
3275
3276    let cdf_h = normal_cdf(h);
3277    let normal_part = cdf_h * (normal_cdf(right) - normal_cdf(left));
3278    if rho_clamped == 0.0 {
3279        return Ok(normal_part.clamp(0.0, 1.0));
3280    }
3281    let asr = rho_clamped.asin();
3282    let sum = bvn_gl_sum_interval(h, left, right, rho_clamped, asr);
3283    Ok((normal_part + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3284}
3285
3286fn exp_neg_half_square(x: f64) -> f64 {
3287    if x.is_infinite() {
3288        0.0
3289    } else {
3290        (-0.5 * x * x).exp()
3291    }
3292}
3293
3294/// Zeroth truncated standard-normal moment `T_0(a, b) = ∫_a^b e^(−z²/2) dz
3295/// = √(2π)·(Φ(b) − Φ(a))`, evaluated without catastrophic cancellation in
3296/// either tail.
3297///
3298/// Writing `T_0 = √(π/2)·[erf(b/√2) − erf(a/√2)]`, the naive form collapses
3299/// to `0.0` whenever both endpoints lie in the *same* far tail: `erf`
3300/// saturates at the IEEE-754 values `±1.0` for `|x| ≳ 8.3·√2`, so the
3301/// difference of two saturated values is exactly zero even though the
3302/// integral is a strictly positive number well inside the f64 normal range
3303/// (e.g. `∫_{-12}^{-10} ≈ 1.9e-23`). The fix is to reduce the erf difference
3304/// to complementary tail probabilities — `erfc` is evaluated with a dedicated
3305/// tail series, *not* as `1 − erf` — and to pick, by the sign of the
3306/// endpoints, the algebraically-equivalent form whose terms do not cancel
3307/// against one another:
3308///
3309/// ```text
3310/// both ≥ 0 (upper tail):  erf(b/√2) − erf(a/√2) = erfc(a/√2) − erfc(b/√2)
3311/// both ≤ 0 (lower tail):  erf(b/√2) − erf(a/√2) = erfc(−b/√2) − erfc(−a/√2)
3312/// straddling zero:        erf(b/√2) − erf(a/√2)
3313///                        = erf(b/√2) + erf(−a/√2)       near the anchor
3314///                        = 2 − erfc(b/√2) − erfc(−a/√2) otherwise
3315/// ```
3316///
3317/// In each branch every `erfc` argument is `≥ 0`, so the terms are small
3318/// positive tail values, while narrow straddling intervals add two
3319/// non-negative `erf` masses measured outward from the anchor. That avoids
3320/// the `2 − erfc(b/√2) − erfc(−a/√2)` cancellation when both erfc terms round
3321/// to `1.0`, but keeps the erfc-tail form for ordinary/full-line straddling
3322/// intervals. No large quantities cancel and full f64 precision survives down
3323/// to the underflow boundary in either tail and around the affine anchor.
3324///
3325/// Uses `libm::erfc` (msun double-precision implementation, ≤ 1 ulp) rather
3326/// than `statrs::function::erf::erfc` (a 6-term rational approximation that
3327/// carries ~3·10⁻¹¹ relative error around `|x| ≈ 1/√2` — see the existing
3328/// `libm::erfc` consumer at `inference::polya_gamma_core::normal_cdf`). That
3329/// statrs error propagates directly into `T_0`, then through every higher
3330/// moment `T_n` (the recurrence `T_n = a^{n-1}e^{-a²/2} − b^{n-1}e^{-b²/2}
3331/// + (n-1)·T_{n-2}` walks `T_0` up two steps at a time), then through every
3332/// affine-cell moment via `affine_anchor_moment_vector` (whose `out[n]` is a
3333/// linear combination of `T_0..=T_n`), and is the dominant source of error
3334/// in the affine-cell branch of the cubic-cell substrate (CPU/GPU parity
3335/// reference for transformation-normal, bernoulli-marginal-slope, and the
3336/// BMS flex-row higher-derivative reuse path).
3337fn truncated_gaussian_zeroth_moment(a: f64, b: f64) -> f64 {
3338    let inv_sqrt2 = 1.0 / std::f64::consts::SQRT_2;
3339    let za = a * inv_sqrt2;
3340    let zb = b * inv_sqrt2;
3341    let erf_diff = if za >= 0.0 {
3342        libm::erfc(za) - libm::erfc(zb)
3343    } else if zb <= 0.0 {
3344        libm::erfc(-zb) - libm::erfc(-za)
3345    } else if zb <= 0.5 && -za <= 0.5 {
3346        // Near the affine anchor, erfc(zb) and erfc(-za) are both close to
3347        // one; subtracting them from 2.0 can round a tiny but representable
3348        // cell mass to zero. The equivalent erf sum adds small positive
3349        // quantities directly.
3350        libm::erf(zb) + libm::erf(-za)
3351    } else {
3352        2.0 - libm::erfc(zb) - libm::erfc(-za)
3353    };
3354    // √(2π)·½ = √(π/2).
3355    (std::f64::consts::PI / 2.0).sqrt() * erf_diff
3356}
3357
3358/// Fill `out[0..=max_degree]` with the raw truncated standard-normal moments
3359///
3360/// ```text
3361/// T_n(a, b) = ∫_a^b z^n exp(-z²/2) dz
3362/// ```
3363///
3364/// using the integration-by-parts recurrence
3365///
3366/// ```text
3367/// T_0(a, b) = √(2π) (Φ(b) − Φ(a))
3368/// T_1(a, b) = exp(−a²/2) − exp(−b²/2)
3369/// T_n(a, b) = a^(n−1) e^{−a²/2} − b^(n−1) e^{−b²/2} + (n−1) T_{n−2}(a, b)
3370/// ```
3371///
3372/// Computed in one forward sweep so each call evaluates `erf` and
3373/// `exp(−x²/2)` exactly twice (once at `a`, once at `b`) regardless of the
3374/// requested degree. The naive form — calling `T_n` recursively for each
3375/// `n = 0..=max_degree` — re-evaluated `erf`/`exp` about `max_degree²/4`
3376/// times per affine cell, which dominated the wall time of the
3377/// transformation-normal and bernoulli-marginal-slope inner solves with
3378/// `max_degree = 64` (the transport order's required degree budget).
3379fn fill_truncated_gaussian_moments(a: f64, b: f64, out: &mut [f64]) {
3380    if out.is_empty() {
3381        return;
3382    }
3383    out[0] = truncated_gaussian_zeroth_moment(a, b);
3384    if out.len() == 1 {
3385        return;
3386    }
3387    let ea = exp_neg_half_square(a);
3388    let eb = exp_neg_half_square(b);
3389    out[1] = ea - eb;
3390    if out.len() == 2 {
3391        return;
3392    }
3393    let a_finite = a.is_finite();
3394    let b_finite = b.is_finite();
3395    // For n in 2..=max_degree we need a^{n-1} e^{-a²/2} (resp. b). Carry the
3396    // running powers a^{n-1}, b^{n-1} forward by a single multiply per step.
3397    // Infinite endpoints contribute 0 (the integrand decays at the rate of
3398    // exp(−x²/2)), matching the prior `is_infinite` branch in the recursive
3399    // implementation; we still update the running power so the iteration
3400    // stays branchless when both endpoints are finite.
3401    let mut a_pow_n_minus_1 = a; // a^1, used at n = 2
3402    let mut b_pow_n_minus_1 = b;
3403    for n in 2..out.len() {
3404        let left = if a_finite { a_pow_n_minus_1 * ea } else { 0.0 };
3405        let right = if b_finite { b_pow_n_minus_1 * eb } else { 0.0 };
3406        out[n] = left - right + (n as f64 - 1.0) * out[n - 2];
3407        a_pow_n_minus_1 *= a;
3408        b_pow_n_minus_1 *= b;
3409    }
3410}
3411
3412/// Stack-array bound for `affine_anchor_moment_vector_into`. Public callers
3413/// use up to ~24 (largest is the bernoulli-margslope outer-step degree-21
3414/// reduction); 64 leaves comfortable headroom without growing the per-call
3415/// stack footprint meaningfully.
3416const MAX_AFFINE_ANCHOR_DEGREE: usize = 64;
3417
3418pub fn affine_anchor_moment_vector(
3419    alpha: f64,
3420    beta: f64,
3421    left: f64,
3422    right: f64,
3423    max_degree: usize,
3424) -> Vec<f64> {
3425    let mut out = vec![0.0; max_degree + 1];
3426    affine_anchor_moment_vector_into(alpha, beta, left, right, max_degree, &mut out);
3427    out
3428}
3429
3430fn affine_anchor_moment_vector_into(
3431    alpha: f64,
3432    beta: f64,
3433    left: f64,
3434    right: f64,
3435    max_degree: usize,
3436    out: &mut [f64],
3437) {
3438    assert_eq!(out.len(), max_degree + 1);
3439    let s = (1.0 + beta * beta).sqrt();
3440    let mu = -alpha * beta / (1.0 + beta * beta);
3441    let y_left = if left.is_infinite() {
3442        if left.is_sign_positive() {
3443            f64::INFINITY
3444        } else {
3445            f64::NEG_INFINITY
3446        }
3447    } else {
3448        s * (left - mu)
3449    };
3450    let y_right = if right.is_infinite() {
3451        if right.is_sign_positive() {
3452            f64::INFINITY
3453        } else {
3454            f64::NEG_INFINITY
3455        }
3456    } else {
3457        s * (right - mu)
3458    };
3459    let anchor = (-alpha * alpha / (2.0 * s * s)).exp() / s;
3460    assert!(
3461        max_degree <= MAX_AFFINE_ANCHOR_DEGREE,
3462        "affine_anchor_moment_vector max_degree {} exceeds compile-time bound {}",
3463        max_degree,
3464        MAX_AFFINE_ANCHOR_DEGREE
3465    );
3466    let mut t = [0.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3467    fill_truncated_gaussian_moments(y_left, y_right, &mut t[..=max_degree]);
3468    // Build mu^k and s^{-k} tables once. The inner sum is the binomial
3469    // expansion of the affine change-of-variables, and computing the
3470    // binomial coefficient via Pascal's row recurrence + carrying mu/s
3471    // powers eliminates the per-(n, k) `powi` and binomial calls that
3472    // otherwise dominated the inner loop at large `max_degree`.
3473    let mut mu_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3474    for k in 1..=max_degree {
3475        mu_pow[k] = mu_pow[k - 1] * mu;
3476    }
3477    let inv_s = 1.0 / s;
3478    let mut inv_s_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3479    for k in 1..=max_degree {
3480        inv_s_pow[k] = inv_s_pow[k - 1] * inv_s;
3481    }
3482    out.fill(0.0);
3483    for n in 0..=max_degree {
3484        let mut acc = 0.0;
3485        // C(n, k+1) = C(n, k) · (n − k) / (k + 1).
3486        let mut binom = 1.0;
3487        for k in 0..=n {
3488            let term = binom * mu_pow[n - k] * inv_s_pow[k];
3489            acc = term.mul_add(t[k], acc);
3490            if k < n {
3491                binom = binom * (n - k) as f64 / (k + 1) as f64;
3492            }
3493        }
3494        out[n] = anchor * acc;
3495    }
3496}
3497
3498fn affine_value_from_moment_primitive(alpha: f64, beta: f64, left: f64, right: f64) -> f64 {
3499    // Exact formula via bivariate normal CDF.
3500    //
3501    // V(α,β,l,r) = ∫_l^r Φ(α+βz)φ(z)dz
3502    //            = P(U ≤ α+βZ, l ≤ Z ≤ r)    where U,Z iid N(0,1)
3503    //            = Φ₂(h, r; ρ) − Φ₂(h, l; ρ)
3504    //
3505    // with h = α/√(1+β²) and ρ = −β/√(1+β²).
3506    //
3507    // This is exact to floating-point precision via the high-accuracy
3508    // Drezner-Wesolowsky BVN routine, replacing the previous fixed 20-point
3509    // Gauss-Legendre numerical integration of the derivative primitive.
3510    let s = (1.0 + beta * beta).sqrt();
3511    let h = alpha / s;
3512    let rho = -beta / s;
3513    bivariate_normal_cdf_interval(h, left, right, rho).unwrap_or(0.0)
3514}
3515
3516/// Evaluate an affine cell (c2=c3=0) with a value/moment-consistent primitive.
3517///
3518/// Value and moments are now generated from the same affine moment primitive.
3519/// The zero-moment derivative is exact, and `value` is reconstructed by
3520/// integrating `d value / d alpha = INV_TWO_PI * moments[0]` over `alpha`
3521/// on a transformed semi-infinite domain.
3522pub fn evaluate_affine_cell_state(
3523    cell: DenestedCubicCell,
3524    max_degree: usize,
3525) -> Result<CellMomentState, String> {
3526    let alpha = cell.c0;
3527    let beta = cell.c1;
3528    let value = affine_value_from_moment_primitive(alpha, beta, cell.left, cell.right);
3529    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3530    Ok(CellMomentState {
3531        branch: ExactCellBranch::Affine,
3532        value,
3533        moments: moments.into(),
3534    })
3535}
3536
3537fn evaluate_affine_cell_derivative_state(
3538    cell: DenestedCubicCell,
3539    max_degree: usize,
3540) -> Result<CellDerivativeMomentState, String> {
3541    let alpha = cell.c0;
3542    let beta = cell.c1;
3543    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3544    Ok(CellDerivativeMomentState {
3545        branch: ExactCellBranch::Affine,
3546        moments: moments.into(),
3547    })
3548}
3549
3550/// Accumulate `mw * z^k` into `moments[k]` for k=0..moments.len(). The
3551/// "unrolled4" name is historical — this is the plain scalar accumulator
3552/// that the SIMD outer loop calls per lane. Moment counts are small enough
3553/// (max_degree + 1 <= ~10) that explicit 4-way unrolling does not measurably
3554/// improve throughput over the iterator path; the wide::f64x4::exp savings
3555/// in the SIMD outer dominate the kernel's runtime.
3556#[inline]
3557fn accumulate_moments_unrolled4(moments: &mut [f64], mw: f64, z: f64) {
3558    let mut z_pow = 1.0_f64;
3559    for slot in moments.iter_mut() {
3560        *slot = mw.mul_add(z_pow, *slot);
3561        z_pow *= z;
3562    }
3563}
3564
3565// Shared SIMD Gauss-Legendre core for non-affine cells. The const generic
3566// `COMPUTE_VALUE` selects whether the cell value integral
3567// `∫ φ(η(z)) · exp(-½z²) dz / √(2π)` is accumulated alongside the moments.
3568// Monomorphization collapses the const-generic branches at compile time, so
3569// `COMPUTE_VALUE = false` emits the moment-only path verbatim.
3570//
3571// Single source of truth for the moment SIMD lane ordering, the Horner-with-FMA
3572// pattern for η(z), the `0.5 * (z² + η²)` quadratic-form evaluation order, the
3573// unscaled per-node GL moment weights, the post-loop half-width fold, and the
3574// per-lane `accumulate_moments_unrolled4` call. The previous duplicated code paths
3575// drifted by 1 ULP whenever any of these details diverged; here both paths
3576// share the same instructions, eliminating an entire class of regressions
3577// where a tweak to the quadrature order or the FMA pattern would silently
3578// re-introduce divergence between the value- and derivative-only callers.
3579//
3580// Gauss-Legendre on [left, right] converges geometrically for the analytic
3581// integrand exp(-q(z)) with quartic/sextic q on a bounded cell; the prior
3582// adaptive transport path expanded basis_moments via the forward 3-/5-step
3583// recurrences in reduce_quartic/sextic_moments, which amplify roundoff by
3584// (1/lead)^n with lead = 2c2²/3c3² and overflow to NaN for small c2/c3 cells
3585// that arise naturally in production.
3586//
3587// The fixed 384-node rule that replaced the transport path is accurate but
3588// pays ~384 exp evaluations per cell unconditionally. Production cells are
3589// narrow spline-knot subdivisions where a 12- or 24-node rule is already
3590// converged to machine precision, and the flex marginal-slope row calculus
3591// evaluates O(100) such cells per row across n=10⁵–10⁶ rows per criterion
3592// evaluation — the fixed rule was the dominant cost of the whole fit (#979).
3593// `evaluate_non_affine_cell_simd` therefore walks a progressive ladder of
3594// rules (12, 24, 48, 96, 192, 384 nodes) and returns as soon as two
3595// consecutive rules agree to `NON_AFFINE_LADDER_RTOL` relative to the moment
3596// vector's own scale. Unlike the old fixed rule — whose error was real but
3597// uncertified — every accepted ladder result carries an embedded two-rule
3598// agreement certificate; a cell that never certifies falls through to the
3599// same 384-node answer the fixed rule produced.
3600//
3601// SIMD path: process 4 GL nodes per outer iteration, batching the two scalar
3602// `exp` calls into single 4-wide `wide::f64x4::exp` invocations. All ladder
3603// rule sizes are divisible by 4, so no scalar tail is needed for the GL
3604// sweep. The inner moment accumulation is then run scalar per-lane but with
3605// a 4-way unrolled slab over the moment slots to break the `z_pow *= z`
3606// serial dependency chain.
3607#[inline(always)]
3608fn evaluate_non_affine_cell_with_rule<const COMPUTE_VALUE: bool>(
3609    cell: DenestedCubicCell,
3610    max_degree: usize,
3611    gl_nodes: &[f64],
3612    gl_weights: &[f64],
3613) -> (CellMomentVec, f64) {
3614    let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
3615    let mut value_integral = 0.0_f64;
3616    let center = 0.5 * (cell.left + cell.right);
3617    let half_width = 0.5 * (cell.right - cell.left);
3618    let c0 = cell.c0;
3619    let c1 = cell.c1;
3620    let c2 = cell.c2;
3621    let c3 = cell.c3;
3622    let moments_slice: &mut [f64] = &mut moments;
3623    assert_eq!(gl_nodes.len(), gl_weights.len());
3624    use wide::f64x4;
3625    let center_v = f64x4::splat(center);
3626    let half_width_v = f64x4::splat(half_width);
3627    let c0_v = f64x4::splat(c0);
3628    let c1_v = f64x4::splat(c1);
3629    let c2_v = f64x4::splat(c2);
3630    let c3_v = f64x4::splat(c3);
3631    let neg_half_v = f64x4::splat(-0.5);
3632    let n_total = gl_nodes.len();
3633    let n_simd = n_total - (n_total % 4);
3634    let mut i = 0;
3635    while i < n_simd {
3636        let node_v = f64x4::from([
3637            gl_nodes[i],
3638            gl_nodes[i + 1],
3639            gl_nodes[i + 2],
3640            gl_nodes[i + 3],
3641        ]);
3642        let weight_v = f64x4::from([
3643            gl_weights[i],
3644            gl_weights[i + 1],
3645            gl_weights[i + 2],
3646            gl_weights[i + 3],
3647        ]);
3648        let z_v = half_width_v.mul_add(node_v, center_v);
3649        // Horner: ((c3*z + c2)*z + c1)*z + c0
3650        let eta_v = c3_v
3651            .mul_add(z_v, c2_v)
3652            .mul_add(z_v, c1_v)
3653            .mul_add(z_v, c0_v);
3654        let z2_v = z_v * z_v;
3655        let neg_q_v = neg_half_v * (z2_v + eta_v * eta_v);
3656        let exp_negq_v = neg_q_v.exp();
3657        let moment_weight_v = weight_v * exp_negq_v;
3658        let z_arr = z_v.to_array();
3659        let mw_arr = moment_weight_v.to_array();
3660        if COMPUTE_VALUE {
3661            for lane in 0..4 {
3662                let z = z_arr[lane];
3663                let mw = mw_arr[lane];
3664                accumulate_moments_unrolled4(moments_slice, mw, z);
3665                // The value integrand carries Φ(η)'s erfc, whose systematic
3666                // per-z error is ~1e-13. To honor the cell-value accuracy
3667                // contract the value term must be assembled bit-for-bit like
3668                // the scalar reference: a non-fused node map
3669                // `z_ref = center + half_width·node`, the expanded
3670                // `η = c0 + c1·z + c2·z² + c3·z³` (NOT the SIMD Horner-FMA used
3671                // for the moments), the unscaled GL weight, a scalar `exp(-½z²)`,
3672                // and a plain `+=`. The SIMD `z_v`/`eta_v` above (fused) feed
3673                // ONLY the moments and are left untouched. Any single ULP slip
3674                // here (FMA node map, Horner η, per-term half_width, SIMD exp,
3675                // FMA accumulation) drifts the 384-node sum by ~1.4e-13 and
3676                // breaks the contract.
3677                let node = gl_nodes[i + lane];
3678                let weight = gl_weights[i + lane];
3679                let z_ref = center + half_width * node;
3680                let eta_ref = c0 + c1 * z_ref + c2 * z_ref * z_ref + c3 * z_ref * z_ref * z_ref;
3681                value_integral += weight * (-0.5 * z_ref * z_ref).exp() * normal_cdf(eta_ref);
3682            }
3683        } else {
3684            for lane in 0..4 {
3685                let z = z_arr[lane];
3686                let mw = mw_arr[lane];
3687                accumulate_moments_unrolled4(moments_slice, mw, z);
3688            }
3689        }
3690        i += 4;
3691    }
3692    while i < n_total {
3693        let node = gl_nodes[i];
3694        let weight = gl_weights[i];
3695        let z = center + half_width * node;
3696        let eta = c3.mul_add(z, c2).mul_add(z, c1).mul_add(z, c0);
3697        let q = 0.5 * (z * z + eta * eta);
3698        let moment_weight = weight * (-q).exp();
3699        accumulate_moments_unrolled4(moments_slice, moment_weight, z);
3700        if COMPUTE_VALUE {
3701            // Bit-for-bit the reference value structure (see SIMD branch): the
3702            // node map `z = center + half_width·node` here already matches the
3703            // reference (non-fused), but η must use the expanded reference form
3704            // rather than the moment path's Horner-FMA.
3705            let eta_ref = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3706            value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta_ref);
3707        }
3708        i += 1;
3709    }
3710    // Apply the cell half-width to both moment and value integrals ONCE at the
3711    // end, mirroring the prefold reference. Folding half_width per-term changes
3712    // f64 rounding enough to show up at the 1e-13 contract.
3713    for moment in moments_slice.iter_mut() {
3714        *moment *= half_width;
3715    }
3716    let value = if COMPUTE_VALUE {
3717        value_integral * half_width
3718    } else {
3719        value_integral
3720    };
3721    (moments, value)
3722}
3723
3724/// Relative agreement threshold for the progressive non-affine quadrature
3725/// ladder: two consecutive Gauss-Legendre rules must agree on every moment
3726/// slot to this tolerance relative to the moment vector's own max magnitude
3727/// before the finer rule's result
3728/// is accepted. Gauss-Legendre error decays geometrically in the node count
3729/// for the analytic integrand `exp(-q(z))`, so agreement between an n-node
3730/// and a 2n-node rule certifies that both are converged: the coarse rule's
3731/// true error is bounded by the observed difference plus the (much smaller)
3732/// fine-rule error.
3733///
3734/// History (#979): a roundoff-floor relaxation of this test (accept when
3735/// successive rungs agree to `≈ n·ε·scale` rather than the bare `3e-15`) was
3736/// tried to let smooth cells certify below the terminal 384-node rung. It was
3737/// reverted: the value-bearing path carries `∫ φ(z)·Φ(η(z)) dz`, and `Φ`'s
3738/// `erfc` implementation has a *systematic per-z* error of order `1e-13` that
3739/// each rung's node set samples differently. Only the exact 384-node rule
3740/// reproduces the reference's erfc-noise realization, so any sub-384 rung
3741/// drifts from the 384 value by `≈ 1e-13` — a drift that is NOT truncation,
3742/// does NOT shrink with rung, and is NOT bounded by rung-to-rung agreement.
3743/// The moment ladder remains independent of the value integral so value- and
3744/// derivative-only evaluators keep returning bit-identical moments. The scalar
3745/// value now evaluates on the terminal 384-node rule directly, preserving the
3746/// `non_affine_cell_state_matches_prefold_reference_to_1e_minus_13` value
3747/// contract without forcing every derivative-moment caller to use the terminal
3748/// rung.
3749const NON_AFFINE_LADDER_RTOL: f64 = 1e-15;
3750
3751/// Node counts of the progressive ladder below the 384-node terminal rung.
3752/// All divisible by 4 so the SIMD sweep needs no scalar tail.
3753const NON_AFFINE_LADDER_RUNGS: [usize; 5] = [12, 24, 48, 96, 192];
3754
3755/// Runtime-generated Gauss-Legendre rules for the ladder rungs, computed
3756/// once per process by Newton iteration on the Legendre polynomial roots
3757/// (standard `gauleg`: cosine initial guess, 3-4 Newton steps to machine
3758/// precision). The terminal 384-node rung reuses the compile-time
3759/// `GL_NODES`/`GL_WEIGHTS` tables, which also remain the single source for
3760/// the GPU kernel.
3761fn non_affine_ladder_rules() -> &'static [(Vec<f64>, Vec<f64>)] {
3762    static RULES: std::sync::OnceLock<Vec<(Vec<f64>, Vec<f64>)>> = std::sync::OnceLock::new();
3763    RULES.get_or_init(|| {
3764        NON_AFFINE_LADDER_RUNGS
3765            .iter()
3766            .map(|&n| gauss_legendre_rule(n))
3767            .collect()
3768    })
3769}
3770
3771/// Nodes and weights of the `n`-point Gauss-Legendre rule on `[-1, 1]`.
3772///
3773/// Newton iteration on `P_n` from the cosine initial guess
3774/// `cos(π(i + 0.75)/(n + 0.5))` converges to every root in a handful of
3775/// steps; weights follow from `w_i = 2 / ((1 - x_i²) P_n'(x_i)²)`. Roots are
3776/// filled symmetrically so the rule is exactly antisymmetric about 0.
3777fn gauss_legendre_rule(n: usize) -> (Vec<f64>, Vec<f64>) {
3778    let mut nodes = vec![0.0_f64; n];
3779    let mut weights = vec![0.0_f64; n];
3780    for i in 0..n.div_ceil(2) {
3781        let mut z = (std::f64::consts::PI * (i as f64 + 0.75) / (n as f64 + 0.5)).cos();
3782        let mut pp = 0.0_f64;
3783        for _ in 0..100 {
3784            // Legendre recurrence: p1 = P_n(z), p2 = P_{n-1}(z).
3785            let mut p1 = 1.0_f64;
3786            let mut p2 = 0.0_f64;
3787            for j in 1..=n {
3788                let p3 = p2;
3789                p2 = p1;
3790                p1 = ((2 * j - 1) as f64 * z * p2 - (j - 1) as f64 * p3) / j as f64;
3791            }
3792            pp = n as f64 * (z * p1 - p2) / (z * z - 1.0);
3793            let z_prev = z;
3794            z = z_prev - p1 / pp;
3795            if (z - z_prev).abs() <= f64::EPSILON {
3796                break;
3797            }
3798        }
3799        nodes[i] = -z;
3800        nodes[n - 1 - i] = z;
3801        let w = 2.0 / ((1.0 - z * z) * pp * pp);
3802        weights[i] = w;
3803        weights[n - 1 - i] = w;
3804    }
3805    (nodes, weights)
3806}
3807
3808/// Two-rule agreement certificate for the progressive ladder. `true` when
3809/// every MOMENT slot agrees to `NON_AFFINE_LADDER_RTOL` relative to the fine
3810/// result's max magnitude. Non-finite results never certify, so they fall
3811/// through to the terminal 384-node rung and reproduce the fixed rule's
3812/// behavior exactly.
3813///
3814/// The decision is deliberately moment-only and independent of whether the
3815/// caller also computed the cell value: the value- and derivative-only
3816/// evaluators MUST select the same ladder rung so they accumulate the moment
3817/// vector over the same nodes and return bit-identical moments (the
3818/// `derivative_moment_evaluator_matches_value_evaluator_moments` invariant).
3819/// Value-bearing callers evaluate the scalar cell probability separately on
3820/// the terminal 384-node rule; this certificate governs only the reusable
3821/// derivative moment vector.
3822fn non_affine_ladder_converged(coarse: &CellMomentVec, fine: &CellMomentVec) -> bool {
3823    let mut scale = 0.0_f64;
3824    let mut err = 0.0_f64;
3825    for (&c, &f) in coarse.iter().zip(fine.iter()) {
3826        scale = scale.max(f.abs());
3827        err = err.max((c - f).abs());
3828    }
3829    if !(scale.is_finite() && err.is_finite()) {
3830        return false;
3831    }
3832    err <= NON_AFFINE_LADDER_RTOL * scale
3833}
3834
3835/// Per-rung certification histogram for the non-affine ladder, indexed by the
3836/// rung that certified (`NON_AFFINE_LADDER_RUNGS[i]` at index `i`), with the
3837/// final slot counting cells that fell through to the terminal 384-node rule.
3838/// Incremented once per non-affine cell evaluation; the BMS exact-cache build
3839/// logs the distribution so the ladder's real cost (early-certify win vs.
3840/// terminal-fallthrough cost) is observable on every large-scale fit rather
3841/// than assumed. `+1` length for the terminal bucket.
3842pub(crate) static NON_AFFINE_LADDER_CERT_COUNTS: [AtomicU64; NON_AFFINE_LADDER_RUNGS.len() + 1] = [
3843    AtomicU64::new(0),
3844    AtomicU64::new(0),
3845    AtomicU64::new(0),
3846    AtomicU64::new(0),
3847    AtomicU64::new(0),
3848    AtomicU64::new(0),
3849];
3850
3851/// Snapshot the ladder certification histogram as `(rung_node_count, count)`
3852/// pairs plus the terminal-fallthrough count, for logging/inspection.
3853pub fn non_affine_ladder_cert_histogram() -> (Vec<(usize, u64)>, u64) {
3854    let per_rung = NON_AFFINE_LADDER_RUNGS
3855        .iter()
3856        .enumerate()
3857        .map(|(i, &n)| (n, NON_AFFINE_LADDER_CERT_COUNTS[i].load(Ordering::Relaxed)))
3858        .collect();
3859    let terminal =
3860        NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].load(Ordering::Relaxed);
3861    (per_rung, terminal)
3862}
3863
3864/// Progressive-ladder evaluation of a non-affine cell: walk the rule ladder
3865/// from 12 nodes upward and return the first result certified by two-rule
3866/// agreement; a cell that never certifies returns the terminal 384-node
3867/// result, byte-identical to the previous fixed-rule implementation.
3868#[inline]
3869fn evaluate_non_affine_cell_simd<const COMPUTE_VALUE: bool>(
3870    cell: DenestedCubicCell,
3871    max_degree: usize,
3872) -> (CellMomentVec, f64) {
3873    let mut prev: Option<(CellMomentVec, f64)> = None;
3874    for (i, (nodes, weights)) in non_affine_ladder_rules().iter().enumerate() {
3875        let cur =
3876            evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, nodes, weights);
3877        if let Some(prev) = prev.as_ref()
3878            && non_affine_ladder_converged(&prev.0, &cur.0)
3879        {
3880            NON_AFFINE_LADDER_CERT_COUNTS[i].fetch_add(1, Ordering::Relaxed);
3881            return cur;
3882        }
3883        prev = Some(cur);
3884    }
3885    NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].fetch_add(1, Ordering::Relaxed);
3886    evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, &GL_NODES, &GL_WEIGHTS)
3887}
3888
3889/// Value-only evaluation of a non-affine cell on the terminal 384-node rule.
3890///
3891/// Returns the cell probability integral `∫ exp(-½z²)·Φ(η(z)) dz` (pre the
3892/// `1/√τ` normalization) computed bit-for-bit like the value branch of
3893/// [`evaluate_non_affine_cell_with_rule`]: the non-fused node map
3894/// `z = center + half_width·node`, the expanded (non-Horner)
3895/// `η = c0 + c1·z + c2·z² + c3·z³`, the unscaled GL weight, a scalar
3896/// `exp(-½z²)`, a plain `+=` in ascending node order, and a single trailing
3897/// `·half_width`. The terminal rule has 384 nodes (divisible by 4), so the
3898/// general kernel's value path never takes its scalar tail — this loop walks
3899/// the same nodes in the same order and therefore reproduces the reference
3900/// erfc-noise realization the `1e-13` value contract pins down.
3901///
3902/// Computing this through `evaluate_non_affine_cell_with_rule::<true>` at
3903/// `max_degree = 0` would additionally run the 4-wide SIMD `exp(-q)` moment
3904/// sweep and a moment accumulation on every node only to discard the moment
3905/// vector. The survival marginal-slope fit evaluates a value per non-affine
3906/// partition cell, so that discarded moment work is the dominant waste in the
3907/// per-cell pass; this evaluator does only the work the value needs.
3908fn evaluate_non_affine_cell_value_terminal(cell: DenestedCubicCell) -> f64 {
3909    let center = 0.5 * (cell.left + cell.right);
3910    let half_width = 0.5 * (cell.right - cell.left);
3911    let c0 = cell.c0;
3912    let c1 = cell.c1;
3913    let c2 = cell.c2;
3914    let c3 = cell.c3;
3915    let mut value_integral = 0.0_f64;
3916    for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
3917        let z = center + half_width * node;
3918        let eta = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3919        value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
3920    }
3921    value_integral * half_width
3922}
3923
3924fn evaluate_non_affine_cell_state(
3925    cell: DenestedCubicCell,
3926    branch: ExactCellBranch,
3927    max_degree: usize,
3928) -> Result<CellMomentState, String> {
3929    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3930    let value_integral = evaluate_non_affine_cell_value_terminal(cell);
3931    // Reference structure: `value_integral * half_width / sqrt(TAU)`. The
3932    // half_width factor is already applied inside the rule evaluator, so divide
3933    // by sqrt(TAU) here (a true division, NOT multiply-by-reciprocal) to
3934    // reproduce the reference's final rounding bit-for-bit.
3935    Ok(CellMomentState {
3936        branch,
3937        value: value_integral / (std::f64::consts::TAU).sqrt(),
3938        moments,
3939    })
3940}
3941
3942fn evaluate_non_affine_cell_derivative_state(
3943    cell: DenestedCubicCell,
3944    branch: ExactCellBranch,
3945    max_degree: usize,
3946) -> Result<CellDerivativeMomentState, String> {
3947    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3948    Ok(CellDerivativeMomentState { branch, moments })
3949}
3950
3951/// De-nested cubic cell evaluator.
3952///
3953/// Affine cells use the closed-form affine anchor; non-affine cells (Quartic
3954/// and Sextic branches) are evaluated in a single pass over a fixed
3955/// high-order Gauss-Legendre rule on `[left, right]`.
3956pub fn evaluate_cell_moments(
3957    cell: DenestedCubicCell,
3958    max_degree: usize,
3959) -> Result<CellMomentState, String> {
3960    if !TAIL_CELL_MOMENT_CACHE_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
3961        return evaluate_cell_moments_uncached(cell, max_degree);
3962    }
3963    tail_cell_moment_cache().evaluate(cell, max_degree)
3964}
3965
3966/// Evaluate cell moments without consulting the global affine-tail memo.
3967///
3968/// This is retained for regression tests and before/after microbenchmarks;
3969/// production callers should use [`evaluate_cell_moments`].
3970pub fn evaluate_cell_moments_uncached(
3971    cell: DenestedCubicCell,
3972    max_degree: usize,
3973) -> Result<CellMomentState, String> {
3974    evaluate_cell_state_dispatched(
3975        cell,
3976        max_degree,
3977        evaluate_affine_cell_state,
3978        evaluate_non_affine_cell_state,
3979    )
3980}
3981
3982/// Evaluate only the moment vector needed by derivative contractions.
3983///
3984/// This deliberately does not compute the cell probability value
3985/// `∫ φ(z) Φ(η(z)) dz`. Derivative contractions consume
3986/// `∫ z^k exp(-q(z)) dz` moments only, so keeping the value out of the return
3987/// type prevents this cheaper evaluator from satisfying value-bearing calls.
3988pub fn evaluate_cell_derivative_moments_uncached(
3989    cell: DenestedCubicCell,
3990    max_degree: usize,
3991) -> Result<CellDerivativeMomentState, String> {
3992    evaluate_cell_state_dispatched(
3993        cell,
3994        max_degree,
3995        evaluate_affine_cell_derivative_state,
3996        evaluate_non_affine_cell_derivative_state,
3997    )
3998}
3999
4000/// Shared branch dispatch for the value-bearing and derivative-only cell
4001/// evaluators. Both walk the same decision tree (semi-infinite tail → must
4002/// be affine; finite cell → branch-by-coefficients with the sextic
4003/// degenerate-lowering path), differing only in which pair of
4004/// `(affine, non_affine)` evaluator helpers to delegate to.  The two helpers
4005/// are passed as `fn` pointers so the dispatch monomorphizes per `S` and
4006/// keeps the existing pre-condition errors / unreachable branch handling
4007/// in lockstep across both evaluators.
4008fn evaluate_cell_state_dispatched<S>(
4009    cell: DenestedCubicCell,
4010    max_degree: usize,
4011    affine: fn(DenestedCubicCell, usize) -> Result<S, String>,
4012    non_affine: fn(DenestedCubicCell, ExactCellBranch, usize) -> Result<S, String>,
4013) -> Result<S, String> {
4014    let left_inf = !cell.left.is_finite();
4015    let right_inf = !cell.right.is_finite();
4016    if left_inf || right_inf {
4017        // Semi-infinite tail cells must be affine: the deviation saturates
4018        // to a constant outside support, so c2=c3=0.  Both the BVN CDF
4019        // and the truncated-Gaussian moment vector handle infinite bounds.
4020        if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL
4021        {
4022            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4023                "semi-infinite cell [{}, {}] must be affine (c2=c3=0), got c2={:.3e}, c3={:.3e}",
4024                cell.left, cell.right, cell.c2, cell.c3
4025            ))
4026            .into());
4027        }
4028        return affine(cell, max_degree);
4029    }
4030    if cell.right <= cell.left {
4031        return Err(CubicCellKernelError::invalid_cell_shape(format!(
4032            "finite cell must have left < right, got [{}, {}]",
4033            cell.left, cell.right
4034        ))
4035        .into());
4036    }
4037    let branch = branch_cell(cell)?;
4038    if branch == ExactCellBranch::Affine {
4039        return affine(cell, max_degree);
4040    }
4041    if branch == ExactCellBranch::Sextic {
4042        let lead = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3)[5];
4043        if !lead.is_finite() {
4044            return Err(CubicCellKernelError::invalid_cell_shape(format!(
4045                "sextic cell evaluation encountered non-finite leading coefficient: {lead:.3e}"
4046            ))
4047            .into());
4048        }
4049        if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
4050            return match lower_branch {
4051                ExactCellBranch::Quartic => non_affine(
4052                    DenestedCubicCell { c3: 0.0, ..cell },
4053                    ExactCellBranch::Quartic,
4054                    max_degree,
4055                ),
4056                ExactCellBranch::Affine => affine(
4057                    DenestedCubicCell {
4058                        c2: 0.0,
4059                        c3: 0.0,
4060                        ..cell
4061                    },
4062                    max_degree,
4063                ),
4064                ExactCellBranch::Sextic => Err(CubicCellKernelError::invalid_cell_shape(
4065                    "internal: degenerate_sextic_branch returned Sextic as a lowered branch",
4066                )
4067                .into()),
4068            };
4069        }
4070    }
4071    non_affine(cell, branch, max_degree)
4072}
4073
4074/// Evaluate a de-nested cubic cell through a fit-lifetime byte-limited LRU cache.
4075///
4076/// The fingerprint is an exact bit-cast of `(c0, c1, c2, c3, left, right)`, so
4077/// eviction and reuse cannot alias nearby-but-different cells.  A cached entry
4078/// computed to a higher degree may satisfy a lower-degree request by truncating
4079/// the moment vector, preserving the public [`evaluate_cell_moments`] contract.
4080pub fn evaluate_cell_moments_cached(
4081    cell: DenestedCubicCell,
4082    max_degree: usize,
4083    cache: &CellMomentLruCache,
4084    stats: Option<&CellMomentCacheStats>,
4085) -> Result<CellMomentState, String> {
4086    // Affine cells (every rigid-path cell and every tail cell) evaluate
4087    // through the closed-form anchor — cheaper than a single LRU probe. The
4088    // LRU exists only to amortize the EXPENSIVE non-affine transport across
4089    // recurring cells; at large n the row scalars `(a, b)` are unique per
4090    // row, so affine cells never recur and routing them through the sharded
4091    // mutex was pure cost (320k lock+insert+evict ops per gradient eval, ~0%
4092    // hit — the dominant cost of the rigid n=320k fit, #979). Bypass the
4093    // cache entirely for them.
4094    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4095        if let Some(stats) = stats {
4096            stats.misses.fetch_add(1, Ordering::Relaxed);
4097        }
4098        return evaluate_cell_moments_uncached(cell, max_degree);
4099    }
4100    let key = CellFingerprint::new(cell);
4101    let existing_derivative = match cache.get(&key) {
4102        Some(cached) => {
4103            if let Some(state) = cached.state_for_degree(max_degree) {
4104                if let Some(stats) = stats {
4105                    stats.hits.fetch_add(1, Ordering::Relaxed);
4106                }
4107                return Ok(state);
4108            }
4109            // `cached.derivative_state` is `Option<Arc<_>>`; `.clone()` here
4110            // is the cheap refcount bump the audit-39 fix targets, not a
4111            // full moment-vector deep clone.
4112            cached.derivative_state.clone()
4113        }
4114        None => None,
4115    };
4116    if let Some(stats) = stats {
4117        stats.misses.fetch_add(1, Ordering::Relaxed);
4118    }
4119    let state = evaluate_cell_moments(cell, max_degree)?;
4120    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4121    // through `Arc::clone`, and return the underlying value by unwrapping the
4122    // unique-reference (caller-side) `Arc`. This replaces the prior
4123    // `state.clone()` deep copy at the insert site.
4124    let shared = Arc::new(state);
4125    let mut entry = CachedCellMoments::new(Arc::clone(&shared));
4126    if let Some(derivative) = existing_derivative {
4127        entry = entry.with_derivative(derivative);
4128    }
4129    cache.insert(key, entry);
4130    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4131}
4132
4133/// Derivative-moment counterpart to [`evaluate_cell_moments_cached`]. Shares
4134/// the value-moment LRU by storing both moment kinds in a single
4135/// [`CachedCellMoments`] entry keyed on the cell fingerprint — derivative
4136/// insertions preserve any pre-existing value state and vice versa, so the
4137/// two callers never evict each other's work.
4138pub fn evaluate_cell_derivative_moments_cached(
4139    cell: DenestedCubicCell,
4140    max_degree: usize,
4141    cache: &CellMomentLruCache,
4142    stats: Option<&CellMomentCacheStats>,
4143) -> Result<CellDerivativeMomentState, String> {
4144    // Affine cells bypass the LRU — see `evaluate_cell_moments_cached` for
4145    // why the sharded-mutex memo is pure overhead on the closed-form affine
4146    // path at large n (#979).
4147    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4148        if let Some(stats) = stats {
4149            stats.misses.fetch_add(1, Ordering::Relaxed);
4150        }
4151        return evaluate_cell_derivative_moments_uncached(cell, max_degree);
4152    }
4153    let key = CellFingerprint::new(cell);
4154    let existing_value = match cache.get(&key) {
4155        Some(cached) => {
4156            if let Some(state) = cached.derivative_state_for_degree(max_degree) {
4157                if let Some(stats) = stats {
4158                    stats.hits.fetch_add(1, Ordering::Relaxed);
4159                }
4160                return Ok(state);
4161            }
4162            // `cached.state` is `Option<Arc<_>>`; `.clone()` here is the cheap
4163            // refcount bump the audit-39 fix targets, not a full moment-vector
4164            // deep clone.
4165            cached.state.clone()
4166        }
4167        None => None,
4168    };
4169    if let Some(stats) = stats {
4170        stats.misses.fetch_add(1, Ordering::Relaxed);
4171    }
4172    let state = evaluate_cell_derivative_moments_uncached(cell, max_degree)?;
4173    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4174    // through `Arc::clone`, and return the underlying value by unwrapping the
4175    // unique-reference (caller-side) `Arc`. This replaces the prior
4176    // `state.clone()` deep copy at the insert site.
4177    let shared = Arc::new(state);
4178    let mut entry = CachedCellMoments::new_derivative(Arc::clone(&shared));
4179    if let Some(value) = existing_value {
4180        entry = entry.with_value(value);
4181    }
4182    cache.insert(key, entry);
4183    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4184}
4185
4186/// Scratch-backed variant of [`evaluate_cell_moments`].
4187///
4188/// Reuses the supplied [`CellMomentScratch`] for the returned moments slice,
4189/// so repeated calls with the same scratch (and a sufficient initial capacity)
4190/// avoid per-call `Vec` allocations on the hot inner-PIRLS row-intercept
4191/// solver path. Internal transport allocations are unchanged.
4192pub fn evaluate_cell_moments_with_scratch<'a>(
4193    cell: DenestedCubicCell,
4194    max_degree: usize,
4195    scratch: &'a mut CellMomentScratch,
4196) -> Result<CellMomentStateRef<'a>, String> {
4197    let state = evaluate_cell_moments(cell, max_degree)?;
4198    let out = scratch.prepare_moments(max_degree + 1);
4199    out.copy_from_slice(&state.moments);
4200    Ok(CellMomentStateRef {
4201        branch: state.branch,
4202        value: state.value,
4203        moments: out,
4204    })
4205}
4206
4207#[cfg(test)]
4208mod tests {
4209    use super::*;
4210    use gam_math::probability::normal_pdf;
4211
4212    #[inline]
4213    pub(super) fn polynomial_value(coefficients: &[f64], z: f64) -> f64 {
4214        coefficients
4215            .iter()
4216            .rev()
4217            .fold(0.0, |acc, &coeff| acc * z + coeff)
4218    }
4219
4220    fn reset_cell_moment_test_reallocs() {
4221        super::CELL_MOMENT_REALLOCS.store(0, std::sync::atomic::Ordering::Relaxed);
4222    }
4223
4224    fn cell_moment_test_reallocs() -> usize {
4225        super::CELL_MOMENT_REALLOCS.load(std::sync::atomic::Ordering::Relaxed)
4226    }
4227
4228    fn assert_close_rel(label: &str, actual: f64, expected: f64, tol: f64) {
4229        let denom = expected.abs().max(1.0);
4230        let rel = (actual - expected).abs() / denom;
4231        assert!(
4232            rel <= tol,
4233            "{label}: actual={actual:.17e} expected={expected:.17e} rel={rel:.3e} tol={tol:.3e}"
4234        );
4235    }
4236
4237    // The link-basis cell coefficient `transformed_link_cubic(span, a, b)` is, in
4238    // each of its four output components, a polynomial of TOTAL degree exactly 3 in
4239    // (a, b):
4240    //   d0 = c0 + c1·s + c2·s² + c3·s³            (s = a − left; deg 3 in a)
4241    //   d1 = b·(c1 + 2c2·s + 3c3·s²)              (a²·b → total deg 3)
4242    //   d2 = b²·(c2 + 3c3·s)                       (a·b² → total deg 3)
4243    //   d3 = c3·b³                                 (b³  → total deg 3)
4244    // Therefore EVERY 4th-order total (a,b)-partial (∂⁴/∂aⁱ∂b^{4−i}) is identically
4245    // zero, while the 3rd-order partials (∂³/∂aⁱ∂b^{3−i}) are the highest nonzero
4246    // ones. This is the exact algebraic fact the bidirectional flex jet relies on:
4247    // a "second mixed derivative of a third-a-partial" slot, etc., demands a 4th
4248    // total (a,b)-partial and must be hard-zero — substituting a (nonzero) 3rd
4249    // partial there is a bug. This test certifies BOTH facts by central FD so the
4250    // hard-coded `0.0` fixes are provably correct and provably necessary.
4251    #[test]
4252    fn link_basis_cell_fourth_ab_partials_vanish_third_are_nonzero() {
4253        let span = LocalSpanCubic {
4254            left: -0.4,
4255            right: 1.6,
4256            c0: 0.37,
4257            c1: -0.81,
4258            c2: 0.53,
4259            c3: -0.29,
4260        };
4261        let a0 = 0.23_f64;
4262        let b0 = 0.61_f64;
4263        let h = 1e-2_f64;
4264
4265        // Generic central-difference stencils per derivative order.
4266        let stencil = |order: usize| -> &'static [(i64, f64)] {
4267            match order {
4268                0 => &[(0, 1.0)],
4269                1 => &[(-1, -0.5), (1, 0.5)],
4270                2 => &[(-1, 1.0), (0, -2.0), (1, 1.0)],
4271                3 => &[(-2, -0.5), (-1, 1.0), (1, -1.0), (2, 0.5)],
4272                4 => &[(-2, 1.0), (-1, -4.0), (0, 6.0), (1, -4.0), (2, 1.0)],
4273                _ => &[(0, 1.0)],
4274            }
4275        };
4276        // FD of component `k` of the cell coefficient: ∂^{na+nb}/∂a^{na}∂b^{nb}.
4277        let fd = |k: usize, na: usize, nb: usize| -> f64 {
4278            let mut acc = 0.0;
4279            for &(ia, wa) in stencil(na) {
4280                for &(ib, wb) in stencil(nb) {
4281                    let a = a0 + (ia as f64) * h;
4282                    let b = b0 + (ib as f64) * h;
4283                    acc += wa * wb * link_basis_cell_coefficients(span, a, b)[k];
4284                }
4285            }
4286            acc / h.powi((na + nb) as i32)
4287        };
4288
4289        let (p3_aaa, p3_aab, p3_abb, p3_bbb) = link_basis_cell_third_partials(span);
4290
4291        // (1) The analytic 3rd partials match FD (within FD truncation) — and at
4292        // least one is appreciably nonzero, so these are real signal that a wrong
4293        // slot would inject.
4294        let mut max_third = 0.0_f64;
4295        for k in 0..4 {
4296            for (label, (na, nb), analytic) in [
4297                ("aaa", (3usize, 0usize), p3_aaa[k]),
4298                ("aab", (2, 1), p3_aab[k]),
4299                ("abb", (1, 2), p3_abb[k]),
4300                ("bbb", (0, 3), p3_bbb[k]),
4301            ] {
4302                let got = fd(k, na, nb);
4303                assert!(
4304                    (got - analytic).abs() <= 1e-4 + 1e-3 * analytic.abs(),
4305                    "3rd partial {label}[{k}] analytic {analytic:+.6e} vs FD {got:+.6e}"
4306                );
4307                max_third = max_third.max(analytic.abs());
4308            }
4309        }
4310        assert!(
4311            max_third > 1e-1,
4312            "expected an appreciable nonzero 3rd (a,b)-partial; max |analytic| = {max_third:.3e}"
4313        );
4314
4315        // (2) EVERY 4th-order total (a,b)-partial vanishes (degree-3 polynomial),
4316        // certifying that the hard-coded `0.0` in the bidirectional d12 slots is the
4317        // mathematically required value, not an approximation.
4318        for k in 0..4 {
4319            for (na, nb) in [(4usize, 0usize), (3, 1), (2, 2), (1, 3), (0, 4)] {
4320                let got = fd(k, na, nb);
4321                assert!(
4322                    got.abs() <= 1e-2,
4323                    "4th (a,b)-partial ∂^{na}_a∂^{nb}_b of cell coeff[{k}] must vanish, FD = {got:+.6e}"
4324                );
4325            }
4326        }
4327    }
4328
4329    #[test]
4330    fn non_affine_cell_state_grid_matches_public_cell_moments_reference() {
4331        let cells = [
4332            DenestedCubicCell {
4333                left: -1.25,
4334                right: -0.2,
4335                c0: -0.35,
4336                c1: 0.85,
4337                c2: 0.04,
4338                c3: -0.015,
4339            },
4340            DenestedCubicCell {
4341                left: -0.2,
4342                right: 0.55,
4343                c0: 0.12,
4344                c1: -0.65,
4345                c2: -0.025,
4346                c3: 0.02,
4347            },
4348            DenestedCubicCell {
4349                left: 0.55,
4350                right: 1.6,
4351                c0: 0.42,
4352                c1: 0.35,
4353                c2: 0.018,
4354                c3: 0.012,
4355            },
4356        ];
4357        for cell in cells {
4358            let branch = branch_cell(cell).expect("branch");
4359            assert_ne!(branch, ExactCellBranch::Affine);
4360            for max_degree in [0usize, 2, 4, 9, 16] {
4361                let direct = evaluate_non_affine_cell_state(cell, branch, max_degree)
4362                    .expect("direct non-affine transport");
4363                let public = evaluate_cell_moments(cell, max_degree).expect("public evaluator");
4364                assert_eq!(direct.branch, public.branch);
4365                assert_eq!(direct.moments.len(), public.moments.len());
4366                let value_scale = direct.value.abs().max(public.value.abs()).max(1.0);
4367                assert!(
4368                    (direct.value - public.value).abs() <= 1e-10 * value_scale,
4369                    "value mismatch for {cell:?} degree {max_degree}: direct={} public={}",
4370                    direct.value,
4371                    public.value
4372                );
4373                for (degree, (lhs, rhs)) in
4374                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4375                {
4376                    let scale = lhs.abs().max(rhs.abs()).max(1.0);
4377                    assert!(
4378                        (lhs - rhs).abs() <= 1e-10 * scale,
4379                        "moment {degree} mismatch for {cell:?} degree {max_degree}: {lhs} vs {rhs}"
4380                    );
4381                }
4382            }
4383        }
4384    }
4385
4386    #[test]
4387    fn affine_tail_cell_memo_matches_uncached_grid_and_records_hits() {
4388        // Use a dedicated local cache so the test's hit/miss/entry counters
4389        // are not perturbed by concurrent tests that drive the shared
4390        // global memo through `evaluate_cell_moments`. Asserting on the
4391        // global counters made this test race-flaky when the suite ran in
4392        // parallel.
4393        let cache = TailCellMomentCache::new();
4394        let c0s = [-2.0, -0.25, 0.0, 1.5];
4395        let c1s = [-1.2, -0.05, 0.0, 0.8];
4396        let endpoints = [-4.0, -1.0, 0.0, 2.5, 6.0];
4397        let degrees = [0_usize, 4, 9, 16, 24];
4398
4399        for &c0 in &c0s {
4400            for &c1 in &c1s {
4401                for &endpoint in &endpoints {
4402                    for &max_degree in &degrees {
4403                        for &(left, right) in
4404                            &[(f64::NEG_INFINITY, endpoint), (endpoint, f64::INFINITY)]
4405                        {
4406                            let cell = DenestedCubicCell {
4407                                left,
4408                                right,
4409                                c0,
4410                                c1,
4411                                c2: 0.0,
4412                                c3: 0.0,
4413                            };
4414                            let expected = evaluate_cell_moments_uncached(cell, max_degree)
4415                                .expect("uncached affine tail moments");
4416                            let actual = cache
4417                                .evaluate(cell, max_degree)
4418                                .expect("cached affine tail moments miss");
4419                            let repeat = cache
4420                                .evaluate(cell, max_degree)
4421                                .expect("cached affine tail moments hit");
4422                            assert_eq!(actual.branch, expected.branch);
4423                            assert_eq!(repeat.branch, expected.branch);
4424                            assert_close_rel(
4425                                "tail value miss",
4426                                actual.value,
4427                                expected.value,
4428                                1e-14,
4429                            );
4430                            assert_close_rel("tail value hit", repeat.value, expected.value, 1e-14);
4431                            assert_eq!(actual.moments.len(), expected.moments.len());
4432                            assert_eq!(repeat.moments.len(), expected.moments.len());
4433                            for (idx, ((a, r), e)) in actual
4434                                .moments
4435                                .iter()
4436                                .zip(repeat.moments.iter())
4437                                .zip(expected.moments.iter())
4438                                .enumerate()
4439                            {
4440                                assert_close_rel(
4441                                    &format!("tail moment miss[{idx}]"),
4442                                    *a,
4443                                    *e,
4444                                    1e-14,
4445                                );
4446                                assert_close_rel(&format!("tail moment hit[{idx}]"), *r, *e, 1e-14);
4447                            }
4448                        }
4449                    }
4450                }
4451            }
4452        }
4453
4454        let stats = cache.stats();
4455        assert_eq!(stats.misses, stats.entries);
4456        assert!(
4457            stats.hits >= stats.misses,
4458            "expected repeat hits: {stats:?}"
4459        );
4460        assert!(
4461            stats.hit_rate() >= 0.5,
4462            "unexpected low hit rate: {stats:?}"
4463        );
4464    }
4465
4466    fn reference_bivariate_normal_cdf_20(h: f64, k: f64, rho: f64) -> f64 {
4467        if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
4468            return 0.0;
4469        }
4470        if h == f64::INFINITY {
4471            return normal_cdf(k);
4472        }
4473        if k == f64::INFINITY {
4474            return normal_cdf(h);
4475        }
4476        let rho_clamped = rho.clamp(-1.0, 1.0);
4477        if rho_clamped >= 1.0 - 1e-12 {
4478            return normal_cdf(h.min(k));
4479        }
4480        if rho_clamped <= -1.0 + 1e-12 {
4481            return (normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0);
4482        }
4483
4484        let hs = 0.5 * (h * h + k * k);
4485        let asr = rho_clamped.asin();
4486        let mut sum = 0.0;
4487        for (&node, &weight) in GL20_NODES.iter().zip(GL20_WEIGHTS.iter()) {
4488            let sn = (0.5 * asr * (node + 1.0)).sin();
4489            let one_minus = 1.0 - sn * sn;
4490            let expo = ((sn * h * k) - hs) / one_minus;
4491            sum += weight * expo.exp();
4492        }
4493        (normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0)
4494    }
4495
4496    #[test]
4497    fn non_affine_cell_state_reference_grid_matches_public_moments() {
4498        let c0s = [-0.4, 0.0, 0.35];
4499        let c1s = [-0.8, 0.25, 1.1];
4500        let c2s = [-0.12, 0.08];
4501        let c3s = [-0.04, 0.03];
4502        let intervals = [(-1.25, -0.2), (-0.5, 0.75), (0.1, 1.4)];
4503        let degrees = [3usize, 6, 9, 12];
4504
4505        for &c0 in &c0s {
4506            for &c1 in &c1s {
4507                for &c2 in &c2s {
4508                    for &c3 in &c3s {
4509                        for &(left, right) in &intervals {
4510                            let cell = DenestedCubicCell {
4511                                left,
4512                                right,
4513                                c0,
4514                                c1,
4515                                c2,
4516                                c3,
4517                            };
4518                            let branch = branch_cell(cell).expect("branch");
4519                            assert_ne!(branch, ExactCellBranch::Affine);
4520                            for &degree in &degrees {
4521                                let direct = evaluate_non_affine_cell_state(cell, branch, degree)
4522                                    .expect("direct non-affine state");
4523                                let public = evaluate_cell_moments(cell, degree)
4524                                    .expect("public non-affine state");
4525                                assert_eq!(direct.branch, public.branch);
4526                                let value_scale =
4527                                    direct.value.abs().max(public.value.abs()).max(1.0);
4528                                assert!(
4529                                    (direct.value - public.value).abs() / value_scale <= 1.0e-15,
4530                                    "value mismatch for {cell:?}, degree {degree}: direct={:.17e}, public={:.17e}",
4531                                    direct.value,
4532                                    public.value
4533                                );
4534                                assert_eq!(direct.moments.len(), public.moments.len());
4535                                for (idx, (&a, &b)) in
4536                                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4537                                {
4538                                    let scale = a.abs().max(b.abs()).max(1.0);
4539                                    assert!(
4540                                        (a - b).abs() / scale <= 1.0e-15,
4541                                        "moment {idx} mismatch for {cell:?}, degree {degree}: direct={a:.17e}, public={b:.17e}"
4542                                    );
4543                                }
4544                            }
4545                        }
4546                    }
4547                }
4548            }
4549        }
4550    }
4551
4552    #[test]
4553    fn bivariate_normal_cdf_matches_reference_grid_to_1e_minus_10() {
4554        let hs = [-8.0, -5.0, -3.0, -1.5, -0.5, 0.0, 0.25, 1.0, 2.5, 5.0, 8.0];
4555        let ks = [-8.0, -4.0, -2.0, -0.75, 0.0, 0.4, 1.25, 3.0, 6.0, 8.0];
4556        let rhos = [
4557            -0.999_999_999_999,
4558            -0.999,
4559            -0.95,
4560            -0.7,
4561            -0.3,
4562            -1.0e-12,
4563            0.0,
4564            1.0e-12,
4565            0.3,
4566            0.7,
4567            0.95,
4568            0.999,
4569            0.999_999_999_999,
4570        ];
4571        for &h in &hs {
4572            for &k in &ks {
4573                for &rho in &rhos {
4574                    let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4575                    let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4576                    let scale = expected.abs().max(1.0e-300);
4577                    let rel = (actual - expected).abs() / scale;
4578                    assert!(
4579                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4580                        "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4581                    );
4582                }
4583            }
4584        }
4585    }
4586
4587    #[test]
4588    fn bivariate_normal_cdf_matches_reference_lcg_property_samples() {
4589        let mut seed = 0x5eed_cafe_f00d_u64;
4590        let mut next_unit = || {
4591            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
4592            ((seed >> 11) as f64) * (1.0 / ((1_u64 << 53) as f64))
4593        };
4594        for _ in 0..4096 {
4595            let h = -8.0 + 16.0 * next_unit();
4596            let k = -8.0 + 16.0 * next_unit();
4597            let rho = -0.999 + 1.998 * next_unit();
4598            let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4599            let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4600            let scale = expected.abs().max(1.0e-300);
4601            let rel = (actual - expected).abs() / scale;
4602            assert!(
4603                rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4604                "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4605            );
4606        }
4607    }
4608
4609    #[test]
4610    fn affine_bvn_interval_primitive_matches_two_cdf_difference() {
4611        let hs = [-6.0, -2.0, -0.25, 0.0, 0.8, 3.0, 6.0];
4612        let bounds = [
4613            (-5.0, -2.0),
4614            (-3.0, -0.1),
4615            (-1.0, 0.0),
4616            (-0.25, 0.75),
4617            (0.2, 3.5),
4618            (2.0, 7.0),
4619        ];
4620        let rhos = [-0.98, -0.8, -0.25, 0.0, 0.25, 0.8, 0.98];
4621        for &h in &hs {
4622            for &(left, right) in &bounds {
4623                for &rho in &rhos {
4624                    let actual =
4625                        bivariate_normal_cdf_interval(h, left, right, rho).expect("interval");
4626                    let expected = (reference_bivariate_normal_cdf_20(h, right, rho)
4627                        - reference_bivariate_normal_cdf_20(h, left, rho))
4628                    .clamp(0.0, 1.0);
4629                    let scale = expected.abs().max(1.0e-300);
4630                    let rel = (actual - expected).abs() / scale;
4631                    assert!(
4632                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-12,
4633                        "h={h} left={left} right={right} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4634                    );
4635                }
4636            }
4637        }
4638    }
4639
4640    fn simpson_integral<F>(left: f64, right: f64, steps: usize, f: F) -> f64
4641    where
4642        F: Fn(f64) -> f64,
4643    {
4644        let n = if steps.is_multiple_of(2) {
4645            steps
4646        } else {
4647            steps + 1
4648        };
4649        let h = (right - left) / n as f64;
4650        let mut acc = f(left) + f(right);
4651        for k in 1..n {
4652            let x = left + h * k as f64;
4653            let w = if k % 2 == 0 { 2.0 } else { 4.0 };
4654            acc += w * f(x);
4655        }
4656        acc * h / 3.0
4657    }
4658
4659    #[test]
4660    fn global_transform_preserves_local_span_polynomial() {
4661        let span = LocalSpanCubic {
4662            left: -1.2,
4663            right: 0.8,
4664            c0: 0.3,
4665            c1: -0.25,
4666            c2: 0.11,
4667            c3: -0.04,
4668        };
4669        let (g0, g1, g2, g3) = global_cubic_from_local(span);
4670        for &x in &[-1.2, -0.7, -0.1, 0.4, 0.8] {
4671            let local = span.evaluate(x);
4672            let global = g0 + g1 * x + g2 * x * x + g3 * x * x * x;
4673            assert!((local - global).abs() < 1e-12);
4674        }
4675    }
4676
4677    #[test]
4678    fn bivariate_normal_cdf_independent_factorizes() {
4679        let h = -0.35;
4680        let k = 0.8;
4681        let out = bivariate_normal_cdf(h, k, 0.0).expect("bvn");
4682        let target = normal_cdf(h) * normal_cdf(k);
4683        assert!((out - target).abs() < 1e-12);
4684    }
4685
4686    #[test]
4687    fn evaluate_affine_cell_state_matches_numeric_integrals() {
4688        let cell = DenestedCubicCell {
4689            left: -0.9,
4690            right: 0.8,
4691            c0: 0.15,
4692            c1: -0.35,
4693            c2: 0.0,
4694            c3: 0.0,
4695        };
4696        let state = evaluate_affine_cell_state(cell, 6).expect("affine cell");
4697        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
4698            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
4699        });
4700        assert_eq!(state.branch, ExactCellBranch::Affine);
4701        assert!((state.value - value_numeric).abs() < 1e-9);
4702        for degree in 0..=6 {
4703            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
4704                z.powi(degree as i32) * (-cell.q(z)).exp()
4705            });
4706            assert!((state.moments[degree] - target).abs() < 1e-9);
4707        }
4708    }
4709
4710    #[test]
4711    fn affine_cell_value_matches_zero_moment_derivative() {
4712        let cell = DenestedCubicCell {
4713            left: -1.1,
4714            right: 0.7,
4715            c0: 0.23,
4716            c1: -0.41,
4717            c2: 0.0,
4718            c3: 0.0,
4719        };
4720        let h = 1e-6;
4721        let plus = evaluate_affine_cell_state(
4722            DenestedCubicCell {
4723                c0: cell.c0 + h,
4724                ..cell
4725            },
4726            0,
4727        )
4728        .expect("affine plus");
4729        let minus = evaluate_affine_cell_state(
4730            DenestedCubicCell {
4731                c0: cell.c0 - h,
4732                ..cell
4733            },
4734            0,
4735        )
4736        .expect("affine minus");
4737        let center = evaluate_affine_cell_state(cell, 0).expect("affine center");
4738        let d_value = (plus.value - minus.value) / (2.0 * h);
4739        let target = INV_TWO_PI * center.moments[0];
4740        assert!((d_value - target).abs() < 1e-8);
4741    }
4742
4743    #[test]
4744    fn coefficient_partials_match_exact_span_derivatives() {
4745        let score_span = LocalSpanCubic {
4746            left: -0.75,
4747            right: 0.25,
4748            c0: 0.08,
4749            c1: -0.03,
4750            c2: 0.02,
4751            c3: -0.01,
4752        };
4753        let link_span = LocalSpanCubic {
4754            left: -0.6,
4755            right: 0.9,
4756            c0: -0.05,
4757            c1: 0.04,
4758            c2: -0.02,
4759            c3: 0.015,
4760        };
4761        let a = 0.3;
4762        let b = -0.7;
4763        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
4764        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4765            let u = a + b * z;
4766            let eta_a = 1.0 + link_span.first_derivative(u);
4767            let eta_b = z + score_span.evaluate(z) + z * link_span.first_derivative(u);
4768            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4769            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4770        }
4771    }
4772
4773    #[test]
4774    fn second_coefficient_partials_match_exact_span_derivatives() {
4775        let score_span = LocalSpanCubic {
4776            left: -0.75,
4777            right: 0.25,
4778            c0: 0.08,
4779            c1: -0.03,
4780            c2: 0.02,
4781            c3: -0.01,
4782        };
4783        let link_span = LocalSpanCubic {
4784            left: -0.6,
4785            right: 0.9,
4786            c0: -0.05,
4787            c1: 0.04,
4788            c2: -0.02,
4789            c3: 0.015,
4790        };
4791        let a = 0.3;
4792        let b = -0.7;
4793        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
4794        let dc_daa = second_partials.0;
4795        let dc_dab = second_partials.1;
4796        let dc_dbb = second_partials.2;
4797        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4798            let u = a + b * z;
4799            let eta_aa = link_span.second_derivative(u);
4800            let eta_ab = z * link_span.second_derivative(u);
4801            let eta_bb = z * z * link_span.second_derivative(u);
4802            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4803            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4804            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4805        }
4806    }
4807
4808    #[test]
4809    fn higher_derivative_moment_helpers_reject_empty_first_coefficients() {
4810        let cell = DenestedCubicCell {
4811            left: -1.0,
4812            right: 1.0,
4813            c0: 0.0,
4814            c1: 1.0,
4815            c2: 0.0,
4816            c3: 0.0,
4817        };
4818        let moments = [1.0; 16];
4819
4820        let third_err = cell_third_derivative_from_moments(
4821            cell,
4822            &[],
4823            &[1.0],
4824            &[1.0],
4825            &[],
4826            &[],
4827            &[],
4828            &[],
4829            &moments,
4830        )
4831        .expect_err("empty first coefficients should be rejected");
4832        assert!(third_err.contains("r first-derivative coefficients must be non-empty"));
4833
4834        let fourth_err = cell_fourth_derivative_from_moments(
4835            cell,
4836            &[1.0],
4837            &[],
4838            &[1.0],
4839            &[1.0],
4840            &[],
4841            &[],
4842            &[],
4843            &[],
4844            &[],
4845            &[],
4846            &[],
4847            &[],
4848            &[],
4849            &[],
4850            &[],
4851            &moments,
4852        )
4853        .expect_err("empty first coefficients should be rejected");
4854        assert!(fourth_err.contains("s first-derivative coefficients must be non-empty"));
4855    }
4856
4857    #[test]
4858    fn fourth_derivative_rejects_overlong_scratch_convolutions() {
4859        let cell = DenestedCubicCell {
4860            left: -1.0,
4861            right: 1.0,
4862            c0: 0.0,
4863            c1: 1.0,
4864            c2: 0.0,
4865            c3: 0.0,
4866        };
4867        let long_first = [1.0; 10];
4868        let zero = [0.0; 1];
4869        let moments = [1.0; 64];
4870
4871        let err = cell_fourth_derivative_from_moments(
4872            cell,
4873            &long_first,
4874            &long_first,
4875            &long_first,
4876            &long_first,
4877            &zero,
4878            &zero,
4879            &zero,
4880            &zero,
4881            &zero,
4882            &zero,
4883            &zero,
4884            &zero,
4885            &zero,
4886            &zero,
4887            &zero,
4888            &moments,
4889        )
4890        .expect_err("oversized convolution should be rejected before writing scratch");
4891        assert!(err.contains("fourth derivative polynomial convolution scratch too small"));
4892    }
4893
4894    #[test]
4895    fn score_and_link_basis_cell_coefficients_match_direct_construction() {
4896        let score_basis_span = LocalSpanCubic {
4897            left: -0.7,
4898            right: 0.4,
4899            c0: 0.2,
4900            c1: -0.04,
4901            c2: 0.03,
4902            c3: -0.01,
4903        };
4904        let link_basis_span = LocalSpanCubic {
4905            left: -0.5,
4906            right: 1.1,
4907            c0: -0.03,
4908            c1: 0.05,
4909            c2: -0.02,
4910            c3: 0.01,
4911        };
4912        let a = 0.25;
4913        let b = -0.8;
4914        let score_coeffs = score_basis_cell_coefficients(score_basis_span, b);
4915        let link_coeffs = link_basis_cell_coefficients(link_basis_span, a, b);
4916        for &z in &[-0.7, -0.1, 0.2, 0.4] {
4917            let score_poly = polynomial_value(&score_coeffs, z);
4918            let link_poly = polynomial_value(&link_coeffs, z);
4919            assert!((score_poly - b * score_basis_span.evaluate(z)).abs() < 1e-12);
4920            assert!((link_poly - link_basis_span.evaluate(a + b * z)).abs() < 1e-12);
4921        }
4922    }
4923
4924    #[test]
4925    fn link_basis_partials_match_exact_span_derivatives() {
4926        let link_basis_span = LocalSpanCubic {
4927            left: -0.5,
4928            right: 1.1,
4929            c0: -0.03,
4930            c1: 0.05,
4931            c2: -0.02,
4932            c3: 0.01,
4933        };
4934        let a = 0.25;
4935        let b = -0.8;
4936        let (dc_da, dc_db) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
4937        let (dc_daa, dc_dab, dc_dbb) = link_basis_cell_second_partials(link_basis_span, a, b);
4938        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4939            let u = a + b * z;
4940            let eta_a = link_basis_span.first_derivative(u);
4941            let eta_b = z * link_basis_span.first_derivative(u);
4942            let eta_aa = link_basis_span.second_derivative(u);
4943            let eta_ab = z * link_basis_span.second_derivative(u);
4944            let eta_bb = z * z * link_basis_span.second_derivative(u);
4945            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4946            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4947            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4948            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4949            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4950        }
4951    }
4952
4953    #[test]
4954    fn denested_third_partials_match_exact_span_derivatives() {
4955        let link_span = LocalSpanCubic {
4956            left: -0.6,
4957            right: 0.9,
4958            c0: -0.05,
4959            c1: 0.04,
4960            c2: -0.02,
4961            c3: 0.015,
4962        };
4963        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
4964        let link_third = 6.0 * link_span.c3;
4965        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4966            let eta_aaa = link_third;
4967            let eta_aab = z * link_third;
4968            let eta_abb = z * z * link_third;
4969            let eta_bbb = z * z * z * link_third;
4970            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4971            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4972            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4973            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4974        }
4975    }
4976
4977    #[test]
4978    fn link_basis_third_partials_match_exact_span_derivatives() {
4979        let link_basis_span = LocalSpanCubic {
4980            left: -0.5,
4981            right: 1.1,
4982            c0: -0.03,
4983            c1: 0.05,
4984            c2: -0.02,
4985            c3: 0.01,
4986        };
4987        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = link_basis_cell_third_partials(link_basis_span);
4988        let link_third = 6.0 * link_basis_span.c3;
4989        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4990            let eta_aaa = link_third;
4991            let eta_aab = z * link_third;
4992            let eta_abb = z * z * link_third;
4993            let eta_bbb = z * z * z * link_third;
4994            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4995            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4996            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4997            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4998        }
4999    }
5000
5001    #[test]
5002    fn branch_selection_uses_normalized_non_affine_coefficients() {
5003        let affine = DenestedCubicCell {
5004            left: -1.0,
5005            right: 1.0,
5006            c0: 0.1,
5007            c1: -0.4,
5008            c2: 1e-13,
5009            c3: -1e-13,
5010        };
5011        let quartic = DenestedCubicCell {
5012            c2: 2e-4,
5013            c3: 1e-13,
5014            ..affine
5015        };
5016        let sextic = DenestedCubicCell {
5017            c2: 2e-4,
5018            c3: 5e-3,
5019            ..affine
5020        };
5021        assert_eq!(branch_cell(affine).unwrap(), ExactCellBranch::Affine);
5022        assert_eq!(branch_cell(quartic).unwrap(), ExactCellBranch::Quartic);
5023        assert_eq!(branch_cell(sextic).unwrap(), ExactCellBranch::Sextic);
5024    }
5025
5026    #[test]
5027    fn affine_anchor_moments_match_whole_line_closed_forms() {
5028        let out = affine_anchor_moment_vector(0.0, 0.0, f64::NEG_INFINITY, f64::INFINITY, 4);
5029        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5030        assert!((out[0] - sqrt_2pi).abs() < 1e-12);
5031        assert!(out[1].abs() < 1e-12);
5032        assert!((out[2] - sqrt_2pi).abs() < 1e-12);
5033    }
5034
5035    #[test]
5036    fn affine_anchor_moments_match_shifted_gaussian_whole_line() {
5037        let alpha = 0.7;
5038        let beta = -0.4;
5039        let out = affine_anchor_moment_vector(alpha, beta, f64::NEG_INFINITY, f64::INFINITY, 4);
5040        let s = (1.0 + beta * beta).sqrt();
5041        let mu = -alpha * beta / (1.0 + beta * beta);
5042        let scale = (-alpha * alpha / (2.0 * s * s)).exp() / s;
5043        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
5044        assert!((out[0] - scale * sqrt_2pi).abs() < 1e-12);
5045        assert!((out[1] - scale * sqrt_2pi * mu).abs() < 1e-12);
5046        assert!((out[2] - scale * sqrt_2pi * (mu * mu + 1.0 / (s * s))).abs() < 1e-10);
5047    }
5048
5049    #[test]
5050    fn quartic_recurrence_reduces_higher_moments() {
5051        let cell = DenestedCubicCell {
5052            left: -1.0,
5053            right: 0.9,
5054            c0: 0.2,
5055            c1: -0.3,
5056            c2: 0.18,
5057            c3: 0.0,
5058        };
5059        let exact = |k: usize| {
5060            simpson_integral(cell.left, cell.right, 2000, |z| {
5061                z.powi(k as i32) * (-cell.q(z)).exp()
5062            })
5063        };
5064        let reduced = reduce_quartic_moments(cell, [exact(0), exact(1), exact(2)], 6)
5065            .expect("quartic reduction");
5066        for k in 0..=6 {
5067            let target = exact(k);
5068            assert!(
5069                (reduced[k] - target).abs() < 1e-7,
5070                "quartic reduced moment M{k} mismatch: {} vs {}",
5071                reduced[k],
5072                target
5073            );
5074        }
5075    }
5076
5077    #[test]
5078    fn sextic_recurrence_reduces_higher_moments() {
5079        let cell = DenestedCubicCell {
5080            left: -0.8,
5081            right: 0.7,
5082            c0: -0.1,
5083            c1: 0.25,
5084            c2: -0.14,
5085            c3: 0.22,
5086        };
5087        let exact = |k: usize| {
5088            simpson_integral(cell.left, cell.right, 3000, |z| {
5089                z.powi(k as i32) * (-cell.q(z)).exp()
5090            })
5091        };
5092        let reduced =
5093            reduce_sextic_moments(cell, [exact(0), exact(1), exact(2), exact(3), exact(4)], 9)
5094                .expect("sextic reduction");
5095        for k in 0..=9 {
5096            let target = exact(k);
5097            assert!(
5098                (reduced[k] - target).abs() < 1e-7,
5099                "sextic reduced moment M{k} mismatch: {} vs {}",
5100                reduced[k],
5101                target
5102            );
5103        }
5104    }
5105
5106    #[test]
5107    fn degenerate_sextic_branch_preserves_quadratic_coefficient() {
5108        let cell = DenestedCubicCell {
5109            left: -1.0,
5110            right: 1.0,
5111            c0: 0.0,
5112            c1: 0.0,
5113            c2: 0.1,
5114            c3: 2.0e-10,
5115        };
5116        assert_eq!(branch_cell(cell).unwrap(), ExactCellBranch::Sextic);
5117
5118        let state = evaluate_cell_moments(cell, 9).expect("degenerate sextic cell");
5119        let quartic_cell = DenestedCubicCell { c3: 0.0, ..cell };
5120        let quartic = evaluate_cell_moments(quartic_cell, 9).expect("quartic cell");
5121        let affine = evaluate_affine_cell_state(
5122            DenestedCubicCell {
5123                c2: 0.0,
5124                c3: 0.0,
5125                ..cell
5126            },
5127            9,
5128        )
5129        .expect("affine cell");
5130
5131        assert_eq!(state.branch, ExactCellBranch::Quartic);
5132        for k in 0..=9 {
5133            assert!(
5134                (state.moments[k] - quartic.moments[k]).abs() < 1e-12,
5135                "lowered moment M{k} should match the quartic cell: {} vs {}",
5136                state.moments[k],
5137                quartic.moments[k]
5138            );
5139        }
5140        assert!(
5141            (state.moments[0] - affine.moments[0]).abs() > 1e-4,
5142            "degenerate sextic handling must not drop the nonzero c2 term"
5143        );
5144    }
5145
5146    #[test]
5147    fn moment_reduced_first_and_second_derivatives_match_numeric_integrals() {
5148        let cell = DenestedCubicCell {
5149            left: -0.9,
5150            right: 0.6,
5151            c0: 0.15,
5152            c1: -0.2,
5153            c2: 0.08,
5154            c3: 0.17,
5155        };
5156        let moments = reduce_sextic_moments(
5157            cell,
5158            [
5159                simpson_integral(cell.left, cell.right, 3000, |z| (-cell.q(z)).exp()),
5160                simpson_integral(cell.left, cell.right, 3000, |z| z * (-cell.q(z)).exp()),
5161                simpson_integral(cell.left, cell.right, 3000, |z| z * z * (-cell.q(z)).exp()),
5162                simpson_integral(cell.left, cell.right, 3000, |z| {
5163                    z.powi(3) * (-cell.q(z)).exp()
5164                }),
5165                simpson_integral(cell.left, cell.right, 3000, |z| {
5166                    z.powi(4) * (-cell.q(z)).exp()
5167                }),
5168            ],
5169            9,
5170        )
5171        .expect("reduced moments");
5172
5173        let r = [0.7, -0.1, 0.3];
5174        let s = [0.2, 0.5];
5175        let second = [0.4, -0.2, 0.1];
5176        let exact_first = cell_first_derivative_from_moments(&r, &moments).expect("first");
5177        let exact_second =
5178            cell_second_derivative_from_moments(cell, &r, &s, &second, &moments).expect("second");
5179
5180        let numeric_first = simpson_integral(cell.left, cell.right, 3000, |z| {
5181            polynomial_value(&r, z) * (-cell.q(z)).exp() / (2.0 * std::f64::consts::PI)
5182        });
5183        let numeric_second = simpson_integral(cell.left, cell.right, 3000, |z| {
5184            let eta = cell.eta(z);
5185            (polynomial_value(&second, z) - eta * polynomial_value(&r, z) * polynomial_value(&s, z))
5186                * (-cell.q(z)).exp()
5187                / (2.0 * std::f64::consts::PI)
5188        });
5189
5190        assert!((exact_first - numeric_first).abs() < 1e-7);
5191        assert!((exact_second - numeric_second).abs() < 1e-7);
5192    }
5193
5194    #[test]
5195    fn moment_reduced_third_derivative_matches_numeric_integral() {
5196        let cell = DenestedCubicCell {
5197            left: -0.85,
5198            right: 0.7,
5199            c0: -0.12,
5200            c1: 0.18,
5201            c2: 0.09,
5202            c3: -0.11,
5203        };
5204        let moments = evaluate_cell_moments(cell, 12).expect("cell moments");
5205        let r = [0.35, -0.12, 0.08];
5206        let s = [0.17, 0.09];
5207        let t = [-0.21, 0.14, -0.04];
5208        let rs = [0.11, -0.07, 0.05];
5209        let rt = [-0.06, 0.03];
5210        let st = [0.08, -0.02, 0.01];
5211        let rst = [0.04, -0.05, 0.02];
5212
5213        let exact_third = cell_third_derivative_from_moments(
5214            cell,
5215            &r,
5216            &s,
5217            &t,
5218            &rs,
5219            &rt,
5220            &st,
5221            &rst,
5222            &moments.moments,
5223        )
5224        .expect("third derivative");
5225        let numeric_third = simpson_integral(cell.left, cell.right, 4000, |z| {
5226            let eta = cell.eta(z);
5227            let rz = polynomial_value(&r, z);
5228            let sz = polynomial_value(&s, z);
5229            let tz = polynomial_value(&t, z);
5230            let rsz = polynomial_value(&rs, z);
5231            let rtz = polynomial_value(&rt, z);
5232            let stz = polynomial_value(&st, z);
5233            let rstz = polynomial_value(&rst, z);
5234            (rstz - eta * (rsz * tz + rtz * sz + stz * rz) + (eta * eta - 1.0) * rz * sz * tz)
5235                * (-cell.q(z)).exp()
5236                / (2.0 * std::f64::consts::PI)
5237        });
5238
5239        assert!((exact_third - numeric_third).abs() < 1e-7);
5240    }
5241
5242    #[test]
5243    fn moment_reduced_fourth_derivative_matches_numeric_integral() {
5244        let cell = DenestedCubicCell {
5245            left: -0.8,
5246            right: 0.65,
5247            c0: 0.11,
5248            c1: -0.22,
5249            c2: 0.07,
5250            c3: 0.13,
5251        };
5252        let moments = evaluate_cell_moments(cell, 16).expect("cell moments");
5253        let r = [0.21, -0.13, 0.06];
5254        let s = [-0.18, 0.04];
5255        let t = [0.09, 0.07, -0.03];
5256        let u = [-0.14, 0.05];
5257        let rs = [0.08, -0.03, 0.02];
5258        let rt = [-0.05, 0.01];
5259        let ru = [0.04, -0.02, 0.01];
5260        let st = [0.03, 0.02];
5261        let su = [-0.02, 0.05, -0.01];
5262        let tu = [0.07, -0.04];
5263        let rst = [0.03, -0.01, 0.02];
5264        let rsu = [-0.02, 0.04];
5265        let rtu = [0.01, 0.02, -0.01];
5266        let stu = [-0.03, 0.02];
5267        let rstu = [0.02, -0.01, 0.01];
5268
5269        let exact_fourth = cell_fourth_derivative_from_moments(
5270            cell,
5271            &r,
5272            &s,
5273            &t,
5274            &u,
5275            &rs,
5276            &rt,
5277            &ru,
5278            &st,
5279            &su,
5280            &tu,
5281            &rst,
5282            &rsu,
5283            &rtu,
5284            &stu,
5285            &rstu,
5286            &moments.moments,
5287        )
5288        .expect("fourth derivative");
5289        let numeric_fourth = simpson_integral(cell.left, cell.right, 5000, |z| {
5290            let eta = cell.eta(z);
5291            let rz = polynomial_value(&r, z);
5292            let sz = polynomial_value(&s, z);
5293            let tz = polynomial_value(&t, z);
5294            let uz = polynomial_value(&u, z);
5295            let rsz = polynomial_value(&rs, z);
5296            let rtz = polynomial_value(&rt, z);
5297            let ruz = polynomial_value(&ru, z);
5298            let stz = polynomial_value(&st, z);
5299            let suz = polynomial_value(&su, z);
5300            let tuz = polynomial_value(&tu, z);
5301            let rstz = polynomial_value(&rst, z);
5302            let rsuz = polynomial_value(&rsu, z);
5303            let rtuz = polynomial_value(&rtu, z);
5304            let stuz = polynomial_value(&stu, z);
5305            let rstuz = polynomial_value(&rstu, z);
5306            let linear =
5307                rstz * uz + rsuz * tz + rtuz * sz + stuz * rz + rsz * tuz + rtz * suz + ruz * stz;
5308            let quadratic = rsz * tz * uz
5309                + rtz * sz * uz
5310                + ruz * sz * tz
5311                + stz * rz * uz
5312                + suz * rz * tz
5313                + tuz * rz * sz;
5314            let quartic = rz * sz * tz * uz;
5315            (rstuz - eta * linear
5316                + (eta * eta - 1.0) * quadratic
5317                + (-eta * eta * eta + 3.0 * eta) * quartic)
5318                * (-cell.q(z)).exp()
5319                / (2.0 * std::f64::consts::PI)
5320        });
5321
5322        assert!((exact_fourth - numeric_fourth).abs() < 2e-7);
5323    }
5324
5325    #[test]
5326    fn denested_cell_parameter_derivatives_match_exact_integrands() {
5327        let score_span = LocalSpanCubic {
5328            left: -0.75,
5329            right: 0.25,
5330            c0: 0.08,
5331            c1: -0.03,
5332            c2: 0.02,
5333            c3: -0.01,
5334        };
5335        let link_span = LocalSpanCubic {
5336            left: -0.6,
5337            right: 0.9,
5338            c0: -0.05,
5339            c1: 0.04,
5340            c2: -0.02,
5341            c3: 0.015,
5342        };
5343        let a = 0.3;
5344        let b = -0.7;
5345        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5346        let cell = DenestedCubicCell {
5347            left: score_span.left,
5348            right: score_span.right,
5349            c0: coeffs[0],
5350            c1: coeffs[1],
5351            c2: coeffs[2],
5352            c3: coeffs[3],
5353        };
5354        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5355        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5356        let (dc_daa, dc_dab, dc_dbb) = denested_cell_second_partials(score_span, link_span, a, b);
5357        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
5358        let zero = [0.0; 4];
5359        let link_third = 6.0 * link_span.c3;
5360
5361        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5362        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5363        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5364        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5365        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5366        let eta_aaa = |z: f64| link_third + 0.0 * z;
5367        let eta_aab = |z: f64| z * link_third;
5368        let eta_abb = |z: f64| z * z * link_third;
5369        let eta_bbb = |z: f64| z * z * z * link_third;
5370
5371        let exact_a = cell_first_derivative_from_moments(&dc_da, &state.moments).expect("a");
5372        let exact_b = cell_first_derivative_from_moments(&dc_db, &state.moments).expect("b");
5373        let exact_aa =
5374            cell_second_derivative_from_moments(cell, &dc_da, &dc_da, &dc_daa, &state.moments)
5375                .expect("aa");
5376        let exact_ab =
5377            cell_second_derivative_from_moments(cell, &dc_da, &dc_db, &dc_dab, &state.moments)
5378                .expect("ab");
5379        let exact_bb =
5380            cell_second_derivative_from_moments(cell, &dc_db, &dc_db, &dc_dbb, &state.moments)
5381                .expect("bb");
5382        let exact_aaa = cell_third_derivative_from_moments(
5383            cell,
5384            &dc_da,
5385            &dc_da,
5386            &dc_da,
5387            &dc_daa,
5388            &dc_daa,
5389            &dc_daa,
5390            &dc_daaa,
5391            &state.moments,
5392        )
5393        .expect("aaa");
5394        let exact_aab = cell_third_derivative_from_moments(
5395            cell,
5396            &dc_da,
5397            &dc_da,
5398            &dc_db,
5399            &dc_daa,
5400            &dc_dab,
5401            &dc_dab,
5402            &dc_daab,
5403            &state.moments,
5404        )
5405        .expect("aab");
5406        let exact_abb = cell_third_derivative_from_moments(
5407            cell,
5408            &dc_da,
5409            &dc_db,
5410            &dc_db,
5411            &dc_dab,
5412            &dc_dab,
5413            &dc_dbb,
5414            &dc_dabb,
5415            &state.moments,
5416        )
5417        .expect("abb");
5418        let exact_bbb = cell_third_derivative_from_moments(
5419            cell,
5420            &dc_db,
5421            &dc_db,
5422            &dc_db,
5423            &dc_dbb,
5424            &dc_dbb,
5425            &dc_dbb,
5426            &dc_dbbb,
5427            &state.moments,
5428        )
5429        .expect("bbb");
5430        let exact_aaaa = cell_fourth_derivative_from_moments(
5431            cell,
5432            &dc_da,
5433            &dc_da,
5434            &dc_da,
5435            &dc_da,
5436            &dc_daa,
5437            &dc_daa,
5438            &dc_daa,
5439            &dc_daa,
5440            &dc_daa,
5441            &dc_daa,
5442            &dc_daaa,
5443            &dc_daaa,
5444            &dc_daaa,
5445            &dc_daaa,
5446            &zero,
5447            &state.moments,
5448        )
5449        .expect("aaaa");
5450        let exact_aaab = cell_fourth_derivative_from_moments(
5451            cell,
5452            &dc_da,
5453            &dc_da,
5454            &dc_da,
5455            &dc_db,
5456            &dc_daa,
5457            &dc_daa,
5458            &dc_dab,
5459            &dc_daa,
5460            &dc_dab,
5461            &dc_dab,
5462            &dc_daaa,
5463            &dc_daab,
5464            &dc_daab,
5465            &dc_daab,
5466            &zero,
5467            &state.moments,
5468        )
5469        .expect("aaab");
5470        let exact_aabb = cell_fourth_derivative_from_moments(
5471            cell,
5472            &dc_da,
5473            &dc_da,
5474            &dc_db,
5475            &dc_db,
5476            &dc_daa,
5477            &dc_dab,
5478            &dc_dab,
5479            &dc_dab,
5480            &dc_dab,
5481            &dc_dbb,
5482            &dc_daab,
5483            &dc_daab,
5484            &dc_dabb,
5485            &dc_dabb,
5486            &zero,
5487            &state.moments,
5488        )
5489        .expect("aabb");
5490        let exact_abbb = cell_fourth_derivative_from_moments(
5491            cell,
5492            &dc_da,
5493            &dc_db,
5494            &dc_db,
5495            &dc_db,
5496            &dc_dab,
5497            &dc_dab,
5498            &dc_dab,
5499            &dc_dbb,
5500            &dc_dbb,
5501            &dc_dbb,
5502            &dc_dabb,
5503            &dc_dabb,
5504            &dc_dabb,
5505            &dc_dbbb,
5506            &zero,
5507            &state.moments,
5508        )
5509        .expect("abbb");
5510        let exact_bbbb = cell_fourth_derivative_from_moments(
5511            cell,
5512            &dc_db,
5513            &dc_db,
5514            &dc_db,
5515            &dc_db,
5516            &dc_dbb,
5517            &dc_dbb,
5518            &dc_dbb,
5519            &dc_dbb,
5520            &dc_dbb,
5521            &dc_dbb,
5522            &dc_dbbb,
5523            &dc_dbbb,
5524            &dc_dbbb,
5525            &dc_dbbb,
5526            &zero,
5527            &state.moments,
5528        )
5529        .expect("bbbb");
5530
5531        let numeric_a = simpson_integral(cell.left, cell.right, 5000, |z| {
5532            eta_a(z) * (-cell.q(z)).exp() * INV_TWO_PI
5533        });
5534        let numeric_b = simpson_integral(cell.left, cell.right, 5000, |z| {
5535            eta_b(z) * (-cell.q(z)).exp() * INV_TWO_PI
5536        });
5537        let numeric_aa = simpson_integral(cell.left, cell.right, 5000, |z| {
5538            (eta_aa(z) - cell.eta(z) * eta_a(z) * eta_a(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5539        });
5540        let numeric_ab = simpson_integral(cell.left, cell.right, 5000, |z| {
5541            (eta_ab(z) - cell.eta(z) * eta_a(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5542        });
5543        let numeric_bb = simpson_integral(cell.left, cell.right, 5000, |z| {
5544            (eta_bb(z) - cell.eta(z) * eta_b(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5545        });
5546        let numeric_aaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5547            let eta = cell.eta(z);
5548            (eta_aaa(z) - 3.0 * eta * eta_aa(z) * eta_a(z) + (eta * eta - 1.0) * eta_a(z).powi(3))
5549                * (-cell.q(z)).exp()
5550                * INV_TWO_PI
5551        });
5552        let numeric_aab = simpson_integral(cell.left, cell.right, 5000, |z| {
5553            let eta = cell.eta(z);
5554            let a_z = eta_a(z);
5555            let b_z = eta_b(z);
5556            (eta_aab(z) - eta * (eta_aa(z) * b_z + 2.0 * eta_ab(z) * a_z)
5557                + (eta * eta - 1.0) * a_z * a_z * b_z)
5558                * (-cell.q(z)).exp()
5559                * INV_TWO_PI
5560        });
5561        let numeric_abb = simpson_integral(cell.left, cell.right, 5000, |z| {
5562            let eta = cell.eta(z);
5563            let a_z = eta_a(z);
5564            let b_z = eta_b(z);
5565            (eta_abb(z) - eta * (2.0 * eta_ab(z) * b_z + eta_bb(z) * a_z)
5566                + (eta * eta - 1.0) * a_z * b_z * b_z)
5567                * (-cell.q(z)).exp()
5568                * INV_TWO_PI
5569        });
5570        let numeric_bbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5571            let eta = cell.eta(z);
5572            (eta_bbb(z) - 3.0 * eta * eta_bb(z) * eta_b(z) + (eta * eta - 1.0) * eta_b(z).powi(3))
5573                * (-cell.q(z)).exp()
5574                * INV_TWO_PI
5575        });
5576        let numeric_aaaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5577            let eta = cell.eta(z);
5578            let eta_a_z = eta_a(z);
5579            let eta_aa_z = eta_aa(z);
5580            let eta_aaa_z = eta_aaa(z);
5581            (-eta * (4.0 * eta_aaa_z * eta_a_z + 3.0 * eta_aa_z * eta_aa_z)
5582                + (eta * eta - 1.0) * (6.0 * eta_aa_z * eta_a_z * eta_a_z)
5583                + (-eta * eta * eta + 3.0 * eta) * eta_a_z.powi(4))
5584                * (-cell.q(z)).exp()
5585                * INV_TWO_PI
5586        });
5587        let numeric_aaab = simpson_integral(cell.left, cell.right, 5000, |z| {
5588            let eta = cell.eta(z);
5589            let a_z = eta_a(z);
5590            let b_z = eta_b(z);
5591            let aa_z = eta_aa(z);
5592            let ab_z = eta_ab(z);
5593            let aaa_z = eta_aaa(z);
5594            let aab_z = eta_aab(z);
5595            (-eta * (aaa_z * b_z + 3.0 * aab_z * a_z + 3.0 * aa_z * ab_z)
5596                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * b_z + 3.0 * ab_z * a_z * a_z)
5597                + (-eta * eta * eta + 3.0 * eta) * a_z.powi(3) * b_z)
5598                * (-cell.q(z)).exp()
5599                * INV_TWO_PI
5600        });
5601        let numeric_aabb = simpson_integral(cell.left, cell.right, 5000, |z| {
5602            let eta = cell.eta(z);
5603            let a_z = eta_a(z);
5604            let b_z = eta_b(z);
5605            let aa_z = eta_aa(z);
5606            let ab_z = eta_ab(z);
5607            let bb_z = eta_bb(z);
5608            let aab_z = eta_aab(z);
5609            let abb_z = eta_abb(z);
5610            (-eta * (2.0 * aab_z * b_z + 2.0 * abb_z * a_z + aa_z * bb_z + 2.0 * ab_z * ab_z)
5611                + (eta * eta - 1.0)
5612                    * (aa_z * b_z * b_z + 4.0 * ab_z * a_z * b_z + bb_z * a_z * a_z)
5613                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * b_z * b_z)
5614                * (-cell.q(z)).exp()
5615                * INV_TWO_PI
5616        });
5617        let numeric_abbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5618            let eta = cell.eta(z);
5619            let a_z = eta_a(z);
5620            let b_z = eta_b(z);
5621            let ab_z = eta_ab(z);
5622            let bb_z = eta_bb(z);
5623            let abb_z = eta_abb(z);
5624            let bbb_z = eta_bbb(z);
5625            (-eta * (3.0 * abb_z * b_z + bbb_z * a_z + 3.0 * ab_z * bb_z)
5626                + (eta * eta - 1.0) * (3.0 * ab_z * b_z * b_z + 3.0 * bb_z * a_z * b_z)
5627                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z.powi(3))
5628                * (-cell.q(z)).exp()
5629                * INV_TWO_PI
5630        });
5631        let numeric_bbbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5632            let eta = cell.eta(z);
5633            let eta_b_z = eta_b(z);
5634            let eta_bb_z = eta_bb(z);
5635            let eta_bbb_z = eta_bbb(z);
5636            (-eta * (4.0 * eta_bbb_z * eta_b_z + 3.0 * eta_bb_z * eta_bb_z)
5637                + (eta * eta - 1.0) * (6.0 * eta_bb_z * eta_b_z * eta_b_z)
5638                + (-eta * eta * eta + 3.0 * eta) * eta_b_z.powi(4))
5639                * (-cell.q(z)).exp()
5640                * INV_TWO_PI
5641        });
5642
5643        assert!((exact_a - numeric_a).abs() < 1e-8);
5644        assert!((exact_b - numeric_b).abs() < 1e-8);
5645        assert!((exact_aa - numeric_aa).abs() < 1e-8);
5646        assert!((exact_ab - numeric_ab).abs() < 1e-8);
5647        assert!((exact_bb - numeric_bb).abs() < 1e-8);
5648        assert!((exact_aaa - numeric_aaa).abs() < 2e-7);
5649        assert!((exact_aab - numeric_aab).abs() < 2e-7);
5650        assert!((exact_abb - numeric_abb).abs() < 2e-7);
5651        assert!((exact_bbb - numeric_bbb).abs() < 2e-7);
5652        assert!((exact_aaaa - numeric_aaaa).abs() < 2e-6);
5653        assert!((exact_aaab - numeric_aaab).abs() < 2e-6);
5654        assert!((exact_aabb - numeric_aabb).abs() < 2e-6);
5655        assert!((exact_abbb - numeric_abbb).abs() < 2e-6);
5656        assert!((exact_bbbb - numeric_bbbb).abs() < 2e-6);
5657    }
5658
5659    #[test]
5660    fn link_basis_cell_derivatives_match_exact_integrands() {
5661        let score_span = LocalSpanCubic {
5662            left: -0.75,
5663            right: 0.25,
5664            c0: 0.08,
5665            c1: -0.03,
5666            c2: 0.02,
5667            c3: -0.01,
5668        };
5669        let link_span = LocalSpanCubic {
5670            left: -0.6,
5671            right: 0.9,
5672            c0: -0.05,
5673            c1: 0.04,
5674            c2: -0.02,
5675            c3: 0.015,
5676        };
5677        let link_basis_span = LocalSpanCubic {
5678            left: -0.6,
5679            right: 0.9,
5680            c0: 0.02,
5681            c1: -0.01,
5682            c2: 0.03,
5683            c3: -0.02,
5684        };
5685        let a = 0.3;
5686        let b = -0.7;
5687        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5688        let cell = DenestedCubicCell {
5689            left: score_span.left,
5690            right: score_span.right,
5691            c0: coeffs[0],
5692            c1: coeffs[1],
5693            c2: coeffs[2],
5694            c3: coeffs[3],
5695        };
5696        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5697        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5698        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5699        let dc_daa = second_partials.0;
5700        let dc_dab = second_partials.1;
5701        let dc_dbb = second_partials.2;
5702        let denested_third = denested_cell_third_partials(link_span);
5703        let dc_daaa = denested_third.0;
5704        let dc_dbbb = denested_third.3;
5705
5706        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
5707        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
5708        let (coeff_aaw, coeff_abw, coeff_bbw) =
5709            link_basis_cell_second_partials(link_basis_span, a, b);
5710        let link_basis_third = link_basis_cell_third_partials(link_basis_span);
5711        let coeff_aaaw = link_basis_third.0;
5712        let coeff_bbbw = link_basis_third.3;
5713        let zero = [0.0; 4];
5714        let basis_third = 6.0 * link_basis_span.c3;
5715
5716        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5717        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5718        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5719        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5720        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5721        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
5722        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
5723        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
5724        let eta_aaw = |z: f64| link_basis_span.second_derivative(a + b * z);
5725        let eta_abw = |z: f64| z * link_basis_span.second_derivative(a + b * z);
5726        let eta_bbw = |z: f64| z * z * link_basis_span.second_derivative(a + b * z);
5727        let eta_aaaw = |z: f64| basis_third + 0.0 * z;
5728        let eta_bbbw = |z: f64| z * z * z * basis_third;
5729
5730        let exact_w = cell_first_derivative_from_moments(&coeff_w, &state.moments).expect("w");
5731        let exact_aw =
5732            cell_second_derivative_from_moments(cell, &dc_da, &coeff_w, &coeff_aw, &state.moments)
5733                .expect("aw");
5734        let exact_bw =
5735            cell_second_derivative_from_moments(cell, &dc_db, &coeff_w, &coeff_bw, &state.moments)
5736                .expect("bw");
5737        let exact_ww =
5738            cell_second_derivative_from_moments(cell, &coeff_w, &coeff_w, &zero, &state.moments)
5739                .expect("ww");
5740        let exact_aaw = cell_third_derivative_from_moments(
5741            cell,
5742            &dc_da,
5743            &dc_da,
5744            &coeff_w,
5745            &dc_daa,
5746            &coeff_aw,
5747            &coeff_aw,
5748            &coeff_aaw,
5749            &state.moments,
5750        )
5751        .expect("aaw");
5752        let exact_abw = cell_third_derivative_from_moments(
5753            cell,
5754            &dc_da,
5755            &dc_db,
5756            &coeff_w,
5757            &dc_dab,
5758            &coeff_aw,
5759            &coeff_bw,
5760            &coeff_abw,
5761            &state.moments,
5762        )
5763        .expect("abw");
5764        let exact_bbw = cell_third_derivative_from_moments(
5765            cell,
5766            &dc_db,
5767            &dc_db,
5768            &coeff_w,
5769            &dc_dbb,
5770            &coeff_bw,
5771            &coeff_bw,
5772            &coeff_bbw,
5773            &state.moments,
5774        )
5775        .expect("bbw");
5776        let exact_www = cell_third_derivative_from_moments(
5777            cell,
5778            &coeff_w,
5779            &coeff_w,
5780            &coeff_w,
5781            &zero,
5782            &zero,
5783            &zero,
5784            &zero,
5785            &state.moments,
5786        )
5787        .expect("www");
5788        let exact_aaaw = cell_fourth_derivative_from_moments(
5789            cell,
5790            &dc_da,
5791            &dc_da,
5792            &dc_da,
5793            &coeff_w,
5794            &dc_daa,
5795            &dc_daa,
5796            &coeff_aw,
5797            &dc_daa,
5798            &coeff_aw,
5799            &coeff_aw,
5800            &dc_daaa,
5801            &coeff_aaw,
5802            &coeff_aaw,
5803            &coeff_aaw,
5804            &coeff_aaaw,
5805            &state.moments,
5806        )
5807        .expect("aaaw");
5808        let exact_aaww = cell_fourth_derivative_from_moments(
5809            cell,
5810            &dc_da,
5811            &dc_da,
5812            &coeff_w,
5813            &coeff_w,
5814            &dc_daa,
5815            &coeff_aw,
5816            &coeff_aw,
5817            &coeff_aw,
5818            &coeff_aw,
5819            &zero,
5820            &coeff_aaw,
5821            &coeff_aaw,
5822            &zero,
5823            &zero,
5824            &zero,
5825            &state.moments,
5826        )
5827        .expect("aaww");
5828        let exact_abww = cell_fourth_derivative_from_moments(
5829            cell,
5830            &dc_da,
5831            &dc_db,
5832            &coeff_w,
5833            &coeff_w,
5834            &dc_dab,
5835            &coeff_aw,
5836            &coeff_aw,
5837            &coeff_bw,
5838            &coeff_bw,
5839            &zero,
5840            &coeff_abw,
5841            &coeff_abw,
5842            &zero,
5843            &zero,
5844            &zero,
5845            &state.moments,
5846        )
5847        .expect("abww");
5848        let exact_bbww = cell_fourth_derivative_from_moments(
5849            cell,
5850            &dc_db,
5851            &dc_db,
5852            &coeff_w,
5853            &coeff_w,
5854            &dc_dbb,
5855            &coeff_bw,
5856            &coeff_bw,
5857            &coeff_bw,
5858            &coeff_bw,
5859            &zero,
5860            &coeff_bbw,
5861            &coeff_bbw,
5862            &zero,
5863            &zero,
5864            &zero,
5865            &state.moments,
5866        )
5867        .expect("bbww");
5868        let exact_bbbw = cell_fourth_derivative_from_moments(
5869            cell,
5870            &dc_db,
5871            &dc_db,
5872            &dc_db,
5873            &coeff_w,
5874            &dc_dbb,
5875            &dc_dbb,
5876            &coeff_bw,
5877            &dc_dbb,
5878            &coeff_bw,
5879            &coeff_bw,
5880            &dc_dbbb,
5881            &coeff_bbw,
5882            &coeff_bbw,
5883            &coeff_bbw,
5884            &coeff_bbbw,
5885            &state.moments,
5886        )
5887        .expect("bbbw");
5888        let exact_wwww = cell_fourth_derivative_from_moments(
5889            cell,
5890            &coeff_w,
5891            &coeff_w,
5892            &coeff_w,
5893            &coeff_w,
5894            &zero,
5895            &zero,
5896            &zero,
5897            &zero,
5898            &zero,
5899            &zero,
5900            &zero,
5901            &zero,
5902            &zero,
5903            &zero,
5904            &zero,
5905            &state.moments,
5906        )
5907        .expect("wwww");
5908
5909        let numeric_w = simpson_integral(cell.left, cell.right, 5000, |z| {
5910            eta_w(z) * (-cell.q(z)).exp() * INV_TWO_PI
5911        });
5912        let numeric_aw = simpson_integral(cell.left, cell.right, 5000, |z| {
5913            (eta_aw(z) - cell.eta(z) * eta_a(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5914        });
5915        let numeric_bw = simpson_integral(cell.left, cell.right, 5000, |z| {
5916            (eta_bw(z) - cell.eta(z) * eta_b(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5917        });
5918        let numeric_ww = simpson_integral(cell.left, cell.right, 5000, |z| {
5919            (-cell.eta(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5920        });
5921        let numeric_aaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5922            let eta = cell.eta(z);
5923            let w_z = eta_w(z);
5924            let a_z = eta_a(z);
5925            (eta_aaw(z) - eta * (eta_aa(z) * w_z + 2.0 * eta_aw(z) * a_z)
5926                + (eta * eta - 1.0) * a_z * a_z * w_z)
5927                * (-cell.q(z)).exp()
5928                * INV_TWO_PI
5929        });
5930        let numeric_abw = simpson_integral(cell.left, cell.right, 5000, |z| {
5931            let eta = cell.eta(z);
5932            let w_z = eta_w(z);
5933            let a_z = eta_a(z);
5934            let b_z = eta_b(z);
5935            (eta_abw(z) - eta * (eta_ab(z) * w_z + eta_aw(z) * b_z + eta_bw(z) * a_z)
5936                + (eta * eta - 1.0) * a_z * b_z * w_z)
5937                * (-cell.q(z)).exp()
5938                * INV_TWO_PI
5939        });
5940        let numeric_bbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5941            let eta = cell.eta(z);
5942            let w_z = eta_w(z);
5943            let b_z = eta_b(z);
5944            (eta_bbw(z) - eta * (eta_bb(z) * w_z + 2.0 * eta_bw(z) * b_z)
5945                + (eta * eta - 1.0) * b_z * b_z * w_z)
5946                * (-cell.q(z)).exp()
5947                * INV_TWO_PI
5948        });
5949        let numeric_www = simpson_integral(cell.left, cell.right, 5000, |z| {
5950            let eta = cell.eta(z);
5951            let w_z = eta_w(z);
5952            ((eta * eta - 1.0) * w_z * w_z * w_z) * (-cell.q(z)).exp() * INV_TWO_PI
5953        });
5954        let numeric_aaaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5955            let eta = cell.eta(z);
5956            let a_z = eta_a(z);
5957            let w_z = eta_w(z);
5958            let aa_z = eta_aa(z);
5959            let aw_z = eta_aw(z);
5960            (eta_aaaw(z)
5961                - eta * ((dc_daaa[0] + 0.0 * z) * w_z + 3.0 * eta_aaw(z) * a_z + 3.0 * aa_z * aw_z)
5962                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * w_z + 3.0 * aw_z * a_z * a_z)
5963                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * a_z * w_z)
5964                * (-cell.q(z)).exp()
5965                * INV_TWO_PI
5966        });
5967        let numeric_aaww = simpson_integral(cell.left, cell.right, 5000, |z| {
5968            let eta = cell.eta(z);
5969            let a_z = eta_a(z);
5970            let w_z = eta_w(z);
5971            let aw_z = eta_aw(z);
5972            (-(2.0 * eta * (eta_aaw(z) * w_z + aw_z * aw_z))
5973                + (eta * eta - 1.0) * (eta_aa(z) * w_z * w_z + 4.0 * aw_z * a_z * w_z)
5974                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * w_z * w_z)
5975                * (-cell.q(z)).exp()
5976                * INV_TWO_PI
5977        });
5978        let numeric_abww = simpson_integral(cell.left, cell.right, 5000, |z| {
5979            let eta = cell.eta(z);
5980            let a_z = eta_a(z);
5981            let b_z = eta_b(z);
5982            let w_z = eta_w(z);
5983            let aw_z = eta_aw(z);
5984            let bw_z = eta_bw(z);
5985            (-(2.0 * eta * (eta_abw(z) * w_z + aw_z * bw_z))
5986                + (eta * eta - 1.0)
5987                    * (eta_ab(z) * w_z * w_z + 2.0 * aw_z * b_z * w_z + 2.0 * bw_z * a_z * w_z)
5988                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * w_z * w_z)
5989                * (-cell.q(z)).exp()
5990                * INV_TWO_PI
5991        });
5992        let numeric_bbww = simpson_integral(cell.left, cell.right, 5000, |z| {
5993            let eta = cell.eta(z);
5994            let b_z = eta_b(z);
5995            let w_z = eta_w(z);
5996            let bw_z = eta_bw(z);
5997            (-(2.0 * eta * (eta_bbw(z) * w_z + bw_z * bw_z))
5998                + (eta * eta - 1.0) * (eta_bb(z) * w_z * w_z + 4.0 * bw_z * b_z * w_z)
5999                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * w_z * w_z)
6000                * (-cell.q(z)).exp()
6001                * INV_TWO_PI
6002        });
6003        let numeric_bbbw = simpson_integral(cell.left, cell.right, 5000, |z| {
6004            let eta = cell.eta(z);
6005            let b_z = eta_b(z);
6006            let w_z = eta_w(z);
6007            let bb_z = eta_bb(z);
6008            let bw_z = eta_bw(z);
6009            (eta_bbbw(z)
6010                - eta
6011                    * ((dc_dbbb[3] * z * z * z) * w_z + 3.0 * eta_bbw(z) * b_z + 3.0 * bb_z * bw_z)
6012                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * w_z + 3.0 * bw_z * b_z * b_z)
6013                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * w_z)
6014                * (-cell.q(z)).exp()
6015                * INV_TWO_PI
6016        });
6017        let numeric_wwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6018            let eta = cell.eta(z);
6019            let w_z = eta_w(z);
6020            ((-eta * eta * eta + 3.0 * eta) * w_z * w_z * w_z * w_z)
6021                * (-cell.q(z)).exp()
6022                * INV_TWO_PI
6023        });
6024
6025        assert!((exact_w - numeric_w).abs() < 1e-8);
6026        assert!((exact_aw - numeric_aw).abs() < 1e-7);
6027        assert!((exact_bw - numeric_bw).abs() < 1e-7);
6028        assert!((exact_ww - numeric_ww).abs() < 1e-7);
6029        assert!((exact_aaw - numeric_aaw).abs() < 2e-6);
6030        assert!((exact_abw - numeric_abw).abs() < 2e-6);
6031        assert!((exact_bbw - numeric_bbw).abs() < 2e-6);
6032        assert!((exact_www - numeric_www).abs() < 2e-6);
6033        assert!((exact_aaaw - numeric_aaaw).abs() < 3e-6);
6034        assert!((exact_aaww - numeric_aaww).abs() < 3e-6);
6035        assert!((exact_abww - numeric_abww).abs() < 3e-6);
6036        assert!((exact_bbww - numeric_bbww).abs() < 3e-6);
6037        assert!((exact_bbbw - numeric_bbbw).abs() < 3e-6);
6038        assert!((exact_wwww - numeric_wwww).abs() < 3e-6);
6039    }
6040
6041    #[test]
6042    fn score_basis_cell_derivatives_match_exact_integrands() {
6043        let score_span = LocalSpanCubic {
6044            left: -0.75,
6045            right: 0.25,
6046            c0: 0.08,
6047            c1: -0.03,
6048            c2: 0.02,
6049            c3: -0.01,
6050        };
6051        let score_basis_span = LocalSpanCubic {
6052            left: -0.75,
6053            right: 0.25,
6054            c0: -0.04,
6055            c1: 0.06,
6056            c2: -0.01,
6057            c3: 0.02,
6058        };
6059        let link_span = LocalSpanCubic {
6060            left: -0.6,
6061            right: 0.9,
6062            c0: -0.05,
6063            c1: 0.04,
6064            c2: -0.02,
6065            c3: 0.015,
6066        };
6067        let a = 0.3;
6068        let b = -0.7;
6069        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6070        let cell = DenestedCubicCell {
6071            left: score_span.left,
6072            right: score_span.right,
6073            c0: coeffs[0],
6074            c1: coeffs[1],
6075            c2: coeffs[2],
6076            c3: coeffs[3],
6077        };
6078        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6079        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6080        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
6081        let dc_daa = second_partials.0;
6082        let dc_dab = second_partials.1;
6083        let dc_dbb = second_partials.2;
6084        let denested_third = denested_cell_third_partials(link_span);
6085        let dc_dbbb = denested_third.3;
6086
6087        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6088        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6089        let zero = [0.0; 4];
6090
6091        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6092        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6093        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6094        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
6095        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6096        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6097
6098        let exact_h = cell_first_derivative_from_moments(&coeff_h, &state.moments).expect("h");
6099        let exact_ah =
6100            cell_second_derivative_from_moments(cell, &dc_da, &coeff_h, &zero, &state.moments)
6101                .expect("ah");
6102        let exact_bh =
6103            cell_second_derivative_from_moments(cell, &dc_db, &coeff_h, &coeff_bh, &state.moments)
6104                .expect("bh");
6105        let exact_hh =
6106            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_h, &zero, &state.moments)
6107                .expect("hh");
6108        let exact_abh = cell_third_derivative_from_moments(
6109            cell,
6110            &dc_da,
6111            &dc_db,
6112            &coeff_h,
6113            &dc_dab,
6114            &zero,
6115            &coeff_bh,
6116            &zero,
6117            &state.moments,
6118        )
6119        .expect("abh");
6120        let exact_bbh = cell_third_derivative_from_moments(
6121            cell,
6122            &dc_db,
6123            &dc_db,
6124            &coeff_h,
6125            &dc_dbb,
6126            &coeff_bh,
6127            &coeff_bh,
6128            &zero,
6129            &state.moments,
6130        )
6131        .expect("bbh");
6132        let exact_bhh = cell_third_derivative_from_moments(
6133            cell,
6134            &dc_db,
6135            &coeff_h,
6136            &coeff_h,
6137            &coeff_bh,
6138            &coeff_bh,
6139            &zero,
6140            &zero,
6141            &state.moments,
6142        )
6143        .expect("bhh");
6144        let exact_hhh = cell_third_derivative_from_moments(
6145            cell,
6146            &coeff_h,
6147            &coeff_h,
6148            &coeff_h,
6149            &zero,
6150            &zero,
6151            &zero,
6152            &zero,
6153            &state.moments,
6154        )
6155        .expect("hhh");
6156        let exact_bbbh = cell_fourth_derivative_from_moments(
6157            cell,
6158            &dc_db,
6159            &dc_db,
6160            &dc_db,
6161            &coeff_h,
6162            &dc_dbb,
6163            &dc_dbb,
6164            &coeff_bh,
6165            &dc_dbb,
6166            &coeff_bh,
6167            &coeff_bh,
6168            &dc_dbbb,
6169            &zero,
6170            &zero,
6171            &zero,
6172            &zero,
6173            &state.moments,
6174        )
6175        .expect("bbbh");
6176        let exact_aahh = cell_fourth_derivative_from_moments(
6177            cell,
6178            &dc_da,
6179            &dc_da,
6180            &coeff_h,
6181            &coeff_h,
6182            &dc_daa,
6183            &zero,
6184            &zero,
6185            &zero,
6186            &zero,
6187            &zero,
6188            &zero,
6189            &zero,
6190            &zero,
6191            &zero,
6192            &zero,
6193            &state.moments,
6194        )
6195        .expect("aahh");
6196        let exact_abhh = cell_fourth_derivative_from_moments(
6197            cell,
6198            &dc_da,
6199            &dc_db,
6200            &coeff_h,
6201            &coeff_h,
6202            &dc_dab,
6203            &zero,
6204            &zero,
6205            &coeff_bh,
6206            &coeff_bh,
6207            &zero,
6208            &zero,
6209            &zero,
6210            &zero,
6211            &zero,
6212            &zero,
6213            &state.moments,
6214        )
6215        .expect("abhh");
6216        let exact_bbhh = cell_fourth_derivative_from_moments(
6217            cell,
6218            &dc_db,
6219            &dc_db,
6220            &coeff_h,
6221            &coeff_h,
6222            &dc_dbb,
6223            &coeff_bh,
6224            &coeff_bh,
6225            &coeff_bh,
6226            &coeff_bh,
6227            &zero,
6228            &zero,
6229            &zero,
6230            &zero,
6231            &zero,
6232            &zero,
6233            &state.moments,
6234        )
6235        .expect("bbhh");
6236        let exact_bhhh = cell_fourth_derivative_from_moments(
6237            cell,
6238            &dc_db,
6239            &coeff_h,
6240            &coeff_h,
6241            &coeff_h,
6242            &coeff_bh,
6243            &coeff_bh,
6244            &coeff_bh,
6245            &zero,
6246            &zero,
6247            &zero,
6248            &zero,
6249            &zero,
6250            &zero,
6251            &zero,
6252            &zero,
6253            &state.moments,
6254        )
6255        .expect("bhhh");
6256        let exact_hhhh = cell_fourth_derivative_from_moments(
6257            cell,
6258            &coeff_h,
6259            &coeff_h,
6260            &coeff_h,
6261            &coeff_h,
6262            &zero,
6263            &zero,
6264            &zero,
6265            &zero,
6266            &zero,
6267            &zero,
6268            &zero,
6269            &zero,
6270            &zero,
6271            &zero,
6272            &zero,
6273            &state.moments,
6274        )
6275        .expect("hhhh");
6276
6277        let numeric_h = simpson_integral(cell.left, cell.right, 5000, |z| {
6278            eta_h(z) * (-cell.q(z)).exp() * INV_TWO_PI
6279        });
6280        let numeric_ah = simpson_integral(cell.left, cell.right, 5000, |z| {
6281            (-cell.eta(z) * eta_a(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6282        });
6283        let numeric_bh = simpson_integral(cell.left, cell.right, 5000, |z| {
6284            (eta_bh(z) - cell.eta(z) * eta_b(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6285        });
6286        let numeric_hh = simpson_integral(cell.left, cell.right, 5000, |z| {
6287            (-cell.eta(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6288        });
6289        let numeric_abh = simpson_integral(cell.left, cell.right, 5000, |z| {
6290            let eta = cell.eta(z);
6291            (-(eta * (eta_ab(z) * eta_h(z) + eta_bh(z) * eta_a(z)))
6292                + (eta * eta - 1.0) * eta_a(z) * eta_b(z) * eta_h(z))
6293                * (-cell.q(z)).exp()
6294                * INV_TWO_PI
6295        });
6296        let numeric_bbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6297            let eta = cell.eta(z);
6298            (-(eta * (eta_bb(z) * eta_h(z) + 2.0 * eta_bh(z) * eta_b(z)))
6299                + (eta * eta - 1.0) * eta_b(z) * eta_b(z) * eta_h(z))
6300                * (-cell.q(z)).exp()
6301                * INV_TWO_PI
6302        });
6303        let numeric_bhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6304            let eta = cell.eta(z);
6305            (-(2.0 * eta * eta_bh(z) * eta_h(z))
6306                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_h(z))
6307                * (-cell.q(z)).exp()
6308                * INV_TWO_PI
6309        });
6310        let numeric_hhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6311            let eta = cell.eta(z);
6312            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6313        });
6314        let numeric_bbbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6315            let eta = cell.eta(z);
6316            let b_z = eta_b(z);
6317            let h_z = eta_h(z);
6318            let bb_z = eta_bb(z);
6319            let bh_z = eta_bh(z);
6320            (-(eta * ((dc_dbbb[3] * z * z * z) * h_z + 3.0 * bb_z * bh_z))
6321                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * h_z + 3.0 * bh_z * b_z * b_z)
6322                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * h_z)
6323                * (-cell.q(z)).exp()
6324                * INV_TWO_PI
6325        });
6326        let numeric_aahh = simpson_integral(cell.left, cell.right, 5000, |z| {
6327            let eta = cell.eta(z);
6328            let a_z = eta_a(z);
6329            let h_z = eta_h(z);
6330            ((eta * eta - 1.0) * polynomial_value(&dc_daa, z) * h_z * h_z
6331                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * h_z * h_z)
6332                * (-cell.q(z)).exp()
6333                * INV_TWO_PI
6334        });
6335        let numeric_abhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6336            let eta = cell.eta(z);
6337            let a_z = eta_a(z);
6338            let b_z = eta_b(z);
6339            let h_z = eta_h(z);
6340            ((eta * eta - 1.0) * (eta_ab(z) * h_z * h_z + 2.0 * eta_bh(z) * a_z * h_z)
6341                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * h_z * h_z)
6342                * (-cell.q(z)).exp()
6343                * INV_TWO_PI
6344        });
6345        let numeric_bbhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6346            let eta = cell.eta(z);
6347            let b_z = eta_b(z);
6348            let h_z = eta_h(z);
6349            let bh_z = eta_bh(z);
6350            (-(2.0 * eta * bh_z * bh_z)
6351                + (eta * eta - 1.0) * (eta_bb(z) * h_z * h_z + 4.0 * bh_z * b_z * h_z)
6352                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * h_z * h_z)
6353                * (-cell.q(z)).exp()
6354                * INV_TWO_PI
6355        });
6356        let numeric_bhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6357            let eta = cell.eta(z);
6358            let h_z = eta_h(z);
6359            (-(eta * (3.0 * eta_bh(z) * h_z * h_z))
6360                + (eta * eta - 1.0) * (3.0 * eta_bh(z) * h_z * h_z)
6361                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * h_z * h_z)
6362                * (-cell.q(z)).exp()
6363                * INV_TWO_PI
6364        });
6365        let numeric_hhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6366            let eta = cell.eta(z);
6367            let h_z = eta_h(z);
6368            ((-eta * eta * eta + 3.0 * eta) * h_z * h_z * h_z * h_z)
6369                * (-cell.q(z)).exp()
6370                * INV_TWO_PI
6371        });
6372
6373        assert!((exact_h - numeric_h).abs() < 1e-8);
6374        assert!((exact_ah - numeric_ah).abs() < 1e-7);
6375        assert!((exact_bh - numeric_bh).abs() < 1e-7);
6376        assert!((exact_hh - numeric_hh).abs() < 1e-7);
6377        assert!((exact_abh - numeric_abh).abs() < 2e-6);
6378        assert!((exact_bbh - numeric_bbh).abs() < 2e-6);
6379        assert!((exact_bhh - numeric_bhh).abs() < 2e-6);
6380        assert!((exact_hhh - numeric_hhh).abs() < 2e-6);
6381        assert!((exact_bbbh - numeric_bbbh).abs() < 3e-6);
6382        assert!((exact_aahh - numeric_aahh).abs() < 3e-6);
6383        assert!((exact_abhh - numeric_abhh).abs() < 3e-6);
6384        assert!((exact_bbhh - numeric_bbhh).abs() < 3e-6);
6385        assert!((exact_bhhh - numeric_bhhh).abs() < 3e-6);
6386        assert!((exact_hhhh - numeric_hhhh).abs() < 3e-6);
6387    }
6388
6389    #[test]
6390    fn cross_basis_cell_derivatives_match_exact_integrands() {
6391        let score_span = LocalSpanCubic {
6392            left: -0.75,
6393            right: 0.25,
6394            c0: 0.08,
6395            c1: -0.03,
6396            c2: 0.02,
6397            c3: -0.01,
6398        };
6399        let score_basis_span = LocalSpanCubic {
6400            left: -0.75,
6401            right: 0.25,
6402            c0: -0.04,
6403            c1: 0.06,
6404            c2: -0.01,
6405            c3: 0.02,
6406        };
6407        let link_span = LocalSpanCubic {
6408            left: -0.6,
6409            right: 0.9,
6410            c0: -0.05,
6411            c1: 0.04,
6412            c2: -0.02,
6413            c3: 0.015,
6414        };
6415        let link_basis_span = LocalSpanCubic {
6416            left: -0.6,
6417            right: 0.9,
6418            c0: 0.02,
6419            c1: -0.01,
6420            c2: 0.03,
6421            c3: -0.02,
6422        };
6423        let a = 0.3;
6424        let b = -0.7;
6425        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6426        let cell = DenestedCubicCell {
6427            left: score_span.left,
6428            right: score_span.right,
6429            c0: coeffs[0],
6430            c1: coeffs[1],
6431            c2: coeffs[2],
6432            c3: coeffs[3],
6433        };
6434        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6435        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6436        let (dc_daa, dc_dab, _) = denested_cell_second_partials(score_span, link_span, a, b);
6437
6438        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6439        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6440        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
6441        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
6442        let (coeff_aaw, coeff_abw, _) = link_basis_cell_second_partials(link_basis_span, a, b);
6443        let zero = [0.0; 4];
6444
6445        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6446        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6447        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6448        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6449        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
6450        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6451        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
6452        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
6453
6454        let exact_hw =
6455            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_w, &zero, &state.moments)
6456                .expect("hw");
6457        let exact_ahw = cell_third_derivative_from_moments(
6458            cell,
6459            &dc_da,
6460            &coeff_h,
6461            &coeff_w,
6462            &zero,
6463            &coeff_aw,
6464            &zero,
6465            &zero,
6466            &state.moments,
6467        )
6468        .expect("ahw");
6469        let exact_bhw = cell_third_derivative_from_moments(
6470            cell,
6471            &dc_db,
6472            &coeff_h,
6473            &coeff_w,
6474            &coeff_bh,
6475            &coeff_bw,
6476            &zero,
6477            &zero,
6478            &state.moments,
6479        )
6480        .expect("bhw");
6481        let exact_hhw = cell_third_derivative_from_moments(
6482            cell,
6483            &coeff_h,
6484            &coeff_h,
6485            &coeff_w,
6486            &zero,
6487            &zero,
6488            &zero,
6489            &zero,
6490            &state.moments,
6491        )
6492        .expect("hhw");
6493        let exact_hww = cell_third_derivative_from_moments(
6494            cell,
6495            &coeff_h,
6496            &coeff_w,
6497            &coeff_w,
6498            &zero,
6499            &zero,
6500            &zero,
6501            &zero,
6502            &state.moments,
6503        )
6504        .expect("hww");
6505        let exact_aahw = cell_fourth_derivative_from_moments(
6506            cell,
6507            &dc_da,
6508            &dc_da,
6509            &coeff_h,
6510            &coeff_w,
6511            &dc_daa,
6512            &zero,
6513            &coeff_aw,
6514            &zero,
6515            &coeff_aw,
6516            &zero,
6517            &zero,
6518            &coeff_aaw,
6519            &zero,
6520            &zero,
6521            &zero,
6522            &state.moments,
6523        )
6524        .expect("aahw");
6525        let exact_hhww = cell_fourth_derivative_from_moments(
6526            cell,
6527            &coeff_h,
6528            &coeff_h,
6529            &coeff_w,
6530            &coeff_w,
6531            &zero,
6532            &zero,
6533            &zero,
6534            &zero,
6535            &zero,
6536            &zero,
6537            &zero,
6538            &zero,
6539            &zero,
6540            &zero,
6541            &zero,
6542            &state.moments,
6543        )
6544        .expect("hhww");
6545        let exact_hhhw = cell_fourth_derivative_from_moments(
6546            cell,
6547            &coeff_h,
6548            &coeff_h,
6549            &coeff_h,
6550            &coeff_w,
6551            &zero,
6552            &zero,
6553            &zero,
6554            &zero,
6555            &zero,
6556            &zero,
6557            &zero,
6558            &zero,
6559            &zero,
6560            &zero,
6561            &zero,
6562            &state.moments,
6563        )
6564        .expect("hhhw");
6565        let exact_abhw = cell_fourth_derivative_from_moments(
6566            cell,
6567            &dc_da,
6568            &dc_db,
6569            &coeff_h,
6570            &coeff_w,
6571            &dc_dab,
6572            &zero,
6573            &coeff_aw,
6574            &coeff_bh,
6575            &coeff_bw,
6576            &zero,
6577            &zero,
6578            &coeff_abw,
6579            &zero,
6580            &zero,
6581            &zero,
6582            &state.moments,
6583        )
6584        .expect("abhw");
6585        let exact_ahww = cell_fourth_derivative_from_moments(
6586            cell,
6587            &dc_da,
6588            &coeff_h,
6589            &coeff_w,
6590            &coeff_w,
6591            &zero,
6592            &coeff_aw,
6593            &coeff_aw,
6594            &zero,
6595            &zero,
6596            &zero,
6597            &zero,
6598            &zero,
6599            &zero,
6600            &zero,
6601            &zero,
6602            &state.moments,
6603        )
6604        .expect("ahww");
6605        let exact_bhww = cell_fourth_derivative_from_moments(
6606            cell,
6607            &dc_db,
6608            &coeff_h,
6609            &coeff_w,
6610            &coeff_w,
6611            &coeff_bh,
6612            &coeff_bw,
6613            &coeff_bw,
6614            &zero,
6615            &zero,
6616            &zero,
6617            &zero,
6618            &zero,
6619            &zero,
6620            &zero,
6621            &zero,
6622            &state.moments,
6623        )
6624        .expect("bhww");
6625        let exact_hwww = cell_fourth_derivative_from_moments(
6626            cell,
6627            &coeff_h,
6628            &coeff_w,
6629            &coeff_w,
6630            &coeff_w,
6631            &zero,
6632            &zero,
6633            &zero,
6634            &zero,
6635            &zero,
6636            &zero,
6637            &zero,
6638            &zero,
6639            &zero,
6640            &zero,
6641            &zero,
6642            &state.moments,
6643        )
6644        .expect("hwww");
6645
6646        let numeric_hw = simpson_integral(cell.left, cell.right, 5000, |z| {
6647            (-cell.eta(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6648        });
6649        let numeric_ahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6650            let eta = cell.eta(z);
6651            (-(eta * eta_aw(z) * eta_h(z)) + (eta * eta - 1.0) * eta_a(z) * eta_h(z) * eta_w(z))
6652                * (-cell.q(z)).exp()
6653                * INV_TWO_PI
6654        });
6655        let numeric_bhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6656            let eta = cell.eta(z);
6657            (-(eta * (eta_bh(z) * eta_w(z) + eta_bw(z) * eta_h(z)))
6658                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_w(z))
6659                * (-cell.q(z)).exp()
6660                * INV_TWO_PI
6661        });
6662        let numeric_hhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6663            let eta = cell.eta(z);
6664            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6665        });
6666        let numeric_hww = simpson_integral(cell.left, cell.right, 5000, |z| {
6667            let eta = cell.eta(z);
6668            ((eta * eta - 1.0) * eta_h(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6669        });
6670        let numeric_aahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6671            let eta = cell.eta(z);
6672            (-(eta * polynomial_value(&coeff_aaw, z) * eta_h(z))
6673                + (eta * eta - 1.0)
6674                    * (polynomial_value(&dc_daa, z) * eta_h(z) * eta_w(z)
6675                        + 2.0 * eta_aw(z) * eta_a(z) * eta_h(z))
6676                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_a(z) * eta_h(z) * eta_w(z))
6677                * (-cell.q(z)).exp()
6678                * INV_TWO_PI
6679        });
6680        let numeric_hhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6681            let eta = cell.eta(z);
6682            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_w(z) * eta_w(z))
6683                * (-cell.q(z)).exp()
6684                * INV_TWO_PI
6685        });
6686        let numeric_hhhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6687            let eta = cell.eta(z);
6688            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_h(z) * eta_w(z))
6689                * (-cell.q(z)).exp()
6690                * INV_TWO_PI
6691        });
6692        let numeric_abhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6693            let eta = cell.eta(z);
6694            (-(eta * polynomial_value(&coeff_abw, z) * eta_h(z) + eta * eta_aw(z) * eta_bh(z))
6695                + (eta * eta - 1.0)
6696                    * (eta_ab(z) * eta_h(z) * eta_w(z)
6697                        + eta_aw(z) * eta_b(z) * eta_h(z)
6698                        + eta_bh(z) * eta_a(z) * eta_w(z)
6699                        + eta_bw(z) * eta_a(z) * eta_h(z))
6700                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_b(z) * eta_h(z) * eta_w(z))
6701                * (-cell.q(z)).exp()
6702                * INV_TWO_PI
6703        });
6704        let numeric_ahww = simpson_integral(cell.left, cell.right, 5000, |z| {
6705            let eta = cell.eta(z);
6706            (2.0 * (eta * eta - 1.0) * eta_aw(z) * eta_h(z) * eta_w(z)
6707                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_h(z) * eta_w(z) * eta_w(z))
6708                * (-cell.q(z)).exp()
6709                * INV_TWO_PI
6710        });
6711        let numeric_bhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6712            let eta = cell.eta(z);
6713            let h_z = eta_h(z);
6714            let w_z = eta_w(z);
6715            ((eta * eta - 1.0) * (eta_bh(z) * w_z * w_z + 2.0 * eta_bw(z) * h_z * w_z)
6716                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * w_z * w_z)
6717                * (-cell.q(z)).exp()
6718                * INV_TWO_PI
6719        });
6720        let numeric_hwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6721            let eta = cell.eta(z);
6722            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_w(z) * eta_w(z) * eta_w(z))
6723                * (-cell.q(z)).exp()
6724                * INV_TWO_PI
6725        });
6726
6727        assert!((exact_hw - numeric_hw).abs() < 1e-7);
6728        assert!((exact_ahw - numeric_ahw).abs() < 2e-6);
6729        assert!((exact_bhw - numeric_bhw).abs() < 2e-6);
6730        assert!((exact_hhw - numeric_hhw).abs() < 2e-6);
6731        assert!((exact_hww - numeric_hww).abs() < 2e-6);
6732        assert!((exact_aahw - numeric_aahw).abs() < 3e-6);
6733        assert!((exact_hhww - numeric_hhww).abs() < 3e-6);
6734        assert!((exact_hhhw - numeric_hhhw).abs() < 3e-6);
6735        assert!((exact_abhw - numeric_abhw).abs() < 3e-6);
6736        assert!((exact_ahww - numeric_ahww).abs() < 3e-6);
6737        assert!((exact_bhww - numeric_bhww).abs() < 3e-6);
6738        assert!((exact_hwww - numeric_hwww).abs() < 3e-6);
6739    }
6740
6741    #[test]
6742    fn cell_moment_scratch_reuses_buffers_under_margslope_like_pressure() {
6743        let cells = [
6744            DenestedCubicCell {
6745                left: -1.2,
6746                right: -0.35,
6747                c0: 0.18,
6748                c1: 0.72,
6749                c2: -0.045,
6750                c3: 0.018,
6751            },
6752            DenestedCubicCell {
6753                left: -0.35,
6754                right: 0.48,
6755                c0: -0.08,
6756                c1: 0.91,
6757                c2: 0.038,
6758                c3: -0.014,
6759            },
6760            DenestedCubicCell {
6761                left: 0.48,
6762                right: 1.4,
6763                c0: 0.11,
6764                c1: 0.83,
6765                c2: 0.022,
6766                c3: 0.012,
6767            },
6768        ];
6769        let mut scratch = CellMomentScratch::with_capacity(MAX_AFFINE_ANCHOR_DEGREE);
6770        for cell in cells {
6771            let baseline = evaluate_cell_moments(cell, 9).expect("baseline moments");
6772            let scratch_state =
6773                evaluate_cell_moments_with_scratch(cell, 9, &mut scratch).expect("scratch moments");
6774            assert_eq!(baseline.branch, scratch_state.branch);
6775            assert!((baseline.value - scratch_state.value).abs() <= 1e-10);
6776            assert_eq!(baseline.moments.len(), scratch_state.moments.len());
6777            for (lhs, rhs) in baseline.moments.iter().zip(scratch_state.moments.iter()) {
6778                assert!((lhs - rhs).abs() <= 1e-10, "{lhs} vs {rhs}");
6779            }
6780        }
6781
6782        reset_cell_moment_test_reallocs();
6783        let mut checksum = 0.0;
6784        for i in 0..5_000 {
6785            let cell = cells[i % cells.len()];
6786            let state = evaluate_cell_moments_with_scratch(cell, 9, &mut scratch)
6787                .expect("scratch moments under repeated pressure");
6788            checksum += state.value + state.moments[0] * 1e-12;
6789        }
6790        assert!(checksum.is_finite());
6791        assert_eq!(
6792            cell_moment_test_reallocs(),
6793            0,
6794            "scratch-backed inner cell-moment calls should not grow Vec buffers"
6795        );
6796    }
6797
6798    #[test]
6799    fn evaluate_cell_moments_matches_numeric_integrals() {
6800        let cell = DenestedCubicCell {
6801            left: -0.9,
6802            right: 0.8,
6803            c0: 0.15,
6804            c1: -0.35,
6805            c2: 0.11,
6806            c3: -0.07,
6807        };
6808        let state = evaluate_cell_moments(cell, 6).expect("cell moments");
6809        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
6810            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
6811        });
6812        assert!((state.value - value_numeric).abs() < 1e-9);
6813        for degree in 0..=6 {
6814            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
6815                z.powi(degree as i32) * (-cell.q(z)).exp()
6816            });
6817            assert!((state.moments[degree] - target).abs() < 1e-9);
6818        }
6819    }
6820
6821    #[test]
6822    fn partition_builder_moves_link_preimages_with_intercept() {
6823        let score_breaks = [-2.0, -1.0, 0.0, 1.0, 2.0];
6824        let link_breaks = [-1.5, -0.5, 0.5, 1.5];
6825        let score_span = |z: f64| {
6826            let left = if z < -1.0 {
6827                -2.0
6828            } else if z < 0.0 {
6829                -1.0
6830            } else if z < 1.0 {
6831                0.0
6832            } else {
6833                1.0
6834            };
6835            Ok(LocalSpanCubic {
6836                left,
6837                right: left + 1.0,
6838                c0: 0.1,
6839                c1: 0.2,
6840                c2: 0.0,
6841                c3: 0.0,
6842            })
6843        };
6844        let link_span = |u: f64| {
6845            let left = if u < -0.5 {
6846                -1.5
6847            } else if u < 0.5 {
6848                -0.5
6849            } else {
6850                0.5
6851            };
6852            Ok(LocalSpanCubic {
6853                left,
6854                right: left + 1.0,
6855                c0: -0.05,
6856                c1: 0.1,
6857                c2: 0.0,
6858                c3: 0.0,
6859            })
6860        };
6861        let cells_a0 = build_denested_partition_cells(
6862            0.25,
6863            0.9,
6864            &score_breaks,
6865            &link_breaks,
6866            score_span,
6867            link_span,
6868        )
6869        .expect("cells a0");
6870        let cells_a1 = build_denested_partition_cells(
6871            0.55,
6872            0.9,
6873            &score_breaks,
6874            &link_breaks,
6875            score_span,
6876            link_span,
6877        )
6878        .expect("cells a1");
6879        assert!(cells_a0.len() >= score_breaks.len() - 1);
6880        assert!(
6881            cells_a0
6882                .windows(2)
6883                .all(|w| (w[0].cell.right - w[1].cell.left).abs() <= 1e-12)
6884        );
6885        assert!(
6886            cells_a0
6887                .iter()
6888                .zip(cells_a1.iter())
6889                .any(|(lhs, rhs)| (lhs.cell.left - rhs.cell.left).abs() > 1e-10)
6890        );
6891        assert!(cells_a0.first().unwrap().cell.left.is_infinite());
6892        assert!(cells_a0.last().unwrap().cell.right.is_infinite());
6893    }
6894
6895    #[test]
6896    fn partition_builder_without_breaks_returns_single_global_cell() {
6897        let cells = build_denested_partition_cells_with_tails(
6898            0.3,
6899            -0.4,
6900            &[],
6901            &[],
6902            |z| {
6903                if z.is_nan() {
6904                    return Err("probe z is NaN".to_string());
6905                }
6906                Ok(LocalSpanCubic {
6907                    left: 0.0,
6908                    right: 1.0,
6909                    c0: 0.0,
6910                    c1: 0.0,
6911                    c2: 0.0,
6912                    c3: 0.0,
6913                })
6914            },
6915            |u| {
6916                if u.is_nan() {
6917                    return Err("probe u is NaN".to_string());
6918                }
6919                Ok(LocalSpanCubic {
6920                    left: 0.0,
6921                    right: 1.0,
6922                    c0: 0.0,
6923                    c1: 0.0,
6924                    c2: 0.0,
6925                    c3: 0.0,
6926                })
6927            },
6928        )
6929        .expect("global cell");
6930        assert_eq!(cells.len(), 1);
6931        assert_eq!(cells[0].cell.left, f64::NEG_INFINITY);
6932        assert_eq!(cells[0].cell.right, f64::INFINITY);
6933        assert!(cells[0].cell.c2.abs() < 1e-12);
6934        assert!(cells[0].cell.c3.abs() < 1e-12);
6935    }
6936
6937    #[test]
6938    fn polynomial_integral_helper_matches_moment_sum() {
6939        let cell = DenestedCubicCell {
6940            left: -1.5,
6941            right: 1.25,
6942            c0: 0.2,
6943            c1: -0.4,
6944            c2: 0.15,
6945            c3: 0.03,
6946        };
6947        let state = evaluate_cell_moments(cell, 8).expect("cell moments");
6948        let coeffs = [1.5, -0.25, 0.75, 0.1];
6949        let expected = INV_TWO_PI
6950            * coeffs
6951                .iter()
6952                .enumerate()
6953                .map(|(idx, coeff)| coeff * state.moments[idx])
6954                .sum::<f64>();
6955        let got = cell_polynomial_integral_from_moments(&coeffs, &state.moments, "test poly")
6956            .expect("poly integral");
6957        assert!((got - expected).abs() < 1e-14);
6958    }
6959
6960    #[test]
6961    fn batched_cell_moment_max_degree_matches_direct_non_affine_grid() {
6962        let cells = [
6963            DenestedCubicCell {
6964                left: -2.0,
6965                right: -0.25,
6966                c0: -0.7,
6967                c1: 0.8,
6968                c2: 0.015,
6969                c3: -0.004,
6970            },
6971            DenestedCubicCell {
6972                left: -0.5,
6973                right: 0.75,
6974                c0: 0.2,
6975                c1: -0.35,
6976                c2: -0.025,
6977                c3: 0.0,
6978            },
6979            DenestedCubicCell {
6980                left: 0.1,
6981                right: 1.6,
6982                c0: 0.4,
6983                c1: 0.25,
6984                c2: 0.01,
6985                c3: 0.006,
6986            },
6987            DenestedCubicCell {
6988                left: -1.25,
6989                right: 2.25,
6990                c0: -0.1,
6991                c1: 0.55,
6992                c2: -0.012,
6993                c3: 0.003,
6994            },
6995        ];
6996        for cell in cells {
6997            let branch = branch_cell(cell).expect("branch");
6998            if branch == ExactCellBranch::Affine {
6999                continue;
7000            }
7001            let batched =
7002                evaluate_non_affine_cell_state(cell, branch, 21).expect("degree-21 state");
7003            for degree in [9usize, 15, 21] {
7004                let direct =
7005                    evaluate_non_affine_cell_state(cell, branch, degree).expect("direct state");
7006                assert_eq!(batched.branch, direct.branch);
7007                let denom = direct.value.abs().max(1.0);
7008                assert!(((batched.value - direct.value).abs() / denom) < 1e-10);
7009                for k in 0..=degree {
7010                    let denom = direct.moments[k].abs().max(1.0);
7011                    let rel = (batched.moments[k] - direct.moments[k]).abs() / denom;
7012                    assert!(
7013                        rel < 1e-10,
7014                        "cell={cell:?} degree={degree} moment={k} rel={rel:e}"
7015                    );
7016                }
7017            }
7018        }
7019    }
7020
7021    #[test]
7022    fn derivative_moment_evaluator_matches_value_evaluator_moments() {
7023        let cells = [
7024            DenestedCubicCell {
7025                left: -2.0,
7026                right: -0.4,
7027                c0: 0.15,
7028                c1: -0.8,
7029                c2: 0.0,
7030                c3: 0.0,
7031            },
7032            DenestedCubicCell {
7033                left: -0.75,
7034                right: 1.4,
7035                c0: -0.25,
7036                c1: 0.6,
7037                c2: 0.12,
7038                c3: 0.0,
7039            },
7040            DenestedCubicCell {
7041                left: -1.1,
7042                right: 0.9,
7043                c0: 0.35,
7044                c1: -0.3,
7045                c2: 0.05,
7046                c3: -0.015,
7047            },
7048        ];
7049        for cell in cells {
7050            for degree in [4usize, 9, 15, 21] {
7051                let full = evaluate_cell_moments_uncached(cell, degree).expect("full moments");
7052                let derivative = evaluate_cell_derivative_moments_uncached(cell, degree)
7053                    .expect("derivative moments");
7054                assert_eq!(full.branch, derivative.branch);
7055                assert_eq!(full.moments.len(), derivative.moments.len());
7056                for k in 0..full.moments.len() {
7057                    assert_eq!(full.moments[k].to_bits(), derivative.moments[k].to_bits());
7058                }
7059            }
7060        }
7061    }
7062
7063    #[test]
7064    fn cell_moment_lru_matches_uncached_non_affine_grid() {
7065        let cache = CellMomentLruCache::new(16 * 1024 * 1024);
7066        let stats = CellMomentCacheStats::default();
7067        let c0s = [-0.75, 0.0, 0.5];
7068        let c1s = [-1.2, 0.25, 1.1];
7069        let c2s = [-0.18, 0.07];
7070        let c3s = [0.0, 0.025];
7071        let bounds = [(-2.0, -0.5), (-0.25, 1.5)];
7072        let degrees = [4usize, 9, 15, 21];
7073        for &c0 in &c0s {
7074            for &c1 in &c1s {
7075                for &c2 in &c2s {
7076                    for &c3 in &c3s {
7077                        for &(left, right) in &bounds {
7078                            for &max_degree in &degrees {
7079                                let cell = DenestedCubicCell {
7080                                    left,
7081                                    right,
7082                                    c0,
7083                                    c1,
7084                                    c2,
7085                                    c3,
7086                                };
7087                                let branch = branch_cell(cell).expect("branch");
7088                                if branch == ExactCellBranch::Affine {
7089                                    continue;
7090                                }
7091                                let expected =
7092                                    evaluate_non_affine_cell_state(cell, branch, max_degree)
7093                                        .expect("uncached non-affine moments");
7094                                let got = evaluate_cell_moments_cached(
7095                                    cell,
7096                                    max_degree,
7097                                    &cache,
7098                                    Some(&stats),
7099                                )
7100                                .expect("cached moments");
7101                                assert_eq!(got.branch, expected.branch);
7102                                assert_eq!(got.moments.len(), max_degree + 1);
7103                                let denom = expected.value.abs().max(1.0);
7104                                assert!(
7105                                    ((got.value - expected.value).abs() / denom) < 1e-10,
7106                                    "value mismatch for {cell:?} degree {max_degree}: got {} expected {}",
7107                                    got.value,
7108                                    expected.value
7109                                );
7110                                for (idx, (&lhs, &rhs)) in
7111                                    got.moments.iter().zip(expected.moments.iter()).enumerate()
7112                                {
7113                                    let denom = rhs.abs().max(1.0);
7114                                    assert!(
7115                                        ((lhs - rhs).abs() / denom) < 1e-10,
7116                                        "moment {idx} mismatch for {cell:?} degree {max_degree}: got {lhs} expected {rhs}"
7117                                    );
7118                                }
7119                                let warm = evaluate_cell_moments_cached(
7120                                    cell,
7121                                    max_degree,
7122                                    &cache,
7123                                    Some(&stats),
7124                                )
7125                                .expect("warm cached moments");
7126                                assert_eq!(warm, got);
7127                            }
7128                        }
7129                    }
7130                }
7131            }
7132        }
7133        let (hits, misses) = stats.snapshot();
7134        assert!(hits > 0, "expected warm LRU hits");
7135        assert!(misses > 0, "expected cold LRU misses");
7136    }
7137
7138    #[test]
7139    fn cell_moment_fingerprint_exact_cache_matches_current_evaluator() {
7140        let cells = [
7141            DenestedCubicCell {
7142                left: -1.75,
7143                right: -0.25,
7144                c0: 0.15,
7145                c1: -0.35,
7146                c2: 0.08,
7147                c3: -0.015,
7148            },
7149            DenestedCubicCell {
7150                left: -0.5,
7151                right: 0.8,
7152                c0: -0.2,
7153                c1: 0.45,
7154                c2: -0.12,
7155                c3: 0.025,
7156            },
7157            DenestedCubicCell {
7158                left: 0.1,
7159                right: 1.6,
7160                c0: 0.05,
7161                c1: 0.2,
7162                c2: 0.03,
7163                c3: 0.004,
7164            },
7165        ];
7166        let mut cache = std::collections::HashMap::new();
7167        for max_degree in [0usize, 3, 4, 9, 16] {
7168            for cell in cells {
7169                let baseline = evaluate_cell_moments(cell, max_degree).expect("baseline moments");
7170                let key = cell_moment_cache_key(cell, max_degree, 0.0);
7171                let cached = cache.entry(key).or_insert_with(|| {
7172                    evaluate_cell_moments(cell, max_degree).expect("cached moments")
7173                });
7174                assert_eq!(baseline.branch, cached.branch);
7175                assert_eq!(baseline.value.to_bits(), cached.value.to_bits());
7176                assert_eq!(baseline.moments.len(), cached.moments.len());
7177                for (lhs, rhs) in baseline.moments.iter().zip(cached.moments.iter()) {
7178                    assert_eq!(lhs.to_bits(), rhs.to_bits());
7179                }
7180            }
7181        }
7182    }
7183
7184    #[test]
7185    fn fuzzy_cell_moment_fingerprint_error_scales_with_epsilon() {
7186        for epsilon in [1e-8, 1e-6] {
7187            let base = DenestedCubicCell {
7188                left: -1.25,
7189                right: 1.1,
7190                c0: 0.1,
7191                c1: -0.25,
7192                c2: 0.04,
7193                c3: -0.006,
7194            };
7195            let perturbed = DenestedCubicCell {
7196                left: base.left + 0.001 * epsilon,
7197                right: base.right - 0.001 * epsilon,
7198                c0: base.c0 + 0.001 * epsilon,
7199                c1: base.c1 - 0.001 * epsilon,
7200                c2: base.c2 + 0.001 * epsilon,
7201                c3: base.c3 - 0.001 * epsilon,
7202            };
7203            assert_eq!(
7204                cell_moment_cache_key(base, 9, epsilon),
7205                cell_moment_cache_key(perturbed, 9, epsilon)
7206            );
7207            let lhs = evaluate_cell_moments(base, 9).expect("base moments");
7208            let rhs = evaluate_cell_moments(perturbed, 9).expect("perturbed moments");
7209            let max_rel = lhs
7210                .moments
7211                .iter()
7212                .zip(rhs.moments.iter())
7213                .map(|(a, b)| (a - b).abs() / a.abs().max(b.abs()).max(1.0))
7214                .fold(0.0_f64, f64::max);
7215            assert!(
7216                max_rel <= 10.0 * epsilon,
7217                "epsilon={epsilon:.1e} max_rel={max_rel:.3e}"
7218            );
7219        }
7220    }
7221
7222    /// Locks in numerical equivalence of the optimized
7223    /// `evaluate_non_affine_cell_state` against an inline reference
7224    /// implementation that mirrors the prior pre-fold structure
7225    /// (separate `cell.eta(z)` / `cell.q(z)` calls; post-loop
7226    /// `* half_width`; trailing `value_integral * half_width / sqrt(TAU)`).
7227    /// Any drift larger than 1e-13 relative would indicate the hot-path
7228    /// rewrite changed the math.
7229    #[test]
7230    fn non_affine_cell_state_matches_prefold_reference_to_1e_minus_13() {
7231        // Reference: byte-for-byte the structure of the previous
7232        // implementation. Kept local to this test to avoid leaking a second
7233        // public surface.
7234        fn reference(
7235            cell: DenestedCubicCell,
7236            branch: ExactCellBranch,
7237            max_degree: usize,
7238        ) -> CellMomentState {
7239            let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
7240            let mut value_integral = 0.0_f64;
7241            let center = 0.5 * (cell.left + cell.right);
7242            let half_width = 0.5 * (cell.right - cell.left);
7243            for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
7244                let z = center + half_width * node;
7245                let eta = cell.eta(z);
7246                let moment_weight = weight * (-cell.q(z)).exp();
7247                let mut z_pow = 1.0_f64;
7248                for moment in &mut moments {
7249                    *moment = moment_weight.mul_add(z_pow, *moment);
7250                    z_pow *= z;
7251                }
7252                value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
7253            }
7254            for moment in &mut moments {
7255                *moment *= half_width;
7256            }
7257            CellMomentState {
7258                branch,
7259                value: value_integral * half_width / (std::f64::consts::TAU).sqrt(),
7260                moments,
7261            }
7262        }
7263
7264        // Hand-rolled inputs that cross both Quartic and Sextic branches and
7265        // exercise positive/negative coefficients, asymmetric intervals, and
7266        // a wide degree range (matches survival_marginal_slope's degree=9
7267        // production call as well as the bernoulli outer-step degree=24).
7268        let cells = [
7269            DenestedCubicCell {
7270                left: -1.25,
7271                right: -0.2,
7272                c0: -0.35,
7273                c1: 0.85,
7274                c2: 0.04,
7275                c3: -0.015,
7276            },
7277            DenestedCubicCell {
7278                left: -0.2,
7279                right: 0.55,
7280                c0: 0.12,
7281                c1: -0.65,
7282                c2: -0.025,
7283                c3: 0.02,
7284            },
7285            DenestedCubicCell {
7286                left: 0.55,
7287                right: 1.6,
7288                c0: 0.42,
7289                c1: 0.35,
7290                c2: 0.018,
7291                c3: 0.012,
7292            },
7293            DenestedCubicCell {
7294                left: -3.0,
7295                right: -1.0,
7296                c0: 1.7,
7297                c1: -0.4,
7298                c2: 0.11,
7299                c3: -0.07,
7300            },
7301        ];
7302        let degrees = [0_usize, 4, 9, 16, 24];
7303        for cell in cells {
7304            let branch = branch_cell(cell).expect("branch");
7305            assert_ne!(branch, ExactCellBranch::Affine);
7306            for max_degree in degrees {
7307                let actual = evaluate_non_affine_cell_state(cell, branch, max_degree)
7308                    .expect("optimized non-affine");
7309                let expected = reference(cell, branch, max_degree);
7310                assert_eq!(actual.branch, expected.branch);
7311                assert_eq!(actual.moments.len(), expected.moments.len());
7312                let denom_v = expected.value.abs().max(1.0);
7313                let rel_v = (actual.value - expected.value).abs() / denom_v;
7314                let actual_v = actual.value;
7315                let expected_v = expected.value;
7316                assert!(
7317                    rel_v <= 1e-13,
7318                    "value rel mismatch for {cell:?} degree {max_degree}: \
7319                     actual={actual_v:.17e} expected={expected_v:.17e} rel={rel_v:.3e}"
7320                );
7321                for (k, (lhs, rhs)) in actual
7322                    .moments
7323                    .iter()
7324                    .zip(expected.moments.iter())
7325                    .enumerate()
7326                {
7327                    let denom = rhs.abs().max(1.0);
7328                    let rel = (lhs - rhs).abs() / denom;
7329                    assert!(
7330                        rel <= 1e-13,
7331                        "moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7332                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7333                    );
7334                }
7335
7336                // Also lock in the derivative-state path on the same
7337                // inputs so the (parallel) edit there can't drift.
7338                let actual_deriv =
7339                    evaluate_non_affine_cell_derivative_state(cell, branch, max_degree)
7340                        .expect("optimized derivative");
7341                for (k, (lhs, rhs)) in actual_deriv
7342                    .moments
7343                    .iter()
7344                    .zip(expected.moments.iter())
7345                    .enumerate()
7346                {
7347                    let denom = rhs.abs().max(1.0);
7348                    let rel = (lhs - rhs).abs() / denom;
7349                    assert!(
7350                        rel <= 1e-13,
7351                        "deriv moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7352                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7353                    );
7354                }
7355            }
7356        }
7357    }
7358
7359    /// DECISIVE: the third-derivative kernel must equal the FD of the
7360    /// second-derivative kernel w.r.t. a parameter that perturbs `eta`,
7361    /// RE-EVALUATING the moments at each step (the moments depend on `eta`
7362    /// via the `exp(-q)` weight). This isolates the kernel from all survival
7363    /// partition/cross machinery (gam#979 f_uv_dir localization).
7364    #[test]
7365    fn third_derivative_kernel_matches_fd_of_second_with_eta_perturbation() {
7366        // A finite, non-affine cell.
7367        let base = DenestedCubicCell {
7368            left: -0.6,
7369            right: 0.9,
7370            c0: 0.30,
7371            c1: 0.45,
7372            c2: -0.20,
7373            c3: 0.12,
7374        };
7375        // Synthetic parameter directions as cubic-in-z perturbations of eta:
7376        //   eta_u = ∂eta/∂u, eta_v = ∂eta/∂v, eta_t = ∂eta/∂t (the dir).
7377        let eta_u = [0.11_f64, -0.07, 0.05, 0.02];
7378        let eta_v = [-0.09_f64, 0.13, -0.04, 0.03];
7379        let eta_t = [0.17_f64, 0.06, -0.10, 0.04]; // the "b-like" direction
7380        // Second crosses ∂²eta/∂{·}{·} (pick small non-zero cubics).
7381        let eta_uv = [0.02_f64, 0.01, -0.015, 0.005];
7382        let eta_ut = [-0.01_f64, 0.02, 0.007, -0.003];
7383        let eta_vt = [0.015_f64, -0.008, 0.01, 0.004];
7384        // Third cross ∂³eta/∂u∂v∂t.
7385        let eta_uvt = [0.003_f64, -0.002, 0.001, 0.0005];
7386
7387        let neg = |a: &[f64; 4]| a.map(|v| -v);
7388        let max_degree = 15usize;
7389
7390        // f_uv(s) where param s shifts eta by s·(eta_t + ½ s²... ) — here we
7391        // build the cell at eta + s·eta_t + s²·eta_vt-style is NOT needed; we
7392        // only need the t-direction to first order for ∂/∂t. To FD ∂(f_uv)/∂t
7393        // we perturb eta along eta_t AND carry the s-dependence of the u,v
7394        // crosses: eta_u(s)=eta_u + s·eta_ut, eta_v(s)=eta_v + s·eta_vt,
7395        // eta_uv(s)=eta_uv + s·eta_uvt. The cell cubic shifts by s·eta_t.
7396        let f_uv_at = |s: f64| -> f64 {
7397            let cell_s = DenestedCubicCell {
7398                c0: base.c0 + s * eta_t[0],
7399                c1: base.c1 + s * eta_t[1],
7400                c2: base.c2 + s * eta_t[2],
7401                c3: base.c3 + s * eta_t[3],
7402                ..base
7403            };
7404            // Moments MUST be recomputed at the perturbed eta.
7405            let st = evaluate_cell_moments(cell_s, max_degree).unwrap();
7406            let neg_cell = DenestedCubicCell {
7407                c0: -cell_s.c0,
7408                c1: -cell_s.c1,
7409                c2: -cell_s.c2,
7410                c3: -cell_s.c3,
7411                ..cell_s
7412            };
7413            let u_s = [
7414                eta_u[0] + s * eta_ut[0],
7415                eta_u[1] + s * eta_ut[1],
7416                eta_u[2] + s * eta_ut[2],
7417                eta_u[3] + s * eta_ut[3],
7418            ];
7419            let v_s = [
7420                eta_v[0] + s * eta_vt[0],
7421                eta_v[1] + s * eta_vt[1],
7422                eta_v[2] + s * eta_vt[2],
7423                eta_v[3] + s * eta_vt[3],
7424            ];
7425            let uv_s = [
7426                eta_uv[0] + s * eta_uvt[0],
7427                eta_uv[1] + s * eta_uvt[1],
7428                eta_uv[2] + s * eta_uvt[2],
7429                eta_uv[3] + s * eta_uvt[3],
7430            ];
7431            cell_second_derivative_from_moments(
7432                neg_cell,
7433                &neg(&u_s),
7434                &neg(&v_s),
7435                &neg(&uv_s),
7436                &st.moments,
7437            )
7438            .unwrap()
7439        };
7440
7441        let h = 1e-5;
7442        let fd = (f_uv_at(h) - f_uv_at(-h)) / (2.0 * h);
7443
7444        // Analytic third via the kernel (negated cell + negated crosses, as the
7445        // survival path does).
7446        let st0 = evaluate_cell_moments(base, max_degree).unwrap();
7447        let neg_cell0 = DenestedCubicCell {
7448            c0: -base.c0,
7449            c1: -base.c1,
7450            c2: -base.c2,
7451            c3: -base.c3,
7452            ..base
7453        };
7454        let analytic = cell_third_derivative_from_moments(
7455            neg_cell0,
7456            &neg(&eta_u),
7457            &neg(&eta_v),
7458            &neg(&eta_t),
7459            &neg(&eta_uv),
7460            &neg(&eta_ut),
7461            &neg(&eta_vt),
7462            &neg(&eta_uvt),
7463            &st0.moments,
7464        )
7465        .unwrap();
7466
7467        let denom = fd.abs().max(1e-3);
7468        let rel = (analytic - fd).abs() / denom;
7469        assert!(
7470            rel <= 1e-5,
7471            "third kernel vs FD-of-second mismatch: analytic={analytic:.12e} fd={fd:.12e} rel={rel:.3e}"
7472        );
7473    }
7474
7475    #[test]
7476    fn moving_shared_edge_second_integral_derivative_has_leibniz_jump_sign() {
7477        let edge0 = 0.2_f64;
7478        let edge_velocity = -0.37_f64;
7479
7480        let left_eta = [0.22_f64, -0.18, 0.09, 0.03];
7481        let right_eta = [-0.11_f64, 0.26, -0.04, 0.02];
7482        let left_r = [0.08_f64, -0.05, 0.03, 0.01];
7483        let left_s = [-0.06_f64, 0.04, 0.02, -0.015];
7484        let left_rs = [0.025_f64, -0.012, 0.006, 0.004];
7485        let right_r = [-0.03_f64, 0.07, -0.02, 0.012];
7486        let right_s = [0.05_f64, -0.025, 0.018, 0.007];
7487        let right_rs = [-0.018_f64, 0.014, -0.005, 0.003];
7488
7489        let integral_at = |shift: f64| -> f64 {
7490            let edge = edge0 + edge_velocity * shift;
7491            let left = DenestedCubicCell {
7492                left: -0.7,
7493                right: edge,
7494                c0: left_eta[0],
7495                c1: left_eta[1],
7496                c2: left_eta[2],
7497                c3: left_eta[3],
7498            };
7499            let right = DenestedCubicCell {
7500                left: edge,
7501                right: 1.1,
7502                c0: right_eta[0],
7503                c1: right_eta[1],
7504                c2: right_eta[2],
7505                c3: right_eta[3],
7506            };
7507            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7508            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7509            cell_second_derivative_from_moments(
7510                left,
7511                &left_r,
7512                &left_s,
7513                &left_rs,
7514                &left_state.moments,
7515            )
7516            .expect("left second")
7517                + cell_second_derivative_from_moments(
7518                    right,
7519                    &right_r,
7520                    &right_s,
7521                    &right_rs,
7522                    &right_state.moments,
7523                )
7524                .expect("right second")
7525        };
7526
7527        let h = 1e-5;
7528        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7529
7530        let left = DenestedCubicCell {
7531            left: -0.7,
7532            right: edge0,
7533            c0: left_eta[0],
7534            c1: left_eta[1],
7535            c2: left_eta[2],
7536            c3: left_eta[3],
7537        };
7538        let right = DenestedCubicCell {
7539            left: edge0,
7540            right: 1.1,
7541            c0: right_eta[0],
7542            c1: right_eta[1],
7543            c2: right_eta[2],
7544            c3: right_eta[3],
7545        };
7546        let f_left =
7547            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7548        let f_right =
7549            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7550        let analytic = edge_velocity * (f_left - f_right);
7551
7552        let denom = analytic.abs().max(1e-8);
7553        let rel = (fd - analytic).abs() / denom;
7554        assert!(
7555            rel <= 5e-8,
7556            "moving edge sign mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7557        );
7558    }
7559
7560    #[test]
7561    fn moving_shared_edge_second_integral_mixed_derivative_has_full_leibniz_terms() {
7562        let edge0 = -0.15_f64;
7563        let edge_d1 = 0.31_f64;
7564        let edge_d2 = -0.27_f64;
7565        let edge_d12 = 0.19_f64;
7566
7567        let left_eta = [0.16_f64, -0.21, 0.07, -0.025];
7568        let right_eta = [-0.09_f64, 0.18, -0.055, 0.018];
7569        let left_r = [0.075_f64, -0.045, 0.018, 0.009];
7570        let left_s = [-0.052_f64, 0.033, 0.014, -0.011];
7571        let left_rs = [0.021_f64, -0.009, 0.005, 0.0025];
7572        let right_r = [-0.028_f64, 0.063, -0.017, 0.010];
7573        let right_s = [0.047_f64, -0.023, 0.016, 0.006];
7574        let right_rs = [-0.015_f64, 0.012, -0.004, 0.002];
7575
7576        let integral_at = |s1: f64, s2: f64| -> f64 {
7577            let edge = edge0 + edge_d1 * s1 + edge_d2 * s2 + edge_d12 * s1 * s2;
7578            let left = DenestedCubicCell {
7579                left: -0.8,
7580                right: edge,
7581                c0: left_eta[0],
7582                c1: left_eta[1],
7583                c2: left_eta[2],
7584                c3: left_eta[3],
7585            };
7586            let right = DenestedCubicCell {
7587                left: edge,
7588                right: 0.9,
7589                c0: right_eta[0],
7590                c1: right_eta[1],
7591                c2: right_eta[2],
7592                c3: right_eta[3],
7593            };
7594            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7595            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7596            cell_second_derivative_from_moments(
7597                left,
7598                &left_r,
7599                &left_s,
7600                &left_rs,
7601                &left_state.moments,
7602            )
7603            .expect("left second")
7604                + cell_second_derivative_from_moments(
7605                    right,
7606                    &right_r,
7607                    &right_s,
7608                    &right_rs,
7609                    &right_state.moments,
7610                )
7611                .expect("right second")
7612        };
7613
7614        let h = 2e-4;
7615        let fd = (integral_at(h, h) - integral_at(h, -h) - integral_at(-h, h)
7616            + integral_at(-h, -h))
7617            / (4.0 * h * h);
7618
7619        let left = DenestedCubicCell {
7620            left: -0.8,
7621            right: edge0,
7622            c0: left_eta[0],
7623            c1: left_eta[1],
7624            c2: left_eta[2],
7625            c3: left_eta[3],
7626        };
7627        let right = DenestedCubicCell {
7628            left: edge0,
7629            right: 0.9,
7630            c0: right_eta[0],
7631            c1: right_eta[1],
7632            c2: right_eta[2],
7633            c3: right_eta[3],
7634        };
7635
7636        let boundary_z_derivative =
7637            |cell: DenestedCubicCell, r: &[f64], s: &[f64], rs: &[f64]| -> f64 {
7638                let eta = cell.eta(edge0);
7639                let eta_z = cell.c1 + 2.0 * cell.c2 * edge0 + 3.0 * cell.c3 * edge0 * edge0;
7640                let cr = poly_eval_at(r, edge0);
7641                let cs = poly_eval_at(s, edge0);
7642                let crs = poly_eval_at(rs, edge0);
7643                let cr_z = r.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7644                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7645                });
7646                let cs_z = s.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7647                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7648                });
7649                let crs_z = rs.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7650                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7651                });
7652                let amp = crs - eta * cr * cs;
7653                let amp_z = crs_z - eta_z * cr * cs - eta * cr_z * cs - eta * cr * cs_z;
7654                let q_z = edge0 + eta * eta_z;
7655                (amp_z - amp * q_z) * (-cell.q(edge0)).exp() * INV_TWO_PI
7656            };
7657
7658        let f_left =
7659            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7660        let f_right =
7661            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7662        let fz_left = boundary_z_derivative(left, &left_r, &left_s, &left_rs);
7663        let fz_right = boundary_z_derivative(right, &right_r, &right_s, &right_rs);
7664        let analytic = edge_d12 * (f_left - f_right) + edge_d1 * edge_d2 * (fz_left - fz_right);
7665
7666        let denom = analytic.abs().max(1e-8);
7667        let rel = (fd - analytic).abs() / denom;
7668        assert!(
7669            rel <= 2e-7,
7670            "moving edge mixed term mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7671        );
7672    }
7673
7674    // gam#1454 resolution. The reported defect ("survival flex directional
7675    // third[g,w0] wrong: candidate f_au_dir/f_aa_dir missing self-flux") posited
7676    // a MISSING third-order Leibniz self-flux at the moving link-knot crossings.
7677    // This regression establishes the two facts that, together, prove the
7678    // implicit-intercept third-order tower
7679    // (`row_primary_third_contracted_recompute*`) is CORRECT to add no such flux:
7680    //
7681    //   (1) The third-derivative integrand `F_rst` genuinely DOES jump across a
7682    //       C²-link knot — its third coefficient slice carries `c_rst ∝ 6·α₃`,
7683    //       and `α₃` (the spline's third `z`-derivative) is the one piece a C²
7684    //       cubic spline leaves discontinuous. So the jump is real and the
7685    //       `cell_third_derivative_boundary_integrand` flux formula is exact
7686    //       (verified by FD of a direct ∂/∂edge of the third-integral sum —
7687    //       a FOURTH-order scenario that pins the integrand, not the tower).
7688    //
7689    //   (2) Every boundary term in the Leibniz expansion of a THIRD derivative,
7690    //       however, evaluates an integrand of order ≤ 2 at the moving edge
7691    //       (one of the three differentiations is spent moving the boundary).
7692    //       The second-derivative integrand `F_rs` is CONTINUOUS across the same
7693    //       C² knot (its slices reach at most `α₂ + 3α₃·shift`, i.e. ½·η''(u*),
7694    //       which a C² spline keeps continuous). Hence the shared-edge flux
7695    //       `velocity·(F_rs^L − F_rs^R)` telescopes to ZERO, and the tower's
7696    //       third-order self-flux is a genuine no-op. The real residual lives in
7697    //       the interior implicit-intercept assembly, not at the boundary.
7698    #[test]
7699    fn third_order_self_flux_telescopes_but_third_integrand_jumps_at_c2_knot_1454() {
7700        let edge0 = 0.13_f64;
7701        let edge_velocity = -0.41_f64;
7702
7703        // Build η continuous to C² at edge0 but with a jump in the cubic (3rd
7704        // derivative) coefficient. Pick the left cubic freely; choose the right
7705        // cubic to match value+1st+2nd derivative at edge0, then perturb its c3.
7706        let left_eta = [0.18_f64, -0.12, 0.07, 0.04];
7707        let right_c3 = 0.04_f64 + 0.09; // α₃ jump across the knot.
7708        // Match η, η', η'' at edge0 for the right piece given its c3:
7709        //   η(z)  = c0 + c1 z + c2 z² + c3 z³
7710        //   η'(z) = c1 + 2 c2 z + 3 c3 z²
7711        //   η''(z)= 2 c2 + 6 c3 z
7712        // Solve right (c0,c1,c2) so the three values equal the left ones at edge0.
7713        let l0 = left_eta[0];
7714        let l1 = left_eta[1];
7715        let l2 = left_eta[2];
7716        let l3 = left_eta[3];
7717        let e = edge0;
7718        let eta_val = l0 + l1 * e + l2 * e * e + l3 * e * e * e;
7719        let eta_d1 = l1 + 2.0 * l2 * e + 3.0 * l3 * e * e;
7720        let eta_d2 = 2.0 * l2 + 6.0 * l3 * e;
7721        let rc2 = (eta_d2 - 6.0 * right_c3 * e) / 2.0;
7722        let rc1 = eta_d1 - 2.0 * rc2 * e - 3.0 * right_c3 * e * e;
7723        let rc0 = eta_val - rc1 * e - rc2 * e * e - right_c3 * e * e * e;
7724        let right_eta = [rc0, rc1, rc2, right_c3];
7725
7726        // Coefficient slices. The first/second slices we keep continuous at the
7727        // edge (mimicking c_r=1+η', c_rs∝η'' which a C² spline matches), so the
7728        // 2nd-order flux would cancel. The third-order slice `rst` carries the
7729        // jumping α₃ and is DIFFERENT across the edge — this is the term that
7730        // breaks cancellation.
7731        let common_r = [0.06_f64, -0.04, 0.02, 0.0];
7732        let common_s = [-0.05_f64, 0.03, 0.015, 0.0];
7733        let common_t = [0.08_f64, 0.05, -0.03, 0.0];
7734        let common_rs = [0.02_f64, -0.01, 0.005, 0.0];
7735        let common_rt = [-0.012_f64, 0.008, 0.004, 0.0];
7736        let common_st = [0.015_f64, -0.006, 0.003, 0.0];
7737        // rst ∝ 6·α₃ in the real path: left and right differ by the α₃ jump.
7738        let left_rst = [6.0 * l3, 0.0, 0.0, 0.0];
7739        let right_rst = [6.0 * right_c3, 0.0, 0.0, 0.0];
7740
7741        let max_degree = 15usize;
7742        let neg = |a: &[f64; 4]| a.map(|v| -v);
7743
7744        // The integral sum over the two cells sharing the moving edge, computed
7745        // via the fixed-domain moment reduction with the SURVIVAL/probit sign
7746        // convention (negated cell + negated coefficient slices), exactly as the
7747        // production `row_primary_third_contracted_recompute` path does.
7748        let integral_at = |shift: f64| -> f64 {
7749            let edge = edge0 + edge_velocity * shift;
7750            let left = DenestedCubicCell {
7751                left: -0.7,
7752                right: edge,
7753                c0: left_eta[0],
7754                c1: left_eta[1],
7755                c2: left_eta[2],
7756                c3: left_eta[3],
7757            };
7758            let right = DenestedCubicCell {
7759                left: edge,
7760                right: 1.0,
7761                c0: right_eta[0],
7762                c1: right_eta[1],
7763                c2: right_eta[2],
7764                c3: right_eta[3],
7765            };
7766            let lst = evaluate_cell_moments(left, max_degree).unwrap();
7767            let rst_m = evaluate_cell_moments(right, max_degree).unwrap();
7768            let neg_left = DenestedCubicCell {
7769                c0: -left.c0,
7770                c1: -left.c1,
7771                c2: -left.c2,
7772                c3: -left.c3,
7773                ..left
7774            };
7775            let neg_right = DenestedCubicCell {
7776                c0: -right.c0,
7777                c1: -right.c1,
7778                c2: -right.c2,
7779                c3: -right.c3,
7780                ..right
7781            };
7782            let li = cell_third_derivative_from_moments(
7783                neg_left,
7784                &neg(&common_r),
7785                &neg(&common_s),
7786                &neg(&common_t),
7787                &neg(&common_rs),
7788                &neg(&common_rt),
7789                &neg(&common_st),
7790                &neg(&left_rst),
7791                &lst.moments,
7792            )
7793            .unwrap();
7794            let ri = cell_third_derivative_from_moments(
7795                neg_right,
7796                &neg(&common_r),
7797                &neg(&common_s),
7798                &neg(&common_t),
7799                &neg(&common_rs),
7800                &neg(&common_rt),
7801                &neg(&common_st),
7802                &neg(&right_rst),
7803                &rst_m.moments,
7804            )
7805            .unwrap();
7806            li + ri
7807        };
7808
7809        let h = 1e-5;
7810        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7811
7812        // Fixed-domain part: differentiate ONLY the integrands (domain frozen at
7813        // edge0). Its directional derivative is the analytic Leibniz flux alone,
7814        // since the integrand coefficients here are edge-independent:
7815        //   flux = velocity · ( F_rst^L(edge0) − F_rst^R(edge0) ).
7816        //
7817        // CONVENTION: the finite-difference `integral_at` above integrates the
7818        // SURVIVAL/probit sign convention — negated cell (η→−η) AND negated
7819        // coefficient slices — exactly as the production
7820        // `row_primary_third_contracted_recompute` path does. The Leibniz
7821        // boundary integrand must therefore be evaluated in that SAME negated
7822        // convention: the third-derivative integrand is ODD under the joint
7823        // (η→−η, coeff→−coeff) negation (its `rst`, `η·rs·t`, and `(η²−1)·r·s·t`
7824        // terms each flip sign an odd number of times), so evaluating the flux
7825        // with un-negated cells/coeffs yields exactly the opposite sign and the
7826        // Leibniz identity `fd = flux` fails as `fd = −flux`. (The
7827        // second-derivative sibling test `moving_shared_edge_second_integral_
7828        // derivative_has_leibniz_jump_sign` keeps BOTH sides un-negated and so
7829        // stays self-consistent; this test keeps BOTH sides negated.)
7830        let neg_eta = |eta: &[f64; 4]| [-eta[0], -eta[1], -eta[2], -eta[3]];
7831        let left_eta_neg = neg_eta(&left_eta);
7832        let right_eta_neg = neg_eta(&right_eta);
7833        let left0 = DenestedCubicCell {
7834            left: -0.7,
7835            right: edge0,
7836            c0: left_eta_neg[0],
7837            c1: left_eta_neg[1],
7838            c2: left_eta_neg[2],
7839            c3: left_eta_neg[3],
7840        };
7841        let right0 = DenestedCubicCell {
7842            left: edge0,
7843            right: 1.0,
7844            c0: right_eta_neg[0],
7845            c1: right_eta_neg[1],
7846            c2: right_eta_neg[2],
7847            c3: right_eta_neg[3],
7848        };
7849        let f_left = cell_third_derivative_boundary_integrand(
7850            left0,
7851            &neg(&common_r),
7852            &neg(&common_s),
7853            &neg(&common_t),
7854            &neg(&common_rs),
7855            &neg(&common_rt),
7856            &neg(&common_st),
7857            &neg(&left_rst),
7858            edge0,
7859        );
7860        let f_right = cell_third_derivative_boundary_integrand(
7861            right0,
7862            &neg(&common_r),
7863            &neg(&common_s),
7864            &neg(&common_t),
7865            &neg(&common_rs),
7866            &neg(&common_rt),
7867            &neg(&common_st),
7868            &neg(&right_rst),
7869            edge0,
7870        );
7871
7872        // The integrand DOES jump across this C² knot (the α₃ third-coefficient
7873        // term is the only discontinuous piece). Confirm the jump is genuine —
7874        // if it were zero the flux would be a no-op and #1454 would not exist.
7875        let jump = f_left - f_right;
7876        assert!(
7877            jump.abs() > 1e-4,
7878            "third-derivative integrand must jump across the C² knot (α₃ discontinuity); \
7879             got jump={jump:.3e}"
7880        );
7881
7882        let analytic_flux = edge_velocity * jump;
7883        let denom = fd.abs().max(1e-6);
7884        let rel = (fd - analytic_flux).abs() / denom;
7885        assert!(
7886            rel <= 1e-5,
7887            "moving-edge third-derivative flux mismatch (#1454): fd={fd:.12e} \
7888             analytic_flux={analytic_flux:.12e} rel={rel:.3e}"
7889        );
7890
7891        // ---- Fact (2): the SECOND-derivative integrand telescopes to zero. ----
7892        // A 3rd-derivative Leibniz boundary term spends one differentiation on
7893        // the moving edge and evaluates a ≤2nd-order integrand there. The
7894        // hardest such term is the slope-slope Hessian integrand `F_bb`, whose
7895        // coefficient slice is the link cubic's b-b partial
7896        //   dc_dbb(z) = [0, 0, 2(α₂ + 3 α₃·shift), 6 α₃·b]·(z⁰..z³)
7897        //             = z²·η''(u),  with u = a + b·z, shift = a − knot.
7898        // Across a C² knot α₂, α₃, and `shift` all jump, yet η''(u*) is
7899        // continuous — so the EVALUATED slice `c_bb(z*) = z*²·η''(u*)` matches on
7900        // both sides and `F_bb` is continuous. Build the two pieces' raw dc_dbb
7901        // decompositions from `link_cubic_second_partials` and confirm the
7902        // second-derivative integrand carries no jump (flux telescopes to 0).
7903        let a_row = 0.21_f64;
7904        let b_row = 1.37_f64;
7905        let knot = a_row + b_row * edge0; // u-location of the crossing.
7906        // Left/right link pieces: choose α₂,α₃ freely on the left; pick the
7907        // right piece's α₂ so η''(knot) is continuous given a jumped α₃.
7908        let left_link = LocalSpanCubic {
7909            left: knot - 0.6,
7910            right: knot + 0.6,
7911            c0: 0.0,
7912            c1: 0.0,
7913            c2: 0.08,
7914            c3: -0.05,
7915        };
7916        let right_alpha3 = -0.05_f64 + 0.11; // α₃ jump.
7917        // η''(knot) continuity:  2α₂ᴸ + 6α₃ᴸ·(knot−leftᴸ) = 2α₂ᴿ + 6α₃ᴿ·(knot−leftᴿ).
7918        let right_left_coord = knot - 0.4;
7919        let lhs = 2.0 * left_link.c2 + 6.0 * left_link.c3 * (knot - left_link.left);
7920        let right_alpha2 = (lhs - 6.0 * right_alpha3 * (knot - right_left_coord)) / 2.0;
7921        let right_link = LocalSpanCubic {
7922            left: right_left_coord,
7923            right: right_left_coord + 0.8,
7924            c0: 0.0,
7925            c1: 0.0,
7926            c2: right_alpha2,
7927            c3: right_alpha3,
7928        };
7929        let (_, _, dc_dbb_left) = link_cubic_second_partials(left_link, a_row, b_row);
7930        let (_, _, dc_dbb_right) = link_cubic_second_partials(right_link, a_row, b_row);
7931        // The per-coefficient arrays differ (α₃ jumped)...
7932        assert!(
7933            (dc_dbb_left[3] - dc_dbb_right[3]).abs() > 1e-3,
7934            "α₃ jump must make the raw dc_dbb coefficient arrays differ"
7935        );
7936        // ...but the EVALUATED second-order slice at the crossing matches, so the
7937        // F_bb boundary integrand carries no jump and the flux telescopes to 0.
7938        let c_bb_left = poly_eval_at(&dc_dbb_left, edge0);
7939        let c_bb_right = poly_eval_at(&dc_dbb_right, edge0);
7940        assert!(
7941            (c_bb_left - c_bb_right).abs() <= 1e-12,
7942            "second-derivative slope-slope integrand must be CONTINUOUS across the \
7943             C² knot (telescoping self-flux): left={c_bb_left:.15e} right={c_bb_right:.15e}"
7944        );
7945    }
7946}
gam_model_kernels/cubic_cell_kernel.rs

gam_model_kernels/
cubic_cell_kernel.rs