gam_model_kernels/
cubic_cell_kernel.rs

1use gam_math::probability::normal_cdf;
2use gam_runtime::resource::{ByteLruCache, ResidentBytes};
3use smallvec::{SmallVec, smallvec};
4use std::hash::{Hash, Hasher};
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7
8/// Typed errors raised by the de-nested cubic transport kernel.
9///
10/// Sibling families (`bernoulli_marginal_slope`, `survival_marginal_slope`,
11/// `marginal_slope_shared`) currently consume the kernel's public surface via
12/// `Result<_, String>`. To stay source-compatible, the kernel converts errors
13/// to `String` at the boundary via `From<CubicCellKernelError> for String` and
14/// keeps the public function signatures returning `Result<_, String>`.
15/// `Display` is exact-byte-equivalent to the previous `format!(...)` strings.
16#[derive(Clone, Debug)]
17pub enum CubicCellKernelError {
18    /// Interval probe / cell-bounds preconditions (ordered bounds, supported
19    /// infinity patterns, positive finite width).
20    InvalidInterval { reason: String },
21    /// Cell-shape / branch-classification failure: tail cells not affine,
22    /// finite cells with non-positive width, non-finite affine coefficients,
23    /// non-affine cell with infinite bounds, leading-coefficient degeneracy
24    /// in the moment recurrence, etc.
25    InvalidCellShape { reason: String },
26    /// Reduced moment vector (or polynomial-convolution scratch) is shorter
27    /// than the polynomial degree the leaf needs to evaluate.
28    InsufficientMoments { reason: String },
29    /// Bivariate-normal CDF domain validation (non-finite/non-infinite
30    /// argument, non-finite correlation).
31    BivariateNormalDomain { reason: String },
32}
33
34impl_reason_error_boilerplate! {
35    CubicCellKernelError {
36        InvalidInterval,
37        InvalidCellShape,
38        InsufficientMoments,
39        BivariateNormalDomain,
40    }
41}
42
43impl CubicCellKernelError {
44    #[inline]
45    fn invalid_interval(reason: impl Into<String>) -> Self {
46        CubicCellKernelError::InvalidInterval {
47            reason: reason.into(),
48        }
49    }
50    #[inline]
51    fn invalid_cell_shape(reason: impl Into<String>) -> Self {
52        CubicCellKernelError::InvalidCellShape {
53            reason: reason.into(),
54        }
55    }
56    #[inline]
57    fn insufficient_moments(reason: impl Into<String>) -> Self {
58        CubicCellKernelError::InsufficientMoments {
59            reason: reason.into(),
60        }
61    }
62    #[inline]
63    fn bivariate_normal_domain(reason: impl Into<String>) -> Self {
64        CubicCellKernelError::BivariateNormalDomain {
65            reason: reason.into(),
66        }
67    }
68}
69
70// De-nested cubic transport kernel.
71//
72// This module implements the de-nested flexible-link/score-warp model
73//
74//   eta(z) = a + b*z + b*delta_h(z) + delta_w(a + b*z)
75//
76// where delta_h is the score warp and delta_w is the link deviation.
77// This is not the literal nested composition L(a + b*H(z)); it is an
78// additive-correction model around the affine core a + b*z.
79//
80// On each partition cell, both deviations are cubic polynomials, so eta is
81// at most sextic in z and q(z) = 0.5*(z^2 + eta^2) is at most degree 12.
82// The integral of exp(-q(z)) is evaluated by transporting from the affine
83// anchor (c2=c3=0, where q is Gaussian and the integral reduces to BVN)
84// to the target non-affine cell via the polynomial moment recurrence.
85//
86// The partition covers (-∞, +∞) with:
87//   • two semi-infinite affine TAIL cells (outside all deviation support),
88//   • finitely many interior cells (each a sextic microcell).
89// Because tail cells have constant deviations (c2=c3=0), their bounds
90// are parameter-independent, so no Leibniz boundary-motion corrections
91// appear in the derivatives.
92//
93// Shared by bernoulli_marginal_slope and survival_marginal_slope families.
94
95#[derive(Clone, Copy, Debug, PartialEq)]
96pub struct LocalSpanCubic {
97    pub left: f64,
98    pub right: f64,
99    pub c0: f64,
100    pub c1: f64,
101    pub c2: f64,
102    pub c3: f64,
103}
104
105impl LocalSpanCubic {
106    #[inline]
107    pub fn evaluate(self, x: f64) -> f64 {
108        let t = x - self.left;
109        self.c0 + self.c1 * t + self.c2 * t * t + self.c3 * t * t * t
110    }
111
112    #[inline]
113    pub fn first_derivative(self, x: f64) -> f64 {
114        let t = x - self.left;
115        self.c1 + 2.0 * self.c2 * t + 3.0 * self.c3 * t * t
116    }
117
118    #[inline]
119    pub fn second_derivative(self, x: f64) -> f64 {
120        let t = x - self.left;
121        2.0 * self.c2 + 6.0 * self.c3 * t
122    }
123}
124
125pub const ANCHORED_DEVIATION_KERNEL: &str = "DenestedCubicTransport";
126/// Default normalized non-affine branch tolerance used by [`branch_cell`].
127///
128/// Keep this cutoff explicit and hill-climbable: the large-scale cycle-0
129/// sweep evaluated `{1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3}` against the
130/// legacy transport path.  The more aggressive candidates require an
131/// end-to-end beta acceptance run before promotion; the default therefore
132/// remains the legacy `1e-10` value to preserve bit-for-bit model behavior.
133pub const NORMALIZED_CELL_BRANCH_TOL: f64 = 1e-10;
134
135const INV_TWO_PI: f64 = 1.0 / std::f64::consts::TAU;
136
137/// 384-point Gauss–Legendre nodes, re-exported for the GPU cubic-cell kernel
138/// (`src/gpu/cubic_cell/kernel_src.rs`) to embed as `__constant__` device
139/// memory. Linux-only because the kernel emitter is Linux-only.
140#[cfg(target_os = "linux")]
141pub const GL_NODES_FOR_GPU_KERNEL: &[f64; 384] = &GL_NODES;
142/// Companion weights to [`GL_NODES_FOR_GPU_KERNEL`].
143#[cfg(target_os = "linux")]
144pub const GL_WEIGHTS_FOR_GPU_KERNEL: &[f64; 384] = &GL_WEIGHTS;
145
146const GL_NODES: [f64; 384] = [
147    -9.999_804_411_726_474e-1,
148    -9.998_969_471_378_596e-1,
149    -9.997_467_408_113_523e-1,
150    -9.995_297_988_558_859e-1,
151    -9.992_461_316_671_845e-1,
152    -9.988_957_572_063_257e-1,
153    -9.984_786_985_384_589e-1,
154    -9.979_949_833_727_938e-1,
155    -9.974_446_439_389_107e-1,
156    -9.968_277_169_440_913e-1,
157    -9.961_442_435_551_087e-1,
158    -9.953_942_693_885_953e-1,
159    -9.945_778_445_047_068e-1,
160    -9.936_950_234_020_883e-1,
161    -9.927_458_650_133_153e-1,
162    -9.917_304_327_004_32e-1,
163    -9.906_487_942_504_061e-1,
164    -9.895_010_218_704_087e-1,
165    -9.882_871_921_828_699e-1,
166    -9.870_073_862_202_815e-1,
167    -9.856_616_894_197_333e-1,
168    -9.842_501_916_171_713e-1,
169    -9.827_729_870_413_743e-1,
170    -9.812_301_743_076_443e-1,
171    -9.796_218_564_112_101e-1,
172    -9.779_481_407_203_411e-1,
173    -9.762_091_389_691_724e-1,
174    -9.744_049_672_502_397e-1,
175    -9.725_357_460_067_257e-1,
176    -9.706_016_000_244_151e-1,
177    -9.686_026_584_233_628e-1,
178    -9.665_390_546_492_71e-1,
179    -9.644_109_264_645_802e-1,
180    -9.622_184_159_392_698e-1,
181    -9.599_616_694_413_742e-1,
182    -9.576_408_376_272_095e-1,
183    -9.552_560_754_313_16e-1,
184    -9.528_075_420_561_144e-1,
185    -9.502_954_009_612_771e-1,
186    -9.477_198_198_528_157e-1,
187    -9.450_809_706_718_851e-1,
188    -9.423_790_295_833_044e-1,
189    -9.396_141_769_637_963e-1,
190    -9.367_865_973_899_459e-1,
191    -9.338_964_796_258_775e-1,
192    -9.309_440_166_106_54e-1,
193    -9.279_294_054_453_956e-1,
194    -9.248_528_473_801_222e-1,
195    -9.217_145_478_003_181e-1,
196    -9.185_147_162_132_208e-1,
197    -9.152_535_662_338_34e-1,
198    -9.119_313_155_706_682e-1,
199    -9.085_481_860_112_055e-1,
200    -9.051_044_034_070_944e-1,
201    -9.016_001_976_590_722e-1,
202    -8.980_358_027_016_164e-1,
203    -8.944_114_564_873_288e-1,
204    -8.907_274_009_710_492e-1,
205    -8.869_838_820_937_034e-1,
206    -8.831_811_497_658_847e-1,
207    -8.793_194_578_511_7e-1,
208    -8.753_990_641_491_725e-1,
209    -8.714_202_303_783_312e-1,
210    -8.673_832_221_584_393e-1,
211    -8.632_883_089_929_12e-1,
212    -8.591_357_642_507_945e-1,
213    -8.549_258_651_485_127e-1,
214    -8.506_588_927_313_666e-1,
215    -8.463_351_318_547_683e-1,
216    -8.419_548_711_652_254e-1,
217    -8.375_184_030_810_715e-1,
218    -8.330_260_237_729_452e-1,
219    -8.284_780_331_440_178e-1,
220    -8.238_747_348_099_726e-1,
221    -8.192_164_360_787_36e-1,
222    -8.145_034_479_299_62e-1,
223    -8.097_360_849_942_72e-1,
224    -8.049_146_655_322_506e-1,
225    -8.000_395_114_131_988e-1,
226    -7.951_109_480_936_471e-1,
227    -7.901_293_045_956_28e-1,
228    -7.850_949_134_847_117e-1,
229    -7.800_081_108_478_04e-1,
230    -7.748_692_362_707_1e-1,
231    -7.696_786_328_154_644e-1,
232    -7.644_366_469_974_285e-1,
233    -7.591_436_287_621_58e-1,
234    -7.537_999_314_620_412e-1,
235    -7.484_059_118_327_094e-1,
236    -7.429_619_299_692_227e-1,
237    -7.374_683_493_020_299e-1,
238    -7.319_255_365_727_068e-1,
239    -7.263_338_618_094_733e-1,
240    -7.206_936_983_024_912e-1,
241    -7.150_054_225_789_432e-1,
242    -7.092_694_143_778_975e-1,
243    -7.034_860_566_249_567e-1,
244    -6.976_557_354_066_943e-1,
245    -6.917_788_399_448_808e-1,
246    -6.858_557_625_704_99e-1,
247    -6.798_868_986_975_534e-1,
248    -6.738_726_467_966_731e-1,
249    -6.678_134_083_685_102e-1,
250    -6.617_095_879_169_366e-1,
251    -6.555_615_929_220_4e-1,
252    -6.493_698_338_129_212e-1,
253    -6.431_347_239_402_948e-1,
254    -6.368_566_795_488_945e-1,
255    -6.305_361_197_496_849e-1,
256    -6.241_734_664_918_837e-1,
257    -6.177_691_445_347_913e-1,
258    -6.113_235_814_194_364e-1,
259    -6.048_372_074_400_329e-1,
260    -5.983_104_556_152_549e-1,
261    -5.917_437_616_593_286e-1,
262    -5.851_375_639_529_456e-1,
263    -5.784_923_035_139_965e-1,
264    -5.718_084_239_681_3e-1,
265    -5.650_863_715_191_369e-1,
266    -5.583_265_949_191_623e-1,
267    -5.515_295_454_387_482e-1,
268    -5.446_956_768_367_068e-1,
269    -5.378_254_453_298_289e-1,
270    -5.309_193_095_624_275e-1,
271    -5.239_777_305_757_194e-1,
272    -5.170_011_717_770_473e-1,
273    -5.099_900_989_089_429e-1,
274    -5.029_449_800_180_356e-1,
275    -4.958_662_854_238_058_4e-1,
276    -4.887_544_876_871_878e-1,
277    -4.816_100_615_790_221e-1,
278    -4.744_334_840_483_605_5e-1,
279    -4.672_252_341_906_264e-1,
280    -4.599_857_932_156_304e-1,
281    -4.527_156_444_154_463_7e-1,
282    -4.454_152_731_321_473_5e-1,
283    -4.380_851_667_254_05e-1,
284    -4.307_258_145_399_544_5e-1,
285    -4.233_377_078_729_265e-1,
286    -4.159_213_399_410_494e-1,
287    -4.084_772_058_477_228e-1,
288    -4.010_058_025_499_653e-1,
289    -3.935_076_288_252_386e-1,
290    -3.859_831_852_381_500_6e-1,
291    -3.784_329_741_070_358_6e-1,
292    -3.708_574_994_704_271e-1,
293    -3.632_572_670_534_011e-1,
294    -3.556_327_842_338_202e-1,
295    -3.479_845_600_084_600_6e-1,
296    -3.403_131_049_590_297e-1,
297    -3.326_189_312_180_866e-1,
298    -3.249_025_524_348_469_5e-1,
299    -3.171_644_837_408_958_4e-1,
300    -3.094_052_417_157_978e-1,
301    -3.016_253_443_526_109e-1,
302    -2.938_253_110_233_064_5e-1,
303    -2.860_056_624_440_967_5e-1,
304    -2.781_669_206_406_729e-1,
305    -2.703_096_089_133_553e-1,
306    -2.624_342_518_021_592_4e-1,
307    -2.545_413_750_517_773e-1,
308    -2.466_315_055_764_817_5e-1,
309    -2.387_051_714_249_486_3e-1,
310    -2.307_629_017_450_062e-1,
311    -2.228_052_267_483_099_4e-1,
312    -2.148_326_776_749_466_5e-1,
313    -2.068_457_867_579_697_5e-1,
314    -1.988_450_871_878_683_4e-1,
315    -1.908_311_130_769_724_5e-1,
316    -1.828_043_994_237_965_6e-1,
317    -1.747_654_820_773_241_2e-1,
318    -1.667_148_977_012_352_4e-1,
319    -1.586_531_837_380_799_3e-1,
320    -1.505_808_783_733_995e-1,
321    -1.424_985_204_997_981_4e-1,
322    -1.344_066_496_809_674_7e-1,
323    -1.263_058_061_156_663e-1,
324    -1.181_965_306_016_578_4e-1,
325    -1.100_793_644_996_070_4e-1,
326    -1.019_548_496_969_403_7e-1,
327    -9.382_352_857_167_028e-2,
328    -8.568_594_395_618_719e-2,
329    -7.754_263_910_102_077e-2,
330    -6.939_415_763_857_37e-2,
331    -6.124_104_354_682_962e-2,
332    -5.308_384_111_303_817_6e-2,
333    -4.492_309_489_737_94e-2,
334    -3.675_934_969_660_982e-2,
335    -2.859_315_050_769_284_7e-2,
336    -2.042_504_249_141_571e-2,
337    -1.225_557_093_599_553_8e-2,
338    -4.085_281_220_676_868e-3,
339    4.085_281_220_676_868e-3,
340    1.225_557_093_599_553_8e-2,
341    2.042_504_249_141_571e-2,
342    2.859_315_050_769_284_7e-2,
343    3.675_934_969_660_982e-2,
344    4.492_309_489_737_94e-2,
345    5.308_384_111_303_817_6e-2,
346    6.124_104_354_682_962e-2,
347    6.939_415_763_857_37e-2,
348    7.754_263_910_102_077e-2,
349    8.568_594_395_618_719e-2,
350    9.382_352_857_167_028e-2,
351    1.019_548_496_969_403_7e-1,
352    1.100_793_644_996_070_4e-1,
353    1.181_965_306_016_578_4e-1,
354    1.263_058_061_156_663e-1,
355    1.344_066_496_809_674_7e-1,
356    1.424_985_204_997_981_4e-1,
357    1.505_808_783_733_995e-1,
358    1.586_531_837_380_799_3e-1,
359    1.667_148_977_012_352_4e-1,
360    1.747_654_820_773_241_2e-1,
361    1.828_043_994_237_965_6e-1,
362    1.908_311_130_769_724_5e-1,
363    1.988_450_871_878_683_4e-1,
364    2.068_457_867_579_697_5e-1,
365    2.148_326_776_749_466_5e-1,
366    2.228_052_267_483_099_4e-1,
367    2.307_629_017_450_062e-1,
368    2.387_051_714_249_486_3e-1,
369    2.466_315_055_764_817_5e-1,
370    2.545_413_750_517_773e-1,
371    2.624_342_518_021_592_4e-1,
372    2.703_096_089_133_553e-1,
373    2.781_669_206_406_729e-1,
374    2.860_056_624_440_967_5e-1,
375    2.938_253_110_233_064_5e-1,
376    3.016_253_443_526_109e-1,
377    3.094_052_417_157_978e-1,
378    3.171_644_837_408_958_4e-1,
379    3.249_025_524_348_469_5e-1,
380    3.326_189_312_180_866e-1,
381    3.403_131_049_590_297e-1,
382    3.479_845_600_084_600_6e-1,
383    3.556_327_842_338_202e-1,
384    3.632_572_670_534_011e-1,
385    3.708_574_994_704_271e-1,
386    3.784_329_741_070_358_6e-1,
387    3.859_831_852_381_500_6e-1,
388    3.935_076_288_252_386e-1,
389    4.010_058_025_499_653e-1,
390    4.084_772_058_477_228e-1,
391    4.159_213_399_410_494e-1,
392    4.233_377_078_729_265e-1,
393    4.307_258_145_399_544_5e-1,
394    4.380_851_667_254_05e-1,
395    4.454_152_731_321_473_5e-1,
396    4.527_156_444_154_463_7e-1,
397    4.599_857_932_156_304e-1,
398    4.672_252_341_906_264e-1,
399    4.744_334_840_483_605_5e-1,
400    4.816_100_615_790_221e-1,
401    4.887_544_876_871_878e-1,
402    4.958_662_854_238_058_4e-1,
403    5.029_449_800_180_356e-1,
404    5.099_900_989_089_429e-1,
405    5.170_011_717_770_473e-1,
406    5.239_777_305_757_194e-1,
407    5.309_193_095_624_275e-1,
408    5.378_254_453_298_289e-1,
409    5.446_956_768_367_068e-1,
410    5.515_295_454_387_482e-1,
411    5.583_265_949_191_623e-1,
412    5.650_863_715_191_369e-1,
413    5.718_084_239_681_3e-1,
414    5.784_923_035_139_965e-1,
415    5.851_375_639_529_456e-1,
416    5.917_437_616_593_286e-1,
417    5.983_104_556_152_549e-1,
418    6.048_372_074_400_329e-1,
419    6.113_235_814_194_364e-1,
420    6.177_691_445_347_913e-1,
421    6.241_734_664_918_837e-1,
422    6.305_361_197_496_849e-1,
423    6.368_566_795_488_945e-1,
424    6.431_347_239_402_948e-1,
425    6.493_698_338_129_212e-1,
426    6.555_615_929_220_4e-1,
427    6.617_095_879_169_366e-1,
428    6.678_134_083_685_102e-1,
429    6.738_726_467_966_731e-1,
430    6.798_868_986_975_534e-1,
431    6.858_557_625_704_99e-1,
432    6.917_788_399_448_808e-1,
433    6.976_557_354_066_943e-1,
434    7.034_860_566_249_567e-1,
435    7.092_694_143_778_975e-1,
436    7.150_054_225_789_432e-1,
437    7.206_936_983_024_912e-1,
438    7.263_338_618_094_733e-1,
439    7.319_255_365_727_068e-1,
440    7.374_683_493_020_299e-1,
441    7.429_619_299_692_227e-1,
442    7.484_059_118_327_094e-1,
443    7.537_999_314_620_412e-1,
444    7.591_436_287_621_58e-1,
445    7.644_366_469_974_285e-1,
446    7.696_786_328_154_644e-1,
447    7.748_692_362_707_1e-1,
448    7.800_081_108_478_04e-1,
449    7.850_949_134_847_117e-1,
450    7.901_293_045_956_28e-1,
451    7.951_109_480_936_471e-1,
452    8.000_395_114_131_988e-1,
453    8.049_146_655_322_506e-1,
454    8.097_360_849_942_72e-1,
455    8.145_034_479_299_62e-1,
456    8.192_164_360_787_36e-1,
457    8.238_747_348_099_726e-1,
458    8.284_780_331_440_178e-1,
459    8.330_260_237_729_452e-1,
460    8.375_184_030_810_715e-1,
461    8.419_548_711_652_254e-1,
462    8.463_351_318_547_683e-1,
463    8.506_588_927_313_666e-1,
464    8.549_258_651_485_127e-1,
465    8.591_357_642_507_945e-1,
466    8.632_883_089_929_12e-1,
467    8.673_832_221_584_393e-1,
468    8.714_202_303_783_312e-1,
469    8.753_990_641_491_725e-1,
470    8.793_194_578_511_7e-1,
471    8.831_811_497_658_847e-1,
472    8.869_838_820_937_034e-1,
473    8.907_274_009_710_492e-1,
474    8.944_114_564_873_288e-1,
475    8.980_358_027_016_164e-1,
476    9.016_001_976_590_722e-1,
477    9.051_044_034_070_944e-1,
478    9.085_481_860_112_055e-1,
479    9.119_313_155_706_682e-1,
480    9.152_535_662_338_34e-1,
481    9.185_147_162_132_208e-1,
482    9.217_145_478_003_181e-1,
483    9.248_528_473_801_222e-1,
484    9.279_294_054_453_956e-1,
485    9.309_440_166_106_54e-1,
486    9.338_964_796_258_775e-1,
487    9.367_865_973_899_459e-1,
488    9.396_141_769_637_963e-1,
489    9.423_790_295_833_044e-1,
490    9.450_809_706_718_851e-1,
491    9.477_198_198_528_157e-1,
492    9.502_954_009_612_771e-1,
493    9.528_075_420_561_144e-1,
494    9.552_560_754_313_16e-1,
495    9.576_408_376_272_095e-1,
496    9.599_616_694_413_742e-1,
497    9.622_184_159_392_698e-1,
498    9.644_109_264_645_802e-1,
499    9.665_390_546_492_71e-1,
500    9.686_026_584_233_628e-1,
501    9.706_016_000_244_151e-1,
502    9.725_357_460_067_257e-1,
503    9.744_049_672_502_397e-1,
504    9.762_091_389_691_724e-1,
505    9.779_481_407_203_411e-1,
506    9.796_218_564_112_101e-1,
507    9.812_301_743_076_443e-1,
508    9.827_729_870_413_743e-1,
509    9.842_501_916_171_713e-1,
510    9.856_616_894_197_333e-1,
511    9.870_073_862_202_815e-1,
512    9.882_871_921_828_699e-1,
513    9.895_010_218_704_087e-1,
514    9.906_487_942_504_061e-1,
515    9.917_304_327_004_32e-1,
516    9.927_458_650_133_153e-1,
517    9.936_950_234_020_883e-1,
518    9.945_778_445_047_068e-1,
519    9.953_942_693_885_953e-1,
520    9.961_442_435_551_087e-1,
521    9.968_277_169_440_913e-1,
522    9.974_446_439_389_107e-1,
523    9.979_949_833_727_938e-1,
524    9.984_786_985_384_589e-1,
525    9.988_957_572_063_257e-1,
526    9.992_461_316_671_845e-1,
527    9.995_297_988_558_859e-1,
528    9.997_467_408_113_523e-1,
529    9.998_969_471_378_596e-1,
530    9.999_804_411_726_474e-1,
531];
532const GL_WEIGHTS: [f64; 384] = [
533    5.019_410_348_676_869_6e-5,
534    1.168_390_665_730_266_3e-4,
535    1.835_749_193_551_655_8e-4,
536    2.503_070_890_844_105e-4,
537    3.170_242_698_112_815e-4,
538    3.837_208_020_912_921_4e-4,
539    4.503_919_137_716_827e-4,
540    5.170_330_453_491_649e-4,
541    5.836_397_042_630_135e-4,
542    6.502_074_240_969_948e-4,
543    7.167_317_509_947_801e-4,
544    7.832_082_385_905_168e-4,
545    8.496_324_460_039_209e-4,
546    9.159_999_370_632_641e-4,
547    9.823_062_800_663_463e-4,
548    1.048_547_047_793_689_5e-3,
549    1.114_717_817_647_310_6e-3,
550    1.180_814_171_855_922e-3,
551    1.246_831_697_715_441_5e-3,
552    1.312_765_987_850_66e-3,
553    1.378_612_640_487_646_8e-3,
554    1.444_367_259_734_736e-3,
555    1.510_025_455_865_810_3e-3,
556    1.575_582_845_607_936_8e-3,
557    1.641_035_052_429_271_5e-3,
558    1.706_377_706_828_447_1e-3,
559    1.771_606_446_623_834_7e-3,
560    1.836_716_917_243_567_5e-3,
561    1.901_704_772_014_899_2e-3,
562    1.966_565_672_453_437e-3,
563    2.031_295_288_552_398_4e-3,
564    2.095_889_299_071_020_6e-3,
565    2.160_343_391_822_734_3e-3,
566    2.224_653_263_962_713e-3,
567    2.288_814_622_274_955e-3,
568    2.352_823_183_458_769e-3,
569    2.416_674_674_414_340_5e-3,
570    2.480_364_832_528_265_6e-3,
571    2.543_889_405_957_74e-3,
572    2.607_244_153_914_452e-3,
573    2.670_424_846_947_554e-3,
574    2.733_427_267_226_093_3e-3,
575    2.796_247_208_820_428e-3,
576    2.858_880_477_983_06e-3,
577    2.921_322_893_428_515_3e-3,
578    2.983_570_286_612_554_5e-3,
579    3.045_618_502_010_327_8e-3,
580    3.107_463_397_393_755_5e-3,
581    3.169_100_844_108_32e-3,
582    3.230_526_727_348_174e-3,
583    3.291_736_946_431_361e-3,
584    3.352_727_415_073_250_3e-3,
585    3.413_494_061_659_418_4e-3,
586    3.474_032_829_517_317e-3,
587    3.534_339_677_187_348_4e-3,
588    3.594_410_578_692_452e-3,
589    3.654_241_523_806_987e-3,
590    3.713_828_518_324_312_5e-3,
591    3.773_167_584_323_583_5e-3,
592    3.832_254_760_435_171e-3,
593    3.891_086_102_105_193_4e-3,
594    3.949_657_681_858_895e-3,
595    4.007_965_589_562_678e-3,
596    4.066_005_932_685_269e-3,
597    4.123_774_836_557_6e-3,
598    4.181_268_444_631_281e-3,
599    4.238_482_918_736_289e-3,
600    4.295_414_439_336_925e-3,
601    4.352_059_205_787_275e-3,
602    4.408_413_436_584_285e-3,
603    4.464_473_369_620_78e-3,
604    4.520_235_262_436_235e-3,
605    4.575_695_392_466_791e-3,
606    4.630_850_057_293_894e-3,
607    4.685_695_574_891_041e-3,
608    4.740_228_283_870_022e-3,
609    4.794_444_543_725_102e-3,
610    4.848_340_735_076_109e-3,
611    4.901_913_259_910_197e-3,
612    4.955_158_541_821_682_4e-3,
613    5.008_073_026_251_332e-3,
614    5.060_653_180_723_101_4e-3,
615    5.112_895_495_080_397e-3,
616    5.164_796_481_720_011e-3,
617    5.216_352_675_825_451e-3,
618    5.267_560_635_597_735e-3,
619    5.318_416_942_485_385e-3,
620    5.368_918_201_412_827e-3,
621    5.419_061_041_006_627e-3,
622    5.468_842_113_820_941e-3,
623    5.518_258_096_560_71e-3,
624    5.567_305_690_303_767e-3,
625    5.615_981_620_720_803e-3,
626    5.664_282_638_294_182e-3,
627    5.712_205_518_534_655e-3,
628    5.759_747_062_196_925_5e-3,
629    5.806_904_095_492_818e-3,
630    5.853_673_470_303_617_4e-3,
631    5.900_052_064_389_824e-3,
632    5.946_036_781_599_814e-3,
633    5.991_624_552_076_468e-3,
634    6.036_812_332_462_087e-3,
635    6.081_597_106_101_673e-3,
636    6.125_975_883_244_196e-3,
637    6.169_945_701_242_237e-3,
638    6.213_503_624_749_591e-3,
639    6.256_646_745_917_723e-3,
640    6.299_372_184_589_237e-3,
641    6.341_677_088_490_664e-3,
642    6.383_558_633_422_572e-3,
643    6.425_014_023_448_273e-3,
644    6.466_040_491_080_434e-3,
645    6.506_635_297_465_724e-3,
646    6.546_795_732_567_842_5e-3,
647    6.586_519_115_348_261e-3,
648    6.625_802_793_945_317e-3,
649    6.664_644_145_851_14e-3,
650    6.703_040_578_086_941e-3,
651    6.740_989_527_375_895e-3,
652    6.778_488_460_314_126e-3,
653    6.815_534_873_540_5e-3,
654    6.852_126_293_902_878e-3,
655    6.888_260_278_623_754e-3,
656    6.923_934_415_463_31e-3,
657    6.959_146_322_880_146_5e-3,
658    6.993_893_650_190_702e-3,
659    7.028_174_077_725_734e-3,
660    7.061_985_316_985_506e-3,
661    7.095_325_110_792_439e-3,
662    7.128_191_233_441_844e-3,
663    7.160_581_490_850_321e-3,
664    7.192_493_720_702_486e-3,
665    7.223_925_792_595_309e-3,
666    7.254_875_608_179_984e-3,
667    7.285_341_101_302_512e-3,
668    7.315_320_238_141_324_5e-3,
669    7.344_811_017_343_063e-3,
670    7.373_811_470_156_258e-3,
671    7.402_319_660_562_818e-3,
672    7.430_333_685_407_178e-3,
673    7.457_851_674_523_319e-3,
674    7.484_871_790_859_79e-3,
675    7.511_392_230_602_079e-3,
676    7.537_411_223_293_362e-3,
677    7.562_927_031_952_382e-3,
678    7.587_937_953_189_561_5e-3,
679    7.612_442_317_320_796e-3,
680    7.636_438_488_478_739e-3,
681    7.659_924_864_722_064e-3,
682    7.682_899_878_142_539e-3,
683    7.705_361_994_969_524e-3,
684    7.727_309_715_672_44e-3,
685    7.748_741_575_060_914e-3,
686    7.769_656_142_382_462e-3,
687    7.790_052_021_418_226e-3,
688    7.809_927_850_575_903e-3,
689    7.829_282_302_980_82e-3,
690    7.848_114_086_564_56e-3,
691    7.866_421_944_151_094e-3,
692    7.884_204_653_540_665e-3,
693    7.901_461_027_591_6e-3,
694    7.918_189_914_299_318e-3,
695    7.934_390_196_873_448e-3,
696    7.950_060_793_812_204e-3,
697    7.965_200_658_974_709e-3,
698    7.979_808_781_650_77e-3,
699    7.993_884_186_628_266e-3,
700    8.007_425_934_258_548e-3,
701    8.020_433_120_518_866e-3,
702    8.032_904_877_072_8e-3,
703    8.044_840_371_328_26e-3,
704    8.056_238_806_493_175e-3,
705    8.067_099_421_628_42e-3,
706    8.077_421_491_698_82e-3,
707    8.087_204_327_621_594e-3,
708    8.096_447_276_312_202e-3,
709    8.105_149_720_727_933e-3,
710    8.113_311_079_909_208e-3,
711    8.120_930_809_018_415e-3,
712    8.128_008_399_376_085e-3,
713    8.134_543_378_495_033e-3,
714    8.140_535_310_111_77e-3,
715    8.145_983_794_215_77e-3,
716    8.150_888_467_075_875e-3,
717    8.155_249_001_265_092e-3,
718    8.159_065_105_681_899e-3,
719    8.162_336_525_570_1e-3,
720    8.165_063_042_535_465e-3,
721    8.167_244_474_560_707e-3,
722    8.168_880_676_017_344e-3,
723    8.169_971_537_675_47e-3,
724    8.170_516_986_711_104e-3,
725    8.170_516_986_711_104e-3,
726    8.169_971_537_675_47e-3,
727    8.168_880_676_017_344e-3,
728    8.167_244_474_560_707e-3,
729    8.165_063_042_535_465e-3,
730    8.162_336_525_570_1e-3,
731    8.159_065_105_681_899e-3,
732    8.155_249_001_265_092e-3,
733    8.150_888_467_075_875e-3,
734    8.145_983_794_215_77e-3,
735    8.140_535_310_111_77e-3,
736    8.134_543_378_495_033e-3,
737    8.128_008_399_376_085e-3,
738    8.120_930_809_018_415e-3,
739    8.113_311_079_909_208e-3,
740    8.105_149_720_727_933e-3,
741    8.096_447_276_312_202e-3,
742    8.087_204_327_621_594e-3,
743    8.077_421_491_698_82e-3,
744    8.067_099_421_628_42e-3,
745    8.056_238_806_493_175e-3,
746    8.044_840_371_328_26e-3,
747    8.032_904_877_072_8e-3,
748    8.020_433_120_518_866e-3,
749    8.007_425_934_258_548e-3,
750    7.993_884_186_628_266e-3,
751    7.979_808_781_650_77e-3,
752    7.965_200_658_974_709e-3,
753    7.950_060_793_812_204e-3,
754    7.934_390_196_873_448e-3,
755    7.918_189_914_299_318e-3,
756    7.901_461_027_591_6e-3,
757    7.884_204_653_540_665e-3,
758    7.866_421_944_151_094e-3,
759    7.848_114_086_564_56e-3,
760    7.829_282_302_980_82e-3,
761    7.809_927_850_575_903e-3,
762    7.790_052_021_418_226e-3,
763    7.769_656_142_382_462e-3,
764    7.748_741_575_060_914e-3,
765    7.727_309_715_672_44e-3,
766    7.705_361_994_969_524e-3,
767    7.682_899_878_142_539e-3,
768    7.659_924_864_722_064e-3,
769    7.636_438_488_478_739e-3,
770    7.612_442_317_320_796e-3,
771    7.587_937_953_189_561_5e-3,
772    7.562_927_031_952_382e-3,
773    7.537_411_223_293_362e-3,
774    7.511_392_230_602_079e-3,
775    7.484_871_790_859_79e-3,
776    7.457_851_674_523_319e-3,
777    7.430_333_685_407_178e-3,
778    7.402_319_660_562_818e-3,
779    7.373_811_470_156_258e-3,
780    7.344_811_017_343_063e-3,
781    7.315_320_238_141_324_5e-3,
782    7.285_341_101_302_512e-3,
783    7.254_875_608_179_984e-3,
784    7.223_925_792_595_309e-3,
785    7.192_493_720_702_486e-3,
786    7.160_581_490_850_321e-3,
787    7.128_191_233_441_844e-3,
788    7.095_325_110_792_439e-3,
789    7.061_985_316_985_506e-3,
790    7.028_174_077_725_734e-3,
791    6.993_893_650_190_702e-3,
792    6.959_146_322_880_146_5e-3,
793    6.923_934_415_463_31e-3,
794    6.888_260_278_623_754e-3,
795    6.852_126_293_902_878e-3,
796    6.815_534_873_540_5e-3,
797    6.778_488_460_314_126e-3,
798    6.740_989_527_375_895e-3,
799    6.703_040_578_086_941e-3,
800    6.664_644_145_851_14e-3,
801    6.625_802_793_945_317e-3,
802    6.586_519_115_348_261e-3,
803    6.546_795_732_567_842_5e-3,
804    6.506_635_297_465_724e-3,
805    6.466_040_491_080_434e-3,
806    6.425_014_023_448_273e-3,
807    6.383_558_633_422_572e-3,
808    6.341_677_088_490_664e-3,
809    6.299_372_184_589_237e-3,
810    6.256_646_745_917_723e-3,
811    6.213_503_624_749_591e-3,
812    6.169_945_701_242_237e-3,
813    6.125_975_883_244_196e-3,
814    6.081_597_106_101_673e-3,
815    6.036_812_332_462_087e-3,
816    5.991_624_552_076_468e-3,
817    5.946_036_781_599_814e-3,
818    5.900_052_064_389_824e-3,
819    5.853_673_470_303_617_4e-3,
820    5.806_904_095_492_818e-3,
821    5.759_747_062_196_925_5e-3,
822    5.712_205_518_534_655e-3,
823    5.664_282_638_294_182e-3,
824    5.615_981_620_720_803e-3,
825    5.567_305_690_303_767e-3,
826    5.518_258_096_560_71e-3,
827    5.468_842_113_820_941e-3,
828    5.419_061_041_006_627e-3,
829    5.368_918_201_412_827e-3,
830    5.318_416_942_485_385e-3,
831    5.267_560_635_597_735e-3,
832    5.216_352_675_825_451e-3,
833    5.164_796_481_720_011e-3,
834    5.112_895_495_080_397e-3,
835    5.060_653_180_723_101_4e-3,
836    5.008_073_026_251_332e-3,
837    4.955_158_541_821_682_4e-3,
838    4.901_913_259_910_197e-3,
839    4.848_340_735_076_109e-3,
840    4.794_444_543_725_102e-3,
841    4.740_228_283_870_022e-3,
842    4.685_695_574_891_041e-3,
843    4.630_850_057_293_894e-3,
844    4.575_695_392_466_791e-3,
845    4.520_235_262_436_235e-3,
846    4.464_473_369_620_78e-3,
847    4.408_413_436_584_285e-3,
848    4.352_059_205_787_275e-3,
849    4.295_414_439_336_925e-3,
850    4.238_482_918_736_289e-3,
851    4.181_268_444_631_281e-3,
852    4.123_774_836_557_6e-3,
853    4.066_005_932_685_269e-3,
854    4.007_965_589_562_678e-3,
855    3.949_657_681_858_895e-3,
856    3.891_086_102_105_193_4e-3,
857    3.832_254_760_435_171e-3,
858    3.773_167_584_323_583_5e-3,
859    3.713_828_518_324_312_5e-3,
860    3.654_241_523_806_987e-3,
861    3.594_410_578_692_452e-3,
862    3.534_339_677_187_348_4e-3,
863    3.474_032_829_517_317e-3,
864    3.413_494_061_659_418_4e-3,
865    3.352_727_415_073_250_3e-3,
866    3.291_736_946_431_361e-3,
867    3.230_526_727_348_174e-3,
868    3.169_100_844_108_32e-3,
869    3.107_463_397_393_755_5e-3,
870    3.045_618_502_010_327_8e-3,
871    2.983_570_286_612_554_5e-3,
872    2.921_322_893_428_515_3e-3,
873    2.858_880_477_983_06e-3,
874    2.796_247_208_820_428e-3,
875    2.733_427_267_226_093_3e-3,
876    2.670_424_846_947_554e-3,
877    2.607_244_153_914_452e-3,
878    2.543_889_405_957_74e-3,
879    2.480_364_832_528_265_6e-3,
880    2.416_674_674_414_340_5e-3,
881    2.352_823_183_458_769e-3,
882    2.288_814_622_274_955e-3,
883    2.224_653_263_962_713e-3,
884    2.160_343_391_822_734_3e-3,
885    2.095_889_299_071_020_6e-3,
886    2.031_295_288_552_398_4e-3,
887    1.966_565_672_453_437e-3,
888    1.901_704_772_014_899_2e-3,
889    1.836_716_917_243_567_5e-3,
890    1.771_606_446_623_834_7e-3,
891    1.706_377_706_828_447_1e-3,
892    1.641_035_052_429_271_5e-3,
893    1.575_582_845_607_936_8e-3,
894    1.510_025_455_865_810_3e-3,
895    1.444_367_259_734_736e-3,
896    1.378_612_640_487_646_8e-3,
897    1.312_765_987_850_66e-3,
898    1.246_831_697_715_441_5e-3,
899    1.180_814_171_855_922e-3,
900    1.114_717_817_647_310_6e-3,
901    1.048_547_047_793_689_5e-3,
902    9.823_062_800_663_463e-4,
903    9.159_999_370_632_641e-4,
904    8.496_324_460_039_209e-4,
905    7.832_082_385_905_168e-4,
906    7.167_317_509_947_801e-4,
907    6.502_074_240_969_948e-4,
908    5.836_397_042_630_135e-4,
909    5.170_330_453_491_649e-4,
910    4.503_919_137_716_827e-4,
911    3.837_208_020_912_921_4e-4,
912    3.170_242_698_112_815e-4,
913    2.503_070_890_844_105e-4,
914    1.835_749_193_551_655_8e-4,
915    1.168_390_665_730_266_3e-4,
916    5.019_410_348_676_869_6e-5,
917];
918
919#[derive(Clone, Copy, Debug, Eq, PartialEq)]
920pub enum ExactCellBranch {
921    Affine,
922    Quartic,
923    Sextic,
924}
925
926/// Auto-tune the per-cell affine/non-affine branch tolerance from the cell's
927/// own coefficient magnitudes.
928///
929/// The legacy `branch_cell` compared the normalized cubic coefficients
930/// `(k2, k3)` against a single global constant.  That constant is calibrated
931/// for cells whose anchor coefficients `(c0, c1)` are O(1).  When the anchor
932/// dominates — e.g. a tail cell with `|c0|, |c1| >> 1` — a relative criterion
933/// against the anchor magnitude is more numerically meaningful than the bare
934/// global threshold, because the affine contribution to `eta` already absorbs
935/// any difference at the chosen scale.
936///
937/// The returned tolerance is always at least [`NORMALIZED_CELL_BRANCH_TOL`],
938/// so cells with O(1) anchors recover bit-identical classification with the
939/// legacy code path.  This preserves numerical equivalence for the
940/// established `cubic_cell_kernel` tests, including the
941/// `tuned_branch_tolerance_matches_legacy_non_affine_transport_grid` grid.
942#[inline]
943fn effective_branch_tol(cell: DenestedCubicCell) -> f64 {
944    let anchor_scale = cell.c0.abs().max(cell.c1.abs()).max(1.0);
945    NORMALIZED_CELL_BRANCH_TOL * anchor_scale
946}
947
948#[derive(Clone, Copy, Debug, PartialEq)]
949pub struct DenestedCubicCell {
950    pub left: f64,
951    pub right: f64,
952    pub c0: f64,
953    pub c1: f64,
954    pub c2: f64,
955    pub c3: f64,
956}
957
958impl DenestedCubicCell {
959    #[inline]
960    pub fn eta(self, z: f64) -> f64 {
961        self.c0 + self.c1 * z + self.c2 * z * z + self.c3 * z * z * z
962    }
963
964    #[inline]
965    pub fn q(self, z: f64) -> f64 {
966        let eta = self.eta(z);
967        0.5 * (z * z + eta * eta)
968    }
969}
970
971#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
972pub struct CellMomentFingerprint {
973    pub hash: u64,
974    bins: [u64; 6],
975}
976
977#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
978pub struct CellMomentCacheKey {
979    pub fingerprint: CellMomentFingerprint,
980    pub max_degree: usize,
981}
982
983#[derive(Clone, Copy, Debug, Default, PartialEq)]
984pub struct CellMomentDedupStats {
985    pub lookups: u64,
986    pub hits: u64,
987    pub misses: u64,
988}
989
990impl CellMomentDedupStats {
991    #[inline]
992    pub fn hit_rate(self) -> f64 {
993        if self.lookups == 0 {
994            0.0
995        } else {
996            self.hits as f64 / self.lookups as f64
997        }
998    }
999}
1000
1001#[inline]
1002fn splitmix64(x: u64) -> u64 {
1003    gam_linalg::utils::splitmix64_hash(x)
1004}
1005
1006#[inline]
1007fn mix_fingerprint_words(words: &[u64]) -> u64 {
1008    let mut h = 0xcbf2_9ce4_8422_2325u64;
1009    for &word in words {
1010        h ^= splitmix64(word);
1011        h = h.wrapping_mul(0x100_0000_01b3);
1012    }
1013    h
1014}
1015
1016#[inline]
1017fn quantized_cell_word(x: f64, epsilon: f64) -> u64 {
1018    if epsilon == 0.0 || !epsilon.is_finite() || epsilon < 0.0 || !x.is_finite() {
1019        return x.to_bits();
1020    }
1021    (x / epsilon).round().to_bits()
1022}
1023
1024/// Returns a deterministic geometric fingerprint for a de-nested cubic cell.
1025///
1026/// With `epsilon == 0.0`, each coordinate is represented by its exact IEEE-754
1027/// bit pattern, so equal fingerprints imply bit-equal `(left, right, c0, c1,
1028/// c2, c3)` tuples.  With `epsilon > 0`, finite coordinates are binned to the
1029/// nearest multiple of `epsilon`; callers should treat this as an approximate
1030/// cache key and validate the resulting model error for their data.
1031pub fn cell_moment_fingerprint(cell: DenestedCubicCell, epsilon: f64) -> CellMomentFingerprint {
1032    let bins = [
1033        quantized_cell_word(cell.left, epsilon),
1034        quantized_cell_word(cell.right, epsilon),
1035        quantized_cell_word(cell.c0, epsilon),
1036        quantized_cell_word(cell.c1, epsilon),
1037        quantized_cell_word(cell.c2, epsilon),
1038        quantized_cell_word(cell.c3, epsilon),
1039    ];
1040    CellMomentFingerprint {
1041        hash: mix_fingerprint_words(&bins),
1042        bins,
1043    }
1044}
1045
1046#[inline]
1047pub fn cell_moment_cache_key(
1048    cell: DenestedCubicCell,
1049    max_degree: usize,
1050    epsilon: f64,
1051) -> CellMomentCacheKey {
1052    CellMomentCacheKey {
1053        fingerprint: cell_moment_fingerprint(cell, epsilon),
1054        max_degree,
1055    }
1056}
1057
1058#[derive(Clone, Copy, Debug, PartialEq)]
1059pub struct DenestedPartitionCell {
1060    pub cell: DenestedCubicCell,
1061    pub score_span: LocalSpanCubic,
1062    pub link_span: LocalSpanCubic,
1063    /// Provenance of the cell's boundaries: a fixed z location (score break
1064    /// or ±∞ tail) or a link-knot crossing `z = (τ - a)/b`. Together with
1065    /// `(score_span, link_span)` this identifies the cell's two-parameter
1066    /// family in `(a, b)` across rows (see
1067    /// [`crate::cell_moment_family`]).
1068    pub left_edge: PartitionEdge,
1069    pub right_edge: PartitionEdge,
1070}
1071
1072impl DenestedPartitionCell {}
1073
1074/// Provenance of one boundary of a denested partition cell.
1075#[derive(Clone, Copy, Debug, PartialEq)]
1076pub enum PartitionEdge {
1077    /// A z location independent of the row scalars: a score-spline break,
1078    /// or ±∞ for tail cells.
1079    Fixed(f64),
1080    /// A link-knot crossing: the boundary sits at `z = (τ - a)/b` for the
1081    /// row's `(a, b)`.
1082    Crossing { tau: f64 },
1083}
1084
1085impl PartitionEdge {
1086    /// The boundary's z location at the row scalars `(a, b)`.
1087    #[inline]
1088    pub fn z_at(self, a: f64, b: f64) -> f64 {
1089        match self {
1090            Self::Fixed(z) => z,
1091            Self::Crossing { tau } => (tau - a) / b,
1092        }
1093    }
1094}
1095
1096#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
1097struct TailCellMomentCacheKey {
1098    c0_bits: u64,
1099    c1_bits: u64,
1100    endpoint_bits: u64,
1101    side: i8,
1102    max_degree: usize,
1103}
1104
1105const TAIL_CELL_MOMENT_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
1106const TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES: usize = 262_144;
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1109pub struct TailCellMomentCacheStats {
1110    pub hits: usize,
1111    pub misses: usize,
1112    pub entries: usize,
1113}
1114
1115impl TailCellMomentCacheStats {
1116    #[inline]
1117    pub fn requests(self) -> usize {
1118        self.hits + self.misses
1119    }
1120
1121    #[inline]
1122    pub fn hit_rate(self) -> f64 {
1123        let requests = self.requests();
1124        if requests == 0 {
1125            0.0
1126        } else {
1127            self.hits as f64 / requests as f64
1128        }
1129    }
1130}
1131
1132/// Affine-tail cell-moment memo.
1133///
1134/// Stand-alone instances (`TailCellMomentCache::new()`) are useful when a
1135/// caller needs deterministic hit/miss bookkeeping that is not polluted by
1136/// concurrent traffic on the global memo. The production path uses the
1137/// global instance behind [`evaluate_cell_moments`].
1138///
1139/// All methods take `&self`: the LRU is internally synchronized (sharded for
1140/// the concurrent global memo) and the counters are atomics, so the global
1141/// instance needs no outer `Mutex`. The previous `OnceLock<Mutex<…>>` wrapper
1142/// serialized every tail-cell evaluation across all rayon workers of the
1143/// marginal-slope exact-cache build — the same contention class the sharded
1144/// per-family cell-moment LRU fix removed.
1145#[derive(Debug)]
1146pub struct TailCellMomentCache {
1147    moments: ByteLruCache<TailCellMomentCacheKey, CellMomentState>,
1148    hits: std::sync::atomic::AtomicUsize,
1149    misses: std::sync::atomic::AtomicUsize,
1150}
1151
1152impl Default for TailCellMomentCache {
1153    fn default() -> Self {
1154        // Tail-cell entries are small (a short moment vector), so sharding
1155        // the byte/entry budgets is harmless; size the shard count off the
1156        // worker pool exactly like the per-family cell-moment LRU.
1157        let shard_count = std::thread::available_parallelism()
1158            .map(|workers| workers.get().saturating_mul(8))
1159            .unwrap_or(32)
1160            .clamp(8, 256);
1161        Self {
1162            moments: ByteLruCache::with_max_entries_sharded(
1163                TAIL_CELL_MOMENT_CACHE_MAX_BYTES,
1164                TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES,
1165                shard_count,
1166            ),
1167            hits: std::sync::atomic::AtomicUsize::new(0),
1168            misses: std::sync::atomic::AtomicUsize::new(0),
1169        }
1170    }
1171}
1172
1173impl TailCellMomentCache {
1174    /// Construct an empty cache. Hits/misses start at zero.
1175    #[inline]
1176    pub fn new() -> Self {
1177        Self::default()
1178    }
1179
1180    /// Reset the cache to its empty state. Existing entries are dropped and
1181    /// the hit/miss counters are zeroed.
1182    #[inline]
1183    pub fn clear(&self) {
1184        self.moments.clear();
1185        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
1186        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
1187    }
1188
1189    /// Snapshot of the cache's current usage stats.
1190    #[inline]
1191    pub fn stats(&self) -> TailCellMomentCacheStats {
1192        TailCellMomentCacheStats {
1193            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
1194            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
1195            entries: self.moments.len(),
1196        }
1197    }
1198
1199    /// Look up `cell` at `max_degree`, computing and inserting the result on
1200    /// miss. Cells outside the affine-tail keyset bypass the cache and run
1201    /// the uncached evaluator directly without touching the counters.
1202    ///
1203    /// Stat semantics: every cache hit increments `hits`; a **miss** is
1204    /// counted when this call computed the value itself. Under concurrent
1205    /// access two workers racing on the same cold key may both count a miss
1206    /// (each computes the identical pure-function value); single-threaded
1207    /// bookkeeping is exact.
1208    pub fn evaluate(
1209        &self,
1210        cell: DenestedCubicCell,
1211        max_degree: usize,
1212    ) -> Result<CellMomentState, String> {
1213        let Some(key) = tail_cell_cache_key(cell, max_degree) else {
1214            return evaluate_cell_moments_uncached(cell, max_degree);
1215        };
1216        if let Some(state) = self.moments.get(&key) {
1217            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1218            return Ok(state);
1219        }
1220        let state = evaluate_cell_moments_uncached(cell, max_degree)?;
1221        self.misses
1222            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1223        self.moments.insert(key, state.clone());
1224        Ok(state)
1225    }
1226}
1227
1228static TAIL_CELL_MOMENT_CACHE: std::sync::OnceLock<TailCellMomentCache> =
1229    std::sync::OnceLock::new();
1230static TAIL_CELL_MOMENT_CACHE_ENABLED: std::sync::atomic::AtomicBool =
1231    std::sync::atomic::AtomicBool::new(true);
1232
1233fn tail_cell_moment_cache() -> &'static TailCellMomentCache {
1234    TAIL_CELL_MOMENT_CACHE.get_or_init(TailCellMomentCache::default)
1235}
1236
1237#[inline]
1238fn tail_cell_cache_key(
1239    cell: DenestedCubicCell,
1240    max_degree: usize,
1241) -> Option<TailCellMomentCacheKey> {
1242    if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL {
1243        return None;
1244    }
1245    match (!cell.left.is_finite(), !cell.right.is_finite()) {
1246        (true, false) if cell.right.is_finite() => Some(TailCellMomentCacheKey {
1247            c0_bits: cell.c0.to_bits(),
1248            c1_bits: cell.c1.to_bits(),
1249            endpoint_bits: cell.right.to_bits(),
1250            side: -1,
1251            max_degree,
1252        }),
1253        (false, true) if cell.left.is_finite() => Some(TailCellMomentCacheKey {
1254            c0_bits: cell.c0.to_bits(),
1255            c1_bits: cell.c1.to_bits(),
1256            endpoint_bits: cell.left.to_bits(),
1257            side: 1,
1258            max_degree,
1259        }),
1260        _ => None,
1261    }
1262}
1263
1264pub fn set_tail_cell_moment_cache_enabled(enabled: bool) {
1265    TAIL_CELL_MOMENT_CACHE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed);
1266}
1267
1268pub fn reset_tail_cell_moment_cache() {
1269    tail_cell_moment_cache().clear();
1270}
1271
1272pub fn tail_cell_moment_cache_stats() -> TailCellMomentCacheStats {
1273    tail_cell_moment_cache().stats()
1274}
1275
1276#[derive(Clone, Copy, Debug, Eq)]
1277pub struct CellFingerprint {
1278    c0: u64,
1279    c1: u64,
1280    c2: u64,
1281    c3: u64,
1282    left: u64,
1283    right: u64,
1284}
1285
1286impl CellFingerprint {
1287    #[inline]
1288    pub fn new(cell: DenestedCubicCell) -> Self {
1289        Self {
1290            c0: cell.c0.to_bits(),
1291            c1: cell.c1.to_bits(),
1292            c2: cell.c2.to_bits(),
1293            c3: cell.c3.to_bits(),
1294            left: cell.left.to_bits(),
1295            right: cell.right.to_bits(),
1296        }
1297    }
1298}
1299
1300impl PartialEq for CellFingerprint {
1301    #[inline]
1302    fn eq(&self, other: &Self) -> bool {
1303        self.c0 == other.c0
1304            && self.c1 == other.c1
1305            && self.c2 == other.c2
1306            && self.c3 == other.c3
1307            && self.left == other.left
1308            && self.right == other.right
1309    }
1310}
1311
1312impl Hash for CellFingerprint {
1313    #[inline]
1314    fn hash<H: Hasher>(&self, state: &mut H) {
1315        self.c0.hash(state);
1316        self.c1.hash(state);
1317        self.c2.hash(state);
1318        self.c3.hash(state);
1319        self.left.hash(state);
1320        self.right.hash(state);
1321    }
1322}
1323
1324#[derive(Clone, Debug, Default, PartialEq)]
1325pub struct CachedCellMoments {
1326    /// Regular (value) cell moments, populated by
1327    /// `evaluate_cell_moments_cached`. None when only derivative moments
1328    /// have been cached for this cell. Wrapped in `Arc` so `ByteLruCache`
1329    /// returns lookups through cheap refcount bumps instead of deep-cloning
1330    /// the inline `SmallVec<[f64; 10]>` (which spills on every degree-`>= 10`
1331    /// request) on every hot-path LRU hit.
1332    state: Option<Arc<CellMomentState>>,
1333    /// Derivative moments, populated by
1334    /// `evaluate_cell_derivative_moments_cached`. None when only value
1335    /// moments have been cached for this cell. Both variants share the
1336    /// same `CellFingerprint` key so derivative-only callers do not evict
1337    /// pre-cached value entries and vice versa. Same `Arc` wrapping rationale
1338    /// as `state` above.
1339    derivative_state: Option<Arc<CellDerivativeMomentState>>,
1340}
1341
1342impl CachedCellMoments {
1343    #[inline]
1344    pub fn new(state: Arc<CellMomentState>) -> Self {
1345        Self {
1346            state: Some(state),
1347            derivative_state: None,
1348        }
1349    }
1350
1351    #[inline]
1352    pub fn new_derivative(state: Arc<CellDerivativeMomentState>) -> Self {
1353        Self {
1354            state: None,
1355            derivative_state: Some(state),
1356        }
1357    }
1358
1359    #[inline]
1360    pub fn state_for_degree(&self, max_degree: usize) -> Option<CellMomentState> {
1361        let state = self.state.as_ref()?;
1362        if state.moments.len().saturating_sub(1) < max_degree {
1363            return None;
1364        }
1365        // Cached `Arc<CellMomentState>` is shared across LRU hits, so we
1366        // cannot reuse the inner vector in place. Clone the underlying state
1367        // and (rarely) truncate down to the requested degree to honour the
1368        // public moment-length contract.
1369        let mut state = (**state).clone();
1370        state.moments.truncate(max_degree + 1);
1371        Some(state)
1372    }
1373
1374    #[inline]
1375    pub fn derivative_state_for_degree(
1376        &self,
1377        max_degree: usize,
1378    ) -> Option<CellDerivativeMomentState> {
1379        let state = self.derivative_state.as_ref()?;
1380        if state.moments.len().saturating_sub(1) < max_degree {
1381            return None;
1382        }
1383        // See `state_for_degree`: shared `Arc` forces an inner clone here.
1384        let mut state = (**state).clone();
1385        state.moments.truncate(max_degree + 1);
1386        Some(state)
1387    }
1388
1389    #[inline]
1390    pub fn with_value(mut self, state: Arc<CellMomentState>) -> Self {
1391        self.state = Some(state);
1392        self
1393    }
1394
1395    #[inline]
1396    pub fn with_derivative(mut self, state: Arc<CellDerivativeMomentState>) -> Self {
1397        self.derivative_state = Some(state);
1398        self
1399    }
1400}
1401
1402impl ResidentBytes for CachedCellMoments {
1403    fn resident_bytes(&self) -> usize {
1404        let value_bytes = self
1405            .state
1406            .as_ref()
1407            .map_or(0, |state| state.resident_bytes());
1408        let derivative_bytes = self
1409            .derivative_state
1410            .as_ref()
1411            .map_or(0, |state| state.resident_bytes());
1412        std::mem::size_of::<Self>()
1413            .saturating_add(value_bytes)
1414            .saturating_add(derivative_bytes)
1415    }
1416}
1417
1418#[derive(Debug, Default)]
1419pub struct CellMomentCacheStats {
1420    hits: AtomicU64,
1421    misses: AtomicU64,
1422}
1423
1424impl CellMomentCacheStats {
1425    #[inline]
1426    pub fn snapshot(&self) -> (u64, u64) {
1427        (
1428            self.hits.load(Ordering::Relaxed),
1429            self.misses.load(Ordering::Relaxed),
1430        )
1431    }
1432
1433    #[inline]
1434    pub fn hit_rate_delta(&self, before: (u64, u64)) -> (u64, u64, f64) {
1435        let (hits, misses) = self.snapshot();
1436        let dh = hits.saturating_sub(before.0);
1437        let dm = misses.saturating_sub(before.1);
1438        let total = dh + dm;
1439        let rate = if total == 0 {
1440            0.0
1441        } else {
1442            dh as f64 / total as f64
1443        };
1444        (dh, dm, rate)
1445    }
1446}
1447
1448pub type CellMomentLruCache = ByteLruCache<CellFingerprint, CachedCellMoments>;
1449
1450pub const CELL_MOMENT_INLINE_CAPACITY: usize = 10;
1451
1452pub type CellMomentVec = SmallVec<[f64; CELL_MOMENT_INLINE_CAPACITY]>;
1453
1454#[derive(Clone, Debug, PartialEq)]
1455pub struct CellMomentState {
1456    pub branch: ExactCellBranch,
1457    pub value: f64,
1458    pub moments: CellMomentVec,
1459}
1460
1461impl ResidentBytes for CellMomentState {
1462    fn resident_bytes(&self) -> usize {
1463        let spilled_bytes = if self.moments.spilled() {
1464            self.moments
1465                .capacity()
1466                .saturating_mul(std::mem::size_of::<f64>())
1467        } else {
1468            0
1469        };
1470        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1471    }
1472}
1473
1474#[derive(Clone, Debug, PartialEq)]
1475pub struct CellDerivativeMomentState {
1476    pub branch: ExactCellBranch,
1477    pub moments: CellMomentVec,
1478}
1479
1480impl ResidentBytes for CellDerivativeMomentState {
1481    fn resident_bytes(&self) -> usize {
1482        let spilled_bytes = if self.moments.spilled() {
1483            self.moments
1484                .capacity()
1485                .saturating_mul(std::mem::size_of::<f64>())
1486        } else {
1487            0
1488        };
1489        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1490    }
1491}
1492
1493#[derive(Clone, Copy, Debug, PartialEq)]
1494pub struct CellMomentStateRef<'a> {
1495    pub branch: ExactCellBranch,
1496    pub value: f64,
1497    pub moments: &'a [f64],
1498}
1499
1500#[derive(Clone, Debug)]
1501pub struct CellMomentScratch {
1502    moments: Vec<f64>,
1503}
1504
1505impl Default for CellMomentScratch {
1506    fn default() -> Self {
1507        // Pre-size to the codebase's max moment degree so steady-state
1508        // `prepare_moments` calls never reallocate. Calls with `len`
1509        // exceeding this still reserve lazily.
1510        Self {
1511            moments: Vec::with_capacity(MAX_AFFINE_ANCHOR_DEGREE + 1),
1512        }
1513    }
1514}
1515
1516impl CellMomentScratch {
1517    pub fn new() -> Self {
1518        Self::default()
1519    }
1520
1521    pub fn with_capacity(max_degree: usize) -> Self {
1522        Self {
1523            moments: Vec::with_capacity(max_degree + 1),
1524        }
1525    }
1526
1527    #[inline]
1528    fn prepare_moments(&mut self, len: usize) -> &mut [f64] {
1529        if self.moments.capacity() < len {
1530            CELL_MOMENT_REALLOCS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1531            self.moments.reserve(len - self.moments.capacity());
1532        }
1533        self.moments.resize(len, 0.0);
1534        self.moments.fill(0.0);
1535        &mut self.moments
1536    }
1537}
1538
1539/// Counter for moment-buffer reallocations in `prepare_moments`. Production
1540/// code increments this on every buffer growth; the test mod inspects it to
1541/// assert the steady-state hot loop allocates exactly once per row buffer.
1542pub(crate) static CELL_MOMENT_REALLOCS: std::sync::atomic::AtomicUsize =
1543    std::sync::atomic::AtomicUsize::new(0);
1544
1545/// Canonical 20-point Gauss–Legendre nodes on [-1, 1] (Abramowitz & Stegun
1546/// 25.4), tabulated to f64 precision. Used here for the Drezner–Wesolowsky
1547/// bivariate normal CDF representation — 20 points give >30-digit accuracy for
1548/// the smooth arcsin-transformed integrand, ensuring the BVN value is exact to
1549/// f64 precision for all (h, k, ρ) — and shared with the cubic-cell B-spline
1550/// moment parity gate in [`crate::gpu_kernels::cubic_bspline_moments`].
1551pub const GL20_NODES: [f64; 20] = [
1552    -0.993_128_599_185_094_9,
1553    -0.963_971_927_277_913_8,
1554    -0.912_234_428_251_326,
1555    -0.839_116_971_822_218_8,
1556    -0.746_331_906_460_150_8,
1557    -0.636_053_680_726_515,
1558    -0.510_867_001_950_827_1,
1559    -0.373_706_088_715_419_6,
1560    -0.227_785_851_141_645_1,
1561    -0.076_526_521_133_497_33,
1562    0.076_526_521_133_497_33,
1563    0.227_785_851_141_645_1,
1564    0.373_706_088_715_419_6,
1565    0.510_867_001_950_827_1,
1566    0.636_053_680_726_515,
1567    0.746_331_906_460_150_8,
1568    0.839_116_971_822_218_8,
1569    0.912_234_428_251_326,
1570    0.963_971_927_277_913_8,
1571    0.993_128_599_185_094_9,
1572];
1573
1574/// Companion weights to [`GL20_NODES`]. Symmetric, summing to 2.
1575pub const GL20_WEIGHTS: [f64; 20] = [
1576    0.017_614_007_139_152_12,
1577    0.040_601_429_800_386_94,
1578    0.062_672_048_334_109_06,
1579    0.083_276_741_576_704_75,
1580    0.101_930_119_817_240_4,
1581    0.118_194_531_961_518_4,
1582    0.131_688_638_449_176_6,
1583    0.142_096_109_318_382_1,
1584    0.149_172_986_472_603_7,
1585    0.152_753_387_130_725_9,
1586    0.152_753_387_130_725_9,
1587    0.149_172_986_472_603_7,
1588    0.142_096_109_318_382_1,
1589    0.131_688_638_449_176_6,
1590    0.118_194_531_961_518_4,
1591    0.101_930_119_817_240_4,
1592    0.083_276_741_576_704_75,
1593    0.062_672_048_334_109_06,
1594    0.040_601_429_800_386_94,
1595    0.017_614_007_139_152_12,
1596];
1597
1598/// Provenance-tagged breakpoint dedup: sorts ascending and merges entries
1599/// coinciding within 1e-12, but when a fixed score break and a link-knot
1600/// crossing coincide (the kink configuration), the surviving entry keeps
1601/// the `Fixed` tag — a deterministic choice; the z location is identical
1602/// either way.
1603fn dedup_sorted_tagged_breakpoints(points: &mut Vec<(f64, PartitionEdge)>) {
1604    points.sort_by(|lhs, rhs| {
1605        lhs.0
1606            .partial_cmp(&rhs.0)
1607            .unwrap_or(std::cmp::Ordering::Equal)
1608    });
1609    points.dedup_by(|lhs, rhs| {
1610        let coincide = if lhs.0 == rhs.0 {
1611            true
1612        } else if lhs.0.is_finite() && rhs.0.is_finite() {
1613            (lhs.0 - rhs.0).abs() <= 1e-12
1614        } else {
1615            false
1616        };
1617        if coincide && matches!(lhs.1, PartitionEdge::Fixed(_)) {
1618            // `dedup_by` keeps `rhs` (the earlier element) — propagate the
1619            // Fixed tag onto the survivor.
1620            rhs.1 = lhs.1;
1621        }
1622        coincide
1623    });
1624}
1625
1626#[inline]
1627pub fn interval_probe_point(left: f64, right: f64) -> Result<f64, String> {
1628    if !(left < right) {
1629        return Err(CubicCellKernelError::invalid_interval(format!(
1630            "interval probe requires ordered bounds, got [{left}, {right}]"
1631        ))
1632        .into());
1633    }
1634    if left.is_finite() && right.is_finite() {
1635        Ok(0.5 * (left + right))
1636    } else if left == f64::NEG_INFINITY && right == f64::INFINITY {
1637        Ok(0.0)
1638    } else if left == f64::NEG_INFINITY && right.is_finite() {
1639        Ok(right - 1.0)
1640    } else if left.is_finite() && right == f64::INFINITY {
1641        Ok(left + 1.0)
1642    } else {
1643        Err(CubicCellKernelError::invalid_interval(format!(
1644            "interval probe requires finite bounds or full infinities, got [{left}, {right}]"
1645        ))
1646        .into())
1647    }
1648}
1649
1650#[inline]
1651pub fn quartic_qprime_coefficients(c0: f64, c1: f64, c2: f64) -> [f64; 4] {
1652    [
1653        c0 * c1,
1654        1.0 + c1 * c1 + 2.0 * c0 * c2,
1655        3.0 * c1 * c2,
1656        2.0 * c2 * c2,
1657    ]
1658}
1659
1660#[inline]
1661pub fn sextic_qprime_coefficients(c0: f64, c1: f64, c2: f64, c3: f64) -> [f64; 6] {
1662    [
1663        c0 * c1,
1664        1.0 + c1 * c1 + 2.0 * c0 * c2,
1665        3.0 * c0 * c3 + 3.0 * c1 * c2,
1666        4.0 * c1 * c3 + 2.0 * c2 * c2,
1667        5.0 * c2 * c3,
1668        3.0 * c3 * c3,
1669    ]
1670}
1671
1672/// Boundary term `right^n · exp(−q(right)) − left^n · exp(−q(left))` used by
1673/// the moment recurrences. Takes precomputed `left^n` and `right^n` so callers
1674/// can roll the powers across a recurrence — each iteration becomes one
1675/// multiply instead of a fresh `powi(n)`.
1676#[inline]
1677fn moment_boundary_term_with_powers(
1678    cell: DenestedCubicCell,
1679    left_pow_n: f64,
1680    right_pow_n: f64,
1681) -> f64 {
1682    let left_term = if cell.left.is_infinite() {
1683        0.0
1684    } else {
1685        left_pow_n * (-cell.q(cell.left)).exp()
1686    };
1687    let right_term = if cell.right.is_infinite() {
1688        0.0
1689    } else {
1690        right_pow_n * (-cell.q(cell.right)).exp()
1691    };
1692    right_term - left_term
1693}
1694
1695pub fn reduce_quartic_moments(
1696    cell: DenestedCubicCell,
1697    base_m0_m2: [f64; 3],
1698    max_degree: usize,
1699) -> Result<Vec<f64>, String> {
1700    if max_degree <= 2 {
1701        return Ok(base_m0_m2[..=max_degree].to_vec());
1702    }
1703    let d = quartic_qprime_coefficients(cell.c0, cell.c1, cell.c2);
1704    let lead = d[3];
1705    if !lead.is_finite() || lead.abs() <= 1e-18 {
1706        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1707            "quartic moment reduction requires nonzero leading coefficient, got {lead:.3e}"
1708        ))
1709        .into());
1710    }
1711    let mut moments = vec![0.0; max_degree + 1];
1712    moments[0] = base_m0_m2[0];
1713    moments[1] = base_m0_m2[1];
1714    moments[2] = base_m0_m2[2];
1715    // Roll left^n / right^n across the recurrence rather than calling
1716    // `powi(n)` each iteration. Skip the multiply when an endpoint is
1717    // infinite — the boundary helper ignores the power in that case, and
1718    // ∞·0 would produce a NaN we'd then have to mask off anyway.
1719    let left_finite = cell.left.is_finite();
1720    let right_finite = cell.right.is_finite();
1721    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1722    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1723    for n in 0..=(max_degree - 3) {
1724        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1725        let mut numer = if n == 0 {
1726            0.0
1727        } else {
1728            (n as f64) * moments[n - 1]
1729        };
1730        for j in 0..=2 {
1731            numer -= d[j] * moments[n + j];
1732        }
1733        numer -= b_n;
1734        moments[n + 3] = numer / lead;
1735        if left_finite {
1736            left_pow_n *= cell.left;
1737        }
1738        if right_finite {
1739            right_pow_n *= cell.right;
1740        }
1741    }
1742    Ok(moments)
1743}
1744
1745pub fn reduce_sextic_moments(
1746    cell: DenestedCubicCell,
1747    base_m0_m4: [f64; 5],
1748    max_degree: usize,
1749) -> Result<Vec<f64>, String> {
1750    if max_degree <= 4 {
1751        return Ok(base_m0_m4[..=max_degree].to_vec());
1752    }
1753    let d = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3);
1754    let lead = d[5];
1755    if !lead.is_finite() {
1756        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1757            "sextic moment reduction encountered non-finite leading coefficient: {lead:.3e}"
1758        ))
1759        .into());
1760    }
1761    if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
1762        if lower_branch == ExactCellBranch::Quartic {
1763            return evaluate_non_affine_cell_state(
1764                DenestedCubicCell { c3: 0.0, ..cell },
1765                ExactCellBranch::Quartic,
1766                max_degree,
1767            )
1768            .map(|state| state.moments.into_vec());
1769        }
1770        return evaluate_affine_cell_state(
1771            DenestedCubicCell {
1772                left: cell.left,
1773                right: cell.right,
1774                c0: cell.c0,
1775                c1: cell.c1,
1776                c2: 0.0,
1777                c3: 0.0,
1778            },
1779            max_degree,
1780        )
1781        .map(|state| state.moments.into_vec());
1782    }
1783    let mut moments = vec![0.0; max_degree + 1];
1784    for (idx, value) in base_m0_m4.into_iter().enumerate() {
1785        moments[idx] = value;
1786    }
1787    let left_finite = cell.left.is_finite();
1788    let right_finite = cell.right.is_finite();
1789    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1790    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1791    for n in 0..=(max_degree - 5) {
1792        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1793        let mut numer = if n == 0 {
1794            0.0
1795        } else {
1796            (n as f64) * moments[n - 1]
1797        };
1798        for j in 0..=4 {
1799            numer -= d[j] * moments[n + j];
1800        }
1801        numer -= b_n;
1802        moments[n + 5] = numer / lead;
1803        if left_finite {
1804            left_pow_n *= cell.left;
1805        }
1806        if right_finite {
1807            right_pow_n *= cell.right;
1808        }
1809    }
1810    Ok(moments)
1811}
1812
1813#[inline]
1814pub fn cell_first_derivative_from_moments(
1815    derivative_coefficients: &[f64],
1816    moments: &[f64],
1817) -> Result<f64, String> {
1818    let value = moment_dot_with_coefficients(derivative_coefficients, moments, "first derivative")?;
1819    Ok(value * INV_TWO_PI)
1820}
1821
1822/// Maximum moment index (i.e. `max_degree` passed to
1823/// `evaluate_cell_moments`) required to evaluate
1824/// `cell_first_derivative_from_moments(derivative_coefficients, moments)`.
1825///
1826/// Callers must request at least `cell_first_derivative_required_max_degree(
1827/// derivative_coefficients)` so the moment dot is well-defined; #321 was
1828/// caused by hardcoding a smaller value at one call site.
1829#[inline]
1830pub fn cell_first_derivative_required_max_degree(derivative_coefficients: &[f64]) -> usize {
1831    derivative_coefficients.len().saturating_sub(1)
1832}
1833
1834/// Maximum moment index required by `cell_second_derivative_from_moments`.
1835///
1836/// Mirrors the kernel's internal `needed = max(second_deg, product_deg) + 1`
1837/// computation, but returned as `max_degree` (i.e. `needed - 1`) so it lines
1838/// up with the `evaluate_cell_moments(cell, max_degree)` argument convention.
1839/// The contraction folds an inner cubic `eta` (always degree 3) with the two
1840/// first-coefficient slices and the second-coefficient slice; the +3 below is
1841/// the cubic-cell eta polynomial.
1842#[inline]
1843pub fn cell_second_derivative_required_max_degree(
1844    first_coefficients_r: &[f64],
1845    first_coefficients_s: &[f64],
1846    second_coefficients_rs: &[f64],
1847) -> usize {
1848    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1849    let product_degree = first_coefficients_r.len().saturating_sub(1)
1850        + first_coefficients_s.len().saturating_sub(1)
1851        + 3;
1852    second_degree.max(product_degree)
1853}
1854
1855#[inline]
1856pub fn cell_polynomial_integral_from_moments(
1857    polynomial_coefficients: &[f64],
1858    moments: &[f64],
1859    label: &str,
1860) -> Result<f64, String> {
1861    let value = moment_dot_with_coefficients(polynomial_coefficients, moments, label)?;
1862    Ok(value * INV_TWO_PI)
1863}
1864
1865#[inline]
1866pub fn cell_second_derivative_from_moments(
1867    cell: DenestedCubicCell,
1868    first_coefficients_r: &[f64],
1869    first_coefficients_s: &[f64],
1870    second_coefficients_rs: &[f64],
1871    moments: &[f64],
1872) -> Result<f64, String> {
1873    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1874    let product_degree = first_coefficients_r.len().saturating_sub(1)
1875        + first_coefficients_s.len().saturating_sub(1)
1876        + 3;
1877    let needed = second_degree.max(product_degree) + 1;
1878    if needed > moments.len() {
1879        return Err(CubicCellKernelError::insufficient_moments(format!(
1880            "insufficient reduced moments for second derivative: need {}, have {}",
1881            needed,
1882            moments.len()
1883        ))
1884        .into());
1885    }
1886    let second_term = moment_dot_with_coefficients_unchecked(second_coefficients_rs, moments);
1887    // Fold `Σ_{e,i,j} eta[e]·r[i]·s[j]·moments[e+i+j]` into a single dot
1888    // against `moments`. Convolving `eta ⊗ r ⊗ s` first turns the original
1889    // `len(eta)·len(r)·len(s)` triple loop (typically 4·4·4 = 64 mul-adds
1890    // per call) into `len(eta)·len(r) + (len(eta)+len(r)-1)·len(s) +
1891    // len(out)` ≈ 16 + 28 + 10 = 54 mul-adds, with the inner loops now in
1892    // straight-line FMA-friendly form.
1893    let cubic = [cell.c0, cell.c1, cell.c2, cell.c3];
1894    // Capacity bound: cubic (4) + first_r (≤MAX) + first_s (≤MAX) - 2.
1895    // First-coefficient slices are passed in as `[f64; 4]` from every
1896    // production caller; sizing to 32 covers any realistic test input.
1897    const SCRATCH: usize = 32;
1898    let mut eta_r = [0.0_f64; SCRATCH];
1899    let mut eta_rs = [0.0_f64; SCRATCH];
1900    let er_len = poly_conv_into(&cubic, first_coefficients_r, &mut eta_r);
1901    let ers_len = poly_conv_into(&eta_r[..er_len], first_coefficients_s, &mut eta_rs);
1902    let mut eta_term = 0.0;
1903    for k in 0..ers_len {
1904        eta_term = eta_rs[k].mul_add(moments[k], eta_term);
1905    }
1906    Ok((second_term - eta_term) * INV_TWO_PI)
1907}
1908
1909/// Pointwise value of the cell second-derivative integrand
1910/// `(∂²/∂r∂s) exp(-q(z))/2π` at a single `z`, evaluated from the SAME
1911/// `(r, s, rs)` coefficient polynomials the moment reduction
1912/// [`cell_second_derivative_from_moments`] integrates:
1913///
1914/// ```text
1915///   F_rs(z) = ( c_rs(z) - η(z)·c_r(z)·c_s(z) ) · exp(-q(z)) · 1/2π ,
1916/// ```
1917///
1918/// with `c_•(z) = Σ_k coeff_•[k]·zᵏ`, `η(z)` the cell cubic, and
1919/// `q(z) = ½(z² + η(z)²)`. This is the integrand whose `[cell.left,
1920/// cell.right]` integral the from-moments form returns — needed for the
1921/// Leibniz boundary term when a cell edge (a link-knot crossing
1922/// `z=(τ-a)/b`) moves with a parameter (the slope `b`): the directional
1923/// derivative of `∫_{z_L}^{z_R} F_rs dz` picks up
1924/// `F_rs(z_R)·z_R'(dir) - F_rs(z_L)·z_L'(dir)` on top of the fixed-domain
1925/// part. Coefficient sign convention matches the simpson reference
1926/// (`numeric_ab`): pass the ACTUAL derivative-coefficient polynomials
1927/// `∂c/∂r` etc. (not the negated `neg_dc_d•` the moment path consumes).
1928#[inline]
1929pub fn cell_second_derivative_boundary_integrand(
1930    cell: DenestedCubicCell,
1931    first_coefficients_r: &[f64],
1932    first_coefficients_s: &[f64],
1933    second_coefficients_rs: &[f64],
1934    z: f64,
1935) -> f64 {
1936    let eta = cell.eta(z);
1937    let c_r = poly_eval_at(first_coefficients_r, z);
1938    let c_s = poly_eval_at(first_coefficients_s, z);
1939    let c_rs = poly_eval_at(second_coefficients_rs, z);
1940    (c_rs - eta * c_r * c_s) * (-cell.q(z)).exp() * INV_TWO_PI
1941}
1942
1943/// Pointwise value of the density-weighted integrand `g(z)·exp(-q(z))/2π` at a
1944/// single `z`, for an arbitrary integrand polynomial `g`.
1945///
1946/// This is the boundary value needed for the moving-domain (Leibniz) term of a
1947/// density-normalization integral `∫ g(z)·exp(-q(z))/2π dz` whose cell edge is a
1948/// link-knot crossing `z=(τ-a)/b` that moves with a parameter direction: the
1949/// directional derivative of the integral picks up
1950/// `g(z_R)·w(z_R)·z_R'(dir) - g(z_L)·w(z_L)·z_L'(dir)` on top of the
1951/// fixed-domain part, with `w(z)=exp(-q(z))/2π` the same weight the moment
1952/// reductions integrate. Unlike the Hessian-integral boundary term (which is
1953/// shared by adjacent cells and cancels across each interior knot), the
1954/// ln-density integrand `D_t`/`D_t,uv` carries a non-shared `g`, so this
1955/// Leibniz term does NOT cancel and must be added (gam#932/#979).
1956pub fn cell_density_boundary_integrand(cell: DenestedCubicCell, g: &[f64], z: f64) -> f64 {
1957    poly_eval_at(g, z) * (-cell.q(z)).exp() * INV_TWO_PI
1958}
1959
1960/// Horner evaluation of `Σ_k coefficients[k]·zᵏ`.
1961#[inline]
1962fn poly_eval_at(coefficients: &[f64], z: f64) -> f64 {
1963    let mut acc = 0.0_f64;
1964    for &c in coefficients.iter().rev() {
1965        acc = acc.mul_add(z, c);
1966    }
1967    acc
1968}
1969
1970#[inline]
1971fn moment_dot_with_coefficients(
1972    coefficients: &[f64],
1973    moments: &[f64],
1974    label: &str,
1975) -> Result<f64, String> {
1976    if coefficients.len() > moments.len() {
1977        return Err(CubicCellKernelError::insufficient_moments(format!(
1978            "insufficient reduced moments for {label}: need {}, have {}",
1979            coefficients.len(),
1980            moments.len()
1981        ))
1982        .into());
1983    }
1984    Ok(moment_dot_with_coefficients_unchecked(
1985        coefficients,
1986        moments,
1987    ))
1988}
1989
1990#[inline]
1991fn moment_dot_with_coefficients_unchecked(coefficients: &[f64], moments: &[f64]) -> f64 {
1992    let mut acc = 0.0;
1993    for (idx, &coeff) in coefficients.iter().enumerate() {
1994        acc = coeff.mul_add(moments[idx], acc);
1995    }
1996    acc
1997}
1998
1999/// Convolve two polynomial coefficient slices into a fixed-capacity output
2000/// buffer. Returns the populated length (`lhs.len() + rhs.len() - 1` when
2001/// both are non-empty). The buffer's tail (beyond the returned length) is
2002/// not zeroed; callers must use only the returned prefix.
2003///
2004/// Used by the multi-derivative reductions to fold `eta · r · s · …` triple
2005/// and quadruple sums into a single moment dot, eliminating the
2006/// `O(deg^3)`/`O(deg^4)` inner-loop work that dominated the
2007/// `cell_*_derivative_from_moments` hot leaves on large-scale fits.
2008#[inline]
2009fn poly_conv_into(lhs: &[f64], rhs: &[f64], out: &mut [f64]) -> usize {
2010    if lhs.is_empty() || rhs.is_empty() {
2011        return 0;
2012    }
2013    let len = lhs.len() + rhs.len() - 1;
2014    assert!(out.len() >= len);
2015    for slot in out[..len].iter_mut() {
2016        *slot = 0.0;
2017    }
2018    for (i, &lv) in lhs.iter().enumerate() {
2019        for (j, &rv) in rhs.iter().enumerate() {
2020            out[i + j] = lv.mul_add(rv, out[i + j]);
2021        }
2022    }
2023    len
2024}
2025
2026#[inline]
2027fn require_moments_degree(
2028    required_degree: usize,
2029    moments: &[f64],
2030    label: &str,
2031) -> Result<(), String> {
2032    if required_degree >= moments.len() {
2033        return Err(CubicCellKernelError::insufficient_moments(format!(
2034            "insufficient reduced moments for {label}: need {}, have {}",
2035            required_degree + 1,
2036            moments.len()
2037        ))
2038        .into());
2039    }
2040    Ok::<(), _>(())
2041}
2042
2043#[inline]
2044fn require_scratch_capacity(
2045    required_len: usize,
2046    capacity: usize,
2047    label: &str,
2048) -> Result<(), String> {
2049    if required_len > capacity {
2050        return Err(CubicCellKernelError::insufficient_moments(format!(
2051            "{label} polynomial convolution scratch too small: need {required_len}, have {capacity}"
2052        ))
2053        .into());
2054    }
2055    Ok::<(), _>(())
2056}
2057
2058#[inline]
2059fn convolution_chain_len(lengths: &[usize]) -> usize {
2060    if lengths.is_empty() || lengths.contains(&0) {
2061        0
2062    } else {
2063        lengths.iter().sum::<usize>() - (lengths.len() - 1)
2064    }
2065}
2066
2067#[inline]
2068fn first_coefficients_degree(label: &str, coefficients: &[f64]) -> Result<usize, String> {
2069    coefficients
2070        .len()
2071        .checked_sub(1)
2072        .ok_or_else(|| format!("{label} first-derivative coefficients must be non-empty"))
2073}
2074
2075#[inline]
2076pub fn cell_third_derivative_from_moments(
2077    cell: DenestedCubicCell,
2078    first_coefficients_r: &[f64],
2079    first_coefficients_s: &[f64],
2080    first_coefficients_t: &[f64],
2081    second_coefficients_rs: &[f64],
2082    second_coefficients_rt: &[f64],
2083    second_coefficients_st: &[f64],
2084    third_coefficients_rst: &[f64],
2085    moments: &[f64],
2086) -> Result<f64, String> {
2087    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2088    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2089    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2090    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2091    let second_sum_degree = [
2092        second_coefficients_rs.len() + first_coefficients_t.len(),
2093        second_coefficients_rt.len() + first_coefficients_s.len(),
2094        second_coefficients_st.len() + first_coefficients_r.len(),
2095    ]
2096    .into_iter()
2097    .max()
2098    .unwrap_or(0)
2099    .saturating_sub(1);
2100    let triple_product_degree = r_degree + s_degree + t_degree;
2101    let needed = (third_coefficients_rst.len().saturating_sub(1))
2102        .max(3 + second_sum_degree)
2103        .max(6 + triple_product_degree);
2104    require_moments_degree(needed, moments, "third derivative")?;
2105
2106    let third_term = moment_dot_with_coefficients_unchecked(third_coefficients_rst, moments);
2107
2108    // This is a deliberately serial leaf kernel: each call performs only a
2109    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2110    // at the surrounding row/cell batch level rather than inside this hot path.
2111    const SCRATCH: usize = 32;
2112    let max_linear_conv_len = [
2113        convolution_chain_len(&[
2114            eta.len(),
2115            second_coefficients_rs.len(),
2116            first_coefficients_t.len(),
2117        ]),
2118        convolution_chain_len(&[
2119            eta.len(),
2120            second_coefficients_rt.len(),
2121            first_coefficients_s.len(),
2122        ]),
2123        convolution_chain_len(&[
2124            eta.len(),
2125            second_coefficients_st.len(),
2126            first_coefficients_r.len(),
2127        ]),
2128    ]
2129    .into_iter()
2130    .max()
2131    .unwrap_or(0);
2132    let max_cubic_conv_len = convolution_chain_len(&[
2133        7,
2134        first_coefficients_r.len(),
2135        first_coefficients_s.len(),
2136        first_coefficients_t.len(),
2137    ]);
2138    require_scratch_capacity(
2139        max_linear_conv_len.max(max_cubic_conv_len),
2140        SCRATCH,
2141        "third derivative",
2142    )?;
2143    let mut buf_a = [0.0_f64; SCRATCH];
2144    let mut buf_b = [0.0_f64; SCRATCH];
2145
2146    // eta_second_term = Σ over (rs⊗t, rt⊗s, st⊗r) of eta⊗product · moments.
2147    // Fold each of the three triple sums into a single moment dot.
2148    let mut eta_second_term = 0.0;
2149    let conv_dot = |first: &[f64],
2150                    second: &[f64],
2151                    buf_a: &mut [f64; SCRATCH],
2152                    buf_b: &mut [f64; SCRATCH]|
2153     -> f64 {
2154        let m = poly_conv_into(first, second, buf_a);
2155        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2156        let mut acc = 0.0;
2157        for k in 0..n {
2158            acc = buf_b[k].mul_add(moments[k], acc);
2159        }
2160        acc
2161    };
2162    eta_second_term += conv_dot(
2163        second_coefficients_rs,
2164        first_coefficients_t,
2165        &mut buf_a,
2166        &mut buf_b,
2167    );
2168    eta_second_term += conv_dot(
2169        second_coefficients_rt,
2170        first_coefficients_s,
2171        &mut buf_a,
2172        &mut buf_b,
2173    );
2174    eta_second_term += conv_dot(
2175        second_coefficients_st,
2176        first_coefficients_r,
2177        &mut buf_a,
2178        &mut buf_b,
2179    );
2180
2181    // cubic_coeff_term = Σ_{e,i,j,k} (eta·eta − 1)[e] · r[i] · s[j] · t[k] · moments[e+i+j+k].
2182    // Convolve r⊗s, then ⊗t, then ⊗(eta·eta − 1), giving a single dot.
2183    let mut eta_sq_minus_one = [0.0_f64; 7];
2184    for (i, &eta_i) in eta.iter().enumerate() {
2185        for (j, &eta_j) in eta.iter().enumerate() {
2186            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2187        }
2188    }
2189    eta_sq_minus_one[0] -= 1.0;
2190
2191    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2192    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2193    // buf_a now reused for (eta_sq_minus_one ⊗ rst).
2194    let final_len = poly_conv_into(&eta_sq_minus_one, &buf_b[..rst_len], &mut buf_a);
2195    let mut cubic_coeff_term = 0.0;
2196    for k in 0..final_len {
2197        cubic_coeff_term = buf_a[k].mul_add(moments[k], cubic_coeff_term);
2198    }
2199
2200    Ok((third_term - eta_second_term + cubic_coeff_term) * INV_TWO_PI)
2201}
2202
2203#[inline]
2204pub fn cell_fourth_derivative_from_moments(
2205    cell: DenestedCubicCell,
2206    first_coefficients_r: &[f64],
2207    first_coefficients_s: &[f64],
2208    first_coefficients_t: &[f64],
2209    first_coefficients_u: &[f64],
2210    second_coefficients_rs: &[f64],
2211    second_coefficients_rt: &[f64],
2212    second_coefficients_ru: &[f64],
2213    second_coefficients_st: &[f64],
2214    second_coefficients_su: &[f64],
2215    second_coefficients_tu: &[f64],
2216    third_coefficients_rst: &[f64],
2217    third_coefficients_rsu: &[f64],
2218    third_coefficients_rtu: &[f64],
2219    third_coefficients_stu: &[f64],
2220    fourth_coefficients_rstu: &[f64],
2221    moments: &[f64],
2222) -> Result<f64, String> {
2223    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2224    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2225    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2226    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2227    let u_degree = first_coefficients_degree("u", first_coefficients_u)?;
2228    let linear_sum_degree = [
2229        third_coefficients_rst.len() + first_coefficients_u.len(),
2230        third_coefficients_rsu.len() + first_coefficients_t.len(),
2231        third_coefficients_rtu.len() + first_coefficients_s.len(),
2232        third_coefficients_stu.len() + first_coefficients_r.len(),
2233        second_coefficients_rs.len() + second_coefficients_tu.len(),
2234        second_coefficients_rt.len() + second_coefficients_su.len(),
2235        second_coefficients_ru.len() + second_coefficients_st.len(),
2236    ]
2237    .into_iter()
2238    .max()
2239    .unwrap_or(0)
2240    .saturating_sub(1);
2241    let quad_sum_degree = [
2242        second_coefficients_rs.len() + first_coefficients_t.len() + first_coefficients_u.len(),
2243        second_coefficients_rt.len() + first_coefficients_s.len() + first_coefficients_u.len(),
2244        second_coefficients_ru.len() + first_coefficients_s.len() + first_coefficients_t.len(),
2245        second_coefficients_st.len() + first_coefficients_r.len() + first_coefficients_u.len(),
2246        second_coefficients_su.len() + first_coefficients_r.len() + first_coefficients_t.len(),
2247        second_coefficients_tu.len() + first_coefficients_r.len() + first_coefficients_s.len(),
2248    ]
2249    .into_iter()
2250    .max()
2251    .unwrap_or(0)
2252    .saturating_sub(2);
2253    let quartic_product_degree = r_degree + s_degree + t_degree + u_degree;
2254    let needed = (fourth_coefficients_rstu.len().saturating_sub(1))
2255        .max(3 + linear_sum_degree)
2256        .max(6 + quad_sum_degree)
2257        .max(9 + quartic_product_degree);
2258    require_moments_degree(needed, moments, "fourth derivative")?;
2259
2260    let fourth_term = moment_dot_with_coefficients_unchecked(fourth_coefficients_rstu, moments);
2261
2262    // This is a deliberately serial leaf kernel: each call performs only a
2263    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2264    // at the surrounding row/cell batch level rather than inside this hot path.
2265    const SCRATCH: usize = 32;
2266    let max_linear_conv_len = [
2267        convolution_chain_len(&[
2268            eta.len(),
2269            third_coefficients_rst.len(),
2270            first_coefficients_u.len(),
2271        ]),
2272        convolution_chain_len(&[
2273            eta.len(),
2274            third_coefficients_rsu.len(),
2275            first_coefficients_t.len(),
2276        ]),
2277        convolution_chain_len(&[
2278            eta.len(),
2279            third_coefficients_rtu.len(),
2280            first_coefficients_s.len(),
2281        ]),
2282        convolution_chain_len(&[
2283            eta.len(),
2284            third_coefficients_stu.len(),
2285            first_coefficients_r.len(),
2286        ]),
2287        convolution_chain_len(&[
2288            eta.len(),
2289            second_coefficients_rs.len(),
2290            second_coefficients_tu.len(),
2291        ]),
2292        convolution_chain_len(&[
2293            eta.len(),
2294            second_coefficients_rt.len(),
2295            second_coefficients_su.len(),
2296        ]),
2297        convolution_chain_len(&[
2298            eta.len(),
2299            second_coefficients_ru.len(),
2300            second_coefficients_st.len(),
2301        ]),
2302    ]
2303    .into_iter()
2304    .max()
2305    .unwrap_or(0);
2306    let max_quad_conv_len = [
2307        convolution_chain_len(&[
2308            7,
2309            second_coefficients_rs.len(),
2310            first_coefficients_t.len(),
2311            first_coefficients_u.len(),
2312        ]),
2313        convolution_chain_len(&[
2314            7,
2315            second_coefficients_rt.len(),
2316            first_coefficients_s.len(),
2317            first_coefficients_u.len(),
2318        ]),
2319        convolution_chain_len(&[
2320            7,
2321            second_coefficients_ru.len(),
2322            first_coefficients_s.len(),
2323            first_coefficients_t.len(),
2324        ]),
2325        convolution_chain_len(&[
2326            7,
2327            second_coefficients_st.len(),
2328            first_coefficients_r.len(),
2329            first_coefficients_u.len(),
2330        ]),
2331        convolution_chain_len(&[
2332            7,
2333            second_coefficients_su.len(),
2334            first_coefficients_r.len(),
2335            first_coefficients_t.len(),
2336        ]),
2337        convolution_chain_len(&[
2338            7,
2339            second_coefficients_tu.len(),
2340            first_coefficients_r.len(),
2341            first_coefficients_s.len(),
2342        ]),
2343    ]
2344    .into_iter()
2345    .max()
2346    .unwrap_or(0);
2347    let max_quartic_conv_len = convolution_chain_len(&[
2348        10,
2349        first_coefficients_r.len(),
2350        first_coefficients_s.len(),
2351        first_coefficients_t.len(),
2352        first_coefficients_u.len(),
2353    ]);
2354    require_scratch_capacity(
2355        max_linear_conv_len
2356            .max(max_quad_conv_len)
2357            .max(max_quartic_conv_len),
2358        SCRATCH,
2359        "fourth derivative",
2360    )?;
2361    let mut buf_a = [0.0_f64; SCRATCH];
2362    let mut buf_b = [0.0_f64; SCRATCH];
2363
2364    // eta_linear_term = Σ over seven (rst⊗u, rsu⊗t, rtu⊗s, stu⊗r, rs⊗tu,
2365    // rt⊗su, ru⊗st) of eta⊗product · moments. Fold each triple sum into
2366    // a single moment dot.
2367    let conv_eta_dot = |first: &[f64],
2368                        second: &[f64],
2369                        buf_a: &mut [f64; SCRATCH],
2370                        buf_b: &mut [f64; SCRATCH]|
2371     -> f64 {
2372        let m = poly_conv_into(first, second, buf_a);
2373        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2374        let mut acc = 0.0;
2375        for k in 0..n {
2376            acc = buf_b[k].mul_add(moments[k], acc);
2377        }
2378        acc
2379    };
2380    let mut eta_linear_term = 0.0;
2381    eta_linear_term += conv_eta_dot(
2382        third_coefficients_rst,
2383        first_coefficients_u,
2384        &mut buf_a,
2385        &mut buf_b,
2386    );
2387    eta_linear_term += conv_eta_dot(
2388        third_coefficients_rsu,
2389        first_coefficients_t,
2390        &mut buf_a,
2391        &mut buf_b,
2392    );
2393    eta_linear_term += conv_eta_dot(
2394        third_coefficients_rtu,
2395        first_coefficients_s,
2396        &mut buf_a,
2397        &mut buf_b,
2398    );
2399    eta_linear_term += conv_eta_dot(
2400        third_coefficients_stu,
2401        first_coefficients_r,
2402        &mut buf_a,
2403        &mut buf_b,
2404    );
2405    eta_linear_term += conv_eta_dot(
2406        second_coefficients_rs,
2407        second_coefficients_tu,
2408        &mut buf_a,
2409        &mut buf_b,
2410    );
2411    eta_linear_term += conv_eta_dot(
2412        second_coefficients_rt,
2413        second_coefficients_su,
2414        &mut buf_a,
2415        &mut buf_b,
2416    );
2417    eta_linear_term += conv_eta_dot(
2418        second_coefficients_ru,
2419        second_coefficients_st,
2420        &mut buf_a,
2421        &mut buf_b,
2422    );
2423
2424    let mut eta_sq_minus_one = [0.0_f64; 7];
2425    for (i, &eta_i) in eta.iter().enumerate() {
2426        for (j, &eta_j) in eta.iter().enumerate() {
2427            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2428        }
2429    }
2430    eta_sq_minus_one[0] -= 1.0;
2431
2432    // quad_coeff_term: six (eta²−1)⊗A⊗B⊗C · moments sums, where the (A,B,C)
2433    // factors are: (rs,t,u), (rt,s,u), (ru,s,t), (st,r,u), (su,r,t), (tu,r,s).
2434    let mut buf_c = [0.0_f64; SCRATCH];
2435    let conv_weighted_triple_dot = |weight: &[f64],
2436                                    a: &[f64],
2437                                    b: &[f64],
2438                                    c: &[f64],
2439                                    buf_a: &mut [f64; SCRATCH],
2440                                    buf_b: &mut [f64; SCRATCH],
2441                                    buf_c: &mut [f64; SCRATCH]|
2442     -> f64 {
2443        let ab_len = poly_conv_into(a, b, buf_a);
2444        let abc_len = poly_conv_into(&buf_a[..ab_len], c, buf_b);
2445        let final_len = poly_conv_into(weight, &buf_b[..abc_len], buf_c);
2446        let mut acc = 0.0;
2447        for k in 0..final_len {
2448            acc = buf_c[k].mul_add(moments[k], acc);
2449        }
2450        acc
2451    };
2452    let mut quad_coeff_term = 0.0;
2453    quad_coeff_term += conv_weighted_triple_dot(
2454        &eta_sq_minus_one,
2455        second_coefficients_rs,
2456        first_coefficients_t,
2457        first_coefficients_u,
2458        &mut buf_a,
2459        &mut buf_b,
2460        &mut buf_c,
2461    );
2462    quad_coeff_term += conv_weighted_triple_dot(
2463        &eta_sq_minus_one,
2464        second_coefficients_rt,
2465        first_coefficients_s,
2466        first_coefficients_u,
2467        &mut buf_a,
2468        &mut buf_b,
2469        &mut buf_c,
2470    );
2471    quad_coeff_term += conv_weighted_triple_dot(
2472        &eta_sq_minus_one,
2473        second_coefficients_ru,
2474        first_coefficients_s,
2475        first_coefficients_t,
2476        &mut buf_a,
2477        &mut buf_b,
2478        &mut buf_c,
2479    );
2480    quad_coeff_term += conv_weighted_triple_dot(
2481        &eta_sq_minus_one,
2482        second_coefficients_st,
2483        first_coefficients_r,
2484        first_coefficients_u,
2485        &mut buf_a,
2486        &mut buf_b,
2487        &mut buf_c,
2488    );
2489    quad_coeff_term += conv_weighted_triple_dot(
2490        &eta_sq_minus_one,
2491        second_coefficients_su,
2492        first_coefficients_r,
2493        first_coefficients_t,
2494        &mut buf_a,
2495        &mut buf_b,
2496        &mut buf_c,
2497    );
2498    quad_coeff_term += conv_weighted_triple_dot(
2499        &eta_sq_minus_one,
2500        second_coefficients_tu,
2501        first_coefficients_r,
2502        first_coefficients_s,
2503        &mut buf_a,
2504        &mut buf_b,
2505        &mut buf_c,
2506    );
2507
2508    // cubic_weight = 3·eta − eta³ (same as the prior expansion: eta_sq*eta
2509    // negated, plus the 3·eta linear correction).
2510    let mut eta_sq = [0.0_f64; 7];
2511    for (i, &eta_i) in eta.iter().enumerate() {
2512        for (j, &eta_j) in eta.iter().enumerate() {
2513            eta_sq[i + j] = eta_i.mul_add(eta_j, eta_sq[i + j]);
2514        }
2515    }
2516    let mut cubic_weight = [0.0_f64; 10];
2517    for (i, &eta_sq_i) in eta_sq.iter().enumerate() {
2518        for (j, &eta_j) in eta.iter().enumerate() {
2519            cubic_weight[i + j] = (-eta_sq_i).mul_add(eta_j, cubic_weight[i + j]);
2520        }
2521    }
2522    for (idx, &eta_coeff) in eta.iter().enumerate() {
2523        cubic_weight[idx] += 3.0 * eta_coeff;
2524    }
2525
2526    // quartic_coeff_term: cubic_weight ⊗ r ⊗ s ⊗ t ⊗ u · moments. The
2527    // original quintuple loop did 10·4·4·4·4 = 2560 mul-adds per call;
2528    // four sequential convolutions plus one moment dot drop this to
2529    // ~16+28+40+52+16 ≈ 152 mul-adds.
2530    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2531    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2532    let rstu_len = poly_conv_into(&buf_b[..rst_len], first_coefficients_u, &mut buf_a);
2533    let final_len = poly_conv_into(&cubic_weight, &buf_a[..rstu_len], &mut buf_b);
2534    let mut quartic_coeff_term = 0.0;
2535    for k in 0..final_len {
2536        quartic_coeff_term = buf_b[k].mul_add(moments[k], quartic_coeff_term);
2537    }
2538
2539    Ok((fourth_term - eta_linear_term + quad_coeff_term + quartic_coeff_term) * INV_TWO_PI)
2540}
2541
2542#[inline]
2543pub fn global_cubic_from_local(span: LocalSpanCubic) -> (f64, f64, f64, f64) {
2544    let left = span.left;
2545    let q0 = span.c0 - span.c1 * left + span.c2 * left * left - span.c3 * left * left * left;
2546    let q1 = span.c1 - 2.0 * span.c2 * left + 3.0 * span.c3 * left * left;
2547    let q2 = span.c2 - 3.0 * span.c3 * left;
2548    let q3 = span.c3;
2549    (q0, q1, q2, q3)
2550}
2551
2552/// Return the cubic polynomial coefficients (in `z`) of
2553/// `f(z) = link_span.evaluate(a + b*z)`.
2554///
2555/// `link_span.evaluate` is a cubic in its argument, so `f(z)` is also a cubic
2556/// in `z` and can be written exactly as
2557///
2558/// ```text
2559///     f(z) = d0 + d1·z + d2·z² + d3·z³
2560/// ```
2561///
2562/// where `(d0, d1, d2, d3)` are the values returned by this function. These
2563/// are **polynomial coefficients**, *not* derivatives of `f` at `z = 0`. The
2564/// relationship to Taylor derivatives is
2565///
2566/// ```text
2567///     d_k = f^(k)(0) / k!
2568/// ```
2569///
2570/// so `d0 = f(0)`, `d1 = f'(0)`, `d2 = ½·f''(0)`, `d3 = ⅙·f'''(0)`. Callers
2571/// such as [`denested_cell_coefficients`] and [`link_basis_cell_coefficients`]
2572/// rely on the polynomial-coefficient convention, since they propagate the
2573/// values directly as the `(c0, c1, c2, c3)` slots of a downstream polynomial
2574/// in `z`.
2575#[inline]
2576pub fn transformed_link_cubic(link_span: LocalSpanCubic, a: f64, b: f64) -> (f64, f64, f64, f64) {
2577    let shift = a - link_span.left;
2578    let d0 = link_span.c0
2579        + link_span.c1 * shift
2580        + link_span.c2 * shift * shift
2581        + link_span.c3 * shift * shift * shift;
2582    let d1 = b * (link_span.c1 + 2.0 * link_span.c2 * shift + 3.0 * link_span.c3 * shift * shift);
2583    let d2 = b * b * (link_span.c2 + 3.0 * link_span.c3 * shift);
2584    let d3 = link_span.c3 * b * b * b;
2585    (d0, d1, d2, d3)
2586}
2587
2588#[inline]
2589pub fn denested_cell_coefficients(
2590    score_span: LocalSpanCubic,
2591    link_span: LocalSpanCubic,
2592    a: f64,
2593    b: f64,
2594) -> [f64; 4] {
2595    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2596    let (d0, d1, d2, d3) = transformed_link_cubic(link_span, a, b);
2597    [a + b * h0 + d0, b + b * h1 + d1, b * h2 + d2, b * h3 + d3]
2598}
2599
2600#[inline]
2601pub fn denested_cell_coefficient_partials(
2602    score_span: LocalSpanCubic,
2603    link_span: LocalSpanCubic,
2604    a: f64,
2605    b: f64,
2606) -> ([f64; 4], [f64; 4]) {
2607    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2608    let shift = a - link_span.left;
2609    let alpha1 = link_span.c1;
2610    let alpha2 = link_span.c2;
2611    let alpha3 = link_span.c3;
2612    let dc_da = [
2613        1.0 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2614        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2615        3.0 * alpha3 * b * b,
2616        0.0,
2617    ];
2618    let dc_db = [
2619        h0,
2620        1.0 + h1 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2621        h2 + 2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2622        h3 + 3.0 * alpha3 * b * b,
2623    ];
2624    (dc_da, dc_db)
2625}
2626
2627#[inline]
2628fn link_cubic_second_partials(
2629    link_span: LocalSpanCubic,
2630    a: f64,
2631    b: f64,
2632) -> ([f64; 4], [f64; 4], [f64; 4]) {
2633    let shift = a - link_span.left;
2634    let alpha2 = link_span.c2;
2635    let alpha3 = link_span.c3;
2636    let dc_daa = [
2637        2.0 * alpha2 + 6.0 * alpha3 * shift,
2638        6.0 * alpha3 * b,
2639        0.0,
2640        0.0,
2641    ];
2642    let dc_dab = [
2643        0.0,
2644        2.0 * alpha2 + 6.0 * alpha3 * shift,
2645        6.0 * alpha3 * b,
2646        0.0,
2647    ];
2648    let dc_dbb = [
2649        0.0,
2650        0.0,
2651        2.0 * (alpha2 + 3.0 * alpha3 * shift),
2652        6.0 * alpha3 * b,
2653    ];
2654    (dc_daa, dc_dab, dc_dbb)
2655}
2656
2657#[inline]
2658pub fn denested_cell_second_partials(
2659    score_span: LocalSpanCubic,
2660    link_span: LocalSpanCubic,
2661    a: f64,
2662    b: f64,
2663) -> ([f64; 4], [f64; 4], [f64; 4]) {
2664    let score_left = score_span.left;
2665    if !score_left.is_finite() {
2666        return ([f64::NAN; 4], [f64::NAN; 4], [f64::NAN; 4]);
2667    }
2668    link_cubic_second_partials(link_span, a, b)
2669}
2670
2671#[inline]
2672fn link_cubic_third_partials(
2673    link_span: LocalSpanCubic,
2674) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2675    let alpha3 = link_span.c3;
2676    (
2677        [6.0 * alpha3, 0.0, 0.0, 0.0],
2678        [0.0, 6.0 * alpha3, 0.0, 0.0],
2679        [0.0, 0.0, 6.0 * alpha3, 0.0],
2680        [0.0, 0.0, 0.0, 6.0 * alpha3],
2681    )
2682}
2683
2684#[inline]
2685pub fn denested_cell_third_partials(
2686    link_span: LocalSpanCubic,
2687) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2688    link_cubic_third_partials(link_span)
2689}
2690
2691#[inline]
2692pub fn score_basis_cell_coefficients(score_basis_span: LocalSpanCubic, b: f64) -> [f64; 4] {
2693    let (h0, h1, h2, h3) = global_cubic_from_local(score_basis_span);
2694    [b * h0, b * h1, b * h2, b * h3]
2695}
2696
2697#[inline]
2698pub fn link_basis_cell_coefficients(link_basis_span: LocalSpanCubic, a: f64, b: f64) -> [f64; 4] {
2699    let (d0, d1, d2, d3) = transformed_link_cubic(link_basis_span, a, b);
2700    [d0, d1, d2, d3]
2701}
2702
2703#[inline]
2704pub fn link_basis_cell_coefficient_partials(
2705    link_basis_span: LocalSpanCubic,
2706    a: f64,
2707    b: f64,
2708) -> ([f64; 4], [f64; 4]) {
2709    let shift = a - link_basis_span.left;
2710    let alpha1 = link_basis_span.c1;
2711    let alpha2 = link_basis_span.c2;
2712    let alpha3 = link_basis_span.c3;
2713    let dc_da = [
2714        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2715        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2716        3.0 * alpha3 * b * b,
2717        0.0,
2718    ];
2719    let dc_db = [
2720        0.0,
2721        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2722        2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2723        3.0 * alpha3 * b * b,
2724    ];
2725    (dc_da, dc_db)
2726}
2727
2728#[inline]
2729pub fn link_basis_cell_second_partials(
2730    link_basis_span: LocalSpanCubic,
2731    a: f64,
2732    b: f64,
2733) -> ([f64; 4], [f64; 4], [f64; 4]) {
2734    link_cubic_second_partials(link_basis_span, a, b)
2735}
2736
2737#[inline]
2738pub fn link_basis_cell_third_partials(
2739    link_basis_span: LocalSpanCubic,
2740) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2741    link_cubic_third_partials(link_basis_span)
2742}
2743
2744pub fn build_denested_partition_cells<FS, FL>(
2745    a: f64,
2746    b: f64,
2747    score_breaks: &[f64],
2748    link_breaks: &[f64],
2749    score_span_at: FS,
2750    link_span_at: FL,
2751) -> Result<Vec<DenestedPartitionCell>, String>
2752where
2753    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2754    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2755{
2756    build_denested_partition_cells_with_tails(
2757        a,
2758        b,
2759        score_breaks,
2760        link_breaks,
2761        score_span_at,
2762        link_span_at,
2763    )
2764}
2765
2766/// Build a partition covering `(-∞, +∞)` with parameter-independent outer
2767/// bounds.  Interior cells use the same finite-cell polynomial algebra.
2768/// The two tail cells are guaranteed affine (c2=c3=0) because both
2769/// deviations saturate to constants outside their knot support.
2770///
2771/// The tail cells' score/link spans come from the same closures evaluated
2772/// at a representative point in the tail region — the closures must return
2773/// constant (c1=c2=c3=0) cubics for points outside support.
2774pub fn build_denested_partition_cells_with_tails<FS, FL>(
2775    a: f64,
2776    b: f64,
2777    score_breaks: &[f64],
2778    link_breaks: &[f64],
2779    mut score_span_at: FS,
2780    mut link_span_at: FL,
2781) -> Result<Vec<DenestedPartitionCell>, String>
2782where
2783    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2784    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2785{
2786    // Collect all INTERNAL split points (finite), each tagged with its
2787    // provenance: a fixed score break or a link-knot crossing. Provenance
2788    // identifies the cell's `(a, b)` family for the Chebyshev moment-family
2789    // layer; the z coordinates alone cannot distinguish the two kinds.
2790    let mut split_points: Vec<(f64, PartitionEdge)> = score_breaks
2791        .iter()
2792        .map(|&sigma| (sigma, PartitionEdge::Fixed(sigma)))
2793        .collect();
2794    if b.abs() > 1e-12 {
2795        for &tau in link_breaks {
2796            let z = (tau - a) / b;
2797            if z.is_finite() {
2798                split_points.push((z, PartitionEdge::Crossing { tau }));
2799            }
2800        }
2801    }
2802    dedup_sorted_tagged_breakpoints(&mut split_points);
2803
2804    let mut out = Vec::new();
2805
2806    if split_points.is_empty() {
2807        let score_span = score_span_at(0.0)?;
2808        let link_span = link_span_at(a)?;
2809        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2810        return Ok(vec![DenestedPartitionCell {
2811            cell: DenestedCubicCell {
2812                left: f64::NEG_INFINITY,
2813                right: f64::INFINITY,
2814                c0: coeffs[0],
2815                c1: coeffs[1],
2816                c2: 0.0,
2817                c3: 0.0,
2818            },
2819            score_span,
2820            link_span,
2821            left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2822            right_edge: PartitionEdge::Fixed(f64::INFINITY),
2823        }]);
2824    }
2825
2826    // ── Left tail cell: (-∞, leftmost_split] ──
2827    let (leftmost, leftmost_edge) = split_points[0];
2828    // Evaluate spans at a point just left of the leftmost split.  The
2829    // closures return constant tail cubics for this region.
2830    let left_probe = interval_probe_point(f64::NEG_INFINITY, leftmost)?;
2831    let left_score_span = score_span_at(left_probe)?;
2832    let left_link_span = link_span_at(a + b * left_probe)?;
2833    let left_coeffs = denested_cell_coefficients(left_score_span, left_link_span, a, b);
2834    if left_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2835        || left_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2836    {
2837        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2838            "left tail cell must be affine (deviations constant outside support), \
2839             got c2={:.3e}, c3={:.3e}",
2840            left_coeffs[2], left_coeffs[3]
2841        ))
2842        .into());
2843    }
2844    out.push(DenestedPartitionCell {
2845        cell: DenestedCubicCell {
2846            left: f64::NEG_INFINITY,
2847            right: leftmost,
2848            c0: left_coeffs[0],
2849            c1: left_coeffs[1],
2850            c2: 0.0,
2851            c3: 0.0,
2852        },
2853        score_span: left_score_span,
2854        link_span: left_link_span,
2855        left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2856        right_edge: leftmost_edge,
2857    });
2858
2859    // ── Interior cells (all finite) ──
2860    for window in split_points.windows(2) {
2861        let (left, left_edge) = window[0];
2862        let (right, right_edge) = window[1];
2863        if !left.is_finite() || !right.is_finite() || right - left <= 1e-12 {
2864            continue;
2865        }
2866        let mid = interval_probe_point(left, right)?;
2867        let score_span = score_span_at(mid)?;
2868        let link_span = link_span_at(a + b * mid)?;
2869        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2870        out.push(DenestedPartitionCell {
2871            cell: DenestedCubicCell {
2872                left,
2873                right,
2874                c0: coeffs[0],
2875                c1: coeffs[1],
2876                c2: coeffs[2],
2877                c3: coeffs[3],
2878            },
2879            score_span,
2880            link_span,
2881            left_edge,
2882            right_edge,
2883        });
2884    }
2885
2886    // ── Right tail cell: [rightmost_split, +∞) ──
2887    let (rightmost, rightmost_edge) = *split_points.last().unwrap();
2888    let right_probe = interval_probe_point(rightmost, f64::INFINITY)?;
2889    let right_score_span = score_span_at(right_probe)?;
2890    let right_link_span = link_span_at(a + b * right_probe)?;
2891    let right_coeffs = denested_cell_coefficients(right_score_span, right_link_span, a, b);
2892    if right_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2893        || right_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2894    {
2895        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2896            "right tail cell must be affine (deviations constant outside support), \
2897             got c2={:.3e}, c3={:.3e}",
2898            right_coeffs[2], right_coeffs[3]
2899        ))
2900        .into());
2901    }
2902    out.push(DenestedPartitionCell {
2903        cell: DenestedCubicCell {
2904            left: rightmost,
2905            right: f64::INFINITY,
2906            c0: right_coeffs[0],
2907            c1: right_coeffs[1],
2908            c2: 0.0,
2909            c3: 0.0,
2910        },
2911        score_span: right_score_span,
2912        link_span: right_link_span,
2913        left_edge: rightmost_edge,
2914        right_edge: PartitionEdge::Fixed(f64::INFINITY),
2915    });
2916
2917    Ok(out)
2918}
2919
2920#[inline]
2921pub fn normalized_non_affine_coefficients(
2922    left: f64,
2923    right: f64,
2924    c0: f64,
2925    c1: f64,
2926    c2: f64,
2927    c3: f64,
2928) -> Result<(f64, f64), String> {
2929    let width = right - left;
2930    if !width.is_finite() || width <= 0.0 {
2931        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2932            "normalized cubic coefficients require a positive finite cell width, got left={left}, right={right}"
2933        ))
2934        .into());
2935    }
2936    let anchor_scale = c0.abs() + c1.abs();
2937    if !anchor_scale.is_finite() {
2938        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2939            "normalized cubic coefficients require finite affine coefficients, got c0={c0}, c1={c1}"
2940        ))
2941        .into());
2942    }
2943    let mid = 0.5 * (left + right);
2944    let half = 0.5 * width;
2945    let k2 = half * half * (c2 + 3.0 * c3 * mid);
2946    let k3 = c3 * half * half * half;
2947    Ok((k2, k3))
2948}
2949
2950#[inline]
2951pub fn branch_cell(cell: DenestedCubicCell) -> Result<ExactCellBranch, String> {
2952    let tol = effective_branch_tol(cell);
2953    if !cell.left.is_finite() || !cell.right.is_finite() {
2954        if cell.c2.abs() <= tol && cell.c3.abs() <= tol {
2955            return Ok(ExactCellBranch::Affine);
2956        }
2957        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2958            "non-affine cells require finite bounds, got [{}, {}] with c2={:.6e}, c3={:.6e}",
2959            cell.left, cell.right, cell.c2, cell.c3
2960        ))
2961        .into());
2962    }
2963    let (k2, k3) = normalized_non_affine_coefficients(
2964        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
2965    )?;
2966    if k2.abs() <= tol && k3.abs() <= tol {
2967        Ok(ExactCellBranch::Affine)
2968    } else if k3.abs() <= tol {
2969        Ok(ExactCellBranch::Quartic)
2970    } else {
2971        Ok(ExactCellBranch::Sextic)
2972    }
2973}
2974
2975#[inline]
2976fn degenerate_sextic_branch(
2977    cell: DenestedCubicCell,
2978    lead: f64,
2979) -> Result<Option<ExactCellBranch>, String> {
2980    // The sextic recurrence divides by `lead = 3*c3^2`. When that division is
2981    // unstable, lower the polynomial degree without discarding a material
2982    // quadratic coefficient.
2983    let (normalized_k2, normalized_k3) = normalized_non_affine_coefficients(
2984        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
2985    )?;
2986    if normalized_k3.abs() > NORMALIZED_CELL_BRANCH_TOL && lead.abs() > 1e-18 {
2987        return Ok(None);
2988    }
2989    if normalized_k2.abs() > NORMALIZED_CELL_BRANCH_TOL {
2990        Ok(Some(ExactCellBranch::Quartic))
2991    } else {
2992        Ok(Some(ExactCellBranch::Affine))
2993    }
2994}
2995
2996#[inline]
2997fn validate_bvn_args(h: f64, k: f64, rho: f64) -> Result<(), String> {
2998    if !h.is_finite() && !h.is_infinite() {
2999        return Err(CubicCellKernelError::bivariate_normal_domain(
3000            "bivariate normal cdf requires finite or infinite h",
3001        )
3002        .into());
3003    }
3004    if !k.is_finite() && !k.is_infinite() {
3005        return Err(CubicCellKernelError::bivariate_normal_domain(
3006            "bivariate normal cdf requires finite or infinite k",
3007        )
3008        .into());
3009    }
3010    if !rho.is_finite() {
3011        return Err(CubicCellKernelError::bivariate_normal_domain(format!(
3012            "bivariate normal cdf requires finite correlation, got {rho}"
3013        ))
3014        .into());
3015    }
3016    Ok::<(), _>(())
3017}
3018
3019#[inline]
3020fn bvn_gl_sum(h: f64, k: f64, rho_clamped: f64, asr: f64) -> f64 {
3021    // The Drezner-Wesolowsky arcsin representation is integrated with the
3022    // same 20-point Gauss-Legendre rule as before, but mirrored node pairs are
3023    // evaluated with one sin_cos for the half-angle offset rather than two
3024    // independent sin calls.  This preserves the quadrature rule (and hence
3025    // the accuracy envelope) while reducing the transcendental work in the
3026    // dominant finite-bound path from 20 sin calls to 11 sin/cos evaluations.
3027    if rho_clamped == 0.0 {
3028        return 0.0;
3029    }
3030    let hs = 0.5 * (h * h + k * k);
3031    let hk = h * k;
3032    let half_asr = 0.5 * asr;
3033    let (sin_mid, cos_mid) = half_asr.sin_cos();
3034    let mut sum = 0.0;
3035    for i in 0..10 {
3036        let node = GL20_NODES[i].abs();
3037        let weight = GL20_WEIGHTS[i];
3038        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3039
3040        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3041        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3042        let expo_lo = ((sn_lo * hk) - hs) / one_minus_lo;
3043
3044        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3045        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3046        let expo_hi = ((sn_hi * hk) - hs) / one_minus_hi;
3047
3048        sum += weight * (expo_lo.exp() + expo_hi.exp());
3049    }
3050    sum
3051}
3052
3053pub fn bivariate_normal_cdf(h: f64, k: f64, rho: f64) -> Result<f64, String> {
3054    validate_bvn_args(h, k, rho)?;
3055    if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
3056        return Ok(0.0);
3057    }
3058    if h == f64::INFINITY {
3059        return Ok(normal_cdf(k));
3060    }
3061    if k == f64::INFINITY {
3062        return Ok(normal_cdf(h));
3063    }
3064
3065    let rho_clamped = rho.clamp(-1.0, 1.0);
3066    if rho_clamped >= 1.0 - 1e-12 {
3067        return Ok(normal_cdf(h.min(k)));
3068    }
3069    if rho_clamped <= -1.0 + 1e-12 {
3070        return Ok((normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0));
3071    }
3072    if rho_clamped == 0.0 {
3073        return Ok((normal_cdf(h) * normal_cdf(k)).clamp(0.0, 1.0));
3074    }
3075    if h == 0.0 && k == 0.0 {
3076        return Ok((0.25 + rho_clamped.asin() / std::f64::consts::TAU).clamp(0.0, 1.0));
3077    }
3078
3079    let asr = rho_clamped.asin();
3080    let sum = bvn_gl_sum(h, k, rho_clamped, asr);
3081    Ok((normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3082}
3083
3084#[inline]
3085fn bvn_gl_sum_interval(h: f64, left: f64, right: f64, rho_clamped: f64, asr: f64) -> f64 {
3086    if rho_clamped == 0.0 {
3087        return 0.0;
3088    }
3089    let h2 = h * h;
3090    let right_hs = 0.5 * (h2 + right * right);
3091    let left_hs = 0.5 * (h2 + left * left);
3092    let half_asr = 0.5 * asr;
3093    let (sin_mid, cos_mid) = half_asr.sin_cos();
3094    let mut sum = 0.0;
3095    for i in 0..10 {
3096        let node = GL20_NODES[i].abs();
3097        let weight = GL20_WEIGHTS[i];
3098        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3099
3100        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3101        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3102        let lo_right = (((sn_lo * h * right) - right_hs) / one_minus_lo).exp();
3103        let lo_left = (((sn_lo * h * left) - left_hs) / one_minus_lo).exp();
3104
3105        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3106        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3107        let hi_right = (((sn_hi * h * right) - right_hs) / one_minus_hi).exp();
3108        let hi_left = (((sn_hi * h * left) - left_hs) / one_minus_hi).exp();
3109
3110        sum += weight * ((lo_right - lo_left) + (hi_right - hi_left));
3111    }
3112    sum
3113}
3114
3115fn bivariate_normal_cdf_interval(h: f64, left: f64, right: f64, rho: f64) -> Result<f64, String> {
3116    if right <= left {
3117        return Ok(0.0);
3118    }
3119    if left == f64::NEG_INFINITY && right == f64::INFINITY {
3120        return Ok(normal_cdf(h));
3121    }
3122    if !left.is_finite() || !right.is_finite() {
3123        let upper = bivariate_normal_cdf(h, right, rho)?;
3124        let lower = bivariate_normal_cdf(h, left, rho)?;
3125        return Ok((upper - lower).clamp(0.0, 1.0));
3126    }
3127    validate_bvn_args(h, left, rho)?;
3128    validate_bvn_args(h, right, rho)?;
3129    if h == f64::NEG_INFINITY {
3130        return Ok(0.0);
3131    }
3132    if h == f64::INFINITY {
3133        return Ok((normal_cdf(right) - normal_cdf(left)).clamp(0.0, 1.0));
3134    }
3135
3136    let rho_clamped = rho.clamp(-1.0, 1.0);
3137    if rho_clamped >= 1.0 - 1e-12 || rho_clamped <= -1.0 + 1e-12 {
3138        let upper = bivariate_normal_cdf(h, right, rho_clamped)?;
3139        let lower = bivariate_normal_cdf(h, left, rho_clamped)?;
3140        return Ok((upper - lower).clamp(0.0, 1.0));
3141    }
3142
3143    let cdf_h = normal_cdf(h);
3144    let normal_part = cdf_h * (normal_cdf(right) - normal_cdf(left));
3145    if rho_clamped == 0.0 {
3146        return Ok(normal_part.clamp(0.0, 1.0));
3147    }
3148    let asr = rho_clamped.asin();
3149    let sum = bvn_gl_sum_interval(h, left, right, rho_clamped, asr);
3150    Ok((normal_part + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3151}
3152
3153fn exp_neg_half_square(x: f64) -> f64 {
3154    if x.is_infinite() {
3155        0.0
3156    } else {
3157        (-0.5 * x * x).exp()
3158    }
3159}
3160
3161/// Zeroth truncated standard-normal moment `T_0(a, b) = ∫_a^b e^(−z²/2) dz
3162/// = √(2π)·(Φ(b) − Φ(a))`, evaluated without catastrophic cancellation in
3163/// either tail.
3164///
3165/// Writing `T_0 = √(π/2)·[erf(b/√2) − erf(a/√2)]`, the naive form collapses
3166/// to `0.0` whenever both endpoints lie in the *same* far tail: `erf`
3167/// saturates at the IEEE-754 values `±1.0` for `|x| ≳ 8.3·√2`, so the
3168/// difference of two saturated values is exactly zero even though the
3169/// integral is a strictly positive number well inside the f64 normal range
3170/// (e.g. `∫_{-12}^{-10} ≈ 1.9e-23`). The fix is to reduce the erf difference
3171/// to complementary tail probabilities — `erfc` is evaluated with a dedicated
3172/// tail series, *not* as `1 − erf` — and to pick, by the sign of the
3173/// endpoints, the algebraically-equivalent form whose terms do not cancel
3174/// against one another:
3175///
3176/// ```text
3177/// both ≥ 0 (upper tail):  erf(b/√2) − erf(a/√2) = erfc(a/√2) − erfc(b/√2)
3178/// both ≤ 0 (lower tail):  erf(b/√2) − erf(a/√2) = erfc(−b/√2) − erfc(−a/√2)
3179/// straddling zero:        erf(b/√2) − erf(a/√2) = 2 − erfc(b/√2) − erfc(−a/√2)
3180/// ```
3181///
3182/// In each branch every `erfc` argument is `≥ 0`, so the terms are small
3183/// positive tail values (or an O(1) constant minus two values `≤ 1`); no
3184/// large quantities cancel and full f64 precision survives down to the
3185/// underflow boundary in either tail. Infinite endpoints fall out via the
3186/// `erfc` limits (`erfc(+∞)=0`, `erfc(−∞)=2`) with no special casing.
3187///
3188/// Uses `libm::erfc` (msun double-precision implementation, ≤ 1 ulp) rather
3189/// than `statrs::function::erf::erfc` (a 6-term rational approximation that
3190/// carries ~3·10⁻¹¹ relative error around `|x| ≈ 1/√2` — see the existing
3191/// `libm::erfc` consumer at `inference::polya_gamma_core::normal_cdf`). That
3192/// statrs error propagates directly into `T_0`, then through every higher
3193/// moment `T_n` (the recurrence `T_n = a^{n-1}e^{-a²/2} − b^{n-1}e^{-b²/2}
3194/// + (n-1)·T_{n-2}` walks `T_0` up two steps at a time), then through every
3195/// affine-cell moment via `affine_anchor_moment_vector` (whose `out[n]` is a
3196/// linear combination of `T_0..=T_n`), and is the dominant source of error
3197/// in the affine-cell branch of the cubic-cell substrate (CPU/GPU parity
3198/// reference for transformation-normal, bernoulli-marginal-slope, and the
3199/// BMS flex-row higher-derivative reuse path).
3200fn truncated_gaussian_zeroth_moment(a: f64, b: f64) -> f64 {
3201    let inv_sqrt2 = 1.0 / std::f64::consts::SQRT_2;
3202    let za = a * inv_sqrt2;
3203    let zb = b * inv_sqrt2;
3204    let erf_diff = if za >= 0.0 {
3205        libm::erfc(za) - libm::erfc(zb)
3206    } else if zb <= 0.0 {
3207        libm::erfc(-zb) - libm::erfc(-za)
3208    } else {
3209        2.0 - libm::erfc(zb) - libm::erfc(-za)
3210    };
3211    // √(2π)·½ = √(π/2).
3212    (std::f64::consts::PI / 2.0).sqrt() * erf_diff
3213}
3214
3215/// Fill `out[0..=max_degree]` with the raw truncated standard-normal moments
3216///
3217/// ```text
3218/// T_n(a, b) = ∫_a^b z^n exp(-z²/2) dz
3219/// ```
3220///
3221/// using the integration-by-parts recurrence
3222///
3223/// ```text
3224/// T_0(a, b) = √(2π) (Φ(b) − Φ(a))
3225/// T_1(a, b) = exp(−a²/2) − exp(−b²/2)
3226/// T_n(a, b) = a^(n−1) e^{−a²/2} − b^(n−1) e^{−b²/2} + (n−1) T_{n−2}(a, b)
3227/// ```
3228///
3229/// Computed in one forward sweep so each call evaluates `erf` and
3230/// `exp(−x²/2)` exactly twice (once at `a`, once at `b`) regardless of the
3231/// requested degree. The naive form — calling `T_n` recursively for each
3232/// `n = 0..=max_degree` — re-evaluated `erf`/`exp` about `max_degree²/4`
3233/// times per affine cell, which dominated the wall time of the
3234/// transformation-normal and bernoulli-marginal-slope inner solves with
3235/// `max_degree = 64` (the transport order's required degree budget).
3236fn fill_truncated_gaussian_moments(a: f64, b: f64, out: &mut [f64]) {
3237    if out.is_empty() {
3238        return;
3239    }
3240    out[0] = truncated_gaussian_zeroth_moment(a, b);
3241    if out.len() == 1 {
3242        return;
3243    }
3244    let ea = exp_neg_half_square(a);
3245    let eb = exp_neg_half_square(b);
3246    out[1] = ea - eb;
3247    if out.len() == 2 {
3248        return;
3249    }
3250    let a_finite = a.is_finite();
3251    let b_finite = b.is_finite();
3252    // For n in 2..=max_degree we need a^{n-1} e^{-a²/2} (resp. b). Carry the
3253    // running powers a^{n-1}, b^{n-1} forward by a single multiply per step.
3254    // Infinite endpoints contribute 0 (the integrand decays at the rate of
3255    // exp(−x²/2)), matching the prior `is_infinite` branch in the recursive
3256    // implementation; we still update the running power so the iteration
3257    // stays branchless when both endpoints are finite.
3258    let mut a_pow_n_minus_1 = a; // a^1, used at n = 2
3259    let mut b_pow_n_minus_1 = b;
3260    for n in 2..out.len() {
3261        let left = if a_finite { a_pow_n_minus_1 * ea } else { 0.0 };
3262        let right = if b_finite { b_pow_n_minus_1 * eb } else { 0.0 };
3263        out[n] = left - right + (n as f64 - 1.0) * out[n - 2];
3264        a_pow_n_minus_1 *= a;
3265        b_pow_n_minus_1 *= b;
3266    }
3267}
3268
3269/// Stack-array bound for `affine_anchor_moment_vector_into`. Public callers
3270/// use up to ~24 (largest is the bernoulli-margslope outer-step degree-21
3271/// reduction); 64 leaves comfortable headroom without growing the per-call
3272/// stack footprint meaningfully.
3273const MAX_AFFINE_ANCHOR_DEGREE: usize = 64;
3274
3275pub fn affine_anchor_moment_vector(
3276    alpha: f64,
3277    beta: f64,
3278    left: f64,
3279    right: f64,
3280    max_degree: usize,
3281) -> Vec<f64> {
3282    let mut out = vec![0.0; max_degree + 1];
3283    affine_anchor_moment_vector_into(alpha, beta, left, right, max_degree, &mut out);
3284    out
3285}
3286
3287fn affine_anchor_moment_vector_into(
3288    alpha: f64,
3289    beta: f64,
3290    left: f64,
3291    right: f64,
3292    max_degree: usize,
3293    out: &mut [f64],
3294) {
3295    assert_eq!(out.len(), max_degree + 1);
3296    let s = (1.0 + beta * beta).sqrt();
3297    let mu = -alpha * beta / (1.0 + beta * beta);
3298    let y_left = if left.is_infinite() {
3299        if left.is_sign_positive() {
3300            f64::INFINITY
3301        } else {
3302            f64::NEG_INFINITY
3303        }
3304    } else {
3305        s * (left - mu)
3306    };
3307    let y_right = if right.is_infinite() {
3308        if right.is_sign_positive() {
3309            f64::INFINITY
3310        } else {
3311            f64::NEG_INFINITY
3312        }
3313    } else {
3314        s * (right - mu)
3315    };
3316    let anchor = (-alpha * alpha / (2.0 * s * s)).exp() / s;
3317    assert!(
3318        max_degree <= MAX_AFFINE_ANCHOR_DEGREE,
3319        "affine_anchor_moment_vector max_degree {} exceeds compile-time bound {}",
3320        max_degree,
3321        MAX_AFFINE_ANCHOR_DEGREE
3322    );
3323    let mut t = [0.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3324    fill_truncated_gaussian_moments(y_left, y_right, &mut t[..=max_degree]);
3325    // Build mu^k and s^{-k} tables once. The inner sum is the binomial
3326    // expansion of the affine change-of-variables, and computing the
3327    // binomial coefficient via Pascal's row recurrence + carrying mu/s
3328    // powers eliminates the per-(n, k) `powi` and binomial calls that
3329    // otherwise dominated the inner loop at large `max_degree`.
3330    let mut mu_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3331    for k in 1..=max_degree {
3332        mu_pow[k] = mu_pow[k - 1] * mu;
3333    }
3334    let inv_s = 1.0 / s;
3335    let mut inv_s_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3336    for k in 1..=max_degree {
3337        inv_s_pow[k] = inv_s_pow[k - 1] * inv_s;
3338    }
3339    out.fill(0.0);
3340    for n in 0..=max_degree {
3341        let mut acc = 0.0;
3342        // C(n, k+1) = C(n, k) · (n − k) / (k + 1).
3343        let mut binom = 1.0;
3344        for k in 0..=n {
3345            let term = binom * mu_pow[n - k] * inv_s_pow[k];
3346            acc = term.mul_add(t[k], acc);
3347            if k < n {
3348                binom = binom * (n - k) as f64 / (k + 1) as f64;
3349            }
3350        }
3351        out[n] = anchor * acc;
3352    }
3353}
3354
3355fn affine_value_from_moment_primitive(alpha: f64, beta: f64, left: f64, right: f64) -> f64 {
3356    // Exact formula via bivariate normal CDF.
3357    //
3358    // V(α,β,l,r) = ∫_l^r Φ(α+βz)φ(z)dz
3359    //            = P(U ≤ α+βZ, l ≤ Z ≤ r)    where U,Z iid N(0,1)
3360    //            = Φ₂(h, r; ρ) − Φ₂(h, l; ρ)
3361    //
3362    // with h = α/√(1+β²) and ρ = −β/√(1+β²).
3363    //
3364    // This is exact to floating-point precision via the high-accuracy
3365    // Drezner-Wesolowsky BVN routine, replacing the previous fixed 20-point
3366    // Gauss-Legendre numerical integration of the derivative primitive.
3367    let s = (1.0 + beta * beta).sqrt();
3368    let h = alpha / s;
3369    let rho = -beta / s;
3370    bivariate_normal_cdf_interval(h, left, right, rho).unwrap_or(0.0)
3371}
3372
3373/// Evaluate an affine cell (c2=c3=0) with a value/moment-consistent primitive.
3374///
3375/// Value and moments are now generated from the same affine moment primitive.
3376/// The zero-moment derivative is exact, and `value` is reconstructed by
3377/// integrating `d value / d alpha = INV_TWO_PI * moments[0]` over `alpha`
3378/// on a transformed semi-infinite domain.
3379pub fn evaluate_affine_cell_state(
3380    cell: DenestedCubicCell,
3381    max_degree: usize,
3382) -> Result<CellMomentState, String> {
3383    let alpha = cell.c0;
3384    let beta = cell.c1;
3385    let value = affine_value_from_moment_primitive(alpha, beta, cell.left, cell.right);
3386    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3387    Ok(CellMomentState {
3388        branch: ExactCellBranch::Affine,
3389        value,
3390        moments: moments.into(),
3391    })
3392}
3393
3394fn evaluate_affine_cell_derivative_state(
3395    cell: DenestedCubicCell,
3396    max_degree: usize,
3397) -> Result<CellDerivativeMomentState, String> {
3398    let alpha = cell.c0;
3399    let beta = cell.c1;
3400    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3401    Ok(CellDerivativeMomentState {
3402        branch: ExactCellBranch::Affine,
3403        moments: moments.into(),
3404    })
3405}
3406
3407/// Accumulate `mw * z^k` into `moments[k]` for k=0..moments.len(). The
3408/// "unrolled4" name is historical — this is the plain scalar accumulator
3409/// that the SIMD outer loop calls per lane. Moment counts are small enough
3410/// (max_degree + 1 <= ~10) that explicit 4-way unrolling does not measurably
3411/// improve throughput over the iterator path; the wide::f64x4::exp savings
3412/// in the SIMD outer dominate the kernel's runtime.
3413#[inline]
3414fn accumulate_moments_unrolled4(moments: &mut [f64], mw: f64, z: f64) {
3415    let mut z_pow = 1.0_f64;
3416    for slot in moments.iter_mut() {
3417        *slot = mw.mul_add(z_pow, *slot);
3418        z_pow *= z;
3419    }
3420}
3421
3422// Shared SIMD Gauss-Legendre core for non-affine cells. The const generic
3423// `COMPUTE_VALUE` selects whether the cell value integral
3424// `∫ φ(η(z)) · exp(-½z²) dz / √(2π)` is accumulated alongside the moments.
3425// Monomorphization collapses the const-generic branches at compile time, so
3426// `COMPUTE_VALUE = false` emits the moment-only path verbatim.
3427//
3428// Single source of truth for the moment SIMD lane ordering, the Horner-with-FMA
3429// pattern for η(z), the `0.5 * (z² + η²)` quadratic-form evaluation order, the
3430// unscaled per-node GL moment weights, the post-loop half-width fold, and the
3431// per-lane `accumulate_moments_unrolled4` call. The previous duplicated code paths
3432// drifted by 1 ULP whenever any of these details diverged; here both paths
3433// share the same instructions, eliminating an entire class of regressions
3434// where a tweak to the quadrature order or the FMA pattern would silently
3435// re-introduce divergence between the value- and derivative-only callers.
3436//
3437// Gauss-Legendre on [left, right] converges geometrically for the analytic
3438// integrand exp(-q(z)) with quartic/sextic q on a bounded cell; the prior
3439// adaptive transport path expanded basis_moments via the forward 3-/5-step
3440// recurrences in reduce_quartic/sextic_moments, which amplify roundoff by
3441// (1/lead)^n with lead = 2c2²/3c3² and overflow to NaN for small c2/c3 cells
3442// that arise naturally in production.
3443//
3444// The fixed 384-node rule that replaced the transport path is accurate but
3445// pays ~384 exp evaluations per cell unconditionally. Production cells are
3446// narrow spline-knot subdivisions where a 12- or 24-node rule is already
3447// converged to machine precision, and the flex marginal-slope row calculus
3448// evaluates O(100) such cells per row across n=10⁵–10⁶ rows per criterion
3449// evaluation — the fixed rule was the dominant cost of the whole fit (#979).
3450// `evaluate_non_affine_cell_simd` therefore walks a progressive ladder of
3451// rules (12, 24, 48, 96, 192, 384 nodes) and returns as soon as two
3452// consecutive rules agree to `NON_AFFINE_LADDER_RTOL` relative to the moment
3453// vector's own scale. Unlike the old fixed rule — whose error was real but
3454// uncertified — every accepted ladder result carries an embedded two-rule
3455// agreement certificate; a cell that never certifies falls through to the
3456// same 384-node answer the fixed rule produced.
3457//
3458// SIMD path: process 4 GL nodes per outer iteration, batching the two scalar
3459// `exp` calls into single 4-wide `wide::f64x4::exp` invocations. All ladder
3460// rule sizes are divisible by 4, so no scalar tail is needed for the GL
3461// sweep. The inner moment accumulation is then run scalar per-lane but with
3462// a 4-way unrolled slab over the moment slots to break the `z_pow *= z`
3463// serial dependency chain.
3464#[inline(always)]
3465fn evaluate_non_affine_cell_with_rule<const COMPUTE_VALUE: bool>(
3466    cell: DenestedCubicCell,
3467    max_degree: usize,
3468    gl_nodes: &[f64],
3469    gl_weights: &[f64],
3470) -> (CellMomentVec, f64) {
3471    let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
3472    let mut value_integral = 0.0_f64;
3473    let center = 0.5 * (cell.left + cell.right);
3474    let half_width = 0.5 * (cell.right - cell.left);
3475    let c0 = cell.c0;
3476    let c1 = cell.c1;
3477    let c2 = cell.c2;
3478    let c3 = cell.c3;
3479    let moments_slice: &mut [f64] = &mut moments;
3480    assert_eq!(gl_nodes.len(), gl_weights.len());
3481    use wide::f64x4;
3482    let center_v = f64x4::splat(center);
3483    let half_width_v = f64x4::splat(half_width);
3484    let c0_v = f64x4::splat(c0);
3485    let c1_v = f64x4::splat(c1);
3486    let c2_v = f64x4::splat(c2);
3487    let c3_v = f64x4::splat(c3);
3488    let neg_half_v = f64x4::splat(-0.5);
3489    let n_total = gl_nodes.len();
3490    let n_simd = n_total - (n_total % 4);
3491    let mut i = 0;
3492    while i < n_simd {
3493        let node_v = f64x4::from([
3494            gl_nodes[i],
3495            gl_nodes[i + 1],
3496            gl_nodes[i + 2],
3497            gl_nodes[i + 3],
3498        ]);
3499        let weight_v = f64x4::from([
3500            gl_weights[i],
3501            gl_weights[i + 1],
3502            gl_weights[i + 2],
3503            gl_weights[i + 3],
3504        ]);
3505        let z_v = half_width_v.mul_add(node_v, center_v);
3506        // Horner: ((c3*z + c2)*z + c1)*z + c0
3507        let eta_v = c3_v
3508            .mul_add(z_v, c2_v)
3509            .mul_add(z_v, c1_v)
3510            .mul_add(z_v, c0_v);
3511        let z2_v = z_v * z_v;
3512        let neg_q_v = neg_half_v * (z2_v + eta_v * eta_v);
3513        let exp_negq_v = neg_q_v.exp();
3514        let moment_weight_v = weight_v * exp_negq_v;
3515        let z_arr = z_v.to_array();
3516        let mw_arr = moment_weight_v.to_array();
3517        if COMPUTE_VALUE {
3518            for lane in 0..4 {
3519                let z = z_arr[lane];
3520                let mw = mw_arr[lane];
3521                accumulate_moments_unrolled4(moments_slice, mw, z);
3522                // The value integrand carries Φ(η)'s erfc, whose systematic
3523                // per-z error is ~1e-13. To honor the cell-value accuracy
3524                // contract the value term must be assembled bit-for-bit like
3525                // the scalar reference: a non-fused node map
3526                // `z_ref = center + half_width·node`, the expanded
3527                // `η = c0 + c1·z + c2·z² + c3·z³` (NOT the SIMD Horner-FMA used
3528                // for the moments), the unscaled GL weight, a scalar `exp(-½z²)`,
3529                // and a plain `+=`. The SIMD `z_v`/`eta_v` above (fused) feed
3530                // ONLY the moments and are left untouched. Any single ULP slip
3531                // here (FMA node map, Horner η, per-term half_width, SIMD exp,
3532                // FMA accumulation) drifts the 384-node sum by ~1.4e-13 and
3533                // breaks the contract.
3534                let node = gl_nodes[i + lane];
3535                let weight = gl_weights[i + lane];
3536                let z_ref = center + half_width * node;
3537                let eta_ref = c0 + c1 * z_ref + c2 * z_ref * z_ref + c3 * z_ref * z_ref * z_ref;
3538                value_integral += weight * (-0.5 * z_ref * z_ref).exp() * normal_cdf(eta_ref);
3539            }
3540        } else {
3541            for lane in 0..4 {
3542                let z = z_arr[lane];
3543                let mw = mw_arr[lane];
3544                accumulate_moments_unrolled4(moments_slice, mw, z);
3545            }
3546        }
3547        i += 4;
3548    }
3549    while i < n_total {
3550        let node = gl_nodes[i];
3551        let weight = gl_weights[i];
3552        let z = center + half_width * node;
3553        let eta = c3.mul_add(z, c2).mul_add(z, c1).mul_add(z, c0);
3554        let q = 0.5 * (z * z + eta * eta);
3555        let moment_weight = weight * (-q).exp();
3556        accumulate_moments_unrolled4(moments_slice, moment_weight, z);
3557        if COMPUTE_VALUE {
3558            // Bit-for-bit the reference value structure (see SIMD branch): the
3559            // node map `z = center + half_width·node` here already matches the
3560            // reference (non-fused), but η must use the expanded reference form
3561            // rather than the moment path's Horner-FMA.
3562            let eta_ref = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3563            value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta_ref);
3564        }
3565        i += 1;
3566    }
3567    // Apply the cell half-width to both moment and value integrals ONCE at the
3568    // end, mirroring the prefold reference. Folding half_width per-term changes
3569    // f64 rounding enough to show up at the 1e-13 contract.
3570    for moment in moments_slice.iter_mut() {
3571        *moment *= half_width;
3572    }
3573    let value = if COMPUTE_VALUE {
3574        value_integral * half_width
3575    } else {
3576        value_integral
3577    };
3578    (moments, value)
3579}
3580
3581/// Relative agreement threshold for the progressive non-affine quadrature
3582/// ladder: two consecutive Gauss-Legendre rules must agree on every moment
3583/// slot to this tolerance relative to the moment vector's own max magnitude
3584/// before the finer rule's result
3585/// is accepted. Gauss-Legendre error decays geometrically in the node count
3586/// for the analytic integrand `exp(-q(z))`, so agreement between an n-node
3587/// and a 2n-node rule certifies that both are converged: the coarse rule's
3588/// true error is bounded by the observed difference plus the (much smaller)
3589/// fine-rule error.
3590///
3591/// History (#979): a roundoff-floor relaxation of this test (accept when
3592/// successive rungs agree to `≈ n·ε·scale` rather than the bare `3e-15`) was
3593/// tried to let smooth cells certify below the terminal 384-node rung. It was
3594/// reverted: the value-bearing path carries `∫ φ(z)·Φ(η(z)) dz`, and `Φ`'s
3595/// `erfc` implementation has a *systematic per-z* error of order `1e-13` that
3596/// each rung's node set samples differently. Only the exact 384-node rule
3597/// reproduces the reference's erfc-noise realization, so any sub-384 rung
3598/// drifts from the 384 value by `≈ 1e-13` — a drift that is NOT truncation,
3599/// does NOT shrink with rung, and is NOT bounded by rung-to-rung agreement.
3600/// The moment ladder remains independent of the value integral so value- and
3601/// derivative-only evaluators keep returning bit-identical moments. The scalar
3602/// value now evaluates on the terminal 384-node rule directly, preserving the
3603/// `non_affine_cell_state_matches_prefold_reference_to_1e_minus_13` value
3604/// contract without forcing every derivative-moment caller to use the terminal
3605/// rung.
3606const NON_AFFINE_LADDER_RTOL: f64 = 1e-15;
3607
3608/// Node counts of the progressive ladder below the 384-node terminal rung.
3609/// All divisible by 4 so the SIMD sweep needs no scalar tail.
3610const NON_AFFINE_LADDER_RUNGS: [usize; 5] = [12, 24, 48, 96, 192];
3611
3612/// Runtime-generated Gauss-Legendre rules for the ladder rungs, computed
3613/// once per process by Newton iteration on the Legendre polynomial roots
3614/// (standard `gauleg`: cosine initial guess, 3-4 Newton steps to machine
3615/// precision). The terminal 384-node rung reuses the compile-time
3616/// `GL_NODES`/`GL_WEIGHTS` tables, which also remain the single source for
3617/// the GPU kernel.
3618fn non_affine_ladder_rules() -> &'static [(Vec<f64>, Vec<f64>)] {
3619    static RULES: std::sync::OnceLock<Vec<(Vec<f64>, Vec<f64>)>> = std::sync::OnceLock::new();
3620    RULES.get_or_init(|| {
3621        NON_AFFINE_LADDER_RUNGS
3622            .iter()
3623            .map(|&n| gauss_legendre_rule(n))
3624            .collect()
3625    })
3626}
3627
3628/// Nodes and weights of the `n`-point Gauss-Legendre rule on `[-1, 1]`.
3629///
3630/// Newton iteration on `P_n` from the cosine initial guess
3631/// `cos(π(i + 0.75)/(n + 0.5))` converges to every root in a handful of
3632/// steps; weights follow from `w_i = 2 / ((1 - x_i²) P_n'(x_i)²)`. Roots are
3633/// filled symmetrically so the rule is exactly antisymmetric about 0.
3634fn gauss_legendre_rule(n: usize) -> (Vec<f64>, Vec<f64>) {
3635    let mut nodes = vec![0.0_f64; n];
3636    let mut weights = vec![0.0_f64; n];
3637    for i in 0..n.div_ceil(2) {
3638        let mut z = (std::f64::consts::PI * (i as f64 + 0.75) / (n as f64 + 0.5)).cos();
3639        let mut pp = 0.0_f64;
3640        for _ in 0..100 {
3641            // Legendre recurrence: p1 = P_n(z), p2 = P_{n-1}(z).
3642            let mut p1 = 1.0_f64;
3643            let mut p2 = 0.0_f64;
3644            for j in 1..=n {
3645                let p3 = p2;
3646                p2 = p1;
3647                p1 = ((2 * j - 1) as f64 * z * p2 - (j - 1) as f64 * p3) / j as f64;
3648            }
3649            pp = n as f64 * (z * p1 - p2) / (z * z - 1.0);
3650            let z_prev = z;
3651            z = z_prev - p1 / pp;
3652            if (z - z_prev).abs() <= f64::EPSILON {
3653                break;
3654            }
3655        }
3656        nodes[i] = -z;
3657        nodes[n - 1 - i] = z;
3658        let w = 2.0 / ((1.0 - z * z) * pp * pp);
3659        weights[i] = w;
3660        weights[n - 1 - i] = w;
3661    }
3662    (nodes, weights)
3663}
3664
3665/// Two-rule agreement certificate for the progressive ladder. `true` when
3666/// every MOMENT slot agrees to `NON_AFFINE_LADDER_RTOL` relative to the fine
3667/// result's max magnitude. Non-finite results never certify, so they fall
3668/// through to the terminal 384-node rung and reproduce the fixed rule's
3669/// behavior exactly.
3670///
3671/// The decision is deliberately moment-only and independent of whether the
3672/// caller also computed the cell value: the value- and derivative-only
3673/// evaluators MUST select the same ladder rung so they accumulate the moment
3674/// vector over the same nodes and return bit-identical moments (the
3675/// `derivative_moment_evaluator_matches_value_evaluator_moments` invariant).
3676/// Value-bearing callers evaluate the scalar cell probability separately on
3677/// the terminal 384-node rule; this certificate governs only the reusable
3678/// derivative moment vector.
3679fn non_affine_ladder_converged(coarse: &CellMomentVec, fine: &CellMomentVec) -> bool {
3680    let mut scale = 0.0_f64;
3681    let mut err = 0.0_f64;
3682    for (&c, &f) in coarse.iter().zip(fine.iter()) {
3683        scale = scale.max(f.abs());
3684        err = err.max((c - f).abs());
3685    }
3686    if !(scale.is_finite() && err.is_finite()) {
3687        return false;
3688    }
3689    err <= NON_AFFINE_LADDER_RTOL * scale
3690}
3691
3692/// Per-rung certification histogram for the non-affine ladder, indexed by the
3693/// rung that certified (`NON_AFFINE_LADDER_RUNGS[i]` at index `i`), with the
3694/// final slot counting cells that fell through to the terminal 384-node rule.
3695/// Incremented once per non-affine cell evaluation; the BMS exact-cache build
3696/// logs the distribution so the ladder's real cost (early-certify win vs.
3697/// terminal-fallthrough cost) is observable on every large-scale fit rather
3698/// than assumed. `+1` length for the terminal bucket.
3699pub(crate) static NON_AFFINE_LADDER_CERT_COUNTS: [AtomicU64; NON_AFFINE_LADDER_RUNGS.len() + 1] = [
3700    AtomicU64::new(0),
3701    AtomicU64::new(0),
3702    AtomicU64::new(0),
3703    AtomicU64::new(0),
3704    AtomicU64::new(0),
3705    AtomicU64::new(0),
3706];
3707
3708/// Snapshot the ladder certification histogram as `(rung_node_count, count)`
3709/// pairs plus the terminal-fallthrough count, for logging/inspection.
3710pub fn non_affine_ladder_cert_histogram() -> (Vec<(usize, u64)>, u64) {
3711    let per_rung = NON_AFFINE_LADDER_RUNGS
3712        .iter()
3713        .enumerate()
3714        .map(|(i, &n)| (n, NON_AFFINE_LADDER_CERT_COUNTS[i].load(Ordering::Relaxed)))
3715        .collect();
3716    let terminal =
3717        NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].load(Ordering::Relaxed);
3718    (per_rung, terminal)
3719}
3720
3721/// Progressive-ladder evaluation of a non-affine cell: walk the rule ladder
3722/// from 12 nodes upward and return the first result certified by two-rule
3723/// agreement; a cell that never certifies returns the terminal 384-node
3724/// result, byte-identical to the previous fixed-rule implementation.
3725#[inline]
3726fn evaluate_non_affine_cell_simd<const COMPUTE_VALUE: bool>(
3727    cell: DenestedCubicCell,
3728    max_degree: usize,
3729) -> (CellMomentVec, f64) {
3730    let mut prev: Option<(CellMomentVec, f64)> = None;
3731    for (i, (nodes, weights)) in non_affine_ladder_rules().iter().enumerate() {
3732        let cur =
3733            evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, nodes, weights);
3734        if let Some(prev) = prev.as_ref()
3735            && non_affine_ladder_converged(&prev.0, &cur.0)
3736        {
3737            NON_AFFINE_LADDER_CERT_COUNTS[i].fetch_add(1, Ordering::Relaxed);
3738            return cur;
3739        }
3740        prev = Some(cur);
3741    }
3742    NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].fetch_add(1, Ordering::Relaxed);
3743    evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, &GL_NODES, &GL_WEIGHTS)
3744}
3745
3746/// Value-only evaluation of a non-affine cell on the terminal 384-node rule.
3747///
3748/// Returns the cell probability integral `∫ exp(-½z²)·Φ(η(z)) dz` (pre the
3749/// `1/√τ` normalization) computed bit-for-bit like the value branch of
3750/// [`evaluate_non_affine_cell_with_rule`]: the non-fused node map
3751/// `z = center + half_width·node`, the expanded (non-Horner)
3752/// `η = c0 + c1·z + c2·z² + c3·z³`, the unscaled GL weight, a scalar
3753/// `exp(-½z²)`, a plain `+=` in ascending node order, and a single trailing
3754/// `·half_width`. The terminal rule has 384 nodes (divisible by 4), so the
3755/// general kernel's value path never takes its scalar tail — this loop walks
3756/// the same nodes in the same order and therefore reproduces the reference
3757/// erfc-noise realization the `1e-13` value contract pins down.
3758///
3759/// Computing this through `evaluate_non_affine_cell_with_rule::<true>` at
3760/// `max_degree = 0` would additionally run the 4-wide SIMD `exp(-q)` moment
3761/// sweep and a moment accumulation on every node only to discard the moment
3762/// vector. The survival marginal-slope fit evaluates a value per non-affine
3763/// partition cell, so that discarded moment work is the dominant waste in the
3764/// per-cell pass; this evaluator does only the work the value needs.
3765fn evaluate_non_affine_cell_value_terminal(cell: DenestedCubicCell) -> f64 {
3766    let center = 0.5 * (cell.left + cell.right);
3767    let half_width = 0.5 * (cell.right - cell.left);
3768    let c0 = cell.c0;
3769    let c1 = cell.c1;
3770    let c2 = cell.c2;
3771    let c3 = cell.c3;
3772    let mut value_integral = 0.0_f64;
3773    for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
3774        let z = center + half_width * node;
3775        let eta = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3776        value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
3777    }
3778    value_integral * half_width
3779}
3780
3781fn evaluate_non_affine_cell_state(
3782    cell: DenestedCubicCell,
3783    branch: ExactCellBranch,
3784    max_degree: usize,
3785) -> Result<CellMomentState, String> {
3786    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3787    let value_integral = evaluate_non_affine_cell_value_terminal(cell);
3788    // Reference structure: `value_integral * half_width / sqrt(TAU)`. The
3789    // half_width factor is already applied inside the rule evaluator, so divide
3790    // by sqrt(TAU) here (a true division, NOT multiply-by-reciprocal) to
3791    // reproduce the reference's final rounding bit-for-bit.
3792    Ok(CellMomentState {
3793        branch,
3794        value: value_integral / (std::f64::consts::TAU).sqrt(),
3795        moments,
3796    })
3797}
3798
3799fn evaluate_non_affine_cell_derivative_state(
3800    cell: DenestedCubicCell,
3801    branch: ExactCellBranch,
3802    max_degree: usize,
3803) -> Result<CellDerivativeMomentState, String> {
3804    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3805    Ok(CellDerivativeMomentState { branch, moments })
3806}
3807
3808/// De-nested cubic cell evaluator.
3809///
3810/// Affine cells use the closed-form affine anchor; non-affine cells (Quartic
3811/// and Sextic branches) are evaluated in a single pass over a fixed
3812/// high-order Gauss-Legendre rule on `[left, right]`.
3813pub fn evaluate_cell_moments(
3814    cell: DenestedCubicCell,
3815    max_degree: usize,
3816) -> Result<CellMomentState, String> {
3817    if !TAIL_CELL_MOMENT_CACHE_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
3818        return evaluate_cell_moments_uncached(cell, max_degree);
3819    }
3820    tail_cell_moment_cache().evaluate(cell, max_degree)
3821}
3822
3823/// Evaluate cell moments without consulting the global affine-tail memo.
3824///
3825/// This is retained for regression tests and before/after microbenchmarks;
3826/// production callers should use [`evaluate_cell_moments`].
3827pub fn evaluate_cell_moments_uncached(
3828    cell: DenestedCubicCell,
3829    max_degree: usize,
3830) -> Result<CellMomentState, String> {
3831    evaluate_cell_state_dispatched(
3832        cell,
3833        max_degree,
3834        evaluate_affine_cell_state,
3835        evaluate_non_affine_cell_state,
3836    )
3837}
3838
3839/// Evaluate only the moment vector needed by derivative contractions.
3840///
3841/// This deliberately does not compute the cell probability value
3842/// `∫ φ(z) Φ(η(z)) dz`. Derivative contractions consume
3843/// `∫ z^k exp(-q(z)) dz` moments only, so keeping the value out of the return
3844/// type prevents this cheaper evaluator from satisfying value-bearing calls.
3845pub fn evaluate_cell_derivative_moments_uncached(
3846    cell: DenestedCubicCell,
3847    max_degree: usize,
3848) -> Result<CellDerivativeMomentState, String> {
3849    evaluate_cell_state_dispatched(
3850        cell,
3851        max_degree,
3852        evaluate_affine_cell_derivative_state,
3853        evaluate_non_affine_cell_derivative_state,
3854    )
3855}
3856
3857/// Shared branch dispatch for the value-bearing and derivative-only cell
3858/// evaluators. Both walk the same decision tree (semi-infinite tail → must
3859/// be affine; finite cell → branch-by-coefficients with the sextic
3860/// degenerate-lowering path), differing only in which pair of
3861/// `(affine, non_affine)` evaluator helpers to delegate to.  The two helpers
3862/// are passed as `fn` pointers so the dispatch monomorphizes per `S` and
3863/// keeps the existing pre-condition errors / unreachable branch handling
3864/// in lockstep across both evaluators.
3865fn evaluate_cell_state_dispatched<S>(
3866    cell: DenestedCubicCell,
3867    max_degree: usize,
3868    affine: fn(DenestedCubicCell, usize) -> Result<S, String>,
3869    non_affine: fn(DenestedCubicCell, ExactCellBranch, usize) -> Result<S, String>,
3870) -> Result<S, String> {
3871    let left_inf = !cell.left.is_finite();
3872    let right_inf = !cell.right.is_finite();
3873    if left_inf || right_inf {
3874        // Semi-infinite tail cells must be affine: the deviation saturates
3875        // to a constant outside support, so c2=c3=0.  Both the BVN CDF
3876        // and the truncated-Gaussian moment vector handle infinite bounds.
3877        if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL
3878        {
3879            return Err(CubicCellKernelError::invalid_cell_shape(format!(
3880                "semi-infinite cell [{}, {}] must be affine (c2=c3=0), got c2={:.3e}, c3={:.3e}",
3881                cell.left, cell.right, cell.c2, cell.c3
3882            ))
3883            .into());
3884        }
3885        return affine(cell, max_degree);
3886    }
3887    if cell.right <= cell.left {
3888        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3889            "finite cell must have left < right, got [{}, {}]",
3890            cell.left, cell.right
3891        ))
3892        .into());
3893    }
3894    let branch = branch_cell(cell)?;
3895    if branch == ExactCellBranch::Affine {
3896        return affine(cell, max_degree);
3897    }
3898    if branch == ExactCellBranch::Sextic {
3899        let lead = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3)[5];
3900        if !lead.is_finite() {
3901            return Err(CubicCellKernelError::invalid_cell_shape(format!(
3902                "sextic cell evaluation encountered non-finite leading coefficient: {lead:.3e}"
3903            ))
3904            .into());
3905        }
3906        if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
3907            return match lower_branch {
3908                ExactCellBranch::Quartic => non_affine(
3909                    DenestedCubicCell { c3: 0.0, ..cell },
3910                    ExactCellBranch::Quartic,
3911                    max_degree,
3912                ),
3913                ExactCellBranch::Affine => affine(
3914                    DenestedCubicCell {
3915                        c2: 0.0,
3916                        c3: 0.0,
3917                        ..cell
3918                    },
3919                    max_degree,
3920                ),
3921                ExactCellBranch::Sextic => Err(CubicCellKernelError::invalid_cell_shape(
3922                    "internal: degenerate_sextic_branch returned Sextic as a lowered branch",
3923                )
3924                .into()),
3925            };
3926        }
3927    }
3928    non_affine(cell, branch, max_degree)
3929}
3930
3931/// Evaluate a de-nested cubic cell through a fit-lifetime byte-limited LRU cache.
3932///
3933/// The fingerprint is an exact bit-cast of `(c0, c1, c2, c3, left, right)`, so
3934/// eviction and reuse cannot alias nearby-but-different cells.  A cached entry
3935/// computed to a higher degree may satisfy a lower-degree request by truncating
3936/// the moment vector, preserving the public [`evaluate_cell_moments`] contract.
3937pub fn evaluate_cell_moments_cached(
3938    cell: DenestedCubicCell,
3939    max_degree: usize,
3940    cache: &CellMomentLruCache,
3941    stats: Option<&CellMomentCacheStats>,
3942) -> Result<CellMomentState, String> {
3943    // Affine cells (every rigid-path cell and every tail cell) evaluate
3944    // through the closed-form anchor — cheaper than a single LRU probe. The
3945    // LRU exists only to amortize the EXPENSIVE non-affine transport across
3946    // recurring cells; at large n the row scalars `(a, b)` are unique per
3947    // row, so affine cells never recur and routing them through the sharded
3948    // mutex was pure cost (320k lock+insert+evict ops per gradient eval, ~0%
3949    // hit — the dominant cost of the rigid n=320k fit, #979). Bypass the
3950    // cache entirely for them.
3951    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
3952        if let Some(stats) = stats {
3953            stats.misses.fetch_add(1, Ordering::Relaxed);
3954        }
3955        return evaluate_cell_moments_uncached(cell, max_degree);
3956    }
3957    let key = CellFingerprint::new(cell);
3958    let existing_derivative = match cache.get(&key) {
3959        Some(cached) => {
3960            if let Some(state) = cached.state_for_degree(max_degree) {
3961                if let Some(stats) = stats {
3962                    stats.hits.fetch_add(1, Ordering::Relaxed);
3963                }
3964                return Ok(state);
3965            }
3966            // `cached.derivative_state` is `Option<Arc<_>>`; `.clone()` here
3967            // is the cheap refcount bump the audit-39 fix targets, not a
3968            // full moment-vector deep clone.
3969            cached.derivative_state.clone()
3970        }
3971        None => None,
3972    };
3973    if let Some(stats) = stats {
3974        stats.misses.fetch_add(1, Ordering::Relaxed);
3975    }
3976    let state = evaluate_cell_moments(cell, max_degree)?;
3977    // Wrap the freshly-computed state in `Arc` once, share it with the cache
3978    // through `Arc::clone`, and return the underlying value by unwrapping the
3979    // unique-reference (caller-side) `Arc`. This replaces the prior
3980    // `state.clone()` deep copy at the insert site.
3981    let shared = Arc::new(state);
3982    let mut entry = CachedCellMoments::new(Arc::clone(&shared));
3983    if let Some(derivative) = existing_derivative {
3984        entry = entry.with_derivative(derivative);
3985    }
3986    cache.insert(key, entry);
3987    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
3988}
3989
3990/// Derivative-moment counterpart to [`evaluate_cell_moments_cached`]. Shares
3991/// the value-moment LRU by storing both moment kinds in a single
3992/// [`CachedCellMoments`] entry keyed on the cell fingerprint — derivative
3993/// insertions preserve any pre-existing value state and vice versa, so the
3994/// two callers never evict each other's work.
3995pub fn evaluate_cell_derivative_moments_cached(
3996    cell: DenestedCubicCell,
3997    max_degree: usize,
3998    cache: &CellMomentLruCache,
3999    stats: Option<&CellMomentCacheStats>,
4000) -> Result<CellDerivativeMomentState, String> {
4001    // Affine cells bypass the LRU — see `evaluate_cell_moments_cached` for
4002    // why the sharded-mutex memo is pure overhead on the closed-form affine
4003    // path at large n (#979).
4004    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4005        if let Some(stats) = stats {
4006            stats.misses.fetch_add(1, Ordering::Relaxed);
4007        }
4008        return evaluate_cell_derivative_moments_uncached(cell, max_degree);
4009    }
4010    let key = CellFingerprint::new(cell);
4011    let existing_value = match cache.get(&key) {
4012        Some(cached) => {
4013            if let Some(state) = cached.derivative_state_for_degree(max_degree) {
4014                if let Some(stats) = stats {
4015                    stats.hits.fetch_add(1, Ordering::Relaxed);
4016                }
4017                return Ok(state);
4018            }
4019            // `cached.state` is `Option<Arc<_>>`; `.clone()` here is the cheap
4020            // refcount bump the audit-39 fix targets, not a full moment-vector
4021            // deep clone.
4022            cached.state.clone()
4023        }
4024        None => None,
4025    };
4026    if let Some(stats) = stats {
4027        stats.misses.fetch_add(1, Ordering::Relaxed);
4028    }
4029    let state = evaluate_cell_derivative_moments_uncached(cell, max_degree)?;
4030    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4031    // through `Arc::clone`, and return the underlying value by unwrapping the
4032    // unique-reference (caller-side) `Arc`. This replaces the prior
4033    // `state.clone()` deep copy at the insert site.
4034    let shared = Arc::new(state);
4035    let mut entry = CachedCellMoments::new_derivative(Arc::clone(&shared));
4036    if let Some(value) = existing_value {
4037        entry = entry.with_value(value);
4038    }
4039    cache.insert(key, entry);
4040    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4041}
4042
4043/// Scratch-backed variant of [`evaluate_cell_moments`].
4044///
4045/// Reuses the supplied [`CellMomentScratch`] for the returned moments slice,
4046/// so repeated calls with the same scratch (and a sufficient initial capacity)
4047/// avoid per-call `Vec` allocations on the hot inner-PIRLS row-intercept
4048/// solver path. Internal transport allocations are unchanged.
4049pub fn evaluate_cell_moments_with_scratch<'a>(
4050    cell: DenestedCubicCell,
4051    max_degree: usize,
4052    scratch: &'a mut CellMomentScratch,
4053) -> Result<CellMomentStateRef<'a>, String> {
4054    let state = evaluate_cell_moments(cell, max_degree)?;
4055    let out = scratch.prepare_moments(max_degree + 1);
4056    out.copy_from_slice(&state.moments);
4057    Ok(CellMomentStateRef {
4058        branch: state.branch,
4059        value: state.value,
4060        moments: out,
4061    })
4062}
4063
4064#[cfg(test)]
4065mod tests {
4066    use super::*;
4067    use gam_math::probability::normal_pdf;
4068
4069    /// Pointwise value of the cell THIRD-derivative integrand
4070    /// `(d3/dr ds dt) exp(-q(z))/2pi` at a single `z`, evaluated from the SAME
4071    /// `(r, s, t, rs, rt, st, rst)` coefficient polynomials the moment reduction
4072    /// `cell_third_derivative_from_moments` integrates. Unlike the
4073    /// second-derivative integrand this one does NOT cancel across an interior
4074    /// C2-link knot crossing (the `c_rst` third coefficient jumps), so it backs
4075    /// the C2-telescoping regression below. Test-only; no production consumer.
4076    #[inline]
4077    fn cell_third_derivative_boundary_integrand(
4078        cell: DenestedCubicCell,
4079        first_coefficients_r: &[f64],
4080        first_coefficients_s: &[f64],
4081        first_coefficients_t: &[f64],
4082        second_coefficients_rs: &[f64],
4083        second_coefficients_rt: &[f64],
4084        second_coefficients_st: &[f64],
4085        third_coefficients_rst: &[f64],
4086        z: f64,
4087    ) -> f64 {
4088        let eta = cell.eta(z);
4089        let c_r = poly_eval_at(first_coefficients_r, z);
4090        let c_s = poly_eval_at(first_coefficients_s, z);
4091        let c_t = poly_eval_at(first_coefficients_t, z);
4092        let c_rs = poly_eval_at(second_coefficients_rs, z);
4093        let c_rt = poly_eval_at(second_coefficients_rt, z);
4094        let c_st = poly_eval_at(second_coefficients_st, z);
4095        let c_rst = poly_eval_at(third_coefficients_rst, z);
4096        let amplitude = c_rst - eta * (c_rs * c_t + c_rt * c_s + c_st * c_r)
4097            + (eta * eta - 1.0) * c_r * c_s * c_t;
4098        amplitude * (-cell.q(z)).exp() * INV_TWO_PI
4099    }
4100
4101    #[inline]
4102    pub(super) fn polynomial_value(coefficients: &[f64], z: f64) -> f64 {
4103        coefficients
4104            .iter()
4105            .rev()
4106            .fold(0.0, |acc, &coeff| acc * z + coeff)
4107    }
4108
4109    fn reset_cell_moment_test_reallocs() {
4110        super::CELL_MOMENT_REALLOCS.store(0, std::sync::atomic::Ordering::Relaxed);
4111    }
4112
4113    fn cell_moment_test_reallocs() -> usize {
4114        super::CELL_MOMENT_REALLOCS.load(std::sync::atomic::Ordering::Relaxed)
4115    }
4116
4117    fn assert_close_rel(label: &str, actual: f64, expected: f64, tol: f64) {
4118        let denom = expected.abs().max(1.0);
4119        let rel = (actual - expected).abs() / denom;
4120        assert!(
4121            rel <= tol,
4122            "{label}: actual={actual:.17e} expected={expected:.17e} rel={rel:.3e} tol={tol:.3e}"
4123        );
4124    }
4125
4126    // The link-basis cell coefficient `transformed_link_cubic(span, a, b)` is, in
4127    // each of its four output components, a polynomial of TOTAL degree exactly 3 in
4128    // (a, b):
4129    //   d0 = c0 + c1·s + c2·s² + c3·s³            (s = a − left; deg 3 in a)
4130    //   d1 = b·(c1 + 2c2·s + 3c3·s²)              (a²·b → total deg 3)
4131    //   d2 = b²·(c2 + 3c3·s)                       (a·b² → total deg 3)
4132    //   d3 = c3·b³                                 (b³  → total deg 3)
4133    // Therefore EVERY 4th-order total (a,b)-partial (∂⁴/∂aⁱ∂b^{4−i}) is identically
4134    // zero, while the 3rd-order partials (∂³/∂aⁱ∂b^{3−i}) are the highest nonzero
4135    // ones. This is the exact algebraic fact the bidirectional flex jet relies on:
4136    // a "second mixed derivative of a third-a-partial" slot, etc., demands a 4th
4137    // total (a,b)-partial and must be hard-zero — substituting a (nonzero) 3rd
4138    // partial there is a bug. This test certifies BOTH facts by central FD so the
4139    // hard-coded `0.0` fixes are provably correct and provably necessary.
4140    #[test]
4141    fn link_basis_cell_fourth_ab_partials_vanish_third_are_nonzero() {
4142        let span = LocalSpanCubic {
4143            left: -0.4,
4144            right: 1.6,
4145            c0: 0.37,
4146            c1: -0.81,
4147            c2: 0.53,
4148            c3: -0.29,
4149        };
4150        let a0 = 0.23_f64;
4151        let b0 = 0.61_f64;
4152        let h = 1e-2_f64;
4153
4154        // Generic central-difference stencils per derivative order.
4155        let stencil = |order: usize| -> &'static [(i64, f64)] {
4156            match order {
4157                0 => &[(0, 1.0)],
4158                1 => &[(-1, -0.5), (1, 0.5)],
4159                2 => &[(-1, 1.0), (0, -2.0), (1, 1.0)],
4160                3 => &[(-2, -0.5), (-1, 1.0), (1, -1.0), (2, 0.5)],
4161                4 => &[(-2, 1.0), (-1, -4.0), (0, 6.0), (1, -4.0), (2, 1.0)],
4162                _ => &[(0, 1.0)],
4163            }
4164        };
4165        // FD of component `k` of the cell coefficient: ∂^{na+nb}/∂a^{na}∂b^{nb}.
4166        let fd = |k: usize, na: usize, nb: usize| -> f64 {
4167            let mut acc = 0.0;
4168            for &(ia, wa) in stencil(na) {
4169                for &(ib, wb) in stencil(nb) {
4170                    let a = a0 + (ia as f64) * h;
4171                    let b = b0 + (ib as f64) * h;
4172                    acc += wa * wb * link_basis_cell_coefficients(span, a, b)[k];
4173                }
4174            }
4175            acc / h.powi((na + nb) as i32)
4176        };
4177
4178        let (p3_aaa, p3_aab, p3_abb, p3_bbb) = link_basis_cell_third_partials(span);
4179
4180        // (1) The analytic 3rd partials match FD (within FD truncation) — and at
4181        // least one is appreciably nonzero, so these are real signal that a wrong
4182        // slot would inject.
4183        let mut max_third = 0.0_f64;
4184        for k in 0..4 {
4185            for (label, (na, nb), analytic) in [
4186                ("aaa", (3usize, 0usize), p3_aaa[k]),
4187                ("aab", (2, 1), p3_aab[k]),
4188                ("abb", (1, 2), p3_abb[k]),
4189                ("bbb", (0, 3), p3_bbb[k]),
4190            ] {
4191                let got = fd(k, na, nb);
4192                assert!(
4193                    (got - analytic).abs() <= 1e-4 + 1e-3 * analytic.abs(),
4194                    "3rd partial {label}[{k}] analytic {analytic:+.6e} vs FD {got:+.6e}"
4195                );
4196                max_third = max_third.max(analytic.abs());
4197            }
4198        }
4199        assert!(
4200            max_third > 1e-1,
4201            "expected an appreciable nonzero 3rd (a,b)-partial; max |analytic| = {max_third:.3e}"
4202        );
4203
4204        // (2) EVERY 4th-order total (a,b)-partial vanishes (degree-3 polynomial),
4205        // certifying that the hard-coded `0.0` in the bidirectional d12 slots is the
4206        // mathematically required value, not an approximation.
4207        for k in 0..4 {
4208            for (na, nb) in [(4usize, 0usize), (3, 1), (2, 2), (1, 3), (0, 4)] {
4209                let got = fd(k, na, nb);
4210                assert!(
4211                    got.abs() <= 1e-2,
4212                    "4th (a,b)-partial ∂^{na}_a∂^{nb}_b of cell coeff[{k}] must vanish, FD = {got:+.6e}"
4213                );
4214            }
4215        }
4216    }
4217
4218    #[test]
4219    fn non_affine_cell_state_grid_matches_public_cell_moments_reference() {
4220        let cells = [
4221            DenestedCubicCell {
4222                left: -1.25,
4223                right: -0.2,
4224                c0: -0.35,
4225                c1: 0.85,
4226                c2: 0.04,
4227                c3: -0.015,
4228            },
4229            DenestedCubicCell {
4230                left: -0.2,
4231                right: 0.55,
4232                c0: 0.12,
4233                c1: -0.65,
4234                c2: -0.025,
4235                c3: 0.02,
4236            },
4237            DenestedCubicCell {
4238                left: 0.55,
4239                right: 1.6,
4240                c0: 0.42,
4241                c1: 0.35,
4242                c2: 0.018,
4243                c3: 0.012,
4244            },
4245        ];
4246        for cell in cells {
4247            let branch = branch_cell(cell).expect("branch");
4248            assert_ne!(branch, ExactCellBranch::Affine);
4249            for max_degree in [0usize, 2, 4, 9, 16] {
4250                let direct = evaluate_non_affine_cell_state(cell, branch, max_degree)
4251                    .expect("direct non-affine transport");
4252                let public = evaluate_cell_moments(cell, max_degree).expect("public evaluator");
4253                assert_eq!(direct.branch, public.branch);
4254                assert_eq!(direct.moments.len(), public.moments.len());
4255                let value_scale = direct.value.abs().max(public.value.abs()).max(1.0);
4256                assert!(
4257                    (direct.value - public.value).abs() <= 1e-10 * value_scale,
4258                    "value mismatch for {cell:?} degree {max_degree}: direct={} public={}",
4259                    direct.value,
4260                    public.value
4261                );
4262                for (degree, (lhs, rhs)) in
4263                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4264                {
4265                    let scale = lhs.abs().max(rhs.abs()).max(1.0);
4266                    assert!(
4267                        (lhs - rhs).abs() <= 1e-10 * scale,
4268                        "moment {degree} mismatch for {cell:?} degree {max_degree}: {lhs} vs {rhs}"
4269                    );
4270                }
4271            }
4272        }
4273    }
4274
4275    #[test]
4276    fn affine_tail_cell_memo_matches_uncached_grid_and_records_hits() {
4277        // Use a dedicated local cache so the test's hit/miss/entry counters
4278        // are not perturbed by concurrent tests that drive the shared
4279        // global memo through `evaluate_cell_moments`. Asserting on the
4280        // global counters made this test race-flaky when the suite ran in
4281        // parallel.
4282        let cache = TailCellMomentCache::new();
4283        let c0s = [-2.0, -0.25, 0.0, 1.5];
4284        let c1s = [-1.2, -0.05, 0.0, 0.8];
4285        let endpoints = [-4.0, -1.0, 0.0, 2.5, 6.0];
4286        let degrees = [0_usize, 4, 9, 16, 24];
4287
4288        for &c0 in &c0s {
4289            for &c1 in &c1s {
4290                for &endpoint in &endpoints {
4291                    for &max_degree in &degrees {
4292                        for &(left, right) in
4293                            &[(f64::NEG_INFINITY, endpoint), (endpoint, f64::INFINITY)]
4294                        {
4295                            let cell = DenestedCubicCell {
4296                                left,
4297                                right,
4298                                c0,
4299                                c1,
4300                                c2: 0.0,
4301                                c3: 0.0,
4302                            };
4303                            let expected = evaluate_cell_moments_uncached(cell, max_degree)
4304                                .expect("uncached affine tail moments");
4305                            let actual = cache
4306                                .evaluate(cell, max_degree)
4307                                .expect("cached affine tail moments miss");
4308                            let repeat = cache
4309                                .evaluate(cell, max_degree)
4310                                .expect("cached affine tail moments hit");
4311                            assert_eq!(actual.branch, expected.branch);
4312                            assert_eq!(repeat.branch, expected.branch);
4313                            assert_close_rel(
4314                                "tail value miss",
4315                                actual.value,
4316                                expected.value,
4317                                1e-14,
4318                            );
4319                            assert_close_rel("tail value hit", repeat.value, expected.value, 1e-14);
4320                            assert_eq!(actual.moments.len(), expected.moments.len());
4321                            assert_eq!(repeat.moments.len(), expected.moments.len());
4322                            for (idx, ((a, r), e)) in actual
4323                                .moments
4324                                .iter()
4325                                .zip(repeat.moments.iter())
4326                                .zip(expected.moments.iter())
4327                                .enumerate()
4328                            {
4329                                assert_close_rel(
4330                                    &format!("tail moment miss[{idx}]"),
4331                                    *a,
4332                                    *e,
4333                                    1e-14,
4334                                );
4335                                assert_close_rel(&format!("tail moment hit[{idx}]"), *r, *e, 1e-14);
4336                            }
4337                        }
4338                    }
4339                }
4340            }
4341        }
4342
4343        let stats = cache.stats();
4344        assert_eq!(stats.misses, stats.entries);
4345        assert!(
4346            stats.hits >= stats.misses,
4347            "expected repeat hits: {stats:?}"
4348        );
4349        assert!(
4350            stats.hit_rate() >= 0.5,
4351            "unexpected low hit rate: {stats:?}"
4352        );
4353    }
4354
4355    fn reference_bivariate_normal_cdf_20(h: f64, k: f64, rho: f64) -> f64 {
4356        if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
4357            return 0.0;
4358        }
4359        if h == f64::INFINITY {
4360            return normal_cdf(k);
4361        }
4362        if k == f64::INFINITY {
4363            return normal_cdf(h);
4364        }
4365        let rho_clamped = rho.clamp(-1.0, 1.0);
4366        if rho_clamped >= 1.0 - 1e-12 {
4367            return normal_cdf(h.min(k));
4368        }
4369        if rho_clamped <= -1.0 + 1e-12 {
4370            return (normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0);
4371        }
4372
4373        let hs = 0.5 * (h * h + k * k);
4374        let asr = rho_clamped.asin();
4375        let mut sum = 0.0;
4376        for (&node, &weight) in GL20_NODES.iter().zip(GL20_WEIGHTS.iter()) {
4377            let sn = (0.5 * asr * (node + 1.0)).sin();
4378            let one_minus = 1.0 - sn * sn;
4379            let expo = ((sn * h * k) - hs) / one_minus;
4380            sum += weight * expo.exp();
4381        }
4382        (normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0)
4383    }
4384
4385    #[test]
4386    fn non_affine_cell_state_reference_grid_matches_public_moments() {
4387        let c0s = [-0.4, 0.0, 0.35];
4388        let c1s = [-0.8, 0.25, 1.1];
4389        let c2s = [-0.12, 0.08];
4390        let c3s = [-0.04, 0.03];
4391        let intervals = [(-1.25, -0.2), (-0.5, 0.75), (0.1, 1.4)];
4392        let degrees = [3usize, 6, 9, 12];
4393
4394        for &c0 in &c0s {
4395            for &c1 in &c1s {
4396                for &c2 in &c2s {
4397                    for &c3 in &c3s {
4398                        for &(left, right) in &intervals {
4399                            let cell = DenestedCubicCell {
4400                                left,
4401                                right,
4402                                c0,
4403                                c1,
4404                                c2,
4405                                c3,
4406                            };
4407                            let branch = branch_cell(cell).expect("branch");
4408                            assert_ne!(branch, ExactCellBranch::Affine);
4409                            for &degree in &degrees {
4410                                let direct = evaluate_non_affine_cell_state(cell, branch, degree)
4411                                    .expect("direct non-affine state");
4412                                let public = evaluate_cell_moments(cell, degree)
4413                                    .expect("public non-affine state");
4414                                assert_eq!(direct.branch, public.branch);
4415                                let value_scale =
4416                                    direct.value.abs().max(public.value.abs()).max(1.0);
4417                                assert!(
4418                                    (direct.value - public.value).abs() / value_scale <= 1.0e-15,
4419                                    "value mismatch for {cell:?}, degree {degree}: direct={:.17e}, public={:.17e}",
4420                                    direct.value,
4421                                    public.value
4422                                );
4423                                assert_eq!(direct.moments.len(), public.moments.len());
4424                                for (idx, (&a, &b)) in
4425                                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4426                                {
4427                                    let scale = a.abs().max(b.abs()).max(1.0);
4428                                    assert!(
4429                                        (a - b).abs() / scale <= 1.0e-15,
4430                                        "moment {idx} mismatch for {cell:?}, degree {degree}: direct={a:.17e}, public={b:.17e}"
4431                                    );
4432                                }
4433                            }
4434                        }
4435                    }
4436                }
4437            }
4438        }
4439    }
4440
4441    #[test]
4442    fn bivariate_normal_cdf_matches_reference_grid_to_1e_minus_10() {
4443        let hs = [-8.0, -5.0, -3.0, -1.5, -0.5, 0.0, 0.25, 1.0, 2.5, 5.0, 8.0];
4444        let ks = [-8.0, -4.0, -2.0, -0.75, 0.0, 0.4, 1.25, 3.0, 6.0, 8.0];
4445        let rhos = [
4446            -0.999_999_999_999,
4447            -0.999,
4448            -0.95,
4449            -0.7,
4450            -0.3,
4451            -1.0e-12,
4452            0.0,
4453            1.0e-12,
4454            0.3,
4455            0.7,
4456            0.95,
4457            0.999,
4458            0.999_999_999_999,
4459        ];
4460        for &h in &hs {
4461            for &k in &ks {
4462                for &rho in &rhos {
4463                    let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4464                    let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4465                    let scale = expected.abs().max(1.0e-300);
4466                    let rel = (actual - expected).abs() / scale;
4467                    assert!(
4468                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4469                        "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4470                    );
4471                }
4472            }
4473        }
4474    }
4475
4476    #[test]
4477    fn bivariate_normal_cdf_matches_reference_lcg_property_samples() {
4478        let mut seed = 0x5eed_cafe_f00d_u64;
4479        let mut next_unit = || {
4480            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
4481            ((seed >> 11) as f64) * (1.0 / ((1_u64 << 53) as f64))
4482        };
4483        for _ in 0..4096 {
4484            let h = -8.0 + 16.0 * next_unit();
4485            let k = -8.0 + 16.0 * next_unit();
4486            let rho = -0.999 + 1.998 * next_unit();
4487            let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4488            let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4489            let scale = expected.abs().max(1.0e-300);
4490            let rel = (actual - expected).abs() / scale;
4491            assert!(
4492                rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4493                "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4494            );
4495        }
4496    }
4497
4498    #[test]
4499    fn affine_bvn_interval_primitive_matches_two_cdf_difference() {
4500        let hs = [-6.0, -2.0, -0.25, 0.0, 0.8, 3.0, 6.0];
4501        let bounds = [
4502            (-5.0, -2.0),
4503            (-3.0, -0.1),
4504            (-1.0, 0.0),
4505            (-0.25, 0.75),
4506            (0.2, 3.5),
4507            (2.0, 7.0),
4508        ];
4509        let rhos = [-0.98, -0.8, -0.25, 0.0, 0.25, 0.8, 0.98];
4510        for &h in &hs {
4511            for &(left, right) in &bounds {
4512                for &rho in &rhos {
4513                    let actual =
4514                        bivariate_normal_cdf_interval(h, left, right, rho).expect("interval");
4515                    let expected = (reference_bivariate_normal_cdf_20(h, right, rho)
4516                        - reference_bivariate_normal_cdf_20(h, left, rho))
4517                    .clamp(0.0, 1.0);
4518                    let scale = expected.abs().max(1.0e-300);
4519                    let rel = (actual - expected).abs() / scale;
4520                    assert!(
4521                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-12,
4522                        "h={h} left={left} right={right} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4523                    );
4524                }
4525            }
4526        }
4527    }
4528
4529    fn simpson_integral<F>(left: f64, right: f64, steps: usize, f: F) -> f64
4530    where
4531        F: Fn(f64) -> f64,
4532    {
4533        let n = if steps.is_multiple_of(2) {
4534            steps
4535        } else {
4536            steps + 1
4537        };
4538        let h = (right - left) / n as f64;
4539        let mut acc = f(left) + f(right);
4540        for k in 1..n {
4541            let x = left + h * k as f64;
4542            let w = if k % 2 == 0 { 2.0 } else { 4.0 };
4543            acc += w * f(x);
4544        }
4545        acc * h / 3.0
4546    }
4547
4548    #[test]
4549    fn global_transform_preserves_local_span_polynomial() {
4550        let span = LocalSpanCubic {
4551            left: -1.2,
4552            right: 0.8,
4553            c0: 0.3,
4554            c1: -0.25,
4555            c2: 0.11,
4556            c3: -0.04,
4557        };
4558        let (g0, g1, g2, g3) = global_cubic_from_local(span);
4559        for &x in &[-1.2, -0.7, -0.1, 0.4, 0.8] {
4560            let local = span.evaluate(x);
4561            let global = g0 + g1 * x + g2 * x * x + g3 * x * x * x;
4562            assert!((local - global).abs() < 1e-12);
4563        }
4564    }
4565
4566    #[test]
4567    fn bivariate_normal_cdf_independent_factorizes() {
4568        let h = -0.35;
4569        let k = 0.8;
4570        let out = bivariate_normal_cdf(h, k, 0.0).expect("bvn");
4571        let target = normal_cdf(h) * normal_cdf(k);
4572        assert!((out - target).abs() < 1e-12);
4573    }
4574
4575    #[test]
4576    fn evaluate_affine_cell_state_matches_numeric_integrals() {
4577        let cell = DenestedCubicCell {
4578            left: -0.9,
4579            right: 0.8,
4580            c0: 0.15,
4581            c1: -0.35,
4582            c2: 0.0,
4583            c3: 0.0,
4584        };
4585        let state = evaluate_affine_cell_state(cell, 6).expect("affine cell");
4586        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
4587            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
4588        });
4589        assert_eq!(state.branch, ExactCellBranch::Affine);
4590        assert!((state.value - value_numeric).abs() < 1e-9);
4591        for degree in 0..=6 {
4592            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
4593                z.powi(degree as i32) * (-cell.q(z)).exp()
4594            });
4595            assert!((state.moments[degree] - target).abs() < 1e-9);
4596        }
4597    }
4598
4599    #[test]
4600    fn affine_cell_value_matches_zero_moment_derivative() {
4601        let cell = DenestedCubicCell {
4602            left: -1.1,
4603            right: 0.7,
4604            c0: 0.23,
4605            c1: -0.41,
4606            c2: 0.0,
4607            c3: 0.0,
4608        };
4609        let h = 1e-6;
4610        let plus = evaluate_affine_cell_state(
4611            DenestedCubicCell {
4612                c0: cell.c0 + h,
4613                ..cell
4614            },
4615            0,
4616        )
4617        .expect("affine plus");
4618        let minus = evaluate_affine_cell_state(
4619            DenestedCubicCell {
4620                c0: cell.c0 - h,
4621                ..cell
4622            },
4623            0,
4624        )
4625        .expect("affine minus");
4626        let center = evaluate_affine_cell_state(cell, 0).expect("affine center");
4627        let d_value = (plus.value - minus.value) / (2.0 * h);
4628        let target = INV_TWO_PI * center.moments[0];
4629        assert!((d_value - target).abs() < 1e-8);
4630    }
4631
4632    #[test]
4633    fn coefficient_partials_match_exact_span_derivatives() {
4634        let score_span = LocalSpanCubic {
4635            left: -0.75,
4636            right: 0.25,
4637            c0: 0.08,
4638            c1: -0.03,
4639            c2: 0.02,
4640            c3: -0.01,
4641        };
4642        let link_span = LocalSpanCubic {
4643            left: -0.6,
4644            right: 0.9,
4645            c0: -0.05,
4646            c1: 0.04,
4647            c2: -0.02,
4648            c3: 0.015,
4649        };
4650        let a = 0.3;
4651        let b = -0.7;
4652        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
4653        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4654            let u = a + b * z;
4655            let eta_a = 1.0 + link_span.first_derivative(u);
4656            let eta_b = z + score_span.evaluate(z) + z * link_span.first_derivative(u);
4657            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4658            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4659        }
4660    }
4661
4662    #[test]
4663    fn second_coefficient_partials_match_exact_span_derivatives() {
4664        let score_span = LocalSpanCubic {
4665            left: -0.75,
4666            right: 0.25,
4667            c0: 0.08,
4668            c1: -0.03,
4669            c2: 0.02,
4670            c3: -0.01,
4671        };
4672        let link_span = LocalSpanCubic {
4673            left: -0.6,
4674            right: 0.9,
4675            c0: -0.05,
4676            c1: 0.04,
4677            c2: -0.02,
4678            c3: 0.015,
4679        };
4680        let a = 0.3;
4681        let b = -0.7;
4682        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
4683        let dc_daa = second_partials.0;
4684        let dc_dab = second_partials.1;
4685        let dc_dbb = second_partials.2;
4686        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4687            let u = a + b * z;
4688            let eta_aa = link_span.second_derivative(u);
4689            let eta_ab = z * link_span.second_derivative(u);
4690            let eta_bb = z * z * link_span.second_derivative(u);
4691            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4692            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4693            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4694        }
4695    }
4696
4697    #[test]
4698    fn higher_derivative_moment_helpers_reject_empty_first_coefficients() {
4699        let cell = DenestedCubicCell {
4700            left: -1.0,
4701            right: 1.0,
4702            c0: 0.0,
4703            c1: 1.0,
4704            c2: 0.0,
4705            c3: 0.0,
4706        };
4707        let moments = [1.0; 16];
4708
4709        let third_err = cell_third_derivative_from_moments(
4710            cell,
4711            &[],
4712            &[1.0],
4713            &[1.0],
4714            &[],
4715            &[],
4716            &[],
4717            &[],
4718            &moments,
4719        )
4720        .expect_err("empty first coefficients should be rejected");
4721        assert!(third_err.contains("r first-derivative coefficients must be non-empty"));
4722
4723        let fourth_err = cell_fourth_derivative_from_moments(
4724            cell,
4725            &[1.0],
4726            &[],
4727            &[1.0],
4728            &[1.0],
4729            &[],
4730            &[],
4731            &[],
4732            &[],
4733            &[],
4734            &[],
4735            &[],
4736            &[],
4737            &[],
4738            &[],
4739            &[],
4740            &moments,
4741        )
4742        .expect_err("empty first coefficients should be rejected");
4743        assert!(fourth_err.contains("s first-derivative coefficients must be non-empty"));
4744    }
4745
4746    #[test]
4747    fn fourth_derivative_rejects_overlong_scratch_convolutions() {
4748        let cell = DenestedCubicCell {
4749            left: -1.0,
4750            right: 1.0,
4751            c0: 0.0,
4752            c1: 1.0,
4753            c2: 0.0,
4754            c3: 0.0,
4755        };
4756        let long_first = [1.0; 10];
4757        let zero = [0.0; 1];
4758        let moments = [1.0; 64];
4759
4760        let err = cell_fourth_derivative_from_moments(
4761            cell,
4762            &long_first,
4763            &long_first,
4764            &long_first,
4765            &long_first,
4766            &zero,
4767            &zero,
4768            &zero,
4769            &zero,
4770            &zero,
4771            &zero,
4772            &zero,
4773            &zero,
4774            &zero,
4775            &zero,
4776            &zero,
4777            &moments,
4778        )
4779        .expect_err("oversized convolution should be rejected before writing scratch");
4780        assert!(err.contains("fourth derivative polynomial convolution scratch too small"));
4781    }
4782
4783    #[test]
4784    fn score_and_link_basis_cell_coefficients_match_direct_construction() {
4785        let score_basis_span = LocalSpanCubic {
4786            left: -0.7,
4787            right: 0.4,
4788            c0: 0.2,
4789            c1: -0.04,
4790            c2: 0.03,
4791            c3: -0.01,
4792        };
4793        let link_basis_span = LocalSpanCubic {
4794            left: -0.5,
4795            right: 1.1,
4796            c0: -0.03,
4797            c1: 0.05,
4798            c2: -0.02,
4799            c3: 0.01,
4800        };
4801        let a = 0.25;
4802        let b = -0.8;
4803        let score_coeffs = score_basis_cell_coefficients(score_basis_span, b);
4804        let link_coeffs = link_basis_cell_coefficients(link_basis_span, a, b);
4805        for &z in &[-0.7, -0.1, 0.2, 0.4] {
4806            let score_poly = polynomial_value(&score_coeffs, z);
4807            let link_poly = polynomial_value(&link_coeffs, z);
4808            assert!((score_poly - b * score_basis_span.evaluate(z)).abs() < 1e-12);
4809            assert!((link_poly - link_basis_span.evaluate(a + b * z)).abs() < 1e-12);
4810        }
4811    }
4812
4813    #[test]
4814    fn link_basis_partials_match_exact_span_derivatives() {
4815        let link_basis_span = LocalSpanCubic {
4816            left: -0.5,
4817            right: 1.1,
4818            c0: -0.03,
4819            c1: 0.05,
4820            c2: -0.02,
4821            c3: 0.01,
4822        };
4823        let a = 0.25;
4824        let b = -0.8;
4825        let (dc_da, dc_db) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
4826        let (dc_daa, dc_dab, dc_dbb) = link_basis_cell_second_partials(link_basis_span, a, b);
4827        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4828            let u = a + b * z;
4829            let eta_a = link_basis_span.first_derivative(u);
4830            let eta_b = z * link_basis_span.first_derivative(u);
4831            let eta_aa = link_basis_span.second_derivative(u);
4832            let eta_ab = z * link_basis_span.second_derivative(u);
4833            let eta_bb = z * z * link_basis_span.second_derivative(u);
4834            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4835            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4836            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4837            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4838            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4839        }
4840    }
4841
4842    #[test]
4843    fn denested_third_partials_match_exact_span_derivatives() {
4844        let link_span = LocalSpanCubic {
4845            left: -0.6,
4846            right: 0.9,
4847            c0: -0.05,
4848            c1: 0.04,
4849            c2: -0.02,
4850            c3: 0.015,
4851        };
4852        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
4853        let link_third = 6.0 * link_span.c3;
4854        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4855            let eta_aaa = link_third;
4856            let eta_aab = z * link_third;
4857            let eta_abb = z * z * link_third;
4858            let eta_bbb = z * z * z * link_third;
4859            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4860            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4861            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4862            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4863        }
4864    }
4865
4866    #[test]
4867    fn link_basis_third_partials_match_exact_span_derivatives() {
4868        let link_basis_span = LocalSpanCubic {
4869            left: -0.5,
4870            right: 1.1,
4871            c0: -0.03,
4872            c1: 0.05,
4873            c2: -0.02,
4874            c3: 0.01,
4875        };
4876        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = link_basis_cell_third_partials(link_basis_span);
4877        let link_third = 6.0 * link_basis_span.c3;
4878        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4879            let eta_aaa = link_third;
4880            let eta_aab = z * link_third;
4881            let eta_abb = z * z * link_third;
4882            let eta_bbb = z * z * z * link_third;
4883            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4884            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4885            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4886            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4887        }
4888    }
4889
4890    #[test]
4891    fn branch_selection_uses_normalized_non_affine_coefficients() {
4892        let affine = DenestedCubicCell {
4893            left: -1.0,
4894            right: 1.0,
4895            c0: 0.1,
4896            c1: -0.4,
4897            c2: 1e-13,
4898            c3: -1e-13,
4899        };
4900        let quartic = DenestedCubicCell {
4901            c2: 2e-4,
4902            c3: 1e-13,
4903            ..affine
4904        };
4905        let sextic = DenestedCubicCell {
4906            c2: 2e-4,
4907            c3: 5e-3,
4908            ..affine
4909        };
4910        assert_eq!(branch_cell(affine).unwrap(), ExactCellBranch::Affine);
4911        assert_eq!(branch_cell(quartic).unwrap(), ExactCellBranch::Quartic);
4912        assert_eq!(branch_cell(sextic).unwrap(), ExactCellBranch::Sextic);
4913    }
4914
4915    #[test]
4916    fn affine_anchor_moments_match_whole_line_closed_forms() {
4917        let out = affine_anchor_moment_vector(0.0, 0.0, f64::NEG_INFINITY, f64::INFINITY, 4);
4918        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
4919        assert!((out[0] - sqrt_2pi).abs() < 1e-12);
4920        assert!(out[1].abs() < 1e-12);
4921        assert!((out[2] - sqrt_2pi).abs() < 1e-12);
4922    }
4923
4924    #[test]
4925    fn affine_anchor_moments_match_shifted_gaussian_whole_line() {
4926        let alpha = 0.7;
4927        let beta = -0.4;
4928        let out = affine_anchor_moment_vector(alpha, beta, f64::NEG_INFINITY, f64::INFINITY, 4);
4929        let s = (1.0 + beta * beta).sqrt();
4930        let mu = -alpha * beta / (1.0 + beta * beta);
4931        let scale = (-alpha * alpha / (2.0 * s * s)).exp() / s;
4932        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
4933        assert!((out[0] - scale * sqrt_2pi).abs() < 1e-12);
4934        assert!((out[1] - scale * sqrt_2pi * mu).abs() < 1e-12);
4935        assert!((out[2] - scale * sqrt_2pi * (mu * mu + 1.0 / (s * s))).abs() < 1e-10);
4936    }
4937
4938    #[test]
4939    fn quartic_recurrence_reduces_higher_moments() {
4940        let cell = DenestedCubicCell {
4941            left: -1.0,
4942            right: 0.9,
4943            c0: 0.2,
4944            c1: -0.3,
4945            c2: 0.18,
4946            c3: 0.0,
4947        };
4948        let exact = |k: usize| {
4949            simpson_integral(cell.left, cell.right, 2000, |z| {
4950                z.powi(k as i32) * (-cell.q(z)).exp()
4951            })
4952        };
4953        let reduced = reduce_quartic_moments(cell, [exact(0), exact(1), exact(2)], 6)
4954            .expect("quartic reduction");
4955        for k in 0..=6 {
4956            let target = exact(k);
4957            assert!(
4958                (reduced[k] - target).abs() < 1e-7,
4959                "quartic reduced moment M{k} mismatch: {} vs {}",
4960                reduced[k],
4961                target
4962            );
4963        }
4964    }
4965
4966    #[test]
4967    fn sextic_recurrence_reduces_higher_moments() {
4968        let cell = DenestedCubicCell {
4969            left: -0.8,
4970            right: 0.7,
4971            c0: -0.1,
4972            c1: 0.25,
4973            c2: -0.14,
4974            c3: 0.22,
4975        };
4976        let exact = |k: usize| {
4977            simpson_integral(cell.left, cell.right, 3000, |z| {
4978                z.powi(k as i32) * (-cell.q(z)).exp()
4979            })
4980        };
4981        let reduced =
4982            reduce_sextic_moments(cell, [exact(0), exact(1), exact(2), exact(3), exact(4)], 9)
4983                .expect("sextic reduction");
4984        for k in 0..=9 {
4985            let target = exact(k);
4986            assert!(
4987                (reduced[k] - target).abs() < 1e-7,
4988                "sextic reduced moment M{k} mismatch: {} vs {}",
4989                reduced[k],
4990                target
4991            );
4992        }
4993    }
4994
4995    #[test]
4996    fn degenerate_sextic_branch_preserves_quadratic_coefficient() {
4997        let cell = DenestedCubicCell {
4998            left: -1.0,
4999            right: 1.0,
5000            c0: 0.0,
5001            c1: 0.0,
5002            c2: 0.1,
5003            c3: 2.0e-10,
5004        };
5005        assert_eq!(branch_cell(cell).unwrap(), ExactCellBranch::Sextic);
5006
5007        let state = evaluate_cell_moments(cell, 9).expect("degenerate sextic cell");
5008        let quartic_cell = DenestedCubicCell { c3: 0.0, ..cell };
5009        let quartic = evaluate_cell_moments(quartic_cell, 9).expect("quartic cell");
5010        let affine = evaluate_affine_cell_state(
5011            DenestedCubicCell {
5012                c2: 0.0,
5013                c3: 0.0,
5014                ..cell
5015            },
5016            9,
5017        )
5018        .expect("affine cell");
5019
5020        assert_eq!(state.branch, ExactCellBranch::Quartic);
5021        for k in 0..=9 {
5022            assert!(
5023                (state.moments[k] - quartic.moments[k]).abs() < 1e-12,
5024                "lowered moment M{k} should match the quartic cell: {} vs {}",
5025                state.moments[k],
5026                quartic.moments[k]
5027            );
5028        }
5029        assert!(
5030            (state.moments[0] - affine.moments[0]).abs() > 1e-4,
5031            "degenerate sextic handling must not drop the nonzero c2 term"
5032        );
5033    }
5034
5035    #[test]
5036    fn moment_reduced_first_and_second_derivatives_match_numeric_integrals() {
5037        let cell = DenestedCubicCell {
5038            left: -0.9,
5039            right: 0.6,
5040            c0: 0.15,
5041            c1: -0.2,
5042            c2: 0.08,
5043            c3: 0.17,
5044        };
5045        let moments = reduce_sextic_moments(
5046            cell,
5047            [
5048                simpson_integral(cell.left, cell.right, 3000, |z| (-cell.q(z)).exp()),
5049                simpson_integral(cell.left, cell.right, 3000, |z| z * (-cell.q(z)).exp()),
5050                simpson_integral(cell.left, cell.right, 3000, |z| z * z * (-cell.q(z)).exp()),
5051                simpson_integral(cell.left, cell.right, 3000, |z| {
5052                    z.powi(3) * (-cell.q(z)).exp()
5053                }),
5054                simpson_integral(cell.left, cell.right, 3000, |z| {
5055                    z.powi(4) * (-cell.q(z)).exp()
5056                }),
5057            ],
5058            9,
5059        )
5060        .expect("reduced moments");
5061
5062        let r = [0.7, -0.1, 0.3];
5063        let s = [0.2, 0.5];
5064        let second = [0.4, -0.2, 0.1];
5065        let exact_first = cell_first_derivative_from_moments(&r, &moments).expect("first");
5066        let exact_second =
5067            cell_second_derivative_from_moments(cell, &r, &s, &second, &moments).expect("second");
5068
5069        let numeric_first = simpson_integral(cell.left, cell.right, 3000, |z| {
5070            polynomial_value(&r, z) * (-cell.q(z)).exp() / (2.0 * std::f64::consts::PI)
5071        });
5072        let numeric_second = simpson_integral(cell.left, cell.right, 3000, |z| {
5073            let eta = cell.eta(z);
5074            (polynomial_value(&second, z) - eta * polynomial_value(&r, z) * polynomial_value(&s, z))
5075                * (-cell.q(z)).exp()
5076                / (2.0 * std::f64::consts::PI)
5077        });
5078
5079        assert!((exact_first - numeric_first).abs() < 1e-7);
5080        assert!((exact_second - numeric_second).abs() < 1e-7);
5081    }
5082
5083    #[test]
5084    fn moment_reduced_third_derivative_matches_numeric_integral() {
5085        let cell = DenestedCubicCell {
5086            left: -0.85,
5087            right: 0.7,
5088            c0: -0.12,
5089            c1: 0.18,
5090            c2: 0.09,
5091            c3: -0.11,
5092        };
5093        let moments = evaluate_cell_moments(cell, 12).expect("cell moments");
5094        let r = [0.35, -0.12, 0.08];
5095        let s = [0.17, 0.09];
5096        let t = [-0.21, 0.14, -0.04];
5097        let rs = [0.11, -0.07, 0.05];
5098        let rt = [-0.06, 0.03];
5099        let st = [0.08, -0.02, 0.01];
5100        let rst = [0.04, -0.05, 0.02];
5101
5102        let exact_third = cell_third_derivative_from_moments(
5103            cell,
5104            &r,
5105            &s,
5106            &t,
5107            &rs,
5108            &rt,
5109            &st,
5110            &rst,
5111            &moments.moments,
5112        )
5113        .expect("third derivative");
5114        let numeric_third = simpson_integral(cell.left, cell.right, 4000, |z| {
5115            let eta = cell.eta(z);
5116            let rz = polynomial_value(&r, z);
5117            let sz = polynomial_value(&s, z);
5118            let tz = polynomial_value(&t, z);
5119            let rsz = polynomial_value(&rs, z);
5120            let rtz = polynomial_value(&rt, z);
5121            let stz = polynomial_value(&st, z);
5122            let rstz = polynomial_value(&rst, z);
5123            (rstz - eta * (rsz * tz + rtz * sz + stz * rz) + (eta * eta - 1.0) * rz * sz * tz)
5124                * (-cell.q(z)).exp()
5125                / (2.0 * std::f64::consts::PI)
5126        });
5127
5128        assert!((exact_third - numeric_third).abs() < 1e-7);
5129    }
5130
5131    #[test]
5132    fn moment_reduced_fourth_derivative_matches_numeric_integral() {
5133        let cell = DenestedCubicCell {
5134            left: -0.8,
5135            right: 0.65,
5136            c0: 0.11,
5137            c1: -0.22,
5138            c2: 0.07,
5139            c3: 0.13,
5140        };
5141        let moments = evaluate_cell_moments(cell, 16).expect("cell moments");
5142        let r = [0.21, -0.13, 0.06];
5143        let s = [-0.18, 0.04];
5144        let t = [0.09, 0.07, -0.03];
5145        let u = [-0.14, 0.05];
5146        let rs = [0.08, -0.03, 0.02];
5147        let rt = [-0.05, 0.01];
5148        let ru = [0.04, -0.02, 0.01];
5149        let st = [0.03, 0.02];
5150        let su = [-0.02, 0.05, -0.01];
5151        let tu = [0.07, -0.04];
5152        let rst = [0.03, -0.01, 0.02];
5153        let rsu = [-0.02, 0.04];
5154        let rtu = [0.01, 0.02, -0.01];
5155        let stu = [-0.03, 0.02];
5156        let rstu = [0.02, -0.01, 0.01];
5157
5158        let exact_fourth = cell_fourth_derivative_from_moments(
5159            cell,
5160            &r,
5161            &s,
5162            &t,
5163            &u,
5164            &rs,
5165            &rt,
5166            &ru,
5167            &st,
5168            &su,
5169            &tu,
5170            &rst,
5171            &rsu,
5172            &rtu,
5173            &stu,
5174            &rstu,
5175            &moments.moments,
5176        )
5177        .expect("fourth derivative");
5178        let numeric_fourth = simpson_integral(cell.left, cell.right, 5000, |z| {
5179            let eta = cell.eta(z);
5180            let rz = polynomial_value(&r, z);
5181            let sz = polynomial_value(&s, z);
5182            let tz = polynomial_value(&t, z);
5183            let uz = polynomial_value(&u, z);
5184            let rsz = polynomial_value(&rs, z);
5185            let rtz = polynomial_value(&rt, z);
5186            let ruz = polynomial_value(&ru, z);
5187            let stz = polynomial_value(&st, z);
5188            let suz = polynomial_value(&su, z);
5189            let tuz = polynomial_value(&tu, z);
5190            let rstz = polynomial_value(&rst, z);
5191            let rsuz = polynomial_value(&rsu, z);
5192            let rtuz = polynomial_value(&rtu, z);
5193            let stuz = polynomial_value(&stu, z);
5194            let rstuz = polynomial_value(&rstu, z);
5195            let linear =
5196                rstz * uz + rsuz * tz + rtuz * sz + stuz * rz + rsz * tuz + rtz * suz + ruz * stz;
5197            let quadratic = rsz * tz * uz
5198                + rtz * sz * uz
5199                + ruz * sz * tz
5200                + stz * rz * uz
5201                + suz * rz * tz
5202                + tuz * rz * sz;
5203            let quartic = rz * sz * tz * uz;
5204            (rstuz - eta * linear
5205                + (eta * eta - 1.0) * quadratic
5206                + (-eta * eta * eta + 3.0 * eta) * quartic)
5207                * (-cell.q(z)).exp()
5208                / (2.0 * std::f64::consts::PI)
5209        });
5210
5211        assert!((exact_fourth - numeric_fourth).abs() < 2e-7);
5212    }
5213
5214    #[test]
5215    fn denested_cell_parameter_derivatives_match_exact_integrands() {
5216        let score_span = LocalSpanCubic {
5217            left: -0.75,
5218            right: 0.25,
5219            c0: 0.08,
5220            c1: -0.03,
5221            c2: 0.02,
5222            c3: -0.01,
5223        };
5224        let link_span = LocalSpanCubic {
5225            left: -0.6,
5226            right: 0.9,
5227            c0: -0.05,
5228            c1: 0.04,
5229            c2: -0.02,
5230            c3: 0.015,
5231        };
5232        let a = 0.3;
5233        let b = -0.7;
5234        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5235        let cell = DenestedCubicCell {
5236            left: score_span.left,
5237            right: score_span.right,
5238            c0: coeffs[0],
5239            c1: coeffs[1],
5240            c2: coeffs[2],
5241            c3: coeffs[3],
5242        };
5243        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5244        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5245        let (dc_daa, dc_dab, dc_dbb) = denested_cell_second_partials(score_span, link_span, a, b);
5246        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
5247        let zero = [0.0; 4];
5248        let link_third = 6.0 * link_span.c3;
5249
5250        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5251        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5252        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5253        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5254        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5255        let eta_aaa = |z: f64| link_third + 0.0 * z;
5256        let eta_aab = |z: f64| z * link_third;
5257        let eta_abb = |z: f64| z * z * link_third;
5258        let eta_bbb = |z: f64| z * z * z * link_third;
5259
5260        let exact_a = cell_first_derivative_from_moments(&dc_da, &state.moments).expect("a");
5261        let exact_b = cell_first_derivative_from_moments(&dc_db, &state.moments).expect("b");
5262        let exact_aa =
5263            cell_second_derivative_from_moments(cell, &dc_da, &dc_da, &dc_daa, &state.moments)
5264                .expect("aa");
5265        let exact_ab =
5266            cell_second_derivative_from_moments(cell, &dc_da, &dc_db, &dc_dab, &state.moments)
5267                .expect("ab");
5268        let exact_bb =
5269            cell_second_derivative_from_moments(cell, &dc_db, &dc_db, &dc_dbb, &state.moments)
5270                .expect("bb");
5271        let exact_aaa = cell_third_derivative_from_moments(
5272            cell,
5273            &dc_da,
5274            &dc_da,
5275            &dc_da,
5276            &dc_daa,
5277            &dc_daa,
5278            &dc_daa,
5279            &dc_daaa,
5280            &state.moments,
5281        )
5282        .expect("aaa");
5283        let exact_aab = cell_third_derivative_from_moments(
5284            cell,
5285            &dc_da,
5286            &dc_da,
5287            &dc_db,
5288            &dc_daa,
5289            &dc_dab,
5290            &dc_dab,
5291            &dc_daab,
5292            &state.moments,
5293        )
5294        .expect("aab");
5295        let exact_abb = cell_third_derivative_from_moments(
5296            cell,
5297            &dc_da,
5298            &dc_db,
5299            &dc_db,
5300            &dc_dab,
5301            &dc_dab,
5302            &dc_dbb,
5303            &dc_dabb,
5304            &state.moments,
5305        )
5306        .expect("abb");
5307        let exact_bbb = cell_third_derivative_from_moments(
5308            cell,
5309            &dc_db,
5310            &dc_db,
5311            &dc_db,
5312            &dc_dbb,
5313            &dc_dbb,
5314            &dc_dbb,
5315            &dc_dbbb,
5316            &state.moments,
5317        )
5318        .expect("bbb");
5319        let exact_aaaa = cell_fourth_derivative_from_moments(
5320            cell,
5321            &dc_da,
5322            &dc_da,
5323            &dc_da,
5324            &dc_da,
5325            &dc_daa,
5326            &dc_daa,
5327            &dc_daa,
5328            &dc_daa,
5329            &dc_daa,
5330            &dc_daa,
5331            &dc_daaa,
5332            &dc_daaa,
5333            &dc_daaa,
5334            &dc_daaa,
5335            &zero,
5336            &state.moments,
5337        )
5338        .expect("aaaa");
5339        let exact_aaab = cell_fourth_derivative_from_moments(
5340            cell,
5341            &dc_da,
5342            &dc_da,
5343            &dc_da,
5344            &dc_db,
5345            &dc_daa,
5346            &dc_daa,
5347            &dc_dab,
5348            &dc_daa,
5349            &dc_dab,
5350            &dc_dab,
5351            &dc_daaa,
5352            &dc_daab,
5353            &dc_daab,
5354            &dc_daab,
5355            &zero,
5356            &state.moments,
5357        )
5358        .expect("aaab");
5359        let exact_aabb = cell_fourth_derivative_from_moments(
5360            cell,
5361            &dc_da,
5362            &dc_da,
5363            &dc_db,
5364            &dc_db,
5365            &dc_daa,
5366            &dc_dab,
5367            &dc_dab,
5368            &dc_dab,
5369            &dc_dab,
5370            &dc_dbb,
5371            &dc_daab,
5372            &dc_daab,
5373            &dc_dabb,
5374            &dc_dabb,
5375            &zero,
5376            &state.moments,
5377        )
5378        .expect("aabb");
5379        let exact_abbb = cell_fourth_derivative_from_moments(
5380            cell,
5381            &dc_da,
5382            &dc_db,
5383            &dc_db,
5384            &dc_db,
5385            &dc_dab,
5386            &dc_dab,
5387            &dc_dab,
5388            &dc_dbb,
5389            &dc_dbb,
5390            &dc_dbb,
5391            &dc_dabb,
5392            &dc_dabb,
5393            &dc_dabb,
5394            &dc_dbbb,
5395            &zero,
5396            &state.moments,
5397        )
5398        .expect("abbb");
5399        let exact_bbbb = cell_fourth_derivative_from_moments(
5400            cell,
5401            &dc_db,
5402            &dc_db,
5403            &dc_db,
5404            &dc_db,
5405            &dc_dbb,
5406            &dc_dbb,
5407            &dc_dbb,
5408            &dc_dbb,
5409            &dc_dbb,
5410            &dc_dbb,
5411            &dc_dbbb,
5412            &dc_dbbb,
5413            &dc_dbbb,
5414            &dc_dbbb,
5415            &zero,
5416            &state.moments,
5417        )
5418        .expect("bbbb");
5419
5420        let numeric_a = simpson_integral(cell.left, cell.right, 5000, |z| {
5421            eta_a(z) * (-cell.q(z)).exp() * INV_TWO_PI
5422        });
5423        let numeric_b = simpson_integral(cell.left, cell.right, 5000, |z| {
5424            eta_b(z) * (-cell.q(z)).exp() * INV_TWO_PI
5425        });
5426        let numeric_aa = simpson_integral(cell.left, cell.right, 5000, |z| {
5427            (eta_aa(z) - cell.eta(z) * eta_a(z) * eta_a(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5428        });
5429        let numeric_ab = simpson_integral(cell.left, cell.right, 5000, |z| {
5430            (eta_ab(z) - cell.eta(z) * eta_a(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5431        });
5432        let numeric_bb = simpson_integral(cell.left, cell.right, 5000, |z| {
5433            (eta_bb(z) - cell.eta(z) * eta_b(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5434        });
5435        let numeric_aaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5436            let eta = cell.eta(z);
5437            (eta_aaa(z) - 3.0 * eta * eta_aa(z) * eta_a(z) + (eta * eta - 1.0) * eta_a(z).powi(3))
5438                * (-cell.q(z)).exp()
5439                * INV_TWO_PI
5440        });
5441        let numeric_aab = simpson_integral(cell.left, cell.right, 5000, |z| {
5442            let eta = cell.eta(z);
5443            let a_z = eta_a(z);
5444            let b_z = eta_b(z);
5445            (eta_aab(z) - eta * (eta_aa(z) * b_z + 2.0 * eta_ab(z) * a_z)
5446                + (eta * eta - 1.0) * a_z * a_z * b_z)
5447                * (-cell.q(z)).exp()
5448                * INV_TWO_PI
5449        });
5450        let numeric_abb = simpson_integral(cell.left, cell.right, 5000, |z| {
5451            let eta = cell.eta(z);
5452            let a_z = eta_a(z);
5453            let b_z = eta_b(z);
5454            (eta_abb(z) - eta * (2.0 * eta_ab(z) * b_z + eta_bb(z) * a_z)
5455                + (eta * eta - 1.0) * a_z * b_z * b_z)
5456                * (-cell.q(z)).exp()
5457                * INV_TWO_PI
5458        });
5459        let numeric_bbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5460            let eta = cell.eta(z);
5461            (eta_bbb(z) - 3.0 * eta * eta_bb(z) * eta_b(z) + (eta * eta - 1.0) * eta_b(z).powi(3))
5462                * (-cell.q(z)).exp()
5463                * INV_TWO_PI
5464        });
5465        let numeric_aaaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5466            let eta = cell.eta(z);
5467            let eta_a_z = eta_a(z);
5468            let eta_aa_z = eta_aa(z);
5469            let eta_aaa_z = eta_aaa(z);
5470            (-eta * (4.0 * eta_aaa_z * eta_a_z + 3.0 * eta_aa_z * eta_aa_z)
5471                + (eta * eta - 1.0) * (6.0 * eta_aa_z * eta_a_z * eta_a_z)
5472                + (-eta * eta * eta + 3.0 * eta) * eta_a_z.powi(4))
5473                * (-cell.q(z)).exp()
5474                * INV_TWO_PI
5475        });
5476        let numeric_aaab = simpson_integral(cell.left, cell.right, 5000, |z| {
5477            let eta = cell.eta(z);
5478            let a_z = eta_a(z);
5479            let b_z = eta_b(z);
5480            let aa_z = eta_aa(z);
5481            let ab_z = eta_ab(z);
5482            let aaa_z = eta_aaa(z);
5483            let aab_z = eta_aab(z);
5484            (-eta * (aaa_z * b_z + 3.0 * aab_z * a_z + 3.0 * aa_z * ab_z)
5485                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * b_z + 3.0 * ab_z * a_z * a_z)
5486                + (-eta * eta * eta + 3.0 * eta) * a_z.powi(3) * b_z)
5487                * (-cell.q(z)).exp()
5488                * INV_TWO_PI
5489        });
5490        let numeric_aabb = simpson_integral(cell.left, cell.right, 5000, |z| {
5491            let eta = cell.eta(z);
5492            let a_z = eta_a(z);
5493            let b_z = eta_b(z);
5494            let aa_z = eta_aa(z);
5495            let ab_z = eta_ab(z);
5496            let bb_z = eta_bb(z);
5497            let aab_z = eta_aab(z);
5498            let abb_z = eta_abb(z);
5499            (-eta * (2.0 * aab_z * b_z + 2.0 * abb_z * a_z + aa_z * bb_z + 2.0 * ab_z * ab_z)
5500                + (eta * eta - 1.0)
5501                    * (aa_z * b_z * b_z + 4.0 * ab_z * a_z * b_z + bb_z * a_z * a_z)
5502                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * b_z * b_z)
5503                * (-cell.q(z)).exp()
5504                * INV_TWO_PI
5505        });
5506        let numeric_abbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5507            let eta = cell.eta(z);
5508            let a_z = eta_a(z);
5509            let b_z = eta_b(z);
5510            let ab_z = eta_ab(z);
5511            let bb_z = eta_bb(z);
5512            let abb_z = eta_abb(z);
5513            let bbb_z = eta_bbb(z);
5514            (-eta * (3.0 * abb_z * b_z + bbb_z * a_z + 3.0 * ab_z * bb_z)
5515                + (eta * eta - 1.0) * (3.0 * ab_z * b_z * b_z + 3.0 * bb_z * a_z * b_z)
5516                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z.powi(3))
5517                * (-cell.q(z)).exp()
5518                * INV_TWO_PI
5519        });
5520        let numeric_bbbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5521            let eta = cell.eta(z);
5522            let eta_b_z = eta_b(z);
5523            let eta_bb_z = eta_bb(z);
5524            let eta_bbb_z = eta_bbb(z);
5525            (-eta * (4.0 * eta_bbb_z * eta_b_z + 3.0 * eta_bb_z * eta_bb_z)
5526                + (eta * eta - 1.0) * (6.0 * eta_bb_z * eta_b_z * eta_b_z)
5527                + (-eta * eta * eta + 3.0 * eta) * eta_b_z.powi(4))
5528                * (-cell.q(z)).exp()
5529                * INV_TWO_PI
5530        });
5531
5532        assert!((exact_a - numeric_a).abs() < 1e-8);
5533        assert!((exact_b - numeric_b).abs() < 1e-8);
5534        assert!((exact_aa - numeric_aa).abs() < 1e-8);
5535        assert!((exact_ab - numeric_ab).abs() < 1e-8);
5536        assert!((exact_bb - numeric_bb).abs() < 1e-8);
5537        assert!((exact_aaa - numeric_aaa).abs() < 2e-7);
5538        assert!((exact_aab - numeric_aab).abs() < 2e-7);
5539        assert!((exact_abb - numeric_abb).abs() < 2e-7);
5540        assert!((exact_bbb - numeric_bbb).abs() < 2e-7);
5541        assert!((exact_aaaa - numeric_aaaa).abs() < 2e-6);
5542        assert!((exact_aaab - numeric_aaab).abs() < 2e-6);
5543        assert!((exact_aabb - numeric_aabb).abs() < 2e-6);
5544        assert!((exact_abbb - numeric_abbb).abs() < 2e-6);
5545        assert!((exact_bbbb - numeric_bbbb).abs() < 2e-6);
5546    }
5547
5548    #[test]
5549    fn link_basis_cell_derivatives_match_exact_integrands() {
5550        let score_span = LocalSpanCubic {
5551            left: -0.75,
5552            right: 0.25,
5553            c0: 0.08,
5554            c1: -0.03,
5555            c2: 0.02,
5556            c3: -0.01,
5557        };
5558        let link_span = LocalSpanCubic {
5559            left: -0.6,
5560            right: 0.9,
5561            c0: -0.05,
5562            c1: 0.04,
5563            c2: -0.02,
5564            c3: 0.015,
5565        };
5566        let link_basis_span = LocalSpanCubic {
5567            left: -0.6,
5568            right: 0.9,
5569            c0: 0.02,
5570            c1: -0.01,
5571            c2: 0.03,
5572            c3: -0.02,
5573        };
5574        let a = 0.3;
5575        let b = -0.7;
5576        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5577        let cell = DenestedCubicCell {
5578            left: score_span.left,
5579            right: score_span.right,
5580            c0: coeffs[0],
5581            c1: coeffs[1],
5582            c2: coeffs[2],
5583            c3: coeffs[3],
5584        };
5585        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5586        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5587        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5588        let dc_daa = second_partials.0;
5589        let dc_dab = second_partials.1;
5590        let dc_dbb = second_partials.2;
5591        let denested_third = denested_cell_third_partials(link_span);
5592        let dc_daaa = denested_third.0;
5593        let dc_dbbb = denested_third.3;
5594
5595        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
5596        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
5597        let (coeff_aaw, coeff_abw, coeff_bbw) =
5598            link_basis_cell_second_partials(link_basis_span, a, b);
5599        let link_basis_third = link_basis_cell_third_partials(link_basis_span);
5600        let coeff_aaaw = link_basis_third.0;
5601        let coeff_bbbw = link_basis_third.3;
5602        let zero = [0.0; 4];
5603        let basis_third = 6.0 * link_basis_span.c3;
5604
5605        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5606        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5607        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5608        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5609        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5610        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
5611        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
5612        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
5613        let eta_aaw = |z: f64| link_basis_span.second_derivative(a + b * z);
5614        let eta_abw = |z: f64| z * link_basis_span.second_derivative(a + b * z);
5615        let eta_bbw = |z: f64| z * z * link_basis_span.second_derivative(a + b * z);
5616        let eta_aaaw = |z: f64| basis_third + 0.0 * z;
5617        let eta_bbbw = |z: f64| z * z * z * basis_third;
5618
5619        let exact_w = cell_first_derivative_from_moments(&coeff_w, &state.moments).expect("w");
5620        let exact_aw =
5621            cell_second_derivative_from_moments(cell, &dc_da, &coeff_w, &coeff_aw, &state.moments)
5622                .expect("aw");
5623        let exact_bw =
5624            cell_second_derivative_from_moments(cell, &dc_db, &coeff_w, &coeff_bw, &state.moments)
5625                .expect("bw");
5626        let exact_ww =
5627            cell_second_derivative_from_moments(cell, &coeff_w, &coeff_w, &zero, &state.moments)
5628                .expect("ww");
5629        let exact_aaw = cell_third_derivative_from_moments(
5630            cell,
5631            &dc_da,
5632            &dc_da,
5633            &coeff_w,
5634            &dc_daa,
5635            &coeff_aw,
5636            &coeff_aw,
5637            &coeff_aaw,
5638            &state.moments,
5639        )
5640        .expect("aaw");
5641        let exact_abw = cell_third_derivative_from_moments(
5642            cell,
5643            &dc_da,
5644            &dc_db,
5645            &coeff_w,
5646            &dc_dab,
5647            &coeff_aw,
5648            &coeff_bw,
5649            &coeff_abw,
5650            &state.moments,
5651        )
5652        .expect("abw");
5653        let exact_bbw = cell_third_derivative_from_moments(
5654            cell,
5655            &dc_db,
5656            &dc_db,
5657            &coeff_w,
5658            &dc_dbb,
5659            &coeff_bw,
5660            &coeff_bw,
5661            &coeff_bbw,
5662            &state.moments,
5663        )
5664        .expect("bbw");
5665        let exact_www = cell_third_derivative_from_moments(
5666            cell,
5667            &coeff_w,
5668            &coeff_w,
5669            &coeff_w,
5670            &zero,
5671            &zero,
5672            &zero,
5673            &zero,
5674            &state.moments,
5675        )
5676        .expect("www");
5677        let exact_aaaw = cell_fourth_derivative_from_moments(
5678            cell,
5679            &dc_da,
5680            &dc_da,
5681            &dc_da,
5682            &coeff_w,
5683            &dc_daa,
5684            &dc_daa,
5685            &coeff_aw,
5686            &dc_daa,
5687            &coeff_aw,
5688            &coeff_aw,
5689            &dc_daaa,
5690            &coeff_aaw,
5691            &coeff_aaw,
5692            &coeff_aaw,
5693            &coeff_aaaw,
5694            &state.moments,
5695        )
5696        .expect("aaaw");
5697        let exact_aaww = cell_fourth_derivative_from_moments(
5698            cell,
5699            &dc_da,
5700            &dc_da,
5701            &coeff_w,
5702            &coeff_w,
5703            &dc_daa,
5704            &coeff_aw,
5705            &coeff_aw,
5706            &coeff_aw,
5707            &coeff_aw,
5708            &zero,
5709            &coeff_aaw,
5710            &coeff_aaw,
5711            &zero,
5712            &zero,
5713            &zero,
5714            &state.moments,
5715        )
5716        .expect("aaww");
5717        let exact_abww = cell_fourth_derivative_from_moments(
5718            cell,
5719            &dc_da,
5720            &dc_db,
5721            &coeff_w,
5722            &coeff_w,
5723            &dc_dab,
5724            &coeff_aw,
5725            &coeff_aw,
5726            &coeff_bw,
5727            &coeff_bw,
5728            &zero,
5729            &coeff_abw,
5730            &coeff_abw,
5731            &zero,
5732            &zero,
5733            &zero,
5734            &state.moments,
5735        )
5736        .expect("abww");
5737        let exact_bbww = cell_fourth_derivative_from_moments(
5738            cell,
5739            &dc_db,
5740            &dc_db,
5741            &coeff_w,
5742            &coeff_w,
5743            &dc_dbb,
5744            &coeff_bw,
5745            &coeff_bw,
5746            &coeff_bw,
5747            &coeff_bw,
5748            &zero,
5749            &coeff_bbw,
5750            &coeff_bbw,
5751            &zero,
5752            &zero,
5753            &zero,
5754            &state.moments,
5755        )
5756        .expect("bbww");
5757        let exact_bbbw = cell_fourth_derivative_from_moments(
5758            cell,
5759            &dc_db,
5760            &dc_db,
5761            &dc_db,
5762            &coeff_w,
5763            &dc_dbb,
5764            &dc_dbb,
5765            &coeff_bw,
5766            &dc_dbb,
5767            &coeff_bw,
5768            &coeff_bw,
5769            &dc_dbbb,
5770            &coeff_bbw,
5771            &coeff_bbw,
5772            &coeff_bbw,
5773            &coeff_bbbw,
5774            &state.moments,
5775        )
5776        .expect("bbbw");
5777        let exact_wwww = cell_fourth_derivative_from_moments(
5778            cell,
5779            &coeff_w,
5780            &coeff_w,
5781            &coeff_w,
5782            &coeff_w,
5783            &zero,
5784            &zero,
5785            &zero,
5786            &zero,
5787            &zero,
5788            &zero,
5789            &zero,
5790            &zero,
5791            &zero,
5792            &zero,
5793            &zero,
5794            &state.moments,
5795        )
5796        .expect("wwww");
5797
5798        let numeric_w = simpson_integral(cell.left, cell.right, 5000, |z| {
5799            eta_w(z) * (-cell.q(z)).exp() * INV_TWO_PI
5800        });
5801        let numeric_aw = simpson_integral(cell.left, cell.right, 5000, |z| {
5802            (eta_aw(z) - cell.eta(z) * eta_a(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5803        });
5804        let numeric_bw = simpson_integral(cell.left, cell.right, 5000, |z| {
5805            (eta_bw(z) - cell.eta(z) * eta_b(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5806        });
5807        let numeric_ww = simpson_integral(cell.left, cell.right, 5000, |z| {
5808            (-cell.eta(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5809        });
5810        let numeric_aaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5811            let eta = cell.eta(z);
5812            let w_z = eta_w(z);
5813            let a_z = eta_a(z);
5814            (eta_aaw(z) - eta * (eta_aa(z) * w_z + 2.0 * eta_aw(z) * a_z)
5815                + (eta * eta - 1.0) * a_z * a_z * w_z)
5816                * (-cell.q(z)).exp()
5817                * INV_TWO_PI
5818        });
5819        let numeric_abw = simpson_integral(cell.left, cell.right, 5000, |z| {
5820            let eta = cell.eta(z);
5821            let w_z = eta_w(z);
5822            let a_z = eta_a(z);
5823            let b_z = eta_b(z);
5824            (eta_abw(z) - eta * (eta_ab(z) * w_z + eta_aw(z) * b_z + eta_bw(z) * a_z)
5825                + (eta * eta - 1.0) * a_z * b_z * w_z)
5826                * (-cell.q(z)).exp()
5827                * INV_TWO_PI
5828        });
5829        let numeric_bbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5830            let eta = cell.eta(z);
5831            let w_z = eta_w(z);
5832            let b_z = eta_b(z);
5833            (eta_bbw(z) - eta * (eta_bb(z) * w_z + 2.0 * eta_bw(z) * b_z)
5834                + (eta * eta - 1.0) * b_z * b_z * w_z)
5835                * (-cell.q(z)).exp()
5836                * INV_TWO_PI
5837        });
5838        let numeric_www = simpson_integral(cell.left, cell.right, 5000, |z| {
5839            let eta = cell.eta(z);
5840            let w_z = eta_w(z);
5841            ((eta * eta - 1.0) * w_z * w_z * w_z) * (-cell.q(z)).exp() * INV_TWO_PI
5842        });
5843        let numeric_aaaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5844            let eta = cell.eta(z);
5845            let a_z = eta_a(z);
5846            let w_z = eta_w(z);
5847            let aa_z = eta_aa(z);
5848            let aw_z = eta_aw(z);
5849            (eta_aaaw(z)
5850                - eta * ((dc_daaa[0] + 0.0 * z) * w_z + 3.0 * eta_aaw(z) * a_z + 3.0 * aa_z * aw_z)
5851                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * w_z + 3.0 * aw_z * a_z * a_z)
5852                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * a_z * w_z)
5853                * (-cell.q(z)).exp()
5854                * INV_TWO_PI
5855        });
5856        let numeric_aaww = simpson_integral(cell.left, cell.right, 5000, |z| {
5857            let eta = cell.eta(z);
5858            let a_z = eta_a(z);
5859            let w_z = eta_w(z);
5860            let aw_z = eta_aw(z);
5861            (-(2.0 * eta * (eta_aaw(z) * w_z + aw_z * aw_z))
5862                + (eta * eta - 1.0) * (eta_aa(z) * w_z * w_z + 4.0 * aw_z * a_z * w_z)
5863                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * w_z * w_z)
5864                * (-cell.q(z)).exp()
5865                * INV_TWO_PI
5866        });
5867        let numeric_abww = simpson_integral(cell.left, cell.right, 5000, |z| {
5868            let eta = cell.eta(z);
5869            let a_z = eta_a(z);
5870            let b_z = eta_b(z);
5871            let w_z = eta_w(z);
5872            let aw_z = eta_aw(z);
5873            let bw_z = eta_bw(z);
5874            (-(2.0 * eta * (eta_abw(z) * w_z + aw_z * bw_z))
5875                + (eta * eta - 1.0)
5876                    * (eta_ab(z) * w_z * w_z + 2.0 * aw_z * b_z * w_z + 2.0 * bw_z * a_z * w_z)
5877                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * w_z * w_z)
5878                * (-cell.q(z)).exp()
5879                * INV_TWO_PI
5880        });
5881        let numeric_bbww = simpson_integral(cell.left, cell.right, 5000, |z| {
5882            let eta = cell.eta(z);
5883            let b_z = eta_b(z);
5884            let w_z = eta_w(z);
5885            let bw_z = eta_bw(z);
5886            (-(2.0 * eta * (eta_bbw(z) * w_z + bw_z * bw_z))
5887                + (eta * eta - 1.0) * (eta_bb(z) * w_z * w_z + 4.0 * bw_z * b_z * w_z)
5888                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * w_z * w_z)
5889                * (-cell.q(z)).exp()
5890                * INV_TWO_PI
5891        });
5892        let numeric_bbbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5893            let eta = cell.eta(z);
5894            let b_z = eta_b(z);
5895            let w_z = eta_w(z);
5896            let bb_z = eta_bb(z);
5897            let bw_z = eta_bw(z);
5898            (eta_bbbw(z)
5899                - eta
5900                    * ((dc_dbbb[3] * z * z * z) * w_z + 3.0 * eta_bbw(z) * b_z + 3.0 * bb_z * bw_z)
5901                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * w_z + 3.0 * bw_z * b_z * b_z)
5902                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * w_z)
5903                * (-cell.q(z)).exp()
5904                * INV_TWO_PI
5905        });
5906        let numeric_wwww = simpson_integral(cell.left, cell.right, 5000, |z| {
5907            let eta = cell.eta(z);
5908            let w_z = eta_w(z);
5909            ((-eta * eta * eta + 3.0 * eta) * w_z * w_z * w_z * w_z)
5910                * (-cell.q(z)).exp()
5911                * INV_TWO_PI
5912        });
5913
5914        assert!((exact_w - numeric_w).abs() < 1e-8);
5915        assert!((exact_aw - numeric_aw).abs() < 1e-7);
5916        assert!((exact_bw - numeric_bw).abs() < 1e-7);
5917        assert!((exact_ww - numeric_ww).abs() < 1e-7);
5918        assert!((exact_aaw - numeric_aaw).abs() < 2e-6);
5919        assert!((exact_abw - numeric_abw).abs() < 2e-6);
5920        assert!((exact_bbw - numeric_bbw).abs() < 2e-6);
5921        assert!((exact_www - numeric_www).abs() < 2e-6);
5922        assert!((exact_aaaw - numeric_aaaw).abs() < 3e-6);
5923        assert!((exact_aaww - numeric_aaww).abs() < 3e-6);
5924        assert!((exact_abww - numeric_abww).abs() < 3e-6);
5925        assert!((exact_bbww - numeric_bbww).abs() < 3e-6);
5926        assert!((exact_bbbw - numeric_bbbw).abs() < 3e-6);
5927        assert!((exact_wwww - numeric_wwww).abs() < 3e-6);
5928    }
5929
5930    #[test]
5931    fn score_basis_cell_derivatives_match_exact_integrands() {
5932        let score_span = LocalSpanCubic {
5933            left: -0.75,
5934            right: 0.25,
5935            c0: 0.08,
5936            c1: -0.03,
5937            c2: 0.02,
5938            c3: -0.01,
5939        };
5940        let score_basis_span = LocalSpanCubic {
5941            left: -0.75,
5942            right: 0.25,
5943            c0: -0.04,
5944            c1: 0.06,
5945            c2: -0.01,
5946            c3: 0.02,
5947        };
5948        let link_span = LocalSpanCubic {
5949            left: -0.6,
5950            right: 0.9,
5951            c0: -0.05,
5952            c1: 0.04,
5953            c2: -0.02,
5954            c3: 0.015,
5955        };
5956        let a = 0.3;
5957        let b = -0.7;
5958        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5959        let cell = DenestedCubicCell {
5960            left: score_span.left,
5961            right: score_span.right,
5962            c0: coeffs[0],
5963            c1: coeffs[1],
5964            c2: coeffs[2],
5965            c3: coeffs[3],
5966        };
5967        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5968        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5969        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5970        let dc_daa = second_partials.0;
5971        let dc_dab = second_partials.1;
5972        let dc_dbb = second_partials.2;
5973        let denested_third = denested_cell_third_partials(link_span);
5974        let dc_dbbb = denested_third.3;
5975
5976        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
5977        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
5978        let zero = [0.0; 4];
5979
5980        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5981        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5982        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5983        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5984        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
5985        let eta_bh = |z: f64| score_basis_span.evaluate(z);
5986
5987        let exact_h = cell_first_derivative_from_moments(&coeff_h, &state.moments).expect("h");
5988        let exact_ah =
5989            cell_second_derivative_from_moments(cell, &dc_da, &coeff_h, &zero, &state.moments)
5990                .expect("ah");
5991        let exact_bh =
5992            cell_second_derivative_from_moments(cell, &dc_db, &coeff_h, &coeff_bh, &state.moments)
5993                .expect("bh");
5994        let exact_hh =
5995            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_h, &zero, &state.moments)
5996                .expect("hh");
5997        let exact_abh = cell_third_derivative_from_moments(
5998            cell,
5999            &dc_da,
6000            &dc_db,
6001            &coeff_h,
6002            &dc_dab,
6003            &zero,
6004            &coeff_bh,
6005            &zero,
6006            &state.moments,
6007        )
6008        .expect("abh");
6009        let exact_bbh = cell_third_derivative_from_moments(
6010            cell,
6011            &dc_db,
6012            &dc_db,
6013            &coeff_h,
6014            &dc_dbb,
6015            &coeff_bh,
6016            &coeff_bh,
6017            &zero,
6018            &state.moments,
6019        )
6020        .expect("bbh");
6021        let exact_bhh = cell_third_derivative_from_moments(
6022            cell,
6023            &dc_db,
6024            &coeff_h,
6025            &coeff_h,
6026            &coeff_bh,
6027            &coeff_bh,
6028            &zero,
6029            &zero,
6030            &state.moments,
6031        )
6032        .expect("bhh");
6033        let exact_hhh = cell_third_derivative_from_moments(
6034            cell,
6035            &coeff_h,
6036            &coeff_h,
6037            &coeff_h,
6038            &zero,
6039            &zero,
6040            &zero,
6041            &zero,
6042            &state.moments,
6043        )
6044        .expect("hhh");
6045        let exact_bbbh = cell_fourth_derivative_from_moments(
6046            cell,
6047            &dc_db,
6048            &dc_db,
6049            &dc_db,
6050            &coeff_h,
6051            &dc_dbb,
6052            &dc_dbb,
6053            &coeff_bh,
6054            &dc_dbb,
6055            &coeff_bh,
6056            &coeff_bh,
6057            &dc_dbbb,
6058            &zero,
6059            &zero,
6060            &zero,
6061            &zero,
6062            &state.moments,
6063        )
6064        .expect("bbbh");
6065        let exact_aahh = cell_fourth_derivative_from_moments(
6066            cell,
6067            &dc_da,
6068            &dc_da,
6069            &coeff_h,
6070            &coeff_h,
6071            &dc_daa,
6072            &zero,
6073            &zero,
6074            &zero,
6075            &zero,
6076            &zero,
6077            &zero,
6078            &zero,
6079            &zero,
6080            &zero,
6081            &zero,
6082            &state.moments,
6083        )
6084        .expect("aahh");
6085        let exact_abhh = cell_fourth_derivative_from_moments(
6086            cell,
6087            &dc_da,
6088            &dc_db,
6089            &coeff_h,
6090            &coeff_h,
6091            &dc_dab,
6092            &zero,
6093            &zero,
6094            &coeff_bh,
6095            &coeff_bh,
6096            &zero,
6097            &zero,
6098            &zero,
6099            &zero,
6100            &zero,
6101            &zero,
6102            &state.moments,
6103        )
6104        .expect("abhh");
6105        let exact_bbhh = cell_fourth_derivative_from_moments(
6106            cell,
6107            &dc_db,
6108            &dc_db,
6109            &coeff_h,
6110            &coeff_h,
6111            &dc_dbb,
6112            &coeff_bh,
6113            &coeff_bh,
6114            &coeff_bh,
6115            &coeff_bh,
6116            &zero,
6117            &zero,
6118            &zero,
6119            &zero,
6120            &zero,
6121            &zero,
6122            &state.moments,
6123        )
6124        .expect("bbhh");
6125        let exact_bhhh = cell_fourth_derivative_from_moments(
6126            cell,
6127            &dc_db,
6128            &coeff_h,
6129            &coeff_h,
6130            &coeff_h,
6131            &coeff_bh,
6132            &coeff_bh,
6133            &coeff_bh,
6134            &zero,
6135            &zero,
6136            &zero,
6137            &zero,
6138            &zero,
6139            &zero,
6140            &zero,
6141            &zero,
6142            &state.moments,
6143        )
6144        .expect("bhhh");
6145        let exact_hhhh = cell_fourth_derivative_from_moments(
6146            cell,
6147            &coeff_h,
6148            &coeff_h,
6149            &coeff_h,
6150            &coeff_h,
6151            &zero,
6152            &zero,
6153            &zero,
6154            &zero,
6155            &zero,
6156            &zero,
6157            &zero,
6158            &zero,
6159            &zero,
6160            &zero,
6161            &zero,
6162            &state.moments,
6163        )
6164        .expect("hhhh");
6165
6166        let numeric_h = simpson_integral(cell.left, cell.right, 5000, |z| {
6167            eta_h(z) * (-cell.q(z)).exp() * INV_TWO_PI
6168        });
6169        let numeric_ah = simpson_integral(cell.left, cell.right, 5000, |z| {
6170            (-cell.eta(z) * eta_a(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6171        });
6172        let numeric_bh = simpson_integral(cell.left, cell.right, 5000, |z| {
6173            (eta_bh(z) - cell.eta(z) * eta_b(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6174        });
6175        let numeric_hh = simpson_integral(cell.left, cell.right, 5000, |z| {
6176            (-cell.eta(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6177        });
6178        let numeric_abh = simpson_integral(cell.left, cell.right, 5000, |z| {
6179            let eta = cell.eta(z);
6180            (-(eta * (eta_ab(z) * eta_h(z) + eta_bh(z) * eta_a(z)))
6181                + (eta * eta - 1.0) * eta_a(z) * eta_b(z) * eta_h(z))
6182                * (-cell.q(z)).exp()
6183                * INV_TWO_PI
6184        });
6185        let numeric_bbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6186            let eta = cell.eta(z);
6187            (-(eta * (eta_bb(z) * eta_h(z) + 2.0 * eta_bh(z) * eta_b(z)))
6188                + (eta * eta - 1.0) * eta_b(z) * eta_b(z) * eta_h(z))
6189                * (-cell.q(z)).exp()
6190                * INV_TWO_PI
6191        });
6192        let numeric_bhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6193            let eta = cell.eta(z);
6194            (-(2.0 * eta * eta_bh(z) * eta_h(z))
6195                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_h(z))
6196                * (-cell.q(z)).exp()
6197                * INV_TWO_PI
6198        });
6199        let numeric_hhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6200            let eta = cell.eta(z);
6201            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6202        });
6203        let numeric_bbbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6204            let eta = cell.eta(z);
6205            let b_z = eta_b(z);
6206            let h_z = eta_h(z);
6207            let bb_z = eta_bb(z);
6208            let bh_z = eta_bh(z);
6209            (-(eta * ((dc_dbbb[3] * z * z * z) * h_z + 3.0 * bb_z * bh_z))
6210                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * h_z + 3.0 * bh_z * b_z * b_z)
6211                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * h_z)
6212                * (-cell.q(z)).exp()
6213                * INV_TWO_PI
6214        });
6215        let numeric_aahh = simpson_integral(cell.left, cell.right, 5000, |z| {
6216            let eta = cell.eta(z);
6217            let a_z = eta_a(z);
6218            let h_z = eta_h(z);
6219            ((eta * eta - 1.0) * polynomial_value(&dc_daa, z) * h_z * h_z
6220                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * h_z * h_z)
6221                * (-cell.q(z)).exp()
6222                * INV_TWO_PI
6223        });
6224        let numeric_abhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6225            let eta = cell.eta(z);
6226            let a_z = eta_a(z);
6227            let b_z = eta_b(z);
6228            let h_z = eta_h(z);
6229            ((eta * eta - 1.0) * (eta_ab(z) * h_z * h_z + 2.0 * eta_bh(z) * a_z * h_z)
6230                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * h_z * h_z)
6231                * (-cell.q(z)).exp()
6232                * INV_TWO_PI
6233        });
6234        let numeric_bbhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6235            let eta = cell.eta(z);
6236            let b_z = eta_b(z);
6237            let h_z = eta_h(z);
6238            let bh_z = eta_bh(z);
6239            (-(2.0 * eta * bh_z * bh_z)
6240                + (eta * eta - 1.0) * (eta_bb(z) * h_z * h_z + 4.0 * bh_z * b_z * h_z)
6241                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * h_z * h_z)
6242                * (-cell.q(z)).exp()
6243                * INV_TWO_PI
6244        });
6245        let numeric_bhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6246            let eta = cell.eta(z);
6247            let h_z = eta_h(z);
6248            (-(eta * (3.0 * eta_bh(z) * h_z * h_z))
6249                + (eta * eta - 1.0) * (3.0 * eta_bh(z) * h_z * h_z)
6250                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * h_z * h_z)
6251                * (-cell.q(z)).exp()
6252                * INV_TWO_PI
6253        });
6254        let numeric_hhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6255            let eta = cell.eta(z);
6256            let h_z = eta_h(z);
6257            ((-eta * eta * eta + 3.0 * eta) * h_z * h_z * h_z * h_z)
6258                * (-cell.q(z)).exp()
6259                * INV_TWO_PI
6260        });
6261
6262        assert!((exact_h - numeric_h).abs() < 1e-8);
6263        assert!((exact_ah - numeric_ah).abs() < 1e-7);
6264        assert!((exact_bh - numeric_bh).abs() < 1e-7);
6265        assert!((exact_hh - numeric_hh).abs() < 1e-7);
6266        assert!((exact_abh - numeric_abh).abs() < 2e-6);
6267        assert!((exact_bbh - numeric_bbh).abs() < 2e-6);
6268        assert!((exact_bhh - numeric_bhh).abs() < 2e-6);
6269        assert!((exact_hhh - numeric_hhh).abs() < 2e-6);
6270        assert!((exact_bbbh - numeric_bbbh).abs() < 3e-6);
6271        assert!((exact_aahh - numeric_aahh).abs() < 3e-6);
6272        assert!((exact_abhh - numeric_abhh).abs() < 3e-6);
6273        assert!((exact_bbhh - numeric_bbhh).abs() < 3e-6);
6274        assert!((exact_bhhh - numeric_bhhh).abs() < 3e-6);
6275        assert!((exact_hhhh - numeric_hhhh).abs() < 3e-6);
6276    }
6277
6278    #[test]
6279    fn cross_basis_cell_derivatives_match_exact_integrands() {
6280        let score_span = LocalSpanCubic {
6281            left: -0.75,
6282            right: 0.25,
6283            c0: 0.08,
6284            c1: -0.03,
6285            c2: 0.02,
6286            c3: -0.01,
6287        };
6288        let score_basis_span = LocalSpanCubic {
6289            left: -0.75,
6290            right: 0.25,
6291            c0: -0.04,
6292            c1: 0.06,
6293            c2: -0.01,
6294            c3: 0.02,
6295        };
6296        let link_span = LocalSpanCubic {
6297            left: -0.6,
6298            right: 0.9,
6299            c0: -0.05,
6300            c1: 0.04,
6301            c2: -0.02,
6302            c3: 0.015,
6303        };
6304        let link_basis_span = LocalSpanCubic {
6305            left: -0.6,
6306            right: 0.9,
6307            c0: 0.02,
6308            c1: -0.01,
6309            c2: 0.03,
6310            c3: -0.02,
6311        };
6312        let a = 0.3;
6313        let b = -0.7;
6314        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6315        let cell = DenestedCubicCell {
6316            left: score_span.left,
6317            right: score_span.right,
6318            c0: coeffs[0],
6319            c1: coeffs[1],
6320            c2: coeffs[2],
6321            c3: coeffs[3],
6322        };
6323        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6324        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6325        let (dc_daa, dc_dab, _) = denested_cell_second_partials(score_span, link_span, a, b);
6326
6327        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6328        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6329        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
6330        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
6331        let (coeff_aaw, coeff_abw, _) = link_basis_cell_second_partials(link_basis_span, a, b);
6332        let zero = [0.0; 4];
6333
6334        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6335        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6336        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6337        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6338        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
6339        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6340        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
6341        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
6342
6343        let exact_hw =
6344            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_w, &zero, &state.moments)
6345                .expect("hw");
6346        let exact_ahw = cell_third_derivative_from_moments(
6347            cell,
6348            &dc_da,
6349            &coeff_h,
6350            &coeff_w,
6351            &zero,
6352            &coeff_aw,
6353            &zero,
6354            &zero,
6355            &state.moments,
6356        )
6357        .expect("ahw");
6358        let exact_bhw = cell_third_derivative_from_moments(
6359            cell,
6360            &dc_db,
6361            &coeff_h,
6362            &coeff_w,
6363            &coeff_bh,
6364            &coeff_bw,
6365            &zero,
6366            &zero,
6367            &state.moments,
6368        )
6369        .expect("bhw");
6370        let exact_hhw = cell_third_derivative_from_moments(
6371            cell,
6372            &coeff_h,
6373            &coeff_h,
6374            &coeff_w,
6375            &zero,
6376            &zero,
6377            &zero,
6378            &zero,
6379            &state.moments,
6380        )
6381        .expect("hhw");
6382        let exact_hww = cell_third_derivative_from_moments(
6383            cell,
6384            &coeff_h,
6385            &coeff_w,
6386            &coeff_w,
6387            &zero,
6388            &zero,
6389            &zero,
6390            &zero,
6391            &state.moments,
6392        )
6393        .expect("hww");
6394        let exact_aahw = cell_fourth_derivative_from_moments(
6395            cell,
6396            &dc_da,
6397            &dc_da,
6398            &coeff_h,
6399            &coeff_w,
6400            &dc_daa,
6401            &zero,
6402            &coeff_aw,
6403            &zero,
6404            &coeff_aw,
6405            &zero,
6406            &zero,
6407            &coeff_aaw,
6408            &zero,
6409            &zero,
6410            &zero,
6411            &state.moments,
6412        )
6413        .expect("aahw");
6414        let exact_hhww = cell_fourth_derivative_from_moments(
6415            cell,
6416            &coeff_h,
6417            &coeff_h,
6418            &coeff_w,
6419            &coeff_w,
6420            &zero,
6421            &zero,
6422            &zero,
6423            &zero,
6424            &zero,
6425            &zero,
6426            &zero,
6427            &zero,
6428            &zero,
6429            &zero,
6430            &zero,
6431            &state.moments,
6432        )
6433        .expect("hhww");
6434        let exact_hhhw = cell_fourth_derivative_from_moments(
6435            cell,
6436            &coeff_h,
6437            &coeff_h,
6438            &coeff_h,
6439            &coeff_w,
6440            &zero,
6441            &zero,
6442            &zero,
6443            &zero,
6444            &zero,
6445            &zero,
6446            &zero,
6447            &zero,
6448            &zero,
6449            &zero,
6450            &zero,
6451            &state.moments,
6452        )
6453        .expect("hhhw");
6454        let exact_abhw = cell_fourth_derivative_from_moments(
6455            cell,
6456            &dc_da,
6457            &dc_db,
6458            &coeff_h,
6459            &coeff_w,
6460            &dc_dab,
6461            &zero,
6462            &coeff_aw,
6463            &coeff_bh,
6464            &coeff_bw,
6465            &zero,
6466            &zero,
6467            &coeff_abw,
6468            &zero,
6469            &zero,
6470            &zero,
6471            &state.moments,
6472        )
6473        .expect("abhw");
6474        let exact_ahww = cell_fourth_derivative_from_moments(
6475            cell,
6476            &dc_da,
6477            &coeff_h,
6478            &coeff_w,
6479            &coeff_w,
6480            &zero,
6481            &coeff_aw,
6482            &coeff_aw,
6483            &zero,
6484            &zero,
6485            &zero,
6486            &zero,
6487            &zero,
6488            &zero,
6489            &zero,
6490            &zero,
6491            &state.moments,
6492        )
6493        .expect("ahww");
6494        let exact_bhww = cell_fourth_derivative_from_moments(
6495            cell,
6496            &dc_db,
6497            &coeff_h,
6498            &coeff_w,
6499            &coeff_w,
6500            &coeff_bh,
6501            &coeff_bw,
6502            &coeff_bw,
6503            &zero,
6504            &zero,
6505            &zero,
6506            &zero,
6507            &zero,
6508            &zero,
6509            &zero,
6510            &zero,
6511            &state.moments,
6512        )
6513        .expect("bhww");
6514        let exact_hwww = cell_fourth_derivative_from_moments(
6515            cell,
6516            &coeff_h,
6517            &coeff_w,
6518            &coeff_w,
6519            &coeff_w,
6520            &zero,
6521            &zero,
6522            &zero,
6523            &zero,
6524            &zero,
6525            &zero,
6526            &zero,
6527            &zero,
6528            &zero,
6529            &zero,
6530            &zero,
6531            &state.moments,
6532        )
6533        .expect("hwww");
6534
6535        let numeric_hw = simpson_integral(cell.left, cell.right, 5000, |z| {
6536            (-cell.eta(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6537        });
6538        let numeric_ahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6539            let eta = cell.eta(z);
6540            (-(eta * eta_aw(z) * eta_h(z)) + (eta * eta - 1.0) * eta_a(z) * eta_h(z) * eta_w(z))
6541                * (-cell.q(z)).exp()
6542                * INV_TWO_PI
6543        });
6544        let numeric_bhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6545            let eta = cell.eta(z);
6546            (-(eta * (eta_bh(z) * eta_w(z) + eta_bw(z) * eta_h(z)))
6547                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_w(z))
6548                * (-cell.q(z)).exp()
6549                * INV_TWO_PI
6550        });
6551        let numeric_hhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6552            let eta = cell.eta(z);
6553            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6554        });
6555        let numeric_hww = simpson_integral(cell.left, cell.right, 5000, |z| {
6556            let eta = cell.eta(z);
6557            ((eta * eta - 1.0) * eta_h(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6558        });
6559        let numeric_aahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6560            let eta = cell.eta(z);
6561            (-(eta * polynomial_value(&coeff_aaw, z) * eta_h(z))
6562                + (eta * eta - 1.0)
6563                    * (polynomial_value(&dc_daa, z) * eta_h(z) * eta_w(z)
6564                        + 2.0 * eta_aw(z) * eta_a(z) * eta_h(z))
6565                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_a(z) * eta_h(z) * eta_w(z))
6566                * (-cell.q(z)).exp()
6567                * INV_TWO_PI
6568        });
6569        let numeric_hhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6570            let eta = cell.eta(z);
6571            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_w(z) * eta_w(z))
6572                * (-cell.q(z)).exp()
6573                * INV_TWO_PI
6574        });
6575        let numeric_hhhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6576            let eta = cell.eta(z);
6577            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_h(z) * eta_w(z))
6578                * (-cell.q(z)).exp()
6579                * INV_TWO_PI
6580        });
6581        let numeric_abhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6582            let eta = cell.eta(z);
6583            (-(eta * polynomial_value(&coeff_abw, z) * eta_h(z) + eta * eta_aw(z) * eta_bh(z))
6584                + (eta * eta - 1.0)
6585                    * (eta_ab(z) * eta_h(z) * eta_w(z)
6586                        + eta_aw(z) * eta_b(z) * eta_h(z)
6587                        + eta_bh(z) * eta_a(z) * eta_w(z)
6588                        + eta_bw(z) * eta_a(z) * eta_h(z))
6589                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_b(z) * eta_h(z) * eta_w(z))
6590                * (-cell.q(z)).exp()
6591                * INV_TWO_PI
6592        });
6593        let numeric_ahww = simpson_integral(cell.left, cell.right, 5000, |z| {
6594            let eta = cell.eta(z);
6595            (2.0 * (eta * eta - 1.0) * eta_aw(z) * eta_h(z) * eta_w(z)
6596                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_h(z) * eta_w(z) * eta_w(z))
6597                * (-cell.q(z)).exp()
6598                * INV_TWO_PI
6599        });
6600        let numeric_bhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6601            let eta = cell.eta(z);
6602            let h_z = eta_h(z);
6603            let w_z = eta_w(z);
6604            ((eta * eta - 1.0) * (eta_bh(z) * w_z * w_z + 2.0 * eta_bw(z) * h_z * w_z)
6605                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * w_z * w_z)
6606                * (-cell.q(z)).exp()
6607                * INV_TWO_PI
6608        });
6609        let numeric_hwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6610            let eta = cell.eta(z);
6611            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_w(z) * eta_w(z) * eta_w(z))
6612                * (-cell.q(z)).exp()
6613                * INV_TWO_PI
6614        });
6615
6616        assert!((exact_hw - numeric_hw).abs() < 1e-7);
6617        assert!((exact_ahw - numeric_ahw).abs() < 2e-6);
6618        assert!((exact_bhw - numeric_bhw).abs() < 2e-6);
6619        assert!((exact_hhw - numeric_hhw).abs() < 2e-6);
6620        assert!((exact_hww - numeric_hww).abs() < 2e-6);
6621        assert!((exact_aahw - numeric_aahw).abs() < 3e-6);
6622        assert!((exact_hhww - numeric_hhww).abs() < 3e-6);
6623        assert!((exact_hhhw - numeric_hhhw).abs() < 3e-6);
6624        assert!((exact_abhw - numeric_abhw).abs() < 3e-6);
6625        assert!((exact_ahww - numeric_ahww).abs() < 3e-6);
6626        assert!((exact_bhww - numeric_bhww).abs() < 3e-6);
6627        assert!((exact_hwww - numeric_hwww).abs() < 3e-6);
6628    }
6629
6630    #[test]
6631    fn cell_moment_scratch_reuses_buffers_under_margslope_like_pressure() {
6632        let cells = [
6633            DenestedCubicCell {
6634                left: -1.2,
6635                right: -0.35,
6636                c0: 0.18,
6637                c1: 0.72,
6638                c2: -0.045,
6639                c3: 0.018,
6640            },
6641            DenestedCubicCell {
6642                left: -0.35,
6643                right: 0.48,
6644                c0: -0.08,
6645                c1: 0.91,
6646                c2: 0.038,
6647                c3: -0.014,
6648            },
6649            DenestedCubicCell {
6650                left: 0.48,
6651                right: 1.4,
6652                c0: 0.11,
6653                c1: 0.83,
6654                c2: 0.022,
6655                c3: 0.012,
6656            },
6657        ];
6658        let mut scratch = CellMomentScratch::with_capacity(MAX_AFFINE_ANCHOR_DEGREE);
6659        for cell in cells {
6660            let baseline = evaluate_cell_moments(cell, 9).expect("baseline moments");
6661            let scratch_state =
6662                evaluate_cell_moments_with_scratch(cell, 9, &mut scratch).expect("scratch moments");
6663            assert_eq!(baseline.branch, scratch_state.branch);
6664            assert!((baseline.value - scratch_state.value).abs() <= 1e-10);
6665            assert_eq!(baseline.moments.len(), scratch_state.moments.len());
6666            for (lhs, rhs) in baseline.moments.iter().zip(scratch_state.moments.iter()) {
6667                assert!((lhs - rhs).abs() <= 1e-10, "{lhs} vs {rhs}");
6668            }
6669        }
6670
6671        reset_cell_moment_test_reallocs();
6672        let mut checksum = 0.0;
6673        for i in 0..5_000 {
6674            let cell = cells[i % cells.len()];
6675            let state = evaluate_cell_moments_with_scratch(cell, 9, &mut scratch)
6676                .expect("scratch moments under repeated pressure");
6677            checksum += state.value + state.moments[0] * 1e-12;
6678        }
6679        assert!(checksum.is_finite());
6680        assert_eq!(
6681            cell_moment_test_reallocs(),
6682            0,
6683            "scratch-backed inner cell-moment calls should not grow Vec buffers"
6684        );
6685    }
6686
6687    #[test]
6688    fn evaluate_cell_moments_matches_numeric_integrals() {
6689        let cell = DenestedCubicCell {
6690            left: -0.9,
6691            right: 0.8,
6692            c0: 0.15,
6693            c1: -0.35,
6694            c2: 0.11,
6695            c3: -0.07,
6696        };
6697        let state = evaluate_cell_moments(cell, 6).expect("cell moments");
6698        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
6699            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
6700        });
6701        assert!((state.value - value_numeric).abs() < 1e-9);
6702        for degree in 0..=6 {
6703            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
6704                z.powi(degree as i32) * (-cell.q(z)).exp()
6705            });
6706            assert!((state.moments[degree] - target).abs() < 1e-9);
6707        }
6708    }
6709
6710    #[test]
6711    fn partition_builder_moves_link_preimages_with_intercept() {
6712        let score_breaks = [-2.0, -1.0, 0.0, 1.0, 2.0];
6713        let link_breaks = [-1.5, -0.5, 0.5, 1.5];
6714        let score_span = |z: f64| {
6715            let left = if z < -1.0 {
6716                -2.0
6717            } else if z < 0.0 {
6718                -1.0
6719            } else if z < 1.0 {
6720                0.0
6721            } else {
6722                1.0
6723            };
6724            Ok(LocalSpanCubic {
6725                left,
6726                right: left + 1.0,
6727                c0: 0.1,
6728                c1: 0.2,
6729                c2: 0.0,
6730                c3: 0.0,
6731            })
6732        };
6733        let link_span = |u: f64| {
6734            let left = if u < -0.5 {
6735                -1.5
6736            } else if u < 0.5 {
6737                -0.5
6738            } else {
6739                0.5
6740            };
6741            Ok(LocalSpanCubic {
6742                left,
6743                right: left + 1.0,
6744                c0: -0.05,
6745                c1: 0.1,
6746                c2: 0.0,
6747                c3: 0.0,
6748            })
6749        };
6750        let cells_a0 = build_denested_partition_cells(
6751            0.25,
6752            0.9,
6753            &score_breaks,
6754            &link_breaks,
6755            score_span,
6756            link_span,
6757        )
6758        .expect("cells a0");
6759        let cells_a1 = build_denested_partition_cells(
6760            0.55,
6761            0.9,
6762            &score_breaks,
6763            &link_breaks,
6764            score_span,
6765            link_span,
6766        )
6767        .expect("cells a1");
6768        assert!(cells_a0.len() >= score_breaks.len() - 1);
6769        assert!(
6770            cells_a0
6771                .windows(2)
6772                .all(|w| (w[0].cell.right - w[1].cell.left).abs() <= 1e-12)
6773        );
6774        assert!(
6775            cells_a0
6776                .iter()
6777                .zip(cells_a1.iter())
6778                .any(|(lhs, rhs)| (lhs.cell.left - rhs.cell.left).abs() > 1e-10)
6779        );
6780        assert!(cells_a0.first().unwrap().cell.left.is_infinite());
6781        assert!(cells_a0.last().unwrap().cell.right.is_infinite());
6782    }
6783
6784    #[test]
6785    fn partition_builder_without_breaks_returns_single_global_cell() {
6786        let cells = build_denested_partition_cells_with_tails(
6787            0.3,
6788            -0.4,
6789            &[],
6790            &[],
6791            |z| {
6792                if z.is_nan() {
6793                    return Err("probe z is NaN".to_string());
6794                }
6795                Ok(LocalSpanCubic {
6796                    left: 0.0,
6797                    right: 1.0,
6798                    c0: 0.0,
6799                    c1: 0.0,
6800                    c2: 0.0,
6801                    c3: 0.0,
6802                })
6803            },
6804            |u| {
6805                if u.is_nan() {
6806                    return Err("probe u is NaN".to_string());
6807                }
6808                Ok(LocalSpanCubic {
6809                    left: 0.0,
6810                    right: 1.0,
6811                    c0: 0.0,
6812                    c1: 0.0,
6813                    c2: 0.0,
6814                    c3: 0.0,
6815                })
6816            },
6817        )
6818        .expect("global cell");
6819        assert_eq!(cells.len(), 1);
6820        assert_eq!(cells[0].cell.left, f64::NEG_INFINITY);
6821        assert_eq!(cells[0].cell.right, f64::INFINITY);
6822        assert!(cells[0].cell.c2.abs() < 1e-12);
6823        assert!(cells[0].cell.c3.abs() < 1e-12);
6824    }
6825
6826    #[test]
6827    fn polynomial_integral_helper_matches_moment_sum() {
6828        let cell = DenestedCubicCell {
6829            left: -1.5,
6830            right: 1.25,
6831            c0: 0.2,
6832            c1: -0.4,
6833            c2: 0.15,
6834            c3: 0.03,
6835        };
6836        let state = evaluate_cell_moments(cell, 8).expect("cell moments");
6837        let coeffs = [1.5, -0.25, 0.75, 0.1];
6838        let expected = INV_TWO_PI
6839            * coeffs
6840                .iter()
6841                .enumerate()
6842                .map(|(idx, coeff)| coeff * state.moments[idx])
6843                .sum::<f64>();
6844        let got = cell_polynomial_integral_from_moments(&coeffs, &state.moments, "test poly")
6845            .expect("poly integral");
6846        assert!((got - expected).abs() < 1e-14);
6847    }
6848
6849    #[test]
6850    fn batched_cell_moment_max_degree_matches_direct_non_affine_grid() {
6851        let cells = [
6852            DenestedCubicCell {
6853                left: -2.0,
6854                right: -0.25,
6855                c0: -0.7,
6856                c1: 0.8,
6857                c2: 0.015,
6858                c3: -0.004,
6859            },
6860            DenestedCubicCell {
6861                left: -0.5,
6862                right: 0.75,
6863                c0: 0.2,
6864                c1: -0.35,
6865                c2: -0.025,
6866                c3: 0.0,
6867            },
6868            DenestedCubicCell {
6869                left: 0.1,
6870                right: 1.6,
6871                c0: 0.4,
6872                c1: 0.25,
6873                c2: 0.01,
6874                c3: 0.006,
6875            },
6876            DenestedCubicCell {
6877                left: -1.25,
6878                right: 2.25,
6879                c0: -0.1,
6880                c1: 0.55,
6881                c2: -0.012,
6882                c3: 0.003,
6883            },
6884        ];
6885        for cell in cells {
6886            let branch = branch_cell(cell).expect("branch");
6887            if branch == ExactCellBranch::Affine {
6888                continue;
6889            }
6890            let batched =
6891                evaluate_non_affine_cell_state(cell, branch, 21).expect("degree-21 state");
6892            for degree in [9usize, 15, 21] {
6893                let direct =
6894                    evaluate_non_affine_cell_state(cell, branch, degree).expect("direct state");
6895                assert_eq!(batched.branch, direct.branch);
6896                let denom = direct.value.abs().max(1.0);
6897                assert!(((batched.value - direct.value).abs() / denom) < 1e-10);
6898                for k in 0..=degree {
6899                    let denom = direct.moments[k].abs().max(1.0);
6900                    let rel = (batched.moments[k] - direct.moments[k]).abs() / denom;
6901                    assert!(
6902                        rel < 1e-10,
6903                        "cell={cell:?} degree={degree} moment={k} rel={rel:e}"
6904                    );
6905                }
6906            }
6907        }
6908    }
6909
6910    #[test]
6911    fn derivative_moment_evaluator_matches_value_evaluator_moments() {
6912        let cells = [
6913            DenestedCubicCell {
6914                left: -2.0,
6915                right: -0.4,
6916                c0: 0.15,
6917                c1: -0.8,
6918                c2: 0.0,
6919                c3: 0.0,
6920            },
6921            DenestedCubicCell {
6922                left: -0.75,
6923                right: 1.4,
6924                c0: -0.25,
6925                c1: 0.6,
6926                c2: 0.12,
6927                c3: 0.0,
6928            },
6929            DenestedCubicCell {
6930                left: -1.1,
6931                right: 0.9,
6932                c0: 0.35,
6933                c1: -0.3,
6934                c2: 0.05,
6935                c3: -0.015,
6936            },
6937        ];
6938        for cell in cells {
6939            for degree in [4usize, 9, 15, 21] {
6940                let full = evaluate_cell_moments_uncached(cell, degree).expect("full moments");
6941                let derivative = evaluate_cell_derivative_moments_uncached(cell, degree)
6942                    .expect("derivative moments");
6943                assert_eq!(full.branch, derivative.branch);
6944                assert_eq!(full.moments.len(), derivative.moments.len());
6945                for k in 0..full.moments.len() {
6946                    assert_eq!(full.moments[k].to_bits(), derivative.moments[k].to_bits());
6947                }
6948            }
6949        }
6950    }
6951
6952    #[test]
6953    fn cell_moment_lru_matches_uncached_non_affine_grid() {
6954        let cache = CellMomentLruCache::new(16 * 1024 * 1024);
6955        let stats = CellMomentCacheStats::default();
6956        let c0s = [-0.75, 0.0, 0.5];
6957        let c1s = [-1.2, 0.25, 1.1];
6958        let c2s = [-0.18, 0.07];
6959        let c3s = [0.0, 0.025];
6960        let bounds = [(-2.0, -0.5), (-0.25, 1.5)];
6961        let degrees = [4usize, 9, 15, 21];
6962        for &c0 in &c0s {
6963            for &c1 in &c1s {
6964                for &c2 in &c2s {
6965                    for &c3 in &c3s {
6966                        for &(left, right) in &bounds {
6967                            for &max_degree in &degrees {
6968                                let cell = DenestedCubicCell {
6969                                    left,
6970                                    right,
6971                                    c0,
6972                                    c1,
6973                                    c2,
6974                                    c3,
6975                                };
6976                                let branch = branch_cell(cell).expect("branch");
6977                                if branch == ExactCellBranch::Affine {
6978                                    continue;
6979                                }
6980                                let expected =
6981                                    evaluate_non_affine_cell_state(cell, branch, max_degree)
6982                                        .expect("uncached non-affine moments");
6983                                let got = evaluate_cell_moments_cached(
6984                                    cell,
6985                                    max_degree,
6986                                    &cache,
6987                                    Some(&stats),
6988                                )
6989                                .expect("cached moments");
6990                                assert_eq!(got.branch, expected.branch);
6991                                assert_eq!(got.moments.len(), max_degree + 1);
6992                                let denom = expected.value.abs().max(1.0);
6993                                assert!(
6994                                    ((got.value - expected.value).abs() / denom) < 1e-10,
6995                                    "value mismatch for {cell:?} degree {max_degree}: got {} expected {}",
6996                                    got.value,
6997                                    expected.value
6998                                );
6999                                for (idx, (&lhs, &rhs)) in
7000                                    got.moments.iter().zip(expected.moments.iter()).enumerate()
7001                                {
7002                                    let denom = rhs.abs().max(1.0);
7003                                    assert!(
7004                                        ((lhs - rhs).abs() / denom) < 1e-10,
7005                                        "moment {idx} mismatch for {cell:?} degree {max_degree}: got {lhs} expected {rhs}"
7006                                    );
7007                                }
7008                                let warm = evaluate_cell_moments_cached(
7009                                    cell,
7010                                    max_degree,
7011                                    &cache,
7012                                    Some(&stats),
7013                                )
7014                                .expect("warm cached moments");
7015                                assert_eq!(warm, got);
7016                            }
7017                        }
7018                    }
7019                }
7020            }
7021        }
7022        let (hits, misses) = stats.snapshot();
7023        assert!(hits > 0, "expected warm LRU hits");
7024        assert!(misses > 0, "expected cold LRU misses");
7025    }
7026
7027    #[test]
7028    fn cell_moment_fingerprint_exact_cache_matches_current_evaluator() {
7029        let cells = [
7030            DenestedCubicCell {
7031                left: -1.75,
7032                right: -0.25,
7033                c0: 0.15,
7034                c1: -0.35,
7035                c2: 0.08,
7036                c3: -0.015,
7037            },
7038            DenestedCubicCell {
7039                left: -0.5,
7040                right: 0.8,
7041                c0: -0.2,
7042                c1: 0.45,
7043                c2: -0.12,
7044                c3: 0.025,
7045            },
7046            DenestedCubicCell {
7047                left: 0.1,
7048                right: 1.6,
7049                c0: 0.05,
7050                c1: 0.2,
7051                c2: 0.03,
7052                c3: 0.004,
7053            },
7054        ];
7055        let mut cache = std::collections::HashMap::new();
7056        for max_degree in [0usize, 3, 4, 9, 16] {
7057            for cell in cells {
7058                let baseline = evaluate_cell_moments(cell, max_degree).expect("baseline moments");
7059                let key = cell_moment_cache_key(cell, max_degree, 0.0);
7060                let cached = cache.entry(key).or_insert_with(|| {
7061                    evaluate_cell_moments(cell, max_degree).expect("cached moments")
7062                });
7063                assert_eq!(baseline.branch, cached.branch);
7064                assert_eq!(baseline.value.to_bits(), cached.value.to_bits());
7065                assert_eq!(baseline.moments.len(), cached.moments.len());
7066                for (lhs, rhs) in baseline.moments.iter().zip(cached.moments.iter()) {
7067                    assert_eq!(lhs.to_bits(), rhs.to_bits());
7068                }
7069            }
7070        }
7071    }
7072
7073    #[test]
7074    fn fuzzy_cell_moment_fingerprint_error_scales_with_epsilon() {
7075        for epsilon in [1e-8, 1e-6] {
7076            let base = DenestedCubicCell {
7077                left: -1.25,
7078                right: 1.1,
7079                c0: 0.1,
7080                c1: -0.25,
7081                c2: 0.04,
7082                c3: -0.006,
7083            };
7084            let perturbed = DenestedCubicCell {
7085                left: base.left + 0.001 * epsilon,
7086                right: base.right - 0.001 * epsilon,
7087                c0: base.c0 + 0.001 * epsilon,
7088                c1: base.c1 - 0.001 * epsilon,
7089                c2: base.c2 + 0.001 * epsilon,
7090                c3: base.c3 - 0.001 * epsilon,
7091            };
7092            assert_eq!(
7093                cell_moment_cache_key(base, 9, epsilon),
7094                cell_moment_cache_key(perturbed, 9, epsilon)
7095            );
7096            let lhs = evaluate_cell_moments(base, 9).expect("base moments");
7097            let rhs = evaluate_cell_moments(perturbed, 9).expect("perturbed moments");
7098            let max_rel = lhs
7099                .moments
7100                .iter()
7101                .zip(rhs.moments.iter())
7102                .map(|(a, b)| (a - b).abs() / a.abs().max(b.abs()).max(1.0))
7103                .fold(0.0_f64, f64::max);
7104            assert!(
7105                max_rel <= 10.0 * epsilon,
7106                "epsilon={epsilon:.1e} max_rel={max_rel:.3e}"
7107            );
7108        }
7109    }
7110
7111    /// Locks in numerical equivalence of the optimized
7112    /// `evaluate_non_affine_cell_state` against an inline reference
7113    /// implementation that mirrors the prior pre-fold structure
7114    /// (separate `cell.eta(z)` / `cell.q(z)` calls; post-loop
7115    /// `* half_width`; trailing `value_integral * half_width / sqrt(TAU)`).
7116    /// Any drift larger than 1e-13 relative would indicate the hot-path
7117    /// rewrite changed the math.
7118    #[test]
7119    fn non_affine_cell_state_matches_prefold_reference_to_1e_minus_13() {
7120        // Reference: byte-for-byte the structure of the previous
7121        // implementation. Kept local to this test to avoid leaking a second
7122        // public surface.
7123        fn reference(
7124            cell: DenestedCubicCell,
7125            branch: ExactCellBranch,
7126            max_degree: usize,
7127        ) -> CellMomentState {
7128            let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
7129            let mut value_integral = 0.0_f64;
7130            let center = 0.5 * (cell.left + cell.right);
7131            let half_width = 0.5 * (cell.right - cell.left);
7132            for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
7133                let z = center + half_width * node;
7134                let eta = cell.eta(z);
7135                let moment_weight = weight * (-cell.q(z)).exp();
7136                let mut z_pow = 1.0_f64;
7137                for moment in &mut moments {
7138                    *moment = moment_weight.mul_add(z_pow, *moment);
7139                    z_pow *= z;
7140                }
7141                value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
7142            }
7143            for moment in &mut moments {
7144                *moment *= half_width;
7145            }
7146            CellMomentState {
7147                branch,
7148                value: value_integral * half_width / (std::f64::consts::TAU).sqrt(),
7149                moments,
7150            }
7151        }
7152
7153        // Hand-rolled inputs that cross both Quartic and Sextic branches and
7154        // exercise positive/negative coefficients, asymmetric intervals, and
7155        // a wide degree range (matches survival_marginal_slope's degree=9
7156        // production call as well as the bernoulli outer-step degree=24).
7157        let cells = [
7158            DenestedCubicCell {
7159                left: -1.25,
7160                right: -0.2,
7161                c0: -0.35,
7162                c1: 0.85,
7163                c2: 0.04,
7164                c3: -0.015,
7165            },
7166            DenestedCubicCell {
7167                left: -0.2,
7168                right: 0.55,
7169                c0: 0.12,
7170                c1: -0.65,
7171                c2: -0.025,
7172                c3: 0.02,
7173            },
7174            DenestedCubicCell {
7175                left: 0.55,
7176                right: 1.6,
7177                c0: 0.42,
7178                c1: 0.35,
7179                c2: 0.018,
7180                c3: 0.012,
7181            },
7182            DenestedCubicCell {
7183                left: -3.0,
7184                right: -1.0,
7185                c0: 1.7,
7186                c1: -0.4,
7187                c2: 0.11,
7188                c3: -0.07,
7189            },
7190        ];
7191        let degrees = [0_usize, 4, 9, 16, 24];
7192        for cell in cells {
7193            let branch = branch_cell(cell).expect("branch");
7194            assert_ne!(branch, ExactCellBranch::Affine);
7195            for max_degree in degrees {
7196                let actual = evaluate_non_affine_cell_state(cell, branch, max_degree)
7197                    .expect("optimized non-affine");
7198                let expected = reference(cell, branch, max_degree);
7199                assert_eq!(actual.branch, expected.branch);
7200                assert_eq!(actual.moments.len(), expected.moments.len());
7201                let denom_v = expected.value.abs().max(1.0);
7202                let rel_v = (actual.value - expected.value).abs() / denom_v;
7203                let actual_v = actual.value;
7204                let expected_v = expected.value;
7205                assert!(
7206                    rel_v <= 1e-13,
7207                    "value rel mismatch for {cell:?} degree {max_degree}: \
7208                     actual={actual_v:.17e} expected={expected_v:.17e} rel={rel_v:.3e}"
7209                );
7210                for (k, (lhs, rhs)) in actual
7211                    .moments
7212                    .iter()
7213                    .zip(expected.moments.iter())
7214                    .enumerate()
7215                {
7216                    let denom = rhs.abs().max(1.0);
7217                    let rel = (lhs - rhs).abs() / denom;
7218                    assert!(
7219                        rel <= 1e-13,
7220                        "moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7221                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7222                    );
7223                }
7224
7225                // Also lock in the derivative-state path on the same
7226                // inputs so the (parallel) edit there can't drift.
7227                let actual_deriv =
7228                    evaluate_non_affine_cell_derivative_state(cell, branch, max_degree)
7229                        .expect("optimized derivative");
7230                for (k, (lhs, rhs)) in actual_deriv
7231                    .moments
7232                    .iter()
7233                    .zip(expected.moments.iter())
7234                    .enumerate()
7235                {
7236                    let denom = rhs.abs().max(1.0);
7237                    let rel = (lhs - rhs).abs() / denom;
7238                    assert!(
7239                        rel <= 1e-13,
7240                        "deriv moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7241                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7242                    );
7243                }
7244            }
7245        }
7246    }
7247
7248    /// DECISIVE: the third-derivative kernel must equal the FD of the
7249    /// second-derivative kernel w.r.t. a parameter that perturbs `eta`,
7250    /// RE-EVALUATING the moments at each step (the moments depend on `eta`
7251    /// via the `exp(-q)` weight). This isolates the kernel from all survival
7252    /// partition/cross machinery (gam#979 f_uv_dir localization).
7253    #[test]
7254    fn third_derivative_kernel_matches_fd_of_second_with_eta_perturbation() {
7255        // A finite, non-affine cell.
7256        let base = DenestedCubicCell {
7257            left: -0.6,
7258            right: 0.9,
7259            c0: 0.30,
7260            c1: 0.45,
7261            c2: -0.20,
7262            c3: 0.12,
7263        };
7264        // Synthetic parameter directions as cubic-in-z perturbations of eta:
7265        //   eta_u = ∂eta/∂u, eta_v = ∂eta/∂v, eta_t = ∂eta/∂t (the dir).
7266        let eta_u = [0.11_f64, -0.07, 0.05, 0.02];
7267        let eta_v = [-0.09_f64, 0.13, -0.04, 0.03];
7268        let eta_t = [0.17_f64, 0.06, -0.10, 0.04]; // the "b-like" direction
7269        // Second crosses ∂²eta/∂{·}{·} (pick small non-zero cubics).
7270        let eta_uv = [0.02_f64, 0.01, -0.015, 0.005];
7271        let eta_ut = [-0.01_f64, 0.02, 0.007, -0.003];
7272        let eta_vt = [0.015_f64, -0.008, 0.01, 0.004];
7273        // Third cross ∂³eta/∂u∂v∂t.
7274        let eta_uvt = [0.003_f64, -0.002, 0.001, 0.0005];
7275
7276        let neg = |a: &[f64; 4]| a.map(|v| -v);
7277        let max_degree = 15usize;
7278
7279        // f_uv(s) where param s shifts eta by s·(eta_t + ½ s²... ) — here we
7280        // build the cell at eta + s·eta_t + s²·eta_vt-style is NOT needed; we
7281        // only need the t-direction to first order for ∂/∂t. To FD ∂(f_uv)/∂t
7282        // we perturb eta along eta_t AND carry the s-dependence of the u,v
7283        // crosses: eta_u(s)=eta_u + s·eta_ut, eta_v(s)=eta_v + s·eta_vt,
7284        // eta_uv(s)=eta_uv + s·eta_uvt. The cell cubic shifts by s·eta_t.
7285        let f_uv_at = |s: f64| -> f64 {
7286            let cell_s = DenestedCubicCell {
7287                c0: base.c0 + s * eta_t[0],
7288                c1: base.c1 + s * eta_t[1],
7289                c2: base.c2 + s * eta_t[2],
7290                c3: base.c3 + s * eta_t[3],
7291                ..base
7292            };
7293            // Moments MUST be recomputed at the perturbed eta.
7294            let st = evaluate_cell_moments(cell_s, max_degree).unwrap();
7295            let neg_cell = DenestedCubicCell {
7296                c0: -cell_s.c0,
7297                c1: -cell_s.c1,
7298                c2: -cell_s.c2,
7299                c3: -cell_s.c3,
7300                ..cell_s
7301            };
7302            let u_s = [
7303                eta_u[0] + s * eta_ut[0],
7304                eta_u[1] + s * eta_ut[1],
7305                eta_u[2] + s * eta_ut[2],
7306                eta_u[3] + s * eta_ut[3],
7307            ];
7308            let v_s = [
7309                eta_v[0] + s * eta_vt[0],
7310                eta_v[1] + s * eta_vt[1],
7311                eta_v[2] + s * eta_vt[2],
7312                eta_v[3] + s * eta_vt[3],
7313            ];
7314            let uv_s = [
7315                eta_uv[0] + s * eta_uvt[0],
7316                eta_uv[1] + s * eta_uvt[1],
7317                eta_uv[2] + s * eta_uvt[2],
7318                eta_uv[3] + s * eta_uvt[3],
7319            ];
7320            cell_second_derivative_from_moments(
7321                neg_cell,
7322                &neg(&u_s),
7323                &neg(&v_s),
7324                &neg(&uv_s),
7325                &st.moments,
7326            )
7327            .unwrap()
7328        };
7329
7330        let h = 1e-5;
7331        let fd = (f_uv_at(h) - f_uv_at(-h)) / (2.0 * h);
7332
7333        // Analytic third via the kernel (negated cell + negated crosses, as the
7334        // survival path does).
7335        let st0 = evaluate_cell_moments(base, max_degree).unwrap();
7336        let neg_cell0 = DenestedCubicCell {
7337            c0: -base.c0,
7338            c1: -base.c1,
7339            c2: -base.c2,
7340            c3: -base.c3,
7341            ..base
7342        };
7343        let analytic = cell_third_derivative_from_moments(
7344            neg_cell0,
7345            &neg(&eta_u),
7346            &neg(&eta_v),
7347            &neg(&eta_t),
7348            &neg(&eta_uv),
7349            &neg(&eta_ut),
7350            &neg(&eta_vt),
7351            &neg(&eta_uvt),
7352            &st0.moments,
7353        )
7354        .unwrap();
7355
7356        let denom = fd.abs().max(1e-3);
7357        let rel = (analytic - fd).abs() / denom;
7358        assert!(
7359            rel <= 1e-5,
7360            "third kernel vs FD-of-second mismatch: analytic={analytic:.12e} fd={fd:.12e} rel={rel:.3e}"
7361        );
7362    }
7363
7364    #[test]
7365    fn moving_shared_edge_second_integral_derivative_has_leibniz_jump_sign() {
7366        let edge0 = 0.2_f64;
7367        let edge_velocity = -0.37_f64;
7368
7369        let left_eta = [0.22_f64, -0.18, 0.09, 0.03];
7370        let right_eta = [-0.11_f64, 0.26, -0.04, 0.02];
7371        let left_r = [0.08_f64, -0.05, 0.03, 0.01];
7372        let left_s = [-0.06_f64, 0.04, 0.02, -0.015];
7373        let left_rs = [0.025_f64, -0.012, 0.006, 0.004];
7374        let right_r = [-0.03_f64, 0.07, -0.02, 0.012];
7375        let right_s = [0.05_f64, -0.025, 0.018, 0.007];
7376        let right_rs = [-0.018_f64, 0.014, -0.005, 0.003];
7377
7378        let integral_at = |shift: f64| -> f64 {
7379            let edge = edge0 + edge_velocity * shift;
7380            let left = DenestedCubicCell {
7381                left: -0.7,
7382                right: edge,
7383                c0: left_eta[0],
7384                c1: left_eta[1],
7385                c2: left_eta[2],
7386                c3: left_eta[3],
7387            };
7388            let right = DenestedCubicCell {
7389                left: edge,
7390                right: 1.1,
7391                c0: right_eta[0],
7392                c1: right_eta[1],
7393                c2: right_eta[2],
7394                c3: right_eta[3],
7395            };
7396            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7397            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7398            cell_second_derivative_from_moments(
7399                left,
7400                &left_r,
7401                &left_s,
7402                &left_rs,
7403                &left_state.moments,
7404            )
7405            .expect("left second")
7406                + cell_second_derivative_from_moments(
7407                    right,
7408                    &right_r,
7409                    &right_s,
7410                    &right_rs,
7411                    &right_state.moments,
7412                )
7413                .expect("right second")
7414        };
7415
7416        let h = 1e-5;
7417        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7418
7419        let left = DenestedCubicCell {
7420            left: -0.7,
7421            right: edge0,
7422            c0: left_eta[0],
7423            c1: left_eta[1],
7424            c2: left_eta[2],
7425            c3: left_eta[3],
7426        };
7427        let right = DenestedCubicCell {
7428            left: edge0,
7429            right: 1.1,
7430            c0: right_eta[0],
7431            c1: right_eta[1],
7432            c2: right_eta[2],
7433            c3: right_eta[3],
7434        };
7435        let f_left =
7436            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7437        let f_right =
7438            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7439        let analytic = edge_velocity * (f_left - f_right);
7440
7441        let denom = analytic.abs().max(1e-8);
7442        let rel = (fd - analytic).abs() / denom;
7443        assert!(
7444            rel <= 5e-8,
7445            "moving edge sign mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7446        );
7447    }
7448
7449    #[test]
7450    fn moving_shared_edge_second_integral_mixed_derivative_has_full_leibniz_terms() {
7451        let edge0 = -0.15_f64;
7452        let edge_d1 = 0.31_f64;
7453        let edge_d2 = -0.27_f64;
7454        let edge_d12 = 0.19_f64;
7455
7456        let left_eta = [0.16_f64, -0.21, 0.07, -0.025];
7457        let right_eta = [-0.09_f64, 0.18, -0.055, 0.018];
7458        let left_r = [0.075_f64, -0.045, 0.018, 0.009];
7459        let left_s = [-0.052_f64, 0.033, 0.014, -0.011];
7460        let left_rs = [0.021_f64, -0.009, 0.005, 0.0025];
7461        let right_r = [-0.028_f64, 0.063, -0.017, 0.010];
7462        let right_s = [0.047_f64, -0.023, 0.016, 0.006];
7463        let right_rs = [-0.015_f64, 0.012, -0.004, 0.002];
7464
7465        let integral_at = |s1: f64, s2: f64| -> f64 {
7466            let edge = edge0 + edge_d1 * s1 + edge_d2 * s2 + edge_d12 * s1 * s2;
7467            let left = DenestedCubicCell {
7468                left: -0.8,
7469                right: edge,
7470                c0: left_eta[0],
7471                c1: left_eta[1],
7472                c2: left_eta[2],
7473                c3: left_eta[3],
7474            };
7475            let right = DenestedCubicCell {
7476                left: edge,
7477                right: 0.9,
7478                c0: right_eta[0],
7479                c1: right_eta[1],
7480                c2: right_eta[2],
7481                c3: right_eta[3],
7482            };
7483            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7484            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7485            cell_second_derivative_from_moments(
7486                left,
7487                &left_r,
7488                &left_s,
7489                &left_rs,
7490                &left_state.moments,
7491            )
7492            .expect("left second")
7493                + cell_second_derivative_from_moments(
7494                    right,
7495                    &right_r,
7496                    &right_s,
7497                    &right_rs,
7498                    &right_state.moments,
7499                )
7500                .expect("right second")
7501        };
7502
7503        let h = 2e-4;
7504        let fd = (integral_at(h, h) - integral_at(h, -h) - integral_at(-h, h)
7505            + integral_at(-h, -h))
7506            / (4.0 * h * h);
7507
7508        let left = DenestedCubicCell {
7509            left: -0.8,
7510            right: edge0,
7511            c0: left_eta[0],
7512            c1: left_eta[1],
7513            c2: left_eta[2],
7514            c3: left_eta[3],
7515        };
7516        let right = DenestedCubicCell {
7517            left: edge0,
7518            right: 0.9,
7519            c0: right_eta[0],
7520            c1: right_eta[1],
7521            c2: right_eta[2],
7522            c3: right_eta[3],
7523        };
7524
7525        let boundary_z_derivative =
7526            |cell: DenestedCubicCell, r: &[f64], s: &[f64], rs: &[f64]| -> f64 {
7527                let eta = cell.eta(edge0);
7528                let eta_z = cell.c1 + 2.0 * cell.c2 * edge0 + 3.0 * cell.c3 * edge0 * edge0;
7529                let cr = poly_eval_at(r, edge0);
7530                let cs = poly_eval_at(s, edge0);
7531                let crs = poly_eval_at(rs, edge0);
7532                let cr_z = r.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7533                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7534                });
7535                let cs_z = s.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7536                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7537                });
7538                let crs_z = rs.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7539                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7540                });
7541                let amp = crs - eta * cr * cs;
7542                let amp_z = crs_z - eta_z * cr * cs - eta * cr_z * cs - eta * cr * cs_z;
7543                let q_z = edge0 + eta * eta_z;
7544                (amp_z - amp * q_z) * (-cell.q(edge0)).exp() * INV_TWO_PI
7545            };
7546
7547        let f_left =
7548            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7549        let f_right =
7550            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7551        let fz_left = boundary_z_derivative(left, &left_r, &left_s, &left_rs);
7552        let fz_right = boundary_z_derivative(right, &right_r, &right_s, &right_rs);
7553        let analytic = edge_d12 * (f_left - f_right) + edge_d1 * edge_d2 * (fz_left - fz_right);
7554
7555        let denom = analytic.abs().max(1e-8);
7556        let rel = (fd - analytic).abs() / denom;
7557        assert!(
7558            rel <= 2e-7,
7559            "moving edge mixed term mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7560        );
7561    }
7562
7563    // gam#1454 resolution. The reported defect ("survival flex directional
7564    // third[g,w0] wrong: candidate f_au_dir/f_aa_dir missing self-flux") posited
7565    // a MISSING third-order Leibniz self-flux at the moving link-knot crossings.
7566    // This regression establishes the two facts that, together, prove the
7567    // implicit-intercept third-order tower
7568    // (`row_primary_third_contracted_recompute*`) is CORRECT to add no such flux:
7569    //
7570    //   (1) The third-derivative integrand `F_rst` genuinely DOES jump across a
7571    //       C²-link knot — its third coefficient slice carries `c_rst ∝ 6·α₃`,
7572    //       and `α₃` (the spline's third `z`-derivative) is the one piece a C²
7573    //       cubic spline leaves discontinuous. So the jump is real and the
7574    //       `cell_third_derivative_boundary_integrand` flux formula is exact
7575    //       (verified by FD of a direct ∂/∂edge of the third-integral sum —
7576    //       a FOURTH-order scenario that pins the integrand, not the tower).
7577    //
7578    //   (2) Every boundary term in the Leibniz expansion of a THIRD derivative,
7579    //       however, evaluates an integrand of order ≤ 2 at the moving edge
7580    //       (one of the three differentiations is spent moving the boundary).
7581    //       The second-derivative integrand `F_rs` is CONTINUOUS across the same
7582    //       C² knot (its slices reach at most `α₂ + 3α₃·shift`, i.e. ½·η''(u*),
7583    //       which a C² spline keeps continuous). Hence the shared-edge flux
7584    //       `velocity·(F_rs^L − F_rs^R)` telescopes to ZERO, and the tower's
7585    //       third-order self-flux is a genuine no-op. The real residual lives in
7586    //       the interior implicit-intercept assembly, not at the boundary.
7587    #[test]
7588    fn third_order_self_flux_telescopes_but_third_integrand_jumps_at_c2_knot_1454() {
7589        let edge0 = 0.13_f64;
7590        let edge_velocity = -0.41_f64;
7591
7592        // Build η continuous to C² at edge0 but with a jump in the cubic (3rd
7593        // derivative) coefficient. Pick the left cubic freely; choose the right
7594        // cubic to match value+1st+2nd derivative at edge0, then perturb its c3.
7595        let left_eta = [0.18_f64, -0.12, 0.07, 0.04];
7596        let right_c3 = 0.04_f64 + 0.09; // α₃ jump across the knot.
7597        // Match η, η', η'' at edge0 for the right piece given its c3:
7598        //   η(z)  = c0 + c1 z + c2 z² + c3 z³
7599        //   η'(z) = c1 + 2 c2 z + 3 c3 z²
7600        //   η''(z)= 2 c2 + 6 c3 z
7601        // Solve right (c0,c1,c2) so the three values equal the left ones at edge0.
7602        let l0 = left_eta[0];
7603        let l1 = left_eta[1];
7604        let l2 = left_eta[2];
7605        let l3 = left_eta[3];
7606        let e = edge0;
7607        let eta_val = l0 + l1 * e + l2 * e * e + l3 * e * e * e;
7608        let eta_d1 = l1 + 2.0 * l2 * e + 3.0 * l3 * e * e;
7609        let eta_d2 = 2.0 * l2 + 6.0 * l3 * e;
7610        let rc2 = (eta_d2 - 6.0 * right_c3 * e) / 2.0;
7611        let rc1 = eta_d1 - 2.0 * rc2 * e - 3.0 * right_c3 * e * e;
7612        let rc0 = eta_val - rc1 * e - rc2 * e * e - right_c3 * e * e * e;
7613        let right_eta = [rc0, rc1, rc2, right_c3];
7614
7615        // Coefficient slices. The first/second slices we keep continuous at the
7616        // edge (mimicking c_r=1+η', c_rs∝η'' which a C² spline matches), so the
7617        // 2nd-order flux would cancel. The third-order slice `rst` carries the
7618        // jumping α₃ and is DIFFERENT across the edge — this is the term that
7619        // breaks cancellation.
7620        let common_r = [0.06_f64, -0.04, 0.02, 0.0];
7621        let common_s = [-0.05_f64, 0.03, 0.015, 0.0];
7622        let common_t = [0.08_f64, 0.05, -0.03, 0.0];
7623        let common_rs = [0.02_f64, -0.01, 0.005, 0.0];
7624        let common_rt = [-0.012_f64, 0.008, 0.004, 0.0];
7625        let common_st = [0.015_f64, -0.006, 0.003, 0.0];
7626        // rst ∝ 6·α₃ in the real path: left and right differ by the α₃ jump.
7627        let left_rst = [6.0 * l3, 0.0, 0.0, 0.0];
7628        let right_rst = [6.0 * right_c3, 0.0, 0.0, 0.0];
7629
7630        let max_degree = 15usize;
7631        let neg = |a: &[f64; 4]| a.map(|v| -v);
7632
7633        // The integral sum over the two cells sharing the moving edge, computed
7634        // via the fixed-domain moment reduction with the SURVIVAL/probit sign
7635        // convention (negated cell + negated coefficient slices), exactly as the
7636        // production `row_primary_third_contracted_recompute` path does.
7637        let integral_at = |shift: f64| -> f64 {
7638            let edge = edge0 + edge_velocity * shift;
7639            let left = DenestedCubicCell {
7640                left: -0.7,
7641                right: edge,
7642                c0: left_eta[0],
7643                c1: left_eta[1],
7644                c2: left_eta[2],
7645                c3: left_eta[3],
7646            };
7647            let right = DenestedCubicCell {
7648                left: edge,
7649                right: 1.0,
7650                c0: right_eta[0],
7651                c1: right_eta[1],
7652                c2: right_eta[2],
7653                c3: right_eta[3],
7654            };
7655            let lst = evaluate_cell_moments(left, max_degree).unwrap();
7656            let rst_m = evaluate_cell_moments(right, max_degree).unwrap();
7657            let neg_left = DenestedCubicCell {
7658                c0: -left.c0,
7659                c1: -left.c1,
7660                c2: -left.c2,
7661                c3: -left.c3,
7662                ..left
7663            };
7664            let neg_right = DenestedCubicCell {
7665                c0: -right.c0,
7666                c1: -right.c1,
7667                c2: -right.c2,
7668                c3: -right.c3,
7669                ..right
7670            };
7671            let li = cell_third_derivative_from_moments(
7672                neg_left,
7673                &neg(&common_r),
7674                &neg(&common_s),
7675                &neg(&common_t),
7676                &neg(&common_rs),
7677                &neg(&common_rt),
7678                &neg(&common_st),
7679                &neg(&left_rst),
7680                &lst.moments,
7681            )
7682            .unwrap();
7683            let ri = cell_third_derivative_from_moments(
7684                neg_right,
7685                &neg(&common_r),
7686                &neg(&common_s),
7687                &neg(&common_t),
7688                &neg(&common_rs),
7689                &neg(&common_rt),
7690                &neg(&common_st),
7691                &neg(&right_rst),
7692                &rst_m.moments,
7693            )
7694            .unwrap();
7695            li + ri
7696        };
7697
7698        let h = 1e-5;
7699        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7700
7701        // Fixed-domain part: differentiate ONLY the integrands (domain frozen at
7702        // edge0). We approximate it with the same moment reduction but treating
7703        // the edge as fixed — i.e. its directional derivative is captured by the
7704        // analytic Leibniz flux alone, since the integrand coefficients here are
7705        // edge-independent. So the analytic prediction is pure flux:
7706        //   flux = velocity · ( F_rst^L(edge0) − F_rst^R(edge0) ),
7707        // using the UN-negated cells/coeffs (the boundary integrand convention).
7708        let left0 = DenestedCubicCell {
7709            left: -0.7,
7710            right: edge0,
7711            c0: left_eta[0],
7712            c1: left_eta[1],
7713            c2: left_eta[2],
7714            c3: left_eta[3],
7715        };
7716        let right0 = DenestedCubicCell {
7717            left: edge0,
7718            right: 1.0,
7719            c0: right_eta[0],
7720            c1: right_eta[1],
7721            c2: right_eta[2],
7722            c3: right_eta[3],
7723        };
7724        let f_left = cell_third_derivative_boundary_integrand(
7725            left0, &common_r, &common_s, &common_t, &common_rs, &common_rt, &common_st, &left_rst,
7726            edge0,
7727        );
7728        let f_right = cell_third_derivative_boundary_integrand(
7729            right0, &common_r, &common_s, &common_t, &common_rs, &common_rt, &common_st,
7730            &right_rst, edge0,
7731        );
7732
7733        // The integrand DOES jump across this C² knot (the α₃ third-coefficient
7734        // term is the only discontinuous piece). Confirm the jump is genuine —
7735        // if it were zero the flux would be a no-op and #1454 would not exist.
7736        let jump = f_left - f_right;
7737        assert!(
7738            jump.abs() > 1e-4,
7739            "third-derivative integrand must jump across the C² knot (α₃ discontinuity); \
7740             got jump={jump:.3e}"
7741        );
7742
7743        let analytic_flux = edge_velocity * jump;
7744        let denom = fd.abs().max(1e-6);
7745        let rel = (fd - analytic_flux).abs() / denom;
7746        assert!(
7747            rel <= 1e-5,
7748            "moving-edge third-derivative flux mismatch (#1454): fd={fd:.12e} \
7749             analytic_flux={analytic_flux:.12e} rel={rel:.3e}"
7750        );
7751
7752        // ---- Fact (2): the SECOND-derivative integrand telescopes to zero. ----
7753        // A 3rd-derivative Leibniz boundary term spends one differentiation on
7754        // the moving edge and evaluates a ≤2nd-order integrand there. The
7755        // hardest such term is the slope-slope Hessian integrand `F_bb`, whose
7756        // coefficient slice is the link cubic's b-b partial
7757        //   dc_dbb(z) = [0, 0, 2(α₂ + 3 α₃·shift), 6 α₃·b]·(z⁰..z³)
7758        //             = z²·η''(u),  with u = a + b·z, shift = a − knot.
7759        // Across a C² knot α₂, α₃, and `shift` all jump, yet η''(u*) is
7760        // continuous — so the EVALUATED slice `c_bb(z*) = z*²·η''(u*)` matches on
7761        // both sides and `F_bb` is continuous. Build the two pieces' raw dc_dbb
7762        // decompositions from `link_cubic_second_partials` and confirm the
7763        // second-derivative integrand carries no jump (flux telescopes to 0).
7764        let a_row = 0.21_f64;
7765        let b_row = 1.37_f64;
7766        let knot = a_row + b_row * edge0; // u-location of the crossing.
7767        // Left/right link pieces: choose α₂,α₃ freely on the left; pick the
7768        // right piece's α₂ so η''(knot) is continuous given a jumped α₃.
7769        let left_link = LocalSpanCubic {
7770            left: knot - 0.6,
7771            right: knot + 0.6,
7772            c0: 0.0,
7773            c1: 0.0,
7774            c2: 0.08,
7775            c3: -0.05,
7776        };
7777        let right_alpha3 = -0.05_f64 + 0.11; // α₃ jump.
7778        // η''(knot) continuity:  2α₂ᴸ + 6α₃ᴸ·(knot−leftᴸ) = 2α₂ᴿ + 6α₃ᴿ·(knot−leftᴿ).
7779        let right_left_coord = knot - 0.4;
7780        let lhs = 2.0 * left_link.c2 + 6.0 * left_link.c3 * (knot - left_link.left);
7781        let right_alpha2 = (lhs - 6.0 * right_alpha3 * (knot - right_left_coord)) / 2.0;
7782        let right_link = LocalSpanCubic {
7783            left: right_left_coord,
7784            right: right_left_coord + 0.8,
7785            c0: 0.0,
7786            c1: 0.0,
7787            c2: right_alpha2,
7788            c3: right_alpha3,
7789        };
7790        let (_, _, dc_dbb_left) = link_cubic_second_partials(left_link, a_row, b_row);
7791        let (_, _, dc_dbb_right) = link_cubic_second_partials(right_link, a_row, b_row);
7792        // The per-coefficient arrays differ (α₃ jumped)...
7793        assert!(
7794            (dc_dbb_left[3] - dc_dbb_right[3]).abs() > 1e-3,
7795            "α₃ jump must make the raw dc_dbb coefficient arrays differ"
7796        );
7797        // ...but the EVALUATED second-order slice at the crossing matches, so the
7798        // F_bb boundary integrand carries no jump and the flux telescopes to 0.
7799        let c_bb_left = poly_eval_at(&dc_dbb_left, edge0);
7800        let c_bb_right = poly_eval_at(&dc_dbb_right, edge0);
7801        assert!(
7802            (c_bb_left - c_bb_right).abs() <= 1e-12,
7803            "second-derivative slope-slope integrand must be CONTINUOUS across the \
7804             C² knot (telescoping self-flux): left={c_bb_left:.15e} right={c_bb_right:.15e}"
7805        );
7806    }
7807}
gam_model_kernels/cubic_cell_kernel.rs

gam_model_kernels/
cubic_cell_kernel.rs