gam_model_kernels/
cubic_cell_kernel.rs

1use gam_math::probability::normal_cdf;
2use gam_runtime::resource::{ByteLruCache, ResidentBytes};
3use smallvec::{SmallVec, smallvec};
4use std::hash::{Hash, Hasher};
5use std::sync::Arc;
6use std::sync::atomic::{AtomicU64, Ordering};
7
8/// Typed errors raised by the de-nested cubic transport kernel.
9///
10/// Sibling families (`bernoulli_marginal_slope`, `survival_marginal_slope`,
11/// `marginal_slope_shared`) currently consume the kernel's public surface via
12/// `Result<_, String>`. To stay source-compatible, the kernel converts errors
13/// to `String` at the boundary via `From<CubicCellKernelError> for String` and
14/// keeps the public function signatures returning `Result<_, String>`.
15/// `Display` is exact-byte-equivalent to the previous `format!(...)` strings.
16#[derive(Clone, Debug)]
17pub enum CubicCellKernelError {
18    /// Interval probe / cell-bounds preconditions (ordered bounds, supported
19    /// infinity patterns, positive finite width).
20    InvalidInterval { reason: String },
21    /// Cell-shape / branch-classification failure: tail cells not affine,
22    /// finite cells with non-positive width, non-finite affine coefficients,
23    /// non-affine cell with infinite bounds, leading-coefficient degeneracy
24    /// in the moment recurrence, etc.
25    InvalidCellShape { reason: String },
26    /// Reduced moment vector (or polynomial-convolution scratch) is shorter
27    /// than the polynomial degree the leaf needs to evaluate.
28    InsufficientMoments { reason: String },
29    /// Bivariate-normal CDF domain validation (non-finite/non-infinite
30    /// argument, non-finite correlation).
31    BivariateNormalDomain { reason: String },
32}
33
34impl_reason_error_boilerplate! {
35    CubicCellKernelError {
36        InvalidInterval,
37        InvalidCellShape,
38        InsufficientMoments,
39        BivariateNormalDomain,
40    }
41}
42
43impl CubicCellKernelError {
44    #[inline]
45    fn invalid_interval(reason: impl Into<String>) -> Self {
46        CubicCellKernelError::InvalidInterval {
47            reason: reason.into(),
48        }
49    }
50    #[inline]
51    fn invalid_cell_shape(reason: impl Into<String>) -> Self {
52        CubicCellKernelError::InvalidCellShape {
53            reason: reason.into(),
54        }
55    }
56    #[inline]
57    fn insufficient_moments(reason: impl Into<String>) -> Self {
58        CubicCellKernelError::InsufficientMoments {
59            reason: reason.into(),
60        }
61    }
62    #[inline]
63    fn bivariate_normal_domain(reason: impl Into<String>) -> Self {
64        CubicCellKernelError::BivariateNormalDomain {
65            reason: reason.into(),
66        }
67    }
68}
69
70// De-nested cubic transport kernel.
71//
72// This module implements the de-nested flexible-link/score-warp model
73//
74//   eta(z) = a + b*z + b*delta_h(z) + delta_w(a + b*z)
75//
76// where delta_h is the score warp and delta_w is the link deviation.
77// This is not the literal nested composition L(a + b*H(z)); it is an
78// additive-correction model around the affine core a + b*z.
79//
80// On each partition cell, both deviations are cubic polynomials, so eta is
81// at most sextic in z and q(z) = 0.5*(z^2 + eta^2) is at most degree 12.
82// The integral of exp(-q(z)) is evaluated by transporting from the affine
83// anchor (c2=c3=0, where q is Gaussian and the integral reduces to BVN)
84// to the target non-affine cell via the polynomial moment recurrence.
85//
86// The partition covers (-∞, +∞) with:
87//   • two semi-infinite affine TAIL cells (outside all deviation support),
88//   • finitely many interior cells (each a sextic microcell).
89// Because tail cells have constant deviations (c2=c3=0), their bounds
90// are parameter-independent, so no Leibniz boundary-motion corrections
91// appear in the derivatives.
92//
93// Shared by bernoulli_marginal_slope and survival_marginal_slope families.
94
95#[derive(Clone, Copy, Debug, PartialEq)]
96pub struct LocalSpanCubic {
97    pub left: f64,
98    pub right: f64,
99    pub c0: f64,
100    pub c1: f64,
101    pub c2: f64,
102    pub c3: f64,
103}
104
105impl LocalSpanCubic {
106    #[inline]
107    pub fn evaluate(self, x: f64) -> f64 {
108        let t = x - self.left;
109        self.c0 + self.c1 * t + self.c2 * t * t + self.c3 * t * t * t
110    }
111
112    #[inline]
113    pub fn first_derivative(self, x: f64) -> f64 {
114        let t = x - self.left;
115        self.c1 + 2.0 * self.c2 * t + 3.0 * self.c3 * t * t
116    }
117
118    #[inline]
119    pub fn second_derivative(self, x: f64) -> f64 {
120        let t = x - self.left;
121        2.0 * self.c2 + 6.0 * self.c3 * t
122    }
123}
124
125pub const ANCHORED_DEVIATION_KERNEL: &str = "DenestedCubicTransport";
126/// Default normalized non-affine branch tolerance used by [`branch_cell`].
127///
128/// Keep this cutoff explicit and hill-climbable: the large-scale cycle-0
129/// sweep evaluated `{1e-12, 1e-10, 1e-8, 1e-6, 1e-4, 1e-3}` against the
130/// legacy transport path.  The more aggressive candidates require an
131/// end-to-end beta acceptance run before promotion; the default therefore
132/// remains the legacy `1e-10` value to preserve bit-for-bit model behavior.
133pub const NORMALIZED_CELL_BRANCH_TOL: f64 = 1e-10;
134
135const INV_TWO_PI: f64 = 1.0 / std::f64::consts::TAU;
136
137/// 384-point Gauss–Legendre nodes, re-exported for the GPU cubic-cell kernel
138/// (`src/gpu/cubic_cell/kernel_src.rs`) to embed as `__constant__` device
139/// memory. Linux-only because the kernel emitter is Linux-only.
140#[cfg(target_os = "linux")]
141pub const GL_NODES_FOR_GPU_KERNEL: &[f64; 384] = &GL_NODES;
142/// Companion weights to [`GL_NODES_FOR_GPU_KERNEL`].
143#[cfg(target_os = "linux")]
144pub const GL_WEIGHTS_FOR_GPU_KERNEL: &[f64; 384] = &GL_WEIGHTS;
145
146const GL_NODES: [f64; 384] = [
147    -9.999_804_411_726_474e-1,
148    -9.998_969_471_378_596e-1,
149    -9.997_467_408_113_523e-1,
150    -9.995_297_988_558_859e-1,
151    -9.992_461_316_671_845e-1,
152    -9.988_957_572_063_257e-1,
153    -9.984_786_985_384_589e-1,
154    -9.979_949_833_727_938e-1,
155    -9.974_446_439_389_107e-1,
156    -9.968_277_169_440_913e-1,
157    -9.961_442_435_551_087e-1,
158    -9.953_942_693_885_953e-1,
159    -9.945_778_445_047_068e-1,
160    -9.936_950_234_020_883e-1,
161    -9.927_458_650_133_153e-1,
162    -9.917_304_327_004_32e-1,
163    -9.906_487_942_504_061e-1,
164    -9.895_010_218_704_087e-1,
165    -9.882_871_921_828_699e-1,
166    -9.870_073_862_202_815e-1,
167    -9.856_616_894_197_333e-1,
168    -9.842_501_916_171_713e-1,
169    -9.827_729_870_413_743e-1,
170    -9.812_301_743_076_443e-1,
171    -9.796_218_564_112_101e-1,
172    -9.779_481_407_203_411e-1,
173    -9.762_091_389_691_724e-1,
174    -9.744_049_672_502_397e-1,
175    -9.725_357_460_067_257e-1,
176    -9.706_016_000_244_151e-1,
177    -9.686_026_584_233_628e-1,
178    -9.665_390_546_492_71e-1,
179    -9.644_109_264_645_802e-1,
180    -9.622_184_159_392_698e-1,
181    -9.599_616_694_413_742e-1,
182    -9.576_408_376_272_095e-1,
183    -9.552_560_754_313_16e-1,
184    -9.528_075_420_561_144e-1,
185    -9.502_954_009_612_771e-1,
186    -9.477_198_198_528_157e-1,
187    -9.450_809_706_718_851e-1,
188    -9.423_790_295_833_044e-1,
189    -9.396_141_769_637_963e-1,
190    -9.367_865_973_899_459e-1,
191    -9.338_964_796_258_775e-1,
192    -9.309_440_166_106_54e-1,
193    -9.279_294_054_453_956e-1,
194    -9.248_528_473_801_222e-1,
195    -9.217_145_478_003_181e-1,
196    -9.185_147_162_132_208e-1,
197    -9.152_535_662_338_34e-1,
198    -9.119_313_155_706_682e-1,
199    -9.085_481_860_112_055e-1,
200    -9.051_044_034_070_944e-1,
201    -9.016_001_976_590_722e-1,
202    -8.980_358_027_016_164e-1,
203    -8.944_114_564_873_288e-1,
204    -8.907_274_009_710_492e-1,
205    -8.869_838_820_937_034e-1,
206    -8.831_811_497_658_847e-1,
207    -8.793_194_578_511_7e-1,
208    -8.753_990_641_491_725e-1,
209    -8.714_202_303_783_312e-1,
210    -8.673_832_221_584_393e-1,
211    -8.632_883_089_929_12e-1,
212    -8.591_357_642_507_945e-1,
213    -8.549_258_651_485_127e-1,
214    -8.506_588_927_313_666e-1,
215    -8.463_351_318_547_683e-1,
216    -8.419_548_711_652_254e-1,
217    -8.375_184_030_810_715e-1,
218    -8.330_260_237_729_452e-1,
219    -8.284_780_331_440_178e-1,
220    -8.238_747_348_099_726e-1,
221    -8.192_164_360_787_36e-1,
222    -8.145_034_479_299_62e-1,
223    -8.097_360_849_942_72e-1,
224    -8.049_146_655_322_506e-1,
225    -8.000_395_114_131_988e-1,
226    -7.951_109_480_936_471e-1,
227    -7.901_293_045_956_28e-1,
228    -7.850_949_134_847_117e-1,
229    -7.800_081_108_478_04e-1,
230    -7.748_692_362_707_1e-1,
231    -7.696_786_328_154_644e-1,
232    -7.644_366_469_974_285e-1,
233    -7.591_436_287_621_58e-1,
234    -7.537_999_314_620_412e-1,
235    -7.484_059_118_327_094e-1,
236    -7.429_619_299_692_227e-1,
237    -7.374_683_493_020_299e-1,
238    -7.319_255_365_727_068e-1,
239    -7.263_338_618_094_733e-1,
240    -7.206_936_983_024_912e-1,
241    -7.150_054_225_789_432e-1,
242    -7.092_694_143_778_975e-1,
243    -7.034_860_566_249_567e-1,
244    -6.976_557_354_066_943e-1,
245    -6.917_788_399_448_808e-1,
246    -6.858_557_625_704_99e-1,
247    -6.798_868_986_975_534e-1,
248    -6.738_726_467_966_731e-1,
249    -6.678_134_083_685_102e-1,
250    -6.617_095_879_169_366e-1,
251    -6.555_615_929_220_4e-1,
252    -6.493_698_338_129_212e-1,
253    -6.431_347_239_402_948e-1,
254    -6.368_566_795_488_945e-1,
255    -6.305_361_197_496_849e-1,
256    -6.241_734_664_918_837e-1,
257    -6.177_691_445_347_913e-1,
258    -6.113_235_814_194_364e-1,
259    -6.048_372_074_400_329e-1,
260    -5.983_104_556_152_549e-1,
261    -5.917_437_616_593_286e-1,
262    -5.851_375_639_529_456e-1,
263    -5.784_923_035_139_965e-1,
264    -5.718_084_239_681_3e-1,
265    -5.650_863_715_191_369e-1,
266    -5.583_265_949_191_623e-1,
267    -5.515_295_454_387_482e-1,
268    -5.446_956_768_367_068e-1,
269    -5.378_254_453_298_289e-1,
270    -5.309_193_095_624_275e-1,
271    -5.239_777_305_757_194e-1,
272    -5.170_011_717_770_473e-1,
273    -5.099_900_989_089_429e-1,
274    -5.029_449_800_180_356e-1,
275    -4.958_662_854_238_058_4e-1,
276    -4.887_544_876_871_878e-1,
277    -4.816_100_615_790_221e-1,
278    -4.744_334_840_483_605_5e-1,
279    -4.672_252_341_906_264e-1,
280    -4.599_857_932_156_304e-1,
281    -4.527_156_444_154_463_7e-1,
282    -4.454_152_731_321_473_5e-1,
283    -4.380_851_667_254_05e-1,
284    -4.307_258_145_399_544_5e-1,
285    -4.233_377_078_729_265e-1,
286    -4.159_213_399_410_494e-1,
287    -4.084_772_058_477_228e-1,
288    -4.010_058_025_499_653e-1,
289    -3.935_076_288_252_386e-1,
290    -3.859_831_852_381_500_6e-1,
291    -3.784_329_741_070_358_6e-1,
292    -3.708_574_994_704_271e-1,
293    -3.632_572_670_534_011e-1,
294    -3.556_327_842_338_202e-1,
295    -3.479_845_600_084_600_6e-1,
296    -3.403_131_049_590_297e-1,
297    -3.326_189_312_180_866e-1,
298    -3.249_025_524_348_469_5e-1,
299    -3.171_644_837_408_958_4e-1,
300    -3.094_052_417_157_978e-1,
301    -3.016_253_443_526_109e-1,
302    -2.938_253_110_233_064_5e-1,
303    -2.860_056_624_440_967_5e-1,
304    -2.781_669_206_406_729e-1,
305    -2.703_096_089_133_553e-1,
306    -2.624_342_518_021_592_4e-1,
307    -2.545_413_750_517_773e-1,
308    -2.466_315_055_764_817_5e-1,
309    -2.387_051_714_249_486_3e-1,
310    -2.307_629_017_450_062e-1,
311    -2.228_052_267_483_099_4e-1,
312    -2.148_326_776_749_466_5e-1,
313    -2.068_457_867_579_697_5e-1,
314    -1.988_450_871_878_683_4e-1,
315    -1.908_311_130_769_724_5e-1,
316    -1.828_043_994_237_965_6e-1,
317    -1.747_654_820_773_241_2e-1,
318    -1.667_148_977_012_352_4e-1,
319    -1.586_531_837_380_799_3e-1,
320    -1.505_808_783_733_995e-1,
321    -1.424_985_204_997_981_4e-1,
322    -1.344_066_496_809_674_7e-1,
323    -1.263_058_061_156_663e-1,
324    -1.181_965_306_016_578_4e-1,
325    -1.100_793_644_996_070_4e-1,
326    -1.019_548_496_969_403_7e-1,
327    -9.382_352_857_167_028e-2,
328    -8.568_594_395_618_719e-2,
329    -7.754_263_910_102_077e-2,
330    -6.939_415_763_857_37e-2,
331    -6.124_104_354_682_962e-2,
332    -5.308_384_111_303_817_6e-2,
333    -4.492_309_489_737_94e-2,
334    -3.675_934_969_660_982e-2,
335    -2.859_315_050_769_284_7e-2,
336    -2.042_504_249_141_571e-2,
337    -1.225_557_093_599_553_8e-2,
338    -4.085_281_220_676_868e-3,
339    4.085_281_220_676_868e-3,
340    1.225_557_093_599_553_8e-2,
341    2.042_504_249_141_571e-2,
342    2.859_315_050_769_284_7e-2,
343    3.675_934_969_660_982e-2,
344    4.492_309_489_737_94e-2,
345    5.308_384_111_303_817_6e-2,
346    6.124_104_354_682_962e-2,
347    6.939_415_763_857_37e-2,
348    7.754_263_910_102_077e-2,
349    8.568_594_395_618_719e-2,
350    9.382_352_857_167_028e-2,
351    1.019_548_496_969_403_7e-1,
352    1.100_793_644_996_070_4e-1,
353    1.181_965_306_016_578_4e-1,
354    1.263_058_061_156_663e-1,
355    1.344_066_496_809_674_7e-1,
356    1.424_985_204_997_981_4e-1,
357    1.505_808_783_733_995e-1,
358    1.586_531_837_380_799_3e-1,
359    1.667_148_977_012_352_4e-1,
360    1.747_654_820_773_241_2e-1,
361    1.828_043_994_237_965_6e-1,
362    1.908_311_130_769_724_5e-1,
363    1.988_450_871_878_683_4e-1,
364    2.068_457_867_579_697_5e-1,
365    2.148_326_776_749_466_5e-1,
366    2.228_052_267_483_099_4e-1,
367    2.307_629_017_450_062e-1,
368    2.387_051_714_249_486_3e-1,
369    2.466_315_055_764_817_5e-1,
370    2.545_413_750_517_773e-1,
371    2.624_342_518_021_592_4e-1,
372    2.703_096_089_133_553e-1,
373    2.781_669_206_406_729e-1,
374    2.860_056_624_440_967_5e-1,
375    2.938_253_110_233_064_5e-1,
376    3.016_253_443_526_109e-1,
377    3.094_052_417_157_978e-1,
378    3.171_644_837_408_958_4e-1,
379    3.249_025_524_348_469_5e-1,
380    3.326_189_312_180_866e-1,
381    3.403_131_049_590_297e-1,
382    3.479_845_600_084_600_6e-1,
383    3.556_327_842_338_202e-1,
384    3.632_572_670_534_011e-1,
385    3.708_574_994_704_271e-1,
386    3.784_329_741_070_358_6e-1,
387    3.859_831_852_381_500_6e-1,
388    3.935_076_288_252_386e-1,
389    4.010_058_025_499_653e-1,
390    4.084_772_058_477_228e-1,
391    4.159_213_399_410_494e-1,
392    4.233_377_078_729_265e-1,
393    4.307_258_145_399_544_5e-1,
394    4.380_851_667_254_05e-1,
395    4.454_152_731_321_473_5e-1,
396    4.527_156_444_154_463_7e-1,
397    4.599_857_932_156_304e-1,
398    4.672_252_341_906_264e-1,
399    4.744_334_840_483_605_5e-1,
400    4.816_100_615_790_221e-1,
401    4.887_544_876_871_878e-1,
402    4.958_662_854_238_058_4e-1,
403    5.029_449_800_180_356e-1,
404    5.099_900_989_089_429e-1,
405    5.170_011_717_770_473e-1,
406    5.239_777_305_757_194e-1,
407    5.309_193_095_624_275e-1,
408    5.378_254_453_298_289e-1,
409    5.446_956_768_367_068e-1,
410    5.515_295_454_387_482e-1,
411    5.583_265_949_191_623e-1,
412    5.650_863_715_191_369e-1,
413    5.718_084_239_681_3e-1,
414    5.784_923_035_139_965e-1,
415    5.851_375_639_529_456e-1,
416    5.917_437_616_593_286e-1,
417    5.983_104_556_152_549e-1,
418    6.048_372_074_400_329e-1,
419    6.113_235_814_194_364e-1,
420    6.177_691_445_347_913e-1,
421    6.241_734_664_918_837e-1,
422    6.305_361_197_496_849e-1,
423    6.368_566_795_488_945e-1,
424    6.431_347_239_402_948e-1,
425    6.493_698_338_129_212e-1,
426    6.555_615_929_220_4e-1,
427    6.617_095_879_169_366e-1,
428    6.678_134_083_685_102e-1,
429    6.738_726_467_966_731e-1,
430    6.798_868_986_975_534e-1,
431    6.858_557_625_704_99e-1,
432    6.917_788_399_448_808e-1,
433    6.976_557_354_066_943e-1,
434    7.034_860_566_249_567e-1,
435    7.092_694_143_778_975e-1,
436    7.150_054_225_789_432e-1,
437    7.206_936_983_024_912e-1,
438    7.263_338_618_094_733e-1,
439    7.319_255_365_727_068e-1,
440    7.374_683_493_020_299e-1,
441    7.429_619_299_692_227e-1,
442    7.484_059_118_327_094e-1,
443    7.537_999_314_620_412e-1,
444    7.591_436_287_621_58e-1,
445    7.644_366_469_974_285e-1,
446    7.696_786_328_154_644e-1,
447    7.748_692_362_707_1e-1,
448    7.800_081_108_478_04e-1,
449    7.850_949_134_847_117e-1,
450    7.901_293_045_956_28e-1,
451    7.951_109_480_936_471e-1,
452    8.000_395_114_131_988e-1,
453    8.049_146_655_322_506e-1,
454    8.097_360_849_942_72e-1,
455    8.145_034_479_299_62e-1,
456    8.192_164_360_787_36e-1,
457    8.238_747_348_099_726e-1,
458    8.284_780_331_440_178e-1,
459    8.330_260_237_729_452e-1,
460    8.375_184_030_810_715e-1,
461    8.419_548_711_652_254e-1,
462    8.463_351_318_547_683e-1,
463    8.506_588_927_313_666e-1,
464    8.549_258_651_485_127e-1,
465    8.591_357_642_507_945e-1,
466    8.632_883_089_929_12e-1,
467    8.673_832_221_584_393e-1,
468    8.714_202_303_783_312e-1,
469    8.753_990_641_491_725e-1,
470    8.793_194_578_511_7e-1,
471    8.831_811_497_658_847e-1,
472    8.869_838_820_937_034e-1,
473    8.907_274_009_710_492e-1,
474    8.944_114_564_873_288e-1,
475    8.980_358_027_016_164e-1,
476    9.016_001_976_590_722e-1,
477    9.051_044_034_070_944e-1,
478    9.085_481_860_112_055e-1,
479    9.119_313_155_706_682e-1,
480    9.152_535_662_338_34e-1,
481    9.185_147_162_132_208e-1,
482    9.217_145_478_003_181e-1,
483    9.248_528_473_801_222e-1,
484    9.279_294_054_453_956e-1,
485    9.309_440_166_106_54e-1,
486    9.338_964_796_258_775e-1,
487    9.367_865_973_899_459e-1,
488    9.396_141_769_637_963e-1,
489    9.423_790_295_833_044e-1,
490    9.450_809_706_718_851e-1,
491    9.477_198_198_528_157e-1,
492    9.502_954_009_612_771e-1,
493    9.528_075_420_561_144e-1,
494    9.552_560_754_313_16e-1,
495    9.576_408_376_272_095e-1,
496    9.599_616_694_413_742e-1,
497    9.622_184_159_392_698e-1,
498    9.644_109_264_645_802e-1,
499    9.665_390_546_492_71e-1,
500    9.686_026_584_233_628e-1,
501    9.706_016_000_244_151e-1,
502    9.725_357_460_067_257e-1,
503    9.744_049_672_502_397e-1,
504    9.762_091_389_691_724e-1,
505    9.779_481_407_203_411e-1,
506    9.796_218_564_112_101e-1,
507    9.812_301_743_076_443e-1,
508    9.827_729_870_413_743e-1,
509    9.842_501_916_171_713e-1,
510    9.856_616_894_197_333e-1,
511    9.870_073_862_202_815e-1,
512    9.882_871_921_828_699e-1,
513    9.895_010_218_704_087e-1,
514    9.906_487_942_504_061e-1,
515    9.917_304_327_004_32e-1,
516    9.927_458_650_133_153e-1,
517    9.936_950_234_020_883e-1,
518    9.945_778_445_047_068e-1,
519    9.953_942_693_885_953e-1,
520    9.961_442_435_551_087e-1,
521    9.968_277_169_440_913e-1,
522    9.974_446_439_389_107e-1,
523    9.979_949_833_727_938e-1,
524    9.984_786_985_384_589e-1,
525    9.988_957_572_063_257e-1,
526    9.992_461_316_671_845e-1,
527    9.995_297_988_558_859e-1,
528    9.997_467_408_113_523e-1,
529    9.998_969_471_378_596e-1,
530    9.999_804_411_726_474e-1,
531];
532const GL_WEIGHTS: [f64; 384] = [
533    5.019_410_348_676_869_6e-5,
534    1.168_390_665_730_266_3e-4,
535    1.835_749_193_551_655_8e-4,
536    2.503_070_890_844_105e-4,
537    3.170_242_698_112_815e-4,
538    3.837_208_020_912_921_4e-4,
539    4.503_919_137_716_827e-4,
540    5.170_330_453_491_649e-4,
541    5.836_397_042_630_135e-4,
542    6.502_074_240_969_948e-4,
543    7.167_317_509_947_801e-4,
544    7.832_082_385_905_168e-4,
545    8.496_324_460_039_209e-4,
546    9.159_999_370_632_641e-4,
547    9.823_062_800_663_463e-4,
548    1.048_547_047_793_689_5e-3,
549    1.114_717_817_647_310_6e-3,
550    1.180_814_171_855_922e-3,
551    1.246_831_697_715_441_5e-3,
552    1.312_765_987_850_66e-3,
553    1.378_612_640_487_646_8e-3,
554    1.444_367_259_734_736e-3,
555    1.510_025_455_865_810_3e-3,
556    1.575_582_845_607_936_8e-3,
557    1.641_035_052_429_271_5e-3,
558    1.706_377_706_828_447_1e-3,
559    1.771_606_446_623_834_7e-3,
560    1.836_716_917_243_567_5e-3,
561    1.901_704_772_014_899_2e-3,
562    1.966_565_672_453_437e-3,
563    2.031_295_288_552_398_4e-3,
564    2.095_889_299_071_020_6e-3,
565    2.160_343_391_822_734_3e-3,
566    2.224_653_263_962_713e-3,
567    2.288_814_622_274_955e-3,
568    2.352_823_183_458_769e-3,
569    2.416_674_674_414_340_5e-3,
570    2.480_364_832_528_265_6e-3,
571    2.543_889_405_957_74e-3,
572    2.607_244_153_914_452e-3,
573    2.670_424_846_947_554e-3,
574    2.733_427_267_226_093_3e-3,
575    2.796_247_208_820_428e-3,
576    2.858_880_477_983_06e-3,
577    2.921_322_893_428_515_3e-3,
578    2.983_570_286_612_554_5e-3,
579    3.045_618_502_010_327_8e-3,
580    3.107_463_397_393_755_5e-3,
581    3.169_100_844_108_32e-3,
582    3.230_526_727_348_174e-3,
583    3.291_736_946_431_361e-3,
584    3.352_727_415_073_250_3e-3,
585    3.413_494_061_659_418_4e-3,
586    3.474_032_829_517_317e-3,
587    3.534_339_677_187_348_4e-3,
588    3.594_410_578_692_452e-3,
589    3.654_241_523_806_987e-3,
590    3.713_828_518_324_312_5e-3,
591    3.773_167_584_323_583_5e-3,
592    3.832_254_760_435_171e-3,
593    3.891_086_102_105_193_4e-3,
594    3.949_657_681_858_895e-3,
595    4.007_965_589_562_678e-3,
596    4.066_005_932_685_269e-3,
597    4.123_774_836_557_6e-3,
598    4.181_268_444_631_281e-3,
599    4.238_482_918_736_289e-3,
600    4.295_414_439_336_925e-3,
601    4.352_059_205_787_275e-3,
602    4.408_413_436_584_285e-3,
603    4.464_473_369_620_78e-3,
604    4.520_235_262_436_235e-3,
605    4.575_695_392_466_791e-3,
606    4.630_850_057_293_894e-3,
607    4.685_695_574_891_041e-3,
608    4.740_228_283_870_022e-3,
609    4.794_444_543_725_102e-3,
610    4.848_340_735_076_109e-3,
611    4.901_913_259_910_197e-3,
612    4.955_158_541_821_682_4e-3,
613    5.008_073_026_251_332e-3,
614    5.060_653_180_723_101_4e-3,
615    5.112_895_495_080_397e-3,
616    5.164_796_481_720_011e-3,
617    5.216_352_675_825_451e-3,
618    5.267_560_635_597_735e-3,
619    5.318_416_942_485_385e-3,
620    5.368_918_201_412_827e-3,
621    5.419_061_041_006_627e-3,
622    5.468_842_113_820_941e-3,
623    5.518_258_096_560_71e-3,
624    5.567_305_690_303_767e-3,
625    5.615_981_620_720_803e-3,
626    5.664_282_638_294_182e-3,
627    5.712_205_518_534_655e-3,
628    5.759_747_062_196_925_5e-3,
629    5.806_904_095_492_818e-3,
630    5.853_673_470_303_617_4e-3,
631    5.900_052_064_389_824e-3,
632    5.946_036_781_599_814e-3,
633    5.991_624_552_076_468e-3,
634    6.036_812_332_462_087e-3,
635    6.081_597_106_101_673e-3,
636    6.125_975_883_244_196e-3,
637    6.169_945_701_242_237e-3,
638    6.213_503_624_749_591e-3,
639    6.256_646_745_917_723e-3,
640    6.299_372_184_589_237e-3,
641    6.341_677_088_490_664e-3,
642    6.383_558_633_422_572e-3,
643    6.425_014_023_448_273e-3,
644    6.466_040_491_080_434e-3,
645    6.506_635_297_465_724e-3,
646    6.546_795_732_567_842_5e-3,
647    6.586_519_115_348_261e-3,
648    6.625_802_793_945_317e-3,
649    6.664_644_145_851_14e-3,
650    6.703_040_578_086_941e-3,
651    6.740_989_527_375_895e-3,
652    6.778_488_460_314_126e-3,
653    6.815_534_873_540_5e-3,
654    6.852_126_293_902_878e-3,
655    6.888_260_278_623_754e-3,
656    6.923_934_415_463_31e-3,
657    6.959_146_322_880_146_5e-3,
658    6.993_893_650_190_702e-3,
659    7.028_174_077_725_734e-3,
660    7.061_985_316_985_506e-3,
661    7.095_325_110_792_439e-3,
662    7.128_191_233_441_844e-3,
663    7.160_581_490_850_321e-3,
664    7.192_493_720_702_486e-3,
665    7.223_925_792_595_309e-3,
666    7.254_875_608_179_984e-3,
667    7.285_341_101_302_512e-3,
668    7.315_320_238_141_324_5e-3,
669    7.344_811_017_343_063e-3,
670    7.373_811_470_156_258e-3,
671    7.402_319_660_562_818e-3,
672    7.430_333_685_407_178e-3,
673    7.457_851_674_523_319e-3,
674    7.484_871_790_859_79e-3,
675    7.511_392_230_602_079e-3,
676    7.537_411_223_293_362e-3,
677    7.562_927_031_952_382e-3,
678    7.587_937_953_189_561_5e-3,
679    7.612_442_317_320_796e-3,
680    7.636_438_488_478_739e-3,
681    7.659_924_864_722_064e-3,
682    7.682_899_878_142_539e-3,
683    7.705_361_994_969_524e-3,
684    7.727_309_715_672_44e-3,
685    7.748_741_575_060_914e-3,
686    7.769_656_142_382_462e-3,
687    7.790_052_021_418_226e-3,
688    7.809_927_850_575_903e-3,
689    7.829_282_302_980_82e-3,
690    7.848_114_086_564_56e-3,
691    7.866_421_944_151_094e-3,
692    7.884_204_653_540_665e-3,
693    7.901_461_027_591_6e-3,
694    7.918_189_914_299_318e-3,
695    7.934_390_196_873_448e-3,
696    7.950_060_793_812_204e-3,
697    7.965_200_658_974_709e-3,
698    7.979_808_781_650_77e-3,
699    7.993_884_186_628_266e-3,
700    8.007_425_934_258_548e-3,
701    8.020_433_120_518_866e-3,
702    8.032_904_877_072_8e-3,
703    8.044_840_371_328_26e-3,
704    8.056_238_806_493_175e-3,
705    8.067_099_421_628_42e-3,
706    8.077_421_491_698_82e-3,
707    8.087_204_327_621_594e-3,
708    8.096_447_276_312_202e-3,
709    8.105_149_720_727_933e-3,
710    8.113_311_079_909_208e-3,
711    8.120_930_809_018_415e-3,
712    8.128_008_399_376_085e-3,
713    8.134_543_378_495_033e-3,
714    8.140_535_310_111_77e-3,
715    8.145_983_794_215_77e-3,
716    8.150_888_467_075_875e-3,
717    8.155_249_001_265_092e-3,
718    8.159_065_105_681_899e-3,
719    8.162_336_525_570_1e-3,
720    8.165_063_042_535_465e-3,
721    8.167_244_474_560_707e-3,
722    8.168_880_676_017_344e-3,
723    8.169_971_537_675_47e-3,
724    8.170_516_986_711_104e-3,
725    8.170_516_986_711_104e-3,
726    8.169_971_537_675_47e-3,
727    8.168_880_676_017_344e-3,
728    8.167_244_474_560_707e-3,
729    8.165_063_042_535_465e-3,
730    8.162_336_525_570_1e-3,
731    8.159_065_105_681_899e-3,
732    8.155_249_001_265_092e-3,
733    8.150_888_467_075_875e-3,
734    8.145_983_794_215_77e-3,
735    8.140_535_310_111_77e-3,
736    8.134_543_378_495_033e-3,
737    8.128_008_399_376_085e-3,
738    8.120_930_809_018_415e-3,
739    8.113_311_079_909_208e-3,
740    8.105_149_720_727_933e-3,
741    8.096_447_276_312_202e-3,
742    8.087_204_327_621_594e-3,
743    8.077_421_491_698_82e-3,
744    8.067_099_421_628_42e-3,
745    8.056_238_806_493_175e-3,
746    8.044_840_371_328_26e-3,
747    8.032_904_877_072_8e-3,
748    8.020_433_120_518_866e-3,
749    8.007_425_934_258_548e-3,
750    7.993_884_186_628_266e-3,
751    7.979_808_781_650_77e-3,
752    7.965_200_658_974_709e-3,
753    7.950_060_793_812_204e-3,
754    7.934_390_196_873_448e-3,
755    7.918_189_914_299_318e-3,
756    7.901_461_027_591_6e-3,
757    7.884_204_653_540_665e-3,
758    7.866_421_944_151_094e-3,
759    7.848_114_086_564_56e-3,
760    7.829_282_302_980_82e-3,
761    7.809_927_850_575_903e-3,
762    7.790_052_021_418_226e-3,
763    7.769_656_142_382_462e-3,
764    7.748_741_575_060_914e-3,
765    7.727_309_715_672_44e-3,
766    7.705_361_994_969_524e-3,
767    7.682_899_878_142_539e-3,
768    7.659_924_864_722_064e-3,
769    7.636_438_488_478_739e-3,
770    7.612_442_317_320_796e-3,
771    7.587_937_953_189_561_5e-3,
772    7.562_927_031_952_382e-3,
773    7.537_411_223_293_362e-3,
774    7.511_392_230_602_079e-3,
775    7.484_871_790_859_79e-3,
776    7.457_851_674_523_319e-3,
777    7.430_333_685_407_178e-3,
778    7.402_319_660_562_818e-3,
779    7.373_811_470_156_258e-3,
780    7.344_811_017_343_063e-3,
781    7.315_320_238_141_324_5e-3,
782    7.285_341_101_302_512e-3,
783    7.254_875_608_179_984e-3,
784    7.223_925_792_595_309e-3,
785    7.192_493_720_702_486e-3,
786    7.160_581_490_850_321e-3,
787    7.128_191_233_441_844e-3,
788    7.095_325_110_792_439e-3,
789    7.061_985_316_985_506e-3,
790    7.028_174_077_725_734e-3,
791    6.993_893_650_190_702e-3,
792    6.959_146_322_880_146_5e-3,
793    6.923_934_415_463_31e-3,
794    6.888_260_278_623_754e-3,
795    6.852_126_293_902_878e-3,
796    6.815_534_873_540_5e-3,
797    6.778_488_460_314_126e-3,
798    6.740_989_527_375_895e-3,
799    6.703_040_578_086_941e-3,
800    6.664_644_145_851_14e-3,
801    6.625_802_793_945_317e-3,
802    6.586_519_115_348_261e-3,
803    6.546_795_732_567_842_5e-3,
804    6.506_635_297_465_724e-3,
805    6.466_040_491_080_434e-3,
806    6.425_014_023_448_273e-3,
807    6.383_558_633_422_572e-3,
808    6.341_677_088_490_664e-3,
809    6.299_372_184_589_237e-3,
810    6.256_646_745_917_723e-3,
811    6.213_503_624_749_591e-3,
812    6.169_945_701_242_237e-3,
813    6.125_975_883_244_196e-3,
814    6.081_597_106_101_673e-3,
815    6.036_812_332_462_087e-3,
816    5.991_624_552_076_468e-3,
817    5.946_036_781_599_814e-3,
818    5.900_052_064_389_824e-3,
819    5.853_673_470_303_617_4e-3,
820    5.806_904_095_492_818e-3,
821    5.759_747_062_196_925_5e-3,
822    5.712_205_518_534_655e-3,
823    5.664_282_638_294_182e-3,
824    5.615_981_620_720_803e-3,
825    5.567_305_690_303_767e-3,
826    5.518_258_096_560_71e-3,
827    5.468_842_113_820_941e-3,
828    5.419_061_041_006_627e-3,
829    5.368_918_201_412_827e-3,
830    5.318_416_942_485_385e-3,
831    5.267_560_635_597_735e-3,
832    5.216_352_675_825_451e-3,
833    5.164_796_481_720_011e-3,
834    5.112_895_495_080_397e-3,
835    5.060_653_180_723_101_4e-3,
836    5.008_073_026_251_332e-3,
837    4.955_158_541_821_682_4e-3,
838    4.901_913_259_910_197e-3,
839    4.848_340_735_076_109e-3,
840    4.794_444_543_725_102e-3,
841    4.740_228_283_870_022e-3,
842    4.685_695_574_891_041e-3,
843    4.630_850_057_293_894e-3,
844    4.575_695_392_466_791e-3,
845    4.520_235_262_436_235e-3,
846    4.464_473_369_620_78e-3,
847    4.408_413_436_584_285e-3,
848    4.352_059_205_787_275e-3,
849    4.295_414_439_336_925e-3,
850    4.238_482_918_736_289e-3,
851    4.181_268_444_631_281e-3,
852    4.123_774_836_557_6e-3,
853    4.066_005_932_685_269e-3,
854    4.007_965_589_562_678e-3,
855    3.949_657_681_858_895e-3,
856    3.891_086_102_105_193_4e-3,
857    3.832_254_760_435_171e-3,
858    3.773_167_584_323_583_5e-3,
859    3.713_828_518_324_312_5e-3,
860    3.654_241_523_806_987e-3,
861    3.594_410_578_692_452e-3,
862    3.534_339_677_187_348_4e-3,
863    3.474_032_829_517_317e-3,
864    3.413_494_061_659_418_4e-3,
865    3.352_727_415_073_250_3e-3,
866    3.291_736_946_431_361e-3,
867    3.230_526_727_348_174e-3,
868    3.169_100_844_108_32e-3,
869    3.107_463_397_393_755_5e-3,
870    3.045_618_502_010_327_8e-3,
871    2.983_570_286_612_554_5e-3,
872    2.921_322_893_428_515_3e-3,
873    2.858_880_477_983_06e-3,
874    2.796_247_208_820_428e-3,
875    2.733_427_267_226_093_3e-3,
876    2.670_424_846_947_554e-3,
877    2.607_244_153_914_452e-3,
878    2.543_889_405_957_74e-3,
879    2.480_364_832_528_265_6e-3,
880    2.416_674_674_414_340_5e-3,
881    2.352_823_183_458_769e-3,
882    2.288_814_622_274_955e-3,
883    2.224_653_263_962_713e-3,
884    2.160_343_391_822_734_3e-3,
885    2.095_889_299_071_020_6e-3,
886    2.031_295_288_552_398_4e-3,
887    1.966_565_672_453_437e-3,
888    1.901_704_772_014_899_2e-3,
889    1.836_716_917_243_567_5e-3,
890    1.771_606_446_623_834_7e-3,
891    1.706_377_706_828_447_1e-3,
892    1.641_035_052_429_271_5e-3,
893    1.575_582_845_607_936_8e-3,
894    1.510_025_455_865_810_3e-3,
895    1.444_367_259_734_736e-3,
896    1.378_612_640_487_646_8e-3,
897    1.312_765_987_850_66e-3,
898    1.246_831_697_715_441_5e-3,
899    1.180_814_171_855_922e-3,
900    1.114_717_817_647_310_6e-3,
901    1.048_547_047_793_689_5e-3,
902    9.823_062_800_663_463e-4,
903    9.159_999_370_632_641e-4,
904    8.496_324_460_039_209e-4,
905    7.832_082_385_905_168e-4,
906    7.167_317_509_947_801e-4,
907    6.502_074_240_969_948e-4,
908    5.836_397_042_630_135e-4,
909    5.170_330_453_491_649e-4,
910    4.503_919_137_716_827e-4,
911    3.837_208_020_912_921_4e-4,
912    3.170_242_698_112_815e-4,
913    2.503_070_890_844_105e-4,
914    1.835_749_193_551_655_8e-4,
915    1.168_390_665_730_266_3e-4,
916    5.019_410_348_676_869_6e-5,
917];
918
919#[derive(Clone, Copy, Debug, Eq, PartialEq)]
920pub enum ExactCellBranch {
921    Affine,
922    Quartic,
923    Sextic,
924}
925
926/// Auto-tune the per-cell affine/non-affine branch tolerance from the cell's
927/// own coefficient magnitudes.
928///
929/// The legacy `branch_cell` compared the normalized cubic coefficients
930/// `(k2, k3)` against a single global constant.  That constant is calibrated
931/// for cells whose anchor coefficients `(c0, c1)` are O(1).  When the anchor
932/// dominates — e.g. a tail cell with `|c0|, |c1| >> 1` — a relative criterion
933/// against the anchor magnitude is more numerically meaningful than the bare
934/// global threshold, because the affine contribution to `eta` already absorbs
935/// any difference at the chosen scale.
936///
937/// The returned tolerance is always at least [`NORMALIZED_CELL_BRANCH_TOL`],
938/// so cells with O(1) anchors recover bit-identical classification with the
939/// legacy code path.  This preserves numerical equivalence for the
940/// established `cubic_cell_kernel` tests, including the
941/// `tuned_branch_tolerance_matches_legacy_non_affine_transport_grid` grid.
942#[inline]
943fn effective_branch_tol(cell: DenestedCubicCell) -> f64 {
944    let anchor_scale = cell.c0.abs().max(cell.c1.abs()).max(1.0);
945    NORMALIZED_CELL_BRANCH_TOL * anchor_scale
946}
947
948#[derive(Clone, Copy, Debug, PartialEq)]
949pub struct DenestedCubicCell {
950    pub left: f64,
951    pub right: f64,
952    pub c0: f64,
953    pub c1: f64,
954    pub c2: f64,
955    pub c3: f64,
956}
957
958impl DenestedCubicCell {
959    #[inline]
960    pub fn eta(self, z: f64) -> f64 {
961        self.c0 + self.c1 * z + self.c2 * z * z + self.c3 * z * z * z
962    }
963
964    #[inline]
965    pub fn q(self, z: f64) -> f64 {
966        let eta = self.eta(z);
967        0.5 * (z * z + eta * eta)
968    }
969}
970
971#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
972pub struct CellMomentFingerprint {
973    pub hash: u64,
974    bins: [u64; 6],
975}
976
977#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
978pub struct CellMomentCacheKey {
979    pub fingerprint: CellMomentFingerprint,
980    pub max_degree: usize,
981}
982
983#[derive(Clone, Copy, Debug, Default, PartialEq)]
984pub struct CellMomentDedupStats {
985    pub lookups: u64,
986    pub hits: u64,
987    pub misses: u64,
988}
989
990impl CellMomentDedupStats {
991    #[inline]
992    pub fn hit_rate(self) -> f64 {
993        if self.lookups == 0 {
994            0.0
995        } else {
996            self.hits as f64 / self.lookups as f64
997        }
998    }
999}
1000
1001#[inline]
1002fn splitmix64(x: u64) -> u64 {
1003    gam_linalg::utils::splitmix64_hash(x)
1004}
1005
1006#[inline]
1007fn mix_fingerprint_words(words: &[u64]) -> u64 {
1008    let mut h = 0xcbf2_9ce4_8422_2325u64;
1009    for &word in words {
1010        h ^= splitmix64(word);
1011        h = h.wrapping_mul(0x100_0000_01b3);
1012    }
1013    h
1014}
1015
1016#[inline]
1017fn quantized_cell_word(x: f64, epsilon: f64) -> u64 {
1018    if epsilon == 0.0 || !epsilon.is_finite() || epsilon < 0.0 || !x.is_finite() {
1019        return x.to_bits();
1020    }
1021    (x / epsilon).round().to_bits()
1022}
1023
1024/// Returns a deterministic geometric fingerprint for a de-nested cubic cell.
1025///
1026/// With `epsilon == 0.0`, each coordinate is represented by its exact IEEE-754
1027/// bit pattern, so equal fingerprints imply bit-equal `(left, right, c0, c1,
1028/// c2, c3)` tuples.  With `epsilon > 0`, finite coordinates are binned to the
1029/// nearest multiple of `epsilon`; callers should treat this as an approximate
1030/// cache key and validate the resulting model error for their data.
1031pub fn cell_moment_fingerprint(cell: DenestedCubicCell, epsilon: f64) -> CellMomentFingerprint {
1032    let bins = [
1033        quantized_cell_word(cell.left, epsilon),
1034        quantized_cell_word(cell.right, epsilon),
1035        quantized_cell_word(cell.c0, epsilon),
1036        quantized_cell_word(cell.c1, epsilon),
1037        quantized_cell_word(cell.c2, epsilon),
1038        quantized_cell_word(cell.c3, epsilon),
1039    ];
1040    CellMomentFingerprint {
1041        hash: mix_fingerprint_words(&bins),
1042        bins,
1043    }
1044}
1045
1046#[inline]
1047pub fn cell_moment_cache_key(
1048    cell: DenestedCubicCell,
1049    max_degree: usize,
1050    epsilon: f64,
1051) -> CellMomentCacheKey {
1052    CellMomentCacheKey {
1053        fingerprint: cell_moment_fingerprint(cell, epsilon),
1054        max_degree,
1055    }
1056}
1057
1058#[derive(Clone, Copy, Debug, PartialEq)]
1059pub struct DenestedPartitionCell {
1060    pub cell: DenestedCubicCell,
1061    pub score_span: LocalSpanCubic,
1062    pub link_span: LocalSpanCubic,
1063    /// Provenance of the cell's boundaries: a fixed z location (score break
1064    /// or ±∞ tail) or a link-knot crossing `z = (τ - a)/b`. Together with
1065    /// `(score_span, link_span)` this identifies the cell's two-parameter
1066    /// family in `(a, b)` across rows (see
1067    /// [`crate::cell_moment_family`]).
1068    pub left_edge: PartitionEdge,
1069    pub right_edge: PartitionEdge,
1070}
1071
1072impl DenestedPartitionCell {}
1073
1074/// Provenance of one boundary of a denested partition cell.
1075#[derive(Clone, Copy, Debug, PartialEq)]
1076pub enum PartitionEdge {
1077    /// A z location independent of the row scalars: a score-spline break,
1078    /// or ±∞ for tail cells.
1079    Fixed(f64),
1080    /// A link-knot crossing: the boundary sits at `z = (τ - a)/b` for the
1081    /// row's `(a, b)`.
1082    Crossing { tau: f64 },
1083}
1084
1085impl PartitionEdge {
1086    /// The boundary's z location at the row scalars `(a, b)`.
1087    #[inline]
1088    pub fn z_at(self, a: f64, b: f64) -> f64 {
1089        match self {
1090            Self::Fixed(z) => z,
1091            Self::Crossing { tau } => (tau - a) / b,
1092        }
1093    }
1094}
1095
1096#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
1097struct TailCellMomentCacheKey {
1098    c0_bits: u64,
1099    c1_bits: u64,
1100    endpoint_bits: u64,
1101    side: i8,
1102    max_degree: usize,
1103}
1104
1105const TAIL_CELL_MOMENT_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
1106const TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES: usize = 262_144;
1107
1108#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
1109pub struct TailCellMomentCacheStats {
1110    pub hits: usize,
1111    pub misses: usize,
1112    pub entries: usize,
1113}
1114
1115impl TailCellMomentCacheStats {
1116    #[inline]
1117    pub fn requests(self) -> usize {
1118        self.hits + self.misses
1119    }
1120
1121    #[inline]
1122    pub fn hit_rate(self) -> f64 {
1123        let requests = self.requests();
1124        if requests == 0 {
1125            0.0
1126        } else {
1127            self.hits as f64 / requests as f64
1128        }
1129    }
1130}
1131
1132/// Affine-tail cell-moment memo.
1133///
1134/// Stand-alone instances (`TailCellMomentCache::new()`) are useful when a
1135/// caller needs deterministic hit/miss bookkeeping that is not polluted by
1136/// concurrent traffic on the global memo. The production path uses the
1137/// global instance behind [`evaluate_cell_moments`].
1138///
1139/// All methods take `&self`: the LRU is internally synchronized (sharded for
1140/// the concurrent global memo) and the counters are atomics, so the global
1141/// instance needs no outer `Mutex`. The previous `OnceLock<Mutex<…>>` wrapper
1142/// serialized every tail-cell evaluation across all rayon workers of the
1143/// marginal-slope exact-cache build — the same contention class the sharded
1144/// per-family cell-moment LRU fix removed.
1145#[derive(Debug)]
1146pub struct TailCellMomentCache {
1147    moments: ByteLruCache<TailCellMomentCacheKey, CellMomentState>,
1148    hits: std::sync::atomic::AtomicUsize,
1149    misses: std::sync::atomic::AtomicUsize,
1150}
1151
1152impl Default for TailCellMomentCache {
1153    fn default() -> Self {
1154        // Tail-cell entries are small (a short moment vector), so sharding
1155        // the byte/entry budgets is harmless; size the shard count off the
1156        // worker pool exactly like the per-family cell-moment LRU.
1157        let shard_count = std::thread::available_parallelism()
1158            .map(|workers| workers.get().saturating_mul(8))
1159            .unwrap_or(32)
1160            .clamp(8, 256);
1161        Self {
1162            moments: ByteLruCache::with_max_entries_sharded(
1163                TAIL_CELL_MOMENT_CACHE_MAX_BYTES,
1164                TAIL_CELL_MOMENT_CACHE_MAX_ENTRIES,
1165                shard_count,
1166            ),
1167            hits: std::sync::atomic::AtomicUsize::new(0),
1168            misses: std::sync::atomic::AtomicUsize::new(0),
1169        }
1170    }
1171}
1172
1173impl TailCellMomentCache {
1174    /// Construct an empty cache. Hits/misses start at zero.
1175    #[inline]
1176    pub fn new() -> Self {
1177        Self::default()
1178    }
1179
1180    /// Reset the cache to its empty state. Existing entries are dropped and
1181    /// the hit/miss counters are zeroed.
1182    #[inline]
1183    pub fn clear(&self) {
1184        self.moments.clear();
1185        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
1186        self.misses.store(0, std::sync::atomic::Ordering::Relaxed);
1187    }
1188
1189    /// Snapshot of the cache's current usage stats.
1190    #[inline]
1191    pub fn stats(&self) -> TailCellMomentCacheStats {
1192        TailCellMomentCacheStats {
1193            hits: self.hits.load(std::sync::atomic::Ordering::Relaxed),
1194            misses: self.misses.load(std::sync::atomic::Ordering::Relaxed),
1195            entries: self.moments.len(),
1196        }
1197    }
1198
1199    /// Look up `cell` at `max_degree`, computing and inserting the result on
1200    /// miss. Cells outside the affine-tail keyset bypass the cache and run
1201    /// the uncached evaluator directly without touching the counters.
1202    ///
1203    /// Stat semantics: every cache hit increments `hits`; a **miss** is
1204    /// counted when this call computed the value itself. Under concurrent
1205    /// access two workers racing on the same cold key may both count a miss
1206    /// (each computes the identical pure-function value); single-threaded
1207    /// bookkeeping is exact.
1208    pub fn evaluate(
1209        &self,
1210        cell: DenestedCubicCell,
1211        max_degree: usize,
1212    ) -> Result<CellMomentState, String> {
1213        let Some(key) = tail_cell_cache_key(cell, max_degree) else {
1214            return evaluate_cell_moments_uncached(cell, max_degree);
1215        };
1216        if let Some(state) = self.moments.get(&key) {
1217            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1218            return Ok(state);
1219        }
1220        let state = evaluate_cell_moments_uncached(cell, max_degree)?;
1221        self.misses
1222            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1223        self.moments.insert(key, state.clone());
1224        Ok(state)
1225    }
1226}
1227
1228static TAIL_CELL_MOMENT_CACHE: std::sync::OnceLock<TailCellMomentCache> =
1229    std::sync::OnceLock::new();
1230static TAIL_CELL_MOMENT_CACHE_ENABLED: std::sync::atomic::AtomicBool =
1231    std::sync::atomic::AtomicBool::new(true);
1232
1233fn tail_cell_moment_cache() -> &'static TailCellMomentCache {
1234    TAIL_CELL_MOMENT_CACHE.get_or_init(TailCellMomentCache::default)
1235}
1236
1237#[inline]
1238fn tail_cell_cache_key(
1239    cell: DenestedCubicCell,
1240    max_degree: usize,
1241) -> Option<TailCellMomentCacheKey> {
1242    if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL {
1243        return None;
1244    }
1245    match (!cell.left.is_finite(), !cell.right.is_finite()) {
1246        (true, false) if cell.right.is_finite() => Some(TailCellMomentCacheKey {
1247            c0_bits: cell.c0.to_bits(),
1248            c1_bits: cell.c1.to_bits(),
1249            endpoint_bits: cell.right.to_bits(),
1250            side: -1,
1251            max_degree,
1252        }),
1253        (false, true) if cell.left.is_finite() => Some(TailCellMomentCacheKey {
1254            c0_bits: cell.c0.to_bits(),
1255            c1_bits: cell.c1.to_bits(),
1256            endpoint_bits: cell.left.to_bits(),
1257            side: 1,
1258            max_degree,
1259        }),
1260        _ => None,
1261    }
1262}
1263
1264pub fn set_tail_cell_moment_cache_enabled(enabled: bool) {
1265    TAIL_CELL_MOMENT_CACHE_ENABLED.store(enabled, std::sync::atomic::Ordering::Relaxed);
1266}
1267
1268pub fn reset_tail_cell_moment_cache() {
1269    tail_cell_moment_cache().clear();
1270}
1271
1272pub fn tail_cell_moment_cache_stats() -> TailCellMomentCacheStats {
1273    tail_cell_moment_cache().stats()
1274}
1275
1276#[derive(Clone, Copy, Debug, Eq)]
1277pub struct CellFingerprint {
1278    c0: u64,
1279    c1: u64,
1280    c2: u64,
1281    c3: u64,
1282    left: u64,
1283    right: u64,
1284}
1285
1286impl CellFingerprint {
1287    #[inline]
1288    pub fn new(cell: DenestedCubicCell) -> Self {
1289        Self {
1290            c0: cell.c0.to_bits(),
1291            c1: cell.c1.to_bits(),
1292            c2: cell.c2.to_bits(),
1293            c3: cell.c3.to_bits(),
1294            left: cell.left.to_bits(),
1295            right: cell.right.to_bits(),
1296        }
1297    }
1298}
1299
1300impl PartialEq for CellFingerprint {
1301    #[inline]
1302    fn eq(&self, other: &Self) -> bool {
1303        self.c0 == other.c0
1304            && self.c1 == other.c1
1305            && self.c2 == other.c2
1306            && self.c3 == other.c3
1307            && self.left == other.left
1308            && self.right == other.right
1309    }
1310}
1311
1312impl Hash for CellFingerprint {
1313    #[inline]
1314    fn hash<H: Hasher>(&self, state: &mut H) {
1315        self.c0.hash(state);
1316        self.c1.hash(state);
1317        self.c2.hash(state);
1318        self.c3.hash(state);
1319        self.left.hash(state);
1320        self.right.hash(state);
1321    }
1322}
1323
1324#[derive(Clone, Debug, Default, PartialEq)]
1325pub struct CachedCellMoments {
1326    /// Regular (value) cell moments, populated by
1327    /// `evaluate_cell_moments_cached`. None when only derivative moments
1328    /// have been cached for this cell. Wrapped in `Arc` so `ByteLruCache`
1329    /// returns lookups through cheap refcount bumps instead of deep-cloning
1330    /// the inline `SmallVec<[f64; 10]>` (which spills on every degree-`>= 10`
1331    /// request) on every hot-path LRU hit.
1332    state: Option<Arc<CellMomentState>>,
1333    /// Derivative moments, populated by
1334    /// `evaluate_cell_derivative_moments_cached`. None when only value
1335    /// moments have been cached for this cell. Both variants share the
1336    /// same `CellFingerprint` key so derivative-only callers do not evict
1337    /// pre-cached value entries and vice versa. Same `Arc` wrapping rationale
1338    /// as `state` above.
1339    derivative_state: Option<Arc<CellDerivativeMomentState>>,
1340}
1341
1342impl CachedCellMoments {
1343    #[inline]
1344    pub fn new(state: Arc<CellMomentState>) -> Self {
1345        Self {
1346            state: Some(state),
1347            derivative_state: None,
1348        }
1349    }
1350
1351    #[inline]
1352    pub fn new_derivative(state: Arc<CellDerivativeMomentState>) -> Self {
1353        Self {
1354            state: None,
1355            derivative_state: Some(state),
1356        }
1357    }
1358
1359    #[inline]
1360    pub fn state_for_degree(&self, max_degree: usize) -> Option<CellMomentState> {
1361        let state = self.state.as_ref()?;
1362        if state.moments.len().saturating_sub(1) < max_degree {
1363            return None;
1364        }
1365        // Cached `Arc<CellMomentState>` is shared across LRU hits, so we
1366        // cannot reuse the inner vector in place. Clone the underlying state
1367        // and (rarely) truncate down to the requested degree to honour the
1368        // public moment-length contract.
1369        let mut state = (**state).clone();
1370        state.moments.truncate(max_degree + 1);
1371        Some(state)
1372    }
1373
1374    #[inline]
1375    pub fn derivative_state_for_degree(
1376        &self,
1377        max_degree: usize,
1378    ) -> Option<CellDerivativeMomentState> {
1379        let state = self.derivative_state.as_ref()?;
1380        if state.moments.len().saturating_sub(1) < max_degree {
1381            return None;
1382        }
1383        // See `state_for_degree`: shared `Arc` forces an inner clone here.
1384        let mut state = (**state).clone();
1385        state.moments.truncate(max_degree + 1);
1386        Some(state)
1387    }
1388
1389    #[inline]
1390    pub fn with_value(mut self, state: Arc<CellMomentState>) -> Self {
1391        self.state = Some(state);
1392        self
1393    }
1394
1395    #[inline]
1396    pub fn with_derivative(mut self, state: Arc<CellDerivativeMomentState>) -> Self {
1397        self.derivative_state = Some(state);
1398        self
1399    }
1400}
1401
1402impl ResidentBytes for CachedCellMoments {
1403    fn resident_bytes(&self) -> usize {
1404        let value_bytes = self
1405            .state
1406            .as_ref()
1407            .map_or(0, |state| state.resident_bytes());
1408        let derivative_bytes = self
1409            .derivative_state
1410            .as_ref()
1411            .map_or(0, |state| state.resident_bytes());
1412        std::mem::size_of::<Self>()
1413            .saturating_add(value_bytes)
1414            .saturating_add(derivative_bytes)
1415    }
1416}
1417
1418#[derive(Debug, Default)]
1419pub struct CellMomentCacheStats {
1420    hits: AtomicU64,
1421    misses: AtomicU64,
1422}
1423
1424impl CellMomentCacheStats {
1425    #[inline]
1426    pub fn snapshot(&self) -> (u64, u64) {
1427        (
1428            self.hits.load(Ordering::Relaxed),
1429            self.misses.load(Ordering::Relaxed),
1430        )
1431    }
1432
1433    #[inline]
1434    pub fn hit_rate_delta(&self, before: (u64, u64)) -> (u64, u64, f64) {
1435        let (hits, misses) = self.snapshot();
1436        let dh = hits.saturating_sub(before.0);
1437        let dm = misses.saturating_sub(before.1);
1438        let total = dh + dm;
1439        let rate = if total == 0 {
1440            0.0
1441        } else {
1442            dh as f64 / total as f64
1443        };
1444        (dh, dm, rate)
1445    }
1446}
1447
1448pub type CellMomentLruCache = ByteLruCache<CellFingerprint, CachedCellMoments>;
1449
1450pub const CELL_MOMENT_INLINE_CAPACITY: usize = 10;
1451
1452pub type CellMomentVec = SmallVec<[f64; CELL_MOMENT_INLINE_CAPACITY]>;
1453
1454#[derive(Clone, Debug, PartialEq)]
1455pub struct CellMomentState {
1456    pub branch: ExactCellBranch,
1457    pub value: f64,
1458    pub moments: CellMomentVec,
1459}
1460
1461impl ResidentBytes for CellMomentState {
1462    fn resident_bytes(&self) -> usize {
1463        let spilled_bytes = if self.moments.spilled() {
1464            self.moments
1465                .capacity()
1466                .saturating_mul(std::mem::size_of::<f64>())
1467        } else {
1468            0
1469        };
1470        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1471    }
1472}
1473
1474#[derive(Clone, Debug, PartialEq)]
1475pub struct CellDerivativeMomentState {
1476    pub branch: ExactCellBranch,
1477    pub moments: CellMomentVec,
1478}
1479
1480impl ResidentBytes for CellDerivativeMomentState {
1481    fn resident_bytes(&self) -> usize {
1482        let spilled_bytes = if self.moments.spilled() {
1483            self.moments
1484                .capacity()
1485                .saturating_mul(std::mem::size_of::<f64>())
1486        } else {
1487            0
1488        };
1489        std::mem::size_of::<Self>().saturating_add(spilled_bytes)
1490    }
1491}
1492
1493#[derive(Clone, Copy, Debug, PartialEq)]
1494pub struct CellMomentStateRef<'a> {
1495    pub branch: ExactCellBranch,
1496    pub value: f64,
1497    pub moments: &'a [f64],
1498}
1499
1500#[derive(Clone, Debug)]
1501pub struct CellMomentScratch {
1502    moments: Vec<f64>,
1503}
1504
1505impl Default for CellMomentScratch {
1506    fn default() -> Self {
1507        // Pre-size to the codebase's max moment degree so steady-state
1508        // `prepare_moments` calls never reallocate. Calls with `len`
1509        // exceeding this still reserve lazily.
1510        Self {
1511            moments: Vec::with_capacity(MAX_AFFINE_ANCHOR_DEGREE + 1),
1512        }
1513    }
1514}
1515
1516impl CellMomentScratch {
1517    pub fn new() -> Self {
1518        Self::default()
1519    }
1520
1521    pub fn with_capacity(max_degree: usize) -> Self {
1522        Self {
1523            moments: Vec::with_capacity(max_degree + 1),
1524        }
1525    }
1526
1527    #[inline]
1528    fn prepare_moments(&mut self, len: usize) -> &mut [f64] {
1529        if self.moments.capacity() < len {
1530            CELL_MOMENT_REALLOCS.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
1531            self.moments.reserve(len - self.moments.capacity());
1532        }
1533        self.moments.resize(len, 0.0);
1534        self.moments.fill(0.0);
1535        &mut self.moments
1536    }
1537}
1538
1539/// Counter for moment-buffer reallocations in `prepare_moments`. Production
1540/// code increments this on every buffer growth; the test mod inspects it to
1541/// assert the steady-state hot loop allocates exactly once per row buffer.
1542pub(crate) static CELL_MOMENT_REALLOCS: std::sync::atomic::AtomicUsize =
1543    std::sync::atomic::AtomicUsize::new(0);
1544
1545/// Canonical 20-point Gauss–Legendre nodes on [-1, 1] (Abramowitz & Stegun
1546/// 25.4), tabulated to f64 precision. Used here for the Drezner–Wesolowsky
1547/// bivariate normal CDF representation — 20 points give >30-digit accuracy for
1548/// the smooth arcsin-transformed integrand, ensuring the BVN value is exact to
1549/// f64 precision for all (h, k, ρ) — and shared with the cubic-cell B-spline
1550/// moment parity gate in [`crate::gpu_kernels::cubic_bspline_moments`].
1551pub const GL20_NODES: [f64; 20] = [
1552    -0.993_128_599_185_094_9,
1553    -0.963_971_927_277_913_8,
1554    -0.912_234_428_251_326,
1555    -0.839_116_971_822_218_8,
1556    -0.746_331_906_460_150_8,
1557    -0.636_053_680_726_515,
1558    -0.510_867_001_950_827_1,
1559    -0.373_706_088_715_419_6,
1560    -0.227_785_851_141_645_1,
1561    -0.076_526_521_133_497_33,
1562    0.076_526_521_133_497_33,
1563    0.227_785_851_141_645_1,
1564    0.373_706_088_715_419_6,
1565    0.510_867_001_950_827_1,
1566    0.636_053_680_726_515,
1567    0.746_331_906_460_150_8,
1568    0.839_116_971_822_218_8,
1569    0.912_234_428_251_326,
1570    0.963_971_927_277_913_8,
1571    0.993_128_599_185_094_9,
1572];
1573
1574/// Companion weights to [`GL20_NODES`]. Symmetric, summing to 2.
1575pub const GL20_WEIGHTS: [f64; 20] = [
1576    0.017_614_007_139_152_12,
1577    0.040_601_429_800_386_94,
1578    0.062_672_048_334_109_06,
1579    0.083_276_741_576_704_75,
1580    0.101_930_119_817_240_4,
1581    0.118_194_531_961_518_4,
1582    0.131_688_638_449_176_6,
1583    0.142_096_109_318_382_1,
1584    0.149_172_986_472_603_7,
1585    0.152_753_387_130_725_9,
1586    0.152_753_387_130_725_9,
1587    0.149_172_986_472_603_7,
1588    0.142_096_109_318_382_1,
1589    0.131_688_638_449_176_6,
1590    0.118_194_531_961_518_4,
1591    0.101_930_119_817_240_4,
1592    0.083_276_741_576_704_75,
1593    0.062_672_048_334_109_06,
1594    0.040_601_429_800_386_94,
1595    0.017_614_007_139_152_12,
1596];
1597
1598/// Provenance-tagged breakpoint dedup: sorts ascending and merges entries
1599/// coinciding within 1e-12, but when a fixed score break and a link-knot
1600/// crossing coincide (the kink configuration), the surviving entry keeps
1601/// the `Fixed` tag — a deterministic choice; the z location is identical
1602/// either way.
1603fn dedup_sorted_tagged_breakpoints(points: &mut Vec<(f64, PartitionEdge)>) {
1604    points.sort_by(|lhs, rhs| {
1605        lhs.0
1606            .partial_cmp(&rhs.0)
1607            .unwrap_or(std::cmp::Ordering::Equal)
1608    });
1609    points.dedup_by(|lhs, rhs| {
1610        let coincide = if lhs.0 == rhs.0 {
1611            true
1612        } else if lhs.0.is_finite() && rhs.0.is_finite() {
1613            (lhs.0 - rhs.0).abs() <= 1e-12
1614        } else {
1615            false
1616        };
1617        if coincide && matches!(lhs.1, PartitionEdge::Fixed(_)) {
1618            // `dedup_by` keeps `rhs` (the earlier element) — propagate the
1619            // Fixed tag onto the survivor.
1620            rhs.1 = lhs.1;
1621        }
1622        coincide
1623    });
1624}
1625
1626#[inline]
1627pub fn interval_probe_point(left: f64, right: f64) -> Result<f64, String> {
1628    if !(left < right) {
1629        return Err(CubicCellKernelError::invalid_interval(format!(
1630            "interval probe requires ordered bounds, got [{left}, {right}]"
1631        ))
1632        .into());
1633    }
1634    if left.is_finite() && right.is_finite() {
1635        Ok(0.5 * (left + right))
1636    } else if left == f64::NEG_INFINITY && right == f64::INFINITY {
1637        Ok(0.0)
1638    } else if left == f64::NEG_INFINITY && right.is_finite() {
1639        Ok(right - 1.0)
1640    } else if left.is_finite() && right == f64::INFINITY {
1641        Ok(left + 1.0)
1642    } else {
1643        Err(CubicCellKernelError::invalid_interval(format!(
1644            "interval probe requires finite bounds or full infinities, got [{left}, {right}]"
1645        ))
1646        .into())
1647    }
1648}
1649
1650#[inline]
1651pub fn quartic_qprime_coefficients(c0: f64, c1: f64, c2: f64) -> [f64; 4] {
1652    [
1653        c0 * c1,
1654        1.0 + c1 * c1 + 2.0 * c0 * c2,
1655        3.0 * c1 * c2,
1656        2.0 * c2 * c2,
1657    ]
1658}
1659
1660#[inline]
1661pub fn sextic_qprime_coefficients(c0: f64, c1: f64, c2: f64, c3: f64) -> [f64; 6] {
1662    [
1663        c0 * c1,
1664        1.0 + c1 * c1 + 2.0 * c0 * c2,
1665        3.0 * c0 * c3 + 3.0 * c1 * c2,
1666        4.0 * c1 * c3 + 2.0 * c2 * c2,
1667        5.0 * c2 * c3,
1668        3.0 * c3 * c3,
1669    ]
1670}
1671
1672/// Boundary term `right^n · exp(−q(right)) − left^n · exp(−q(left))` used by
1673/// the moment recurrences. Takes precomputed `left^n` and `right^n` so callers
1674/// can roll the powers across a recurrence — each iteration becomes one
1675/// multiply instead of a fresh `powi(n)`.
1676#[inline]
1677fn moment_boundary_term_with_powers(
1678    cell: DenestedCubicCell,
1679    left_pow_n: f64,
1680    right_pow_n: f64,
1681) -> f64 {
1682    let left_term = if cell.left.is_infinite() {
1683        0.0
1684    } else {
1685        left_pow_n * (-cell.q(cell.left)).exp()
1686    };
1687    let right_term = if cell.right.is_infinite() {
1688        0.0
1689    } else {
1690        right_pow_n * (-cell.q(cell.right)).exp()
1691    };
1692    right_term - left_term
1693}
1694
1695#[inline]
1696fn base_moments_match_direct(base: &[f64], direct: &[f64]) -> bool {
1697    base.iter()
1698        .zip(direct.iter())
1699        .all(|(&lhs, &rhs)| (lhs - rhs).abs() <= 1e-10 * (1.0 + lhs.abs().max(rhs.abs())))
1700}
1701
1702#[inline]
1703fn direct_non_affine_moments_if_base_matches(
1704    cell: DenestedCubicCell,
1705    base: &[f64],
1706    max_degree: usize,
1707) -> Option<Vec<f64>> {
1708    if !cell.left.is_finite() || !cell.right.is_finite() {
1709        return None;
1710    }
1711    // When the supplied base moments are the actual moments of this fixed
1712    // finite cell, prefer the same quadrature-backed evaluator used by the
1713    // public non-affine moment path.  The algebraic raising recurrence is kept
1714    // below for callers that intentionally pass symbolic or otherwise
1715    // non-cell-consistent bases, but repeatedly dividing by the quartic/sextic
1716    // leading coefficient can amplify harmless base-roundoff into high-order
1717    // moment error.
1718    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
1719    if base_moments_match_direct(base, &moments) {
1720        Some(moments.into_vec())
1721    } else {
1722        None
1723    }
1724}
1725
1726pub fn reduce_quartic_moments(
1727    cell: DenestedCubicCell,
1728    base_m0_m2: [f64; 3],
1729    max_degree: usize,
1730) -> Result<Vec<f64>, String> {
1731    if max_degree <= 2 {
1732        return Ok(base_m0_m2[..=max_degree].to_vec());
1733    }
1734    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m2, max_degree)
1735    {
1736        return Ok(moments);
1737    }
1738    let d = quartic_qprime_coefficients(cell.c0, cell.c1, cell.c2);
1739    let lead = d[3];
1740    if !lead.is_finite() || lead.abs() <= 1e-18 {
1741        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1742            "quartic moment reduction requires nonzero leading coefficient, got {lead:.3e}"
1743        ))
1744        .into());
1745    }
1746    let mut moments = vec![0.0; max_degree + 1];
1747    moments[0] = base_m0_m2[0];
1748    moments[1] = base_m0_m2[1];
1749    moments[2] = base_m0_m2[2];
1750    // Roll left^n / right^n across the recurrence rather than calling
1751    // `powi(n)` each iteration. Skip the multiply when an endpoint is
1752    // infinite — the boundary helper ignores the power in that case, and
1753    // ∞·0 would produce a NaN we'd then have to mask off anyway.
1754    let left_finite = cell.left.is_finite();
1755    let right_finite = cell.right.is_finite();
1756    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1757    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1758    for n in 0..=(max_degree - 3) {
1759        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1760        let mut numer = if n == 0 {
1761            0.0
1762        } else {
1763            (n as f64) * moments[n - 1]
1764        };
1765        for j in 0..=2 {
1766            numer -= d[j] * moments[n + j];
1767        }
1768        numer -= b_n;
1769        moments[n + 3] = numer / lead;
1770        if left_finite {
1771            left_pow_n *= cell.left;
1772        }
1773        if right_finite {
1774            right_pow_n *= cell.right;
1775        }
1776    }
1777    Ok(moments)
1778}
1779
1780pub fn reduce_sextic_moments(
1781    cell: DenestedCubicCell,
1782    base_m0_m4: [f64; 5],
1783    max_degree: usize,
1784) -> Result<Vec<f64>, String> {
1785    if max_degree <= 4 {
1786        return Ok(base_m0_m4[..=max_degree].to_vec());
1787    }
1788    if let Some(moments) = direct_non_affine_moments_if_base_matches(cell, &base_m0_m4, max_degree)
1789    {
1790        return Ok(moments);
1791    }
1792    let d = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3);
1793    let lead = d[5];
1794    if !lead.is_finite() {
1795        return Err(CubicCellKernelError::invalid_cell_shape(format!(
1796            "sextic moment reduction encountered non-finite leading coefficient: {lead:.3e}"
1797        ))
1798        .into());
1799    }
1800    if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
1801        if lower_branch == ExactCellBranch::Quartic {
1802            return evaluate_non_affine_cell_state(
1803                DenestedCubicCell { c3: 0.0, ..cell },
1804                ExactCellBranch::Quartic,
1805                max_degree,
1806            )
1807            .map(|state| state.moments.into_vec());
1808        }
1809        return evaluate_affine_cell_state(
1810            DenestedCubicCell {
1811                left: cell.left,
1812                right: cell.right,
1813                c0: cell.c0,
1814                c1: cell.c1,
1815                c2: 0.0,
1816                c3: 0.0,
1817            },
1818            max_degree,
1819        )
1820        .map(|state| state.moments.into_vec());
1821    }
1822    let mut moments = vec![0.0; max_degree + 1];
1823    for (idx, value) in base_m0_m4.into_iter().enumerate() {
1824        moments[idx] = value;
1825    }
1826    let left_finite = cell.left.is_finite();
1827    let right_finite = cell.right.is_finite();
1828    let mut left_pow_n = if left_finite { 1.0 } else { 0.0 };
1829    let mut right_pow_n = if right_finite { 1.0 } else { 0.0 };
1830    for n in 0..=(max_degree - 5) {
1831        let b_n = moment_boundary_term_with_powers(cell, left_pow_n, right_pow_n);
1832        let mut numer = if n == 0 {
1833            0.0
1834        } else {
1835            (n as f64) * moments[n - 1]
1836        };
1837        for j in 0..=4 {
1838            numer -= d[j] * moments[n + j];
1839        }
1840        numer -= b_n;
1841        moments[n + 5] = numer / lead;
1842        if left_finite {
1843            left_pow_n *= cell.left;
1844        }
1845        if right_finite {
1846            right_pow_n *= cell.right;
1847        }
1848    }
1849    Ok(moments)
1850}
1851
1852#[inline]
1853pub fn cell_first_derivative_from_moments(
1854    derivative_coefficients: &[f64],
1855    moments: &[f64],
1856) -> Result<f64, String> {
1857    let value = moment_dot_with_coefficients(derivative_coefficients, moments, "first derivative")?;
1858    Ok(value * INV_TWO_PI)
1859}
1860
1861/// Maximum moment index (i.e. `max_degree` passed to
1862/// `evaluate_cell_moments`) required to evaluate
1863/// `cell_first_derivative_from_moments(derivative_coefficients, moments)`.
1864///
1865/// Callers must request at least `cell_first_derivative_required_max_degree(
1866/// derivative_coefficients)` so the moment dot is well-defined; #321 was
1867/// caused by hardcoding a smaller value at one call site.
1868#[inline]
1869pub fn cell_first_derivative_required_max_degree(derivative_coefficients: &[f64]) -> usize {
1870    derivative_coefficients.len().saturating_sub(1)
1871}
1872
1873/// Maximum moment index required by `cell_second_derivative_from_moments`.
1874///
1875/// Mirrors the kernel's internal `needed = max(second_deg, product_deg) + 1`
1876/// computation, but returned as `max_degree` (i.e. `needed - 1`) so it lines
1877/// up with the `evaluate_cell_moments(cell, max_degree)` argument convention.
1878/// The contraction folds an inner cubic `eta` (always degree 3) with the two
1879/// first-coefficient slices and the second-coefficient slice; the +3 below is
1880/// the cubic-cell eta polynomial.
1881#[inline]
1882pub fn cell_second_derivative_required_max_degree(
1883    first_coefficients_r: &[f64],
1884    first_coefficients_s: &[f64],
1885    second_coefficients_rs: &[f64],
1886) -> usize {
1887    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1888    let product_degree = first_coefficients_r.len().saturating_sub(1)
1889        + first_coefficients_s.len().saturating_sub(1)
1890        + 3;
1891    second_degree.max(product_degree)
1892}
1893
1894#[inline]
1895pub fn cell_polynomial_integral_from_moments(
1896    polynomial_coefficients: &[f64],
1897    moments: &[f64],
1898    label: &str,
1899) -> Result<f64, String> {
1900    let value = moment_dot_with_coefficients(polynomial_coefficients, moments, label)?;
1901    Ok(value * INV_TWO_PI)
1902}
1903
1904#[inline]
1905pub fn cell_second_derivative_from_moments(
1906    cell: DenestedCubicCell,
1907    first_coefficients_r: &[f64],
1908    first_coefficients_s: &[f64],
1909    second_coefficients_rs: &[f64],
1910    moments: &[f64],
1911) -> Result<f64, String> {
1912    let second_degree = second_coefficients_rs.len().saturating_sub(1);
1913    let product_degree = first_coefficients_r.len().saturating_sub(1)
1914        + first_coefficients_s.len().saturating_sub(1)
1915        + 3;
1916    let needed = second_degree.max(product_degree) + 1;
1917    if needed > moments.len() {
1918        return Err(CubicCellKernelError::insufficient_moments(format!(
1919            "insufficient reduced moments for second derivative: need {}, have {}",
1920            needed,
1921            moments.len()
1922        ))
1923        .into());
1924    }
1925    let second_term = moment_dot_with_coefficients_unchecked(second_coefficients_rs, moments);
1926    // Fold `Σ_{e,i,j} eta[e]·r[i]·s[j]·moments[e+i+j]` into a single dot
1927    // against `moments`. Convolving `eta ⊗ r ⊗ s` first turns the original
1928    // `len(eta)·len(r)·len(s)` triple loop (typically 4·4·4 = 64 mul-adds
1929    // per call) into `len(eta)·len(r) + (len(eta)+len(r)-1)·len(s) +
1930    // len(out)` ≈ 16 + 28 + 10 = 54 mul-adds, with the inner loops now in
1931    // straight-line FMA-friendly form.
1932    let cubic = [cell.c0, cell.c1, cell.c2, cell.c3];
1933    // Capacity bound: cubic (4) + first_r (≤MAX) + first_s (≤MAX) - 2.
1934    // First-coefficient slices are passed in as `[f64; 4]` from every
1935    // production caller; sizing to 32 covers any realistic test input.
1936    const SCRATCH: usize = 32;
1937    let mut eta_r = [0.0_f64; SCRATCH];
1938    let mut eta_rs = [0.0_f64; SCRATCH];
1939    let er_len = poly_conv_into(&cubic, first_coefficients_r, &mut eta_r);
1940    let ers_len = poly_conv_into(&eta_r[..er_len], first_coefficients_s, &mut eta_rs);
1941    let mut eta_term = 0.0;
1942    for k in 0..ers_len {
1943        eta_term = eta_rs[k].mul_add(moments[k], eta_term);
1944    }
1945    Ok((second_term - eta_term) * INV_TWO_PI)
1946}
1947
1948/// Pointwise value of the cell second-derivative integrand
1949/// `(∂²/∂r∂s) exp(-q(z))/2π` at a single `z`, evaluated from the SAME
1950/// `(r, s, rs)` coefficient polynomials the moment reduction
1951/// [`cell_second_derivative_from_moments`] integrates:
1952///
1953/// ```text
1954///   F_rs(z) = ( c_rs(z) - η(z)·c_r(z)·c_s(z) ) · exp(-q(z)) · 1/2π ,
1955/// ```
1956///
1957/// with `c_•(z) = Σ_k coeff_•[k]·zᵏ`, `η(z)` the cell cubic, and
1958/// `q(z) = ½(z² + η(z)²)`. This is the integrand whose `[cell.left,
1959/// cell.right]` integral the from-moments form returns — needed for the
1960/// Leibniz boundary term when a cell edge (a link-knot crossing
1961/// `z=(τ-a)/b`) moves with a parameter (the slope `b`): the directional
1962/// derivative of `∫_{z_L}^{z_R} F_rs dz` picks up
1963/// `F_rs(z_R)·z_R'(dir) - F_rs(z_L)·z_L'(dir)` on top of the fixed-domain
1964/// part. Coefficient sign convention matches the simpson reference
1965/// (`numeric_ab`): pass the ACTUAL derivative-coefficient polynomials
1966/// `∂c/∂r` etc. (not the negated `neg_dc_d•` the moment path consumes).
1967#[inline]
1968pub fn cell_second_derivative_boundary_integrand(
1969    cell: DenestedCubicCell,
1970    first_coefficients_r: &[f64],
1971    first_coefficients_s: &[f64],
1972    second_coefficients_rs: &[f64],
1973    z: f64,
1974) -> f64 {
1975    let eta = cell.eta(z);
1976    let c_r = poly_eval_at(first_coefficients_r, z);
1977    let c_s = poly_eval_at(first_coefficients_s, z);
1978    let c_rs = poly_eval_at(second_coefficients_rs, z);
1979    (c_rs - eta * c_r * c_s) * (-cell.q(z)).exp() * INV_TWO_PI
1980}
1981
1982/// Pointwise value of the density-weighted integrand `g(z)·exp(-q(z))/2π` at a
1983/// single `z`, for an arbitrary integrand polynomial `g`.
1984///
1985/// This is the boundary value needed for the moving-domain (Leibniz) term of a
1986/// density-normalization integral `∫ g(z)·exp(-q(z))/2π dz` whose cell edge is a
1987/// link-knot crossing `z=(τ-a)/b` that moves with a parameter direction: the
1988/// directional derivative of the integral picks up
1989/// `g(z_R)·w(z_R)·z_R'(dir) - g(z_L)·w(z_L)·z_L'(dir)` on top of the
1990/// fixed-domain part, with `w(z)=exp(-q(z))/2π` the same weight the moment
1991/// reductions integrate. Unlike the Hessian-integral boundary term (which is
1992/// shared by adjacent cells and cancels across each interior knot), the
1993/// ln-density integrand `D_t`/`D_t,uv` carries a non-shared `g`, so this
1994/// Leibniz term does NOT cancel and must be added (gam#932/#979).
1995pub fn cell_density_boundary_integrand(cell: DenestedCubicCell, g: &[f64], z: f64) -> f64 {
1996    poly_eval_at(g, z) * (-cell.q(z)).exp() * INV_TWO_PI
1997}
1998
1999/// Horner evaluation of `Σ_k coefficients[k]·zᵏ`.
2000#[inline]
2001fn poly_eval_at(coefficients: &[f64], z: f64) -> f64 {
2002    let mut acc = 0.0_f64;
2003    for &c in coefficients.iter().rev() {
2004        acc = acc.mul_add(z, c);
2005    }
2006    acc
2007}
2008
2009#[inline]
2010fn moment_dot_with_coefficients(
2011    coefficients: &[f64],
2012    moments: &[f64],
2013    label: &str,
2014) -> Result<f64, String> {
2015    if coefficients.len() > moments.len() {
2016        return Err(CubicCellKernelError::insufficient_moments(format!(
2017            "insufficient reduced moments for {label}: need {}, have {}",
2018            coefficients.len(),
2019            moments.len()
2020        ))
2021        .into());
2022    }
2023    Ok(moment_dot_with_coefficients_unchecked(
2024        coefficients,
2025        moments,
2026    ))
2027}
2028
2029#[inline]
2030fn moment_dot_with_coefficients_unchecked(coefficients: &[f64], moments: &[f64]) -> f64 {
2031    let mut acc = 0.0;
2032    for (idx, &coeff) in coefficients.iter().enumerate() {
2033        acc = coeff.mul_add(moments[idx], acc);
2034    }
2035    acc
2036}
2037
2038/// Convolve two polynomial coefficient slices into a fixed-capacity output
2039/// buffer. Returns the populated length (`lhs.len() + rhs.len() - 1` when
2040/// both are non-empty). The buffer's tail (beyond the returned length) is
2041/// not zeroed; callers must use only the returned prefix.
2042///
2043/// Used by the multi-derivative reductions to fold `eta · r · s · …` triple
2044/// and quadruple sums into a single moment dot, eliminating the
2045/// `O(deg^3)`/`O(deg^4)` inner-loop work that dominated the
2046/// `cell_*_derivative_from_moments` hot leaves on large-scale fits.
2047#[inline]
2048fn poly_conv_into(lhs: &[f64], rhs: &[f64], out: &mut [f64]) -> usize {
2049    if lhs.is_empty() || rhs.is_empty() {
2050        return 0;
2051    }
2052    let len = lhs.len() + rhs.len() - 1;
2053    assert!(out.len() >= len);
2054    for slot in out[..len].iter_mut() {
2055        *slot = 0.0;
2056    }
2057    for (i, &lv) in lhs.iter().enumerate() {
2058        for (j, &rv) in rhs.iter().enumerate() {
2059            out[i + j] = lv.mul_add(rv, out[i + j]);
2060        }
2061    }
2062    len
2063}
2064
2065#[inline]
2066fn require_moments_degree(
2067    required_degree: usize,
2068    moments: &[f64],
2069    label: &str,
2070) -> Result<(), String> {
2071    if required_degree >= moments.len() {
2072        return Err(CubicCellKernelError::insufficient_moments(format!(
2073            "insufficient reduced moments for {label}: need {}, have {}",
2074            required_degree + 1,
2075            moments.len()
2076        ))
2077        .into());
2078    }
2079    Ok::<(), _>(())
2080}
2081
2082#[inline]
2083fn require_scratch_capacity(
2084    required_len: usize,
2085    capacity: usize,
2086    label: &str,
2087) -> Result<(), String> {
2088    if required_len > capacity {
2089        return Err(CubicCellKernelError::insufficient_moments(format!(
2090            "{label} polynomial convolution scratch too small: need {required_len}, have {capacity}"
2091        ))
2092        .into());
2093    }
2094    Ok::<(), _>(())
2095}
2096
2097#[inline]
2098fn convolution_chain_len(lengths: &[usize]) -> usize {
2099    if lengths.is_empty() || lengths.contains(&0) {
2100        0
2101    } else {
2102        lengths.iter().sum::<usize>() - (lengths.len() - 1)
2103    }
2104}
2105
2106#[inline]
2107fn first_coefficients_degree(label: &str, coefficients: &[f64]) -> Result<usize, String> {
2108    coefficients
2109        .len()
2110        .checked_sub(1)
2111        .ok_or_else(|| format!("{label} first-derivative coefficients must be non-empty"))
2112}
2113
2114#[inline]
2115pub fn cell_third_derivative_from_moments(
2116    cell: DenestedCubicCell,
2117    first_coefficients_r: &[f64],
2118    first_coefficients_s: &[f64],
2119    first_coefficients_t: &[f64],
2120    second_coefficients_rs: &[f64],
2121    second_coefficients_rt: &[f64],
2122    second_coefficients_st: &[f64],
2123    third_coefficients_rst: &[f64],
2124    moments: &[f64],
2125) -> Result<f64, String> {
2126    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2127    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2128    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2129    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2130    let second_sum_degree = [
2131        second_coefficients_rs.len() + first_coefficients_t.len(),
2132        second_coefficients_rt.len() + first_coefficients_s.len(),
2133        second_coefficients_st.len() + first_coefficients_r.len(),
2134    ]
2135    .into_iter()
2136    .max()
2137    .unwrap_or(0)
2138    .saturating_sub(1);
2139    let triple_product_degree = r_degree + s_degree + t_degree;
2140    let needed = (third_coefficients_rst.len().saturating_sub(1))
2141        .max(3 + second_sum_degree)
2142        .max(6 + triple_product_degree);
2143    require_moments_degree(needed, moments, "third derivative")?;
2144
2145    let third_term = moment_dot_with_coefficients_unchecked(third_coefficients_rst, moments);
2146
2147    // This is a deliberately serial leaf kernel: each call performs only a
2148    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2149    // at the surrounding row/cell batch level rather than inside this hot path.
2150    const SCRATCH: usize = 32;
2151    let max_linear_conv_len = [
2152        convolution_chain_len(&[
2153            eta.len(),
2154            second_coefficients_rs.len(),
2155            first_coefficients_t.len(),
2156        ]),
2157        convolution_chain_len(&[
2158            eta.len(),
2159            second_coefficients_rt.len(),
2160            first_coefficients_s.len(),
2161        ]),
2162        convolution_chain_len(&[
2163            eta.len(),
2164            second_coefficients_st.len(),
2165            first_coefficients_r.len(),
2166        ]),
2167    ]
2168    .into_iter()
2169    .max()
2170    .unwrap_or(0);
2171    let max_cubic_conv_len = convolution_chain_len(&[
2172        7,
2173        first_coefficients_r.len(),
2174        first_coefficients_s.len(),
2175        first_coefficients_t.len(),
2176    ]);
2177    require_scratch_capacity(
2178        max_linear_conv_len.max(max_cubic_conv_len),
2179        SCRATCH,
2180        "third derivative",
2181    )?;
2182    let mut buf_a = [0.0_f64; SCRATCH];
2183    let mut buf_b = [0.0_f64; SCRATCH];
2184
2185    // eta_second_term = Σ over (rs⊗t, rt⊗s, st⊗r) of eta⊗product · moments.
2186    // Fold each of the three triple sums into a single moment dot.
2187    let mut eta_second_term = 0.0;
2188    let conv_dot = |first: &[f64],
2189                    second: &[f64],
2190                    buf_a: &mut [f64; SCRATCH],
2191                    buf_b: &mut [f64; SCRATCH]|
2192     -> f64 {
2193        let m = poly_conv_into(first, second, buf_a);
2194        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2195        let mut acc = 0.0;
2196        for k in 0..n {
2197            acc = buf_b[k].mul_add(moments[k], acc);
2198        }
2199        acc
2200    };
2201    eta_second_term += conv_dot(
2202        second_coefficients_rs,
2203        first_coefficients_t,
2204        &mut buf_a,
2205        &mut buf_b,
2206    );
2207    eta_second_term += conv_dot(
2208        second_coefficients_rt,
2209        first_coefficients_s,
2210        &mut buf_a,
2211        &mut buf_b,
2212    );
2213    eta_second_term += conv_dot(
2214        second_coefficients_st,
2215        first_coefficients_r,
2216        &mut buf_a,
2217        &mut buf_b,
2218    );
2219
2220    // cubic_coeff_term = Σ_{e,i,j,k} (eta·eta − 1)[e] · r[i] · s[j] · t[k] · moments[e+i+j+k].
2221    // Convolve r⊗s, then ⊗t, then ⊗(eta·eta − 1), giving a single dot.
2222    let mut eta_sq_minus_one = [0.0_f64; 7];
2223    for (i, &eta_i) in eta.iter().enumerate() {
2224        for (j, &eta_j) in eta.iter().enumerate() {
2225            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2226        }
2227    }
2228    eta_sq_minus_one[0] -= 1.0;
2229
2230    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2231    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2232    // buf_a now reused for (eta_sq_minus_one ⊗ rst).
2233    let final_len = poly_conv_into(&eta_sq_minus_one, &buf_b[..rst_len], &mut buf_a);
2234    let mut cubic_coeff_term = 0.0;
2235    for k in 0..final_len {
2236        cubic_coeff_term = buf_a[k].mul_add(moments[k], cubic_coeff_term);
2237    }
2238
2239    Ok((third_term - eta_second_term + cubic_coeff_term) * INV_TWO_PI)
2240}
2241
2242#[inline]
2243pub fn cell_fourth_derivative_from_moments(
2244    cell: DenestedCubicCell,
2245    first_coefficients_r: &[f64],
2246    first_coefficients_s: &[f64],
2247    first_coefficients_t: &[f64],
2248    first_coefficients_u: &[f64],
2249    second_coefficients_rs: &[f64],
2250    second_coefficients_rt: &[f64],
2251    second_coefficients_ru: &[f64],
2252    second_coefficients_st: &[f64],
2253    second_coefficients_su: &[f64],
2254    second_coefficients_tu: &[f64],
2255    third_coefficients_rst: &[f64],
2256    third_coefficients_rsu: &[f64],
2257    third_coefficients_rtu: &[f64],
2258    third_coefficients_stu: &[f64],
2259    fourth_coefficients_rstu: &[f64],
2260    moments: &[f64],
2261) -> Result<f64, String> {
2262    let eta = [cell.c0, cell.c1, cell.c2, cell.c3];
2263    let r_degree = first_coefficients_degree("r", first_coefficients_r)?;
2264    let s_degree = first_coefficients_degree("s", first_coefficients_s)?;
2265    let t_degree = first_coefficients_degree("t", first_coefficients_t)?;
2266    let u_degree = first_coefficients_degree("u", first_coefficients_u)?;
2267    let linear_sum_degree = [
2268        third_coefficients_rst.len() + first_coefficients_u.len(),
2269        third_coefficients_rsu.len() + first_coefficients_t.len(),
2270        third_coefficients_rtu.len() + first_coefficients_s.len(),
2271        third_coefficients_stu.len() + first_coefficients_r.len(),
2272        second_coefficients_rs.len() + second_coefficients_tu.len(),
2273        second_coefficients_rt.len() + second_coefficients_su.len(),
2274        second_coefficients_ru.len() + second_coefficients_st.len(),
2275    ]
2276    .into_iter()
2277    .max()
2278    .unwrap_or(0)
2279    .saturating_sub(1);
2280    let quad_sum_degree = [
2281        second_coefficients_rs.len() + first_coefficients_t.len() + first_coefficients_u.len(),
2282        second_coefficients_rt.len() + first_coefficients_s.len() + first_coefficients_u.len(),
2283        second_coefficients_ru.len() + first_coefficients_s.len() + first_coefficients_t.len(),
2284        second_coefficients_st.len() + first_coefficients_r.len() + first_coefficients_u.len(),
2285        second_coefficients_su.len() + first_coefficients_r.len() + first_coefficients_t.len(),
2286        second_coefficients_tu.len() + first_coefficients_r.len() + first_coefficients_s.len(),
2287    ]
2288    .into_iter()
2289    .max()
2290    .unwrap_or(0)
2291    .saturating_sub(2);
2292    let quartic_product_degree = r_degree + s_degree + t_degree + u_degree;
2293    let needed = (fourth_coefficients_rstu.len().saturating_sub(1))
2294        .max(3 + linear_sum_degree)
2295        .max(6 + quad_sum_degree)
2296        .max(9 + quartic_product_degree);
2297    require_moments_degree(needed, moments, "fourth derivative")?;
2298
2299    let fourth_term = moment_dot_with_coefficients_unchecked(fourth_coefficients_rstu, moments);
2300
2301    // This is a deliberately serial leaf kernel: each call performs only a
2302    // handful of fixed-size polynomial convolutions, so Rayon fan-out belongs
2303    // at the surrounding row/cell batch level rather than inside this hot path.
2304    const SCRATCH: usize = 32;
2305    let max_linear_conv_len = [
2306        convolution_chain_len(&[
2307            eta.len(),
2308            third_coefficients_rst.len(),
2309            first_coefficients_u.len(),
2310        ]),
2311        convolution_chain_len(&[
2312            eta.len(),
2313            third_coefficients_rsu.len(),
2314            first_coefficients_t.len(),
2315        ]),
2316        convolution_chain_len(&[
2317            eta.len(),
2318            third_coefficients_rtu.len(),
2319            first_coefficients_s.len(),
2320        ]),
2321        convolution_chain_len(&[
2322            eta.len(),
2323            third_coefficients_stu.len(),
2324            first_coefficients_r.len(),
2325        ]),
2326        convolution_chain_len(&[
2327            eta.len(),
2328            second_coefficients_rs.len(),
2329            second_coefficients_tu.len(),
2330        ]),
2331        convolution_chain_len(&[
2332            eta.len(),
2333            second_coefficients_rt.len(),
2334            second_coefficients_su.len(),
2335        ]),
2336        convolution_chain_len(&[
2337            eta.len(),
2338            second_coefficients_ru.len(),
2339            second_coefficients_st.len(),
2340        ]),
2341    ]
2342    .into_iter()
2343    .max()
2344    .unwrap_or(0);
2345    let max_quad_conv_len = [
2346        convolution_chain_len(&[
2347            7,
2348            second_coefficients_rs.len(),
2349            first_coefficients_t.len(),
2350            first_coefficients_u.len(),
2351        ]),
2352        convolution_chain_len(&[
2353            7,
2354            second_coefficients_rt.len(),
2355            first_coefficients_s.len(),
2356            first_coefficients_u.len(),
2357        ]),
2358        convolution_chain_len(&[
2359            7,
2360            second_coefficients_ru.len(),
2361            first_coefficients_s.len(),
2362            first_coefficients_t.len(),
2363        ]),
2364        convolution_chain_len(&[
2365            7,
2366            second_coefficients_st.len(),
2367            first_coefficients_r.len(),
2368            first_coefficients_u.len(),
2369        ]),
2370        convolution_chain_len(&[
2371            7,
2372            second_coefficients_su.len(),
2373            first_coefficients_r.len(),
2374            first_coefficients_t.len(),
2375        ]),
2376        convolution_chain_len(&[
2377            7,
2378            second_coefficients_tu.len(),
2379            first_coefficients_r.len(),
2380            first_coefficients_s.len(),
2381        ]),
2382    ]
2383    .into_iter()
2384    .max()
2385    .unwrap_or(0);
2386    let max_quartic_conv_len = convolution_chain_len(&[
2387        10,
2388        first_coefficients_r.len(),
2389        first_coefficients_s.len(),
2390        first_coefficients_t.len(),
2391        first_coefficients_u.len(),
2392    ]);
2393    require_scratch_capacity(
2394        max_linear_conv_len
2395            .max(max_quad_conv_len)
2396            .max(max_quartic_conv_len),
2397        SCRATCH,
2398        "fourth derivative",
2399    )?;
2400    let mut buf_a = [0.0_f64; SCRATCH];
2401    let mut buf_b = [0.0_f64; SCRATCH];
2402
2403    // eta_linear_term = Σ over seven (rst⊗u, rsu⊗t, rtu⊗s, stu⊗r, rs⊗tu,
2404    // rt⊗su, ru⊗st) of eta⊗product · moments. Fold each triple sum into
2405    // a single moment dot.
2406    let conv_eta_dot = |first: &[f64],
2407                        second: &[f64],
2408                        buf_a: &mut [f64; SCRATCH],
2409                        buf_b: &mut [f64; SCRATCH]|
2410     -> f64 {
2411        let m = poly_conv_into(first, second, buf_a);
2412        let n = poly_conv_into(&eta, &buf_a[..m], buf_b);
2413        let mut acc = 0.0;
2414        for k in 0..n {
2415            acc = buf_b[k].mul_add(moments[k], acc);
2416        }
2417        acc
2418    };
2419    let mut eta_linear_term = 0.0;
2420    eta_linear_term += conv_eta_dot(
2421        third_coefficients_rst,
2422        first_coefficients_u,
2423        &mut buf_a,
2424        &mut buf_b,
2425    );
2426    eta_linear_term += conv_eta_dot(
2427        third_coefficients_rsu,
2428        first_coefficients_t,
2429        &mut buf_a,
2430        &mut buf_b,
2431    );
2432    eta_linear_term += conv_eta_dot(
2433        third_coefficients_rtu,
2434        first_coefficients_s,
2435        &mut buf_a,
2436        &mut buf_b,
2437    );
2438    eta_linear_term += conv_eta_dot(
2439        third_coefficients_stu,
2440        first_coefficients_r,
2441        &mut buf_a,
2442        &mut buf_b,
2443    );
2444    eta_linear_term += conv_eta_dot(
2445        second_coefficients_rs,
2446        second_coefficients_tu,
2447        &mut buf_a,
2448        &mut buf_b,
2449    );
2450    eta_linear_term += conv_eta_dot(
2451        second_coefficients_rt,
2452        second_coefficients_su,
2453        &mut buf_a,
2454        &mut buf_b,
2455    );
2456    eta_linear_term += conv_eta_dot(
2457        second_coefficients_ru,
2458        second_coefficients_st,
2459        &mut buf_a,
2460        &mut buf_b,
2461    );
2462
2463    let mut eta_sq_minus_one = [0.0_f64; 7];
2464    for (i, &eta_i) in eta.iter().enumerate() {
2465        for (j, &eta_j) in eta.iter().enumerate() {
2466            eta_sq_minus_one[i + j] = eta_i.mul_add(eta_j, eta_sq_minus_one[i + j]);
2467        }
2468    }
2469    eta_sq_minus_one[0] -= 1.0;
2470
2471    // quad_coeff_term: six (eta²−1)⊗A⊗B⊗C · moments sums, where the (A,B,C)
2472    // factors are: (rs,t,u), (rt,s,u), (ru,s,t), (st,r,u), (su,r,t), (tu,r,s).
2473    let mut buf_c = [0.0_f64; SCRATCH];
2474    let conv_weighted_triple_dot = |weight: &[f64],
2475                                    a: &[f64],
2476                                    b: &[f64],
2477                                    c: &[f64],
2478                                    buf_a: &mut [f64; SCRATCH],
2479                                    buf_b: &mut [f64; SCRATCH],
2480                                    buf_c: &mut [f64; SCRATCH]|
2481     -> f64 {
2482        let ab_len = poly_conv_into(a, b, buf_a);
2483        let abc_len = poly_conv_into(&buf_a[..ab_len], c, buf_b);
2484        let final_len = poly_conv_into(weight, &buf_b[..abc_len], buf_c);
2485        let mut acc = 0.0;
2486        for k in 0..final_len {
2487            acc = buf_c[k].mul_add(moments[k], acc);
2488        }
2489        acc
2490    };
2491    let mut quad_coeff_term = 0.0;
2492    quad_coeff_term += conv_weighted_triple_dot(
2493        &eta_sq_minus_one,
2494        second_coefficients_rs,
2495        first_coefficients_t,
2496        first_coefficients_u,
2497        &mut buf_a,
2498        &mut buf_b,
2499        &mut buf_c,
2500    );
2501    quad_coeff_term += conv_weighted_triple_dot(
2502        &eta_sq_minus_one,
2503        second_coefficients_rt,
2504        first_coefficients_s,
2505        first_coefficients_u,
2506        &mut buf_a,
2507        &mut buf_b,
2508        &mut buf_c,
2509    );
2510    quad_coeff_term += conv_weighted_triple_dot(
2511        &eta_sq_minus_one,
2512        second_coefficients_ru,
2513        first_coefficients_s,
2514        first_coefficients_t,
2515        &mut buf_a,
2516        &mut buf_b,
2517        &mut buf_c,
2518    );
2519    quad_coeff_term += conv_weighted_triple_dot(
2520        &eta_sq_minus_one,
2521        second_coefficients_st,
2522        first_coefficients_r,
2523        first_coefficients_u,
2524        &mut buf_a,
2525        &mut buf_b,
2526        &mut buf_c,
2527    );
2528    quad_coeff_term += conv_weighted_triple_dot(
2529        &eta_sq_minus_one,
2530        second_coefficients_su,
2531        first_coefficients_r,
2532        first_coefficients_t,
2533        &mut buf_a,
2534        &mut buf_b,
2535        &mut buf_c,
2536    );
2537    quad_coeff_term += conv_weighted_triple_dot(
2538        &eta_sq_minus_one,
2539        second_coefficients_tu,
2540        first_coefficients_r,
2541        first_coefficients_s,
2542        &mut buf_a,
2543        &mut buf_b,
2544        &mut buf_c,
2545    );
2546
2547    // cubic_weight = 3·eta − eta³ (same as the prior expansion: eta_sq*eta
2548    // negated, plus the 3·eta linear correction).
2549    let mut eta_sq = [0.0_f64; 7];
2550    for (i, &eta_i) in eta.iter().enumerate() {
2551        for (j, &eta_j) in eta.iter().enumerate() {
2552            eta_sq[i + j] = eta_i.mul_add(eta_j, eta_sq[i + j]);
2553        }
2554    }
2555    let mut cubic_weight = [0.0_f64; 10];
2556    for (i, &eta_sq_i) in eta_sq.iter().enumerate() {
2557        for (j, &eta_j) in eta.iter().enumerate() {
2558            cubic_weight[i + j] = (-eta_sq_i).mul_add(eta_j, cubic_weight[i + j]);
2559        }
2560    }
2561    for (idx, &eta_coeff) in eta.iter().enumerate() {
2562        cubic_weight[idx] += 3.0 * eta_coeff;
2563    }
2564
2565    // quartic_coeff_term: cubic_weight ⊗ r ⊗ s ⊗ t ⊗ u · moments. The
2566    // original quintuple loop did 10·4·4·4·4 = 2560 mul-adds per call;
2567    // four sequential convolutions plus one moment dot drop this to
2568    // ~16+28+40+52+16 ≈ 152 mul-adds.
2569    let rs_len = poly_conv_into(first_coefficients_r, first_coefficients_s, &mut buf_a);
2570    let rst_len = poly_conv_into(&buf_a[..rs_len], first_coefficients_t, &mut buf_b);
2571    let rstu_len = poly_conv_into(&buf_b[..rst_len], first_coefficients_u, &mut buf_a);
2572    let final_len = poly_conv_into(&cubic_weight, &buf_a[..rstu_len], &mut buf_b);
2573    let mut quartic_coeff_term = 0.0;
2574    for k in 0..final_len {
2575        quartic_coeff_term = buf_b[k].mul_add(moments[k], quartic_coeff_term);
2576    }
2577
2578    Ok((fourth_term - eta_linear_term + quad_coeff_term + quartic_coeff_term) * INV_TWO_PI)
2579}
2580
2581#[inline]
2582pub fn global_cubic_from_local(span: LocalSpanCubic) -> (f64, f64, f64, f64) {
2583    let left = span.left;
2584    let q0 = span.c0 - span.c1 * left + span.c2 * left * left - span.c3 * left * left * left;
2585    let q1 = span.c1 - 2.0 * span.c2 * left + 3.0 * span.c3 * left * left;
2586    let q2 = span.c2 - 3.0 * span.c3 * left;
2587    let q3 = span.c3;
2588    (q0, q1, q2, q3)
2589}
2590
2591/// Return the cubic polynomial coefficients (in `z`) of
2592/// `f(z) = link_span.evaluate(a + b*z)`.
2593///
2594/// `link_span.evaluate` is a cubic in its argument, so `f(z)` is also a cubic
2595/// in `z` and can be written exactly as
2596///
2597/// ```text
2598///     f(z) = d0 + d1·z + d2·z² + d3·z³
2599/// ```
2600///
2601/// where `(d0, d1, d2, d3)` are the values returned by this function. These
2602/// are **polynomial coefficients**, *not* derivatives of `f` at `z = 0`. The
2603/// relationship to Taylor derivatives is
2604///
2605/// ```text
2606///     d_k = f^(k)(0) / k!
2607/// ```
2608///
2609/// so `d0 = f(0)`, `d1 = f'(0)`, `d2 = ½·f''(0)`, `d3 = ⅙·f'''(0)`. Callers
2610/// such as [`denested_cell_coefficients`] and [`link_basis_cell_coefficients`]
2611/// rely on the polynomial-coefficient convention, since they propagate the
2612/// values directly as the `(c0, c1, c2, c3)` slots of a downstream polynomial
2613/// in `z`.
2614#[inline]
2615pub fn transformed_link_cubic(link_span: LocalSpanCubic, a: f64, b: f64) -> (f64, f64, f64, f64) {
2616    let shift = a - link_span.left;
2617    let d0 = link_span.c0
2618        + link_span.c1 * shift
2619        + link_span.c2 * shift * shift
2620        + link_span.c3 * shift * shift * shift;
2621    let d1 = b * (link_span.c1 + 2.0 * link_span.c2 * shift + 3.0 * link_span.c3 * shift * shift);
2622    let d2 = b * b * (link_span.c2 + 3.0 * link_span.c3 * shift);
2623    let d3 = link_span.c3 * b * b * b;
2624    (d0, d1, d2, d3)
2625}
2626
2627#[inline]
2628pub fn denested_cell_coefficients(
2629    score_span: LocalSpanCubic,
2630    link_span: LocalSpanCubic,
2631    a: f64,
2632    b: f64,
2633) -> [f64; 4] {
2634    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2635    let (d0, d1, d2, d3) = transformed_link_cubic(link_span, a, b);
2636    [a + b * h0 + d0, b + b * h1 + d1, b * h2 + d2, b * h3 + d3]
2637}
2638
2639#[inline]
2640pub fn denested_cell_coefficient_partials(
2641    score_span: LocalSpanCubic,
2642    link_span: LocalSpanCubic,
2643    a: f64,
2644    b: f64,
2645) -> ([f64; 4], [f64; 4]) {
2646    let (h0, h1, h2, h3) = global_cubic_from_local(score_span);
2647    let shift = a - link_span.left;
2648    let alpha1 = link_span.c1;
2649    let alpha2 = link_span.c2;
2650    let alpha3 = link_span.c3;
2651    let dc_da = [
2652        1.0 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2653        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2654        3.0 * alpha3 * b * b,
2655        0.0,
2656    ];
2657    let dc_db = [
2658        h0,
2659        1.0 + h1 + alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2660        h2 + 2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2661        h3 + 3.0 * alpha3 * b * b,
2662    ];
2663    (dc_da, dc_db)
2664}
2665
2666#[inline]
2667fn link_cubic_second_partials(
2668    link_span: LocalSpanCubic,
2669    a: f64,
2670    b: f64,
2671) -> ([f64; 4], [f64; 4], [f64; 4]) {
2672    let shift = a - link_span.left;
2673    let alpha2 = link_span.c2;
2674    let alpha3 = link_span.c3;
2675    let dc_daa = [
2676        2.0 * alpha2 + 6.0 * alpha3 * shift,
2677        6.0 * alpha3 * b,
2678        0.0,
2679        0.0,
2680    ];
2681    let dc_dab = [
2682        0.0,
2683        2.0 * alpha2 + 6.0 * alpha3 * shift,
2684        6.0 * alpha3 * b,
2685        0.0,
2686    ];
2687    let dc_dbb = [
2688        0.0,
2689        0.0,
2690        2.0 * (alpha2 + 3.0 * alpha3 * shift),
2691        6.0 * alpha3 * b,
2692    ];
2693    (dc_daa, dc_dab, dc_dbb)
2694}
2695
2696#[inline]
2697pub fn denested_cell_second_partials(
2698    score_span: LocalSpanCubic,
2699    link_span: LocalSpanCubic,
2700    a: f64,
2701    b: f64,
2702) -> ([f64; 4], [f64; 4], [f64; 4]) {
2703    let score_left = score_span.left;
2704    if !score_left.is_finite() {
2705        return ([f64::NAN; 4], [f64::NAN; 4], [f64::NAN; 4]);
2706    }
2707    link_cubic_second_partials(link_span, a, b)
2708}
2709
2710#[inline]
2711fn link_cubic_third_partials(
2712    link_span: LocalSpanCubic,
2713) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2714    let alpha3 = link_span.c3;
2715    (
2716        [6.0 * alpha3, 0.0, 0.0, 0.0],
2717        [0.0, 6.0 * alpha3, 0.0, 0.0],
2718        [0.0, 0.0, 6.0 * alpha3, 0.0],
2719        [0.0, 0.0, 0.0, 6.0 * alpha3],
2720    )
2721}
2722
2723#[inline]
2724pub fn denested_cell_third_partials(
2725    link_span: LocalSpanCubic,
2726) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2727    link_cubic_third_partials(link_span)
2728}
2729
2730#[inline]
2731pub fn score_basis_cell_coefficients(score_basis_span: LocalSpanCubic, b: f64) -> [f64; 4] {
2732    let (h0, h1, h2, h3) = global_cubic_from_local(score_basis_span);
2733    [b * h0, b * h1, b * h2, b * h3]
2734}
2735
2736#[inline]
2737pub fn link_basis_cell_coefficients(link_basis_span: LocalSpanCubic, a: f64, b: f64) -> [f64; 4] {
2738    let (d0, d1, d2, d3) = transformed_link_cubic(link_basis_span, a, b);
2739    [d0, d1, d2, d3]
2740}
2741
2742#[inline]
2743pub fn link_basis_cell_coefficient_partials(
2744    link_basis_span: LocalSpanCubic,
2745    a: f64,
2746    b: f64,
2747) -> ([f64; 4], [f64; 4]) {
2748    let shift = a - link_basis_span.left;
2749    let alpha1 = link_basis_span.c1;
2750    let alpha2 = link_basis_span.c2;
2751    let alpha3 = link_basis_span.c3;
2752    let dc_da = [
2753        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2754        b * (2.0 * alpha2 + 6.0 * alpha3 * shift),
2755        3.0 * alpha3 * b * b,
2756        0.0,
2757    ];
2758    let dc_db = [
2759        0.0,
2760        alpha1 + 2.0 * alpha2 * shift + 3.0 * alpha3 * shift * shift,
2761        2.0 * b * (alpha2 + 3.0 * alpha3 * shift),
2762        3.0 * alpha3 * b * b,
2763    ];
2764    (dc_da, dc_db)
2765}
2766
2767#[inline]
2768pub fn link_basis_cell_second_partials(
2769    link_basis_span: LocalSpanCubic,
2770    a: f64,
2771    b: f64,
2772) -> ([f64; 4], [f64; 4], [f64; 4]) {
2773    link_cubic_second_partials(link_basis_span, a, b)
2774}
2775
2776#[inline]
2777pub fn link_basis_cell_third_partials(
2778    link_basis_span: LocalSpanCubic,
2779) -> ([f64; 4], [f64; 4], [f64; 4], [f64; 4]) {
2780    link_cubic_third_partials(link_basis_span)
2781}
2782
2783pub fn build_denested_partition_cells<FS, FL>(
2784    a: f64,
2785    b: f64,
2786    score_breaks: &[f64],
2787    link_breaks: &[f64],
2788    score_span_at: FS,
2789    link_span_at: FL,
2790) -> Result<Vec<DenestedPartitionCell>, String>
2791where
2792    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2793    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2794{
2795    build_denested_partition_cells_with_tails(
2796        a,
2797        b,
2798        score_breaks,
2799        link_breaks,
2800        score_span_at,
2801        link_span_at,
2802    )
2803}
2804
2805/// Build a partition covering `(-∞, +∞)` with parameter-independent outer
2806/// bounds.  Interior cells use the same finite-cell polynomial algebra.
2807/// The two tail cells are guaranteed affine (c2=c3=0) because both
2808/// deviations saturate to constants outside their knot support.
2809///
2810/// The tail cells' score/link spans come from the same closures evaluated
2811/// at a representative point in the tail region — the closures must return
2812/// constant (c1=c2=c3=0) cubics for points outside support.
2813pub fn build_denested_partition_cells_with_tails<FS, FL>(
2814    a: f64,
2815    b: f64,
2816    score_breaks: &[f64],
2817    link_breaks: &[f64],
2818    mut score_span_at: FS,
2819    mut link_span_at: FL,
2820) -> Result<Vec<DenestedPartitionCell>, String>
2821where
2822    FS: FnMut(f64) -> Result<LocalSpanCubic, String>,
2823    FL: FnMut(f64) -> Result<LocalSpanCubic, String>,
2824{
2825    // Collect all INTERNAL split points (finite), each tagged with its
2826    // provenance: a fixed score break or a link-knot crossing. Provenance
2827    // identifies the cell's `(a, b)` family for the Chebyshev moment-family
2828    // layer; the z coordinates alone cannot distinguish the two kinds.
2829    let mut split_points: Vec<(f64, PartitionEdge)> = score_breaks
2830        .iter()
2831        .map(|&sigma| (sigma, PartitionEdge::Fixed(sigma)))
2832        .collect();
2833    if b.abs() > 1e-12 {
2834        for &tau in link_breaks {
2835            let z = (tau - a) / b;
2836            if z.is_finite() {
2837                split_points.push((z, PartitionEdge::Crossing { tau }));
2838            }
2839        }
2840    }
2841    dedup_sorted_tagged_breakpoints(&mut split_points);
2842
2843    let mut out = Vec::new();
2844
2845    if split_points.is_empty() {
2846        let score_span = score_span_at(0.0)?;
2847        let link_span = link_span_at(a)?;
2848        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2849        return Ok(vec![DenestedPartitionCell {
2850            cell: DenestedCubicCell {
2851                left: f64::NEG_INFINITY,
2852                right: f64::INFINITY,
2853                c0: coeffs[0],
2854                c1: coeffs[1],
2855                c2: 0.0,
2856                c3: 0.0,
2857            },
2858            score_span,
2859            link_span,
2860            left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2861            right_edge: PartitionEdge::Fixed(f64::INFINITY),
2862        }]);
2863    }
2864
2865    // ── Left tail cell: (-∞, leftmost_split] ──
2866    let (leftmost, leftmost_edge) = split_points[0];
2867    // Evaluate spans at a point just left of the leftmost split.  The
2868    // closures return constant tail cubics for this region.
2869    let left_probe = interval_probe_point(f64::NEG_INFINITY, leftmost)?;
2870    let left_score_span = score_span_at(left_probe)?;
2871    let left_link_span = link_span_at(a + b * left_probe)?;
2872    let left_coeffs = denested_cell_coefficients(left_score_span, left_link_span, a, b);
2873    if left_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2874        || left_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2875    {
2876        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2877            "left tail cell must be affine (deviations constant outside support), \
2878             got c2={:.3e}, c3={:.3e}",
2879            left_coeffs[2], left_coeffs[3]
2880        ))
2881        .into());
2882    }
2883    out.push(DenestedPartitionCell {
2884        cell: DenestedCubicCell {
2885            left: f64::NEG_INFINITY,
2886            right: leftmost,
2887            c0: left_coeffs[0],
2888            c1: left_coeffs[1],
2889            c2: 0.0,
2890            c3: 0.0,
2891        },
2892        score_span: left_score_span,
2893        link_span: left_link_span,
2894        left_edge: PartitionEdge::Fixed(f64::NEG_INFINITY),
2895        right_edge: leftmost_edge,
2896    });
2897
2898    // ── Interior cells (all finite) ──
2899    for window in split_points.windows(2) {
2900        let (left, left_edge) = window[0];
2901        let (right, right_edge) = window[1];
2902        if !left.is_finite() || !right.is_finite() || right - left <= 1e-12 {
2903            continue;
2904        }
2905        let mid = interval_probe_point(left, right)?;
2906        let score_span = score_span_at(mid)?;
2907        let link_span = link_span_at(a + b * mid)?;
2908        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
2909        out.push(DenestedPartitionCell {
2910            cell: DenestedCubicCell {
2911                left,
2912                right,
2913                c0: coeffs[0],
2914                c1: coeffs[1],
2915                c2: coeffs[2],
2916                c3: coeffs[3],
2917            },
2918            score_span,
2919            link_span,
2920            left_edge,
2921            right_edge,
2922        });
2923    }
2924
2925    // ── Right tail cell: [rightmost_split, +∞) ──
2926    let (rightmost, rightmost_edge) = *split_points.last().unwrap();
2927    let right_probe = interval_probe_point(rightmost, f64::INFINITY)?;
2928    let right_score_span = score_span_at(right_probe)?;
2929    let right_link_span = link_span_at(a + b * right_probe)?;
2930    let right_coeffs = denested_cell_coefficients(right_score_span, right_link_span, a, b);
2931    if right_coeffs[2].abs() > NORMALIZED_CELL_BRANCH_TOL
2932        || right_coeffs[3].abs() > NORMALIZED_CELL_BRANCH_TOL
2933    {
2934        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2935            "right tail cell must be affine (deviations constant outside support), \
2936             got c2={:.3e}, c3={:.3e}",
2937            right_coeffs[2], right_coeffs[3]
2938        ))
2939        .into());
2940    }
2941    out.push(DenestedPartitionCell {
2942        cell: DenestedCubicCell {
2943            left: rightmost,
2944            right: f64::INFINITY,
2945            c0: right_coeffs[0],
2946            c1: right_coeffs[1],
2947            c2: 0.0,
2948            c3: 0.0,
2949        },
2950        score_span: right_score_span,
2951        link_span: right_link_span,
2952        left_edge: rightmost_edge,
2953        right_edge: PartitionEdge::Fixed(f64::INFINITY),
2954    });
2955
2956    Ok(out)
2957}
2958
2959#[inline]
2960pub fn normalized_non_affine_coefficients(
2961    left: f64,
2962    right: f64,
2963    c0: f64,
2964    c1: f64,
2965    c2: f64,
2966    c3: f64,
2967) -> Result<(f64, f64), String> {
2968    let width = right - left;
2969    if !width.is_finite() || width <= 0.0 {
2970        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2971            "normalized cubic coefficients require a positive finite cell width, got left={left}, right={right}"
2972        ))
2973        .into());
2974    }
2975    let anchor_scale = c0.abs() + c1.abs();
2976    if !anchor_scale.is_finite() {
2977        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2978            "normalized cubic coefficients require finite affine coefficients, got c0={c0}, c1={c1}"
2979        ))
2980        .into());
2981    }
2982    let mid = 0.5 * (left + right);
2983    let half = 0.5 * width;
2984    let k2 = half * half * (c2 + 3.0 * c3 * mid);
2985    let k3 = c3 * half * half * half;
2986    Ok((k2, k3))
2987}
2988
2989#[inline]
2990pub fn branch_cell(cell: DenestedCubicCell) -> Result<ExactCellBranch, String> {
2991    let tol = effective_branch_tol(cell);
2992    if !cell.left.is_finite() || !cell.right.is_finite() {
2993        if cell.c2.abs() <= tol && cell.c3.abs() <= tol {
2994            return Ok(ExactCellBranch::Affine);
2995        }
2996        return Err(CubicCellKernelError::invalid_cell_shape(format!(
2997            "non-affine cells require finite bounds, got [{}, {}] with c2={:.6e}, c3={:.6e}",
2998            cell.left, cell.right, cell.c2, cell.c3
2999        ))
3000        .into());
3001    }
3002    let (k2, k3) = normalized_non_affine_coefficients(
3003        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3004    )?;
3005    if k2.abs() <= tol && k3.abs() <= tol {
3006        Ok(ExactCellBranch::Affine)
3007    } else if k3.abs() <= tol {
3008        Ok(ExactCellBranch::Quartic)
3009    } else {
3010        Ok(ExactCellBranch::Sextic)
3011    }
3012}
3013
3014#[inline]
3015fn degenerate_sextic_branch(
3016    cell: DenestedCubicCell,
3017    lead: f64,
3018) -> Result<Option<ExactCellBranch>, String> {
3019    // The sextic recurrence divides by `lead = 3*c3^2`. When that division is
3020    // unstable, lower the polynomial degree without discarding a material
3021    // quadratic coefficient.
3022    let (normalized_k2, normalized_k3) = normalized_non_affine_coefficients(
3023        cell.left, cell.right, cell.c0, cell.c1, cell.c2, cell.c3,
3024    )?;
3025    if normalized_k3.abs() > NORMALIZED_CELL_BRANCH_TOL && lead.abs() > 1e-18 {
3026        return Ok(None);
3027    }
3028    if normalized_k2.abs() > NORMALIZED_CELL_BRANCH_TOL {
3029        Ok(Some(ExactCellBranch::Quartic))
3030    } else {
3031        Ok(Some(ExactCellBranch::Affine))
3032    }
3033}
3034
3035#[inline]
3036fn validate_bvn_args(h: f64, k: f64, rho: f64) -> Result<(), String> {
3037    if !h.is_finite() && !h.is_infinite() {
3038        return Err(CubicCellKernelError::bivariate_normal_domain(
3039            "bivariate normal cdf requires finite or infinite h",
3040        )
3041        .into());
3042    }
3043    if !k.is_finite() && !k.is_infinite() {
3044        return Err(CubicCellKernelError::bivariate_normal_domain(
3045            "bivariate normal cdf requires finite or infinite k",
3046        )
3047        .into());
3048    }
3049    if !rho.is_finite() {
3050        return Err(CubicCellKernelError::bivariate_normal_domain(format!(
3051            "bivariate normal cdf requires finite correlation, got {rho}"
3052        ))
3053        .into());
3054    }
3055    Ok::<(), _>(())
3056}
3057
3058#[inline]
3059fn bvn_gl_sum(h: f64, k: f64, rho_clamped: f64, asr: f64) -> f64 {
3060    // The Drezner-Wesolowsky arcsin representation is integrated with the
3061    // same 20-point Gauss-Legendre rule as before, but mirrored node pairs are
3062    // evaluated with one sin_cos for the half-angle offset rather than two
3063    // independent sin calls.  This preserves the quadrature rule (and hence
3064    // the accuracy envelope) while reducing the transcendental work in the
3065    // dominant finite-bound path from 20 sin calls to 11 sin/cos evaluations.
3066    if rho_clamped == 0.0 {
3067        return 0.0;
3068    }
3069    let hs = 0.5 * (h * h + k * k);
3070    let hk = h * k;
3071    let half_asr = 0.5 * asr;
3072    let (sin_mid, cos_mid) = half_asr.sin_cos();
3073    let mut sum = 0.0;
3074    for i in 0..10 {
3075        let node = GL20_NODES[i].abs();
3076        let weight = GL20_WEIGHTS[i];
3077        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3078
3079        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3080        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3081        let expo_lo = ((sn_lo * hk) - hs) / one_minus_lo;
3082
3083        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3084        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3085        let expo_hi = ((sn_hi * hk) - hs) / one_minus_hi;
3086
3087        sum += weight * (expo_lo.exp() + expo_hi.exp());
3088    }
3089    sum
3090}
3091
3092pub fn bivariate_normal_cdf(h: f64, k: f64, rho: f64) -> Result<f64, String> {
3093    validate_bvn_args(h, k, rho)?;
3094    if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
3095        return Ok(0.0);
3096    }
3097    if h == f64::INFINITY {
3098        return Ok(normal_cdf(k));
3099    }
3100    if k == f64::INFINITY {
3101        return Ok(normal_cdf(h));
3102    }
3103
3104    let rho_clamped = rho.clamp(-1.0, 1.0);
3105    if rho_clamped >= 1.0 - 1e-12 {
3106        return Ok(normal_cdf(h.min(k)));
3107    }
3108    if rho_clamped <= -1.0 + 1e-12 {
3109        return Ok((normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0));
3110    }
3111    if rho_clamped == 0.0 {
3112        return Ok((normal_cdf(h) * normal_cdf(k)).clamp(0.0, 1.0));
3113    }
3114    if h == 0.0 && k == 0.0 {
3115        return Ok((0.25 + rho_clamped.asin() / std::f64::consts::TAU).clamp(0.0, 1.0));
3116    }
3117
3118    let asr = rho_clamped.asin();
3119    let sum = bvn_gl_sum(h, k, rho_clamped, asr);
3120    Ok((normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3121}
3122
3123#[inline]
3124fn bvn_gl_sum_interval(h: f64, left: f64, right: f64, rho_clamped: f64, asr: f64) -> f64 {
3125    if rho_clamped == 0.0 {
3126        return 0.0;
3127    }
3128    let h2 = h * h;
3129    let right_hs = 0.5 * (h2 + right * right);
3130    let left_hs = 0.5 * (h2 + left * left);
3131    let half_asr = 0.5 * asr;
3132    let (sin_mid, cos_mid) = half_asr.sin_cos();
3133    let mut sum = 0.0;
3134    for i in 0..10 {
3135        let node = GL20_NODES[i].abs();
3136        let weight = GL20_WEIGHTS[i];
3137        let (sin_delta, cos_delta) = (half_asr * node).sin_cos();
3138
3139        let sn_lo = sin_mid * cos_delta - cos_mid * sin_delta;
3140        let one_minus_lo = 1.0 - sn_lo * sn_lo;
3141        let lo_right = (((sn_lo * h * right) - right_hs) / one_minus_lo).exp();
3142        let lo_left = (((sn_lo * h * left) - left_hs) / one_minus_lo).exp();
3143
3144        let sn_hi = sin_mid * cos_delta + cos_mid * sin_delta;
3145        let one_minus_hi = 1.0 - sn_hi * sn_hi;
3146        let hi_right = (((sn_hi * h * right) - right_hs) / one_minus_hi).exp();
3147        let hi_left = (((sn_hi * h * left) - left_hs) / one_minus_hi).exp();
3148
3149        sum += weight * ((lo_right - lo_left) + (hi_right - hi_left));
3150    }
3151    sum
3152}
3153
3154fn bivariate_normal_cdf_interval(h: f64, left: f64, right: f64, rho: f64) -> Result<f64, String> {
3155    if right <= left {
3156        return Ok(0.0);
3157    }
3158    if left == f64::NEG_INFINITY && right == f64::INFINITY {
3159        return Ok(normal_cdf(h));
3160    }
3161    if !left.is_finite() || !right.is_finite() {
3162        let upper = bivariate_normal_cdf(h, right, rho)?;
3163        let lower = bivariate_normal_cdf(h, left, rho)?;
3164        return Ok((upper - lower).clamp(0.0, 1.0));
3165    }
3166    validate_bvn_args(h, left, rho)?;
3167    validate_bvn_args(h, right, rho)?;
3168    if h == f64::NEG_INFINITY {
3169        return Ok(0.0);
3170    }
3171    if h == f64::INFINITY {
3172        return Ok((normal_cdf(right) - normal_cdf(left)).clamp(0.0, 1.0));
3173    }
3174
3175    let rho_clamped = rho.clamp(-1.0, 1.0);
3176    if rho_clamped >= 1.0 - 1e-12 || rho_clamped <= -1.0 + 1e-12 {
3177        let upper = bivariate_normal_cdf(h, right, rho_clamped)?;
3178        let lower = bivariate_normal_cdf(h, left, rho_clamped)?;
3179        return Ok((upper - lower).clamp(0.0, 1.0));
3180    }
3181
3182    let cdf_h = normal_cdf(h);
3183    let normal_part = cdf_h * (normal_cdf(right) - normal_cdf(left));
3184    if rho_clamped == 0.0 {
3185        return Ok(normal_part.clamp(0.0, 1.0));
3186    }
3187    let asr = rho_clamped.asin();
3188    let sum = bvn_gl_sum_interval(h, left, right, rho_clamped, asr);
3189    Ok((normal_part + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0))
3190}
3191
3192fn exp_neg_half_square(x: f64) -> f64 {
3193    if x.is_infinite() {
3194        0.0
3195    } else {
3196        (-0.5 * x * x).exp()
3197    }
3198}
3199
3200/// Zeroth truncated standard-normal moment `T_0(a, b) = ∫_a^b e^(−z²/2) dz
3201/// = √(2π)·(Φ(b) − Φ(a))`, evaluated without catastrophic cancellation in
3202/// either tail.
3203///
3204/// Writing `T_0 = √(π/2)·[erf(b/√2) − erf(a/√2)]`, the naive form collapses
3205/// to `0.0` whenever both endpoints lie in the *same* far tail: `erf`
3206/// saturates at the IEEE-754 values `±1.0` for `|x| ≳ 8.3·√2`, so the
3207/// difference of two saturated values is exactly zero even though the
3208/// integral is a strictly positive number well inside the f64 normal range
3209/// (e.g. `∫_{-12}^{-10} ≈ 1.9e-23`). The fix is to reduce the erf difference
3210/// to complementary tail probabilities — `erfc` is evaluated with a dedicated
3211/// tail series, *not* as `1 − erf` — and to pick, by the sign of the
3212/// endpoints, the algebraically-equivalent form whose terms do not cancel
3213/// against one another:
3214///
3215/// ```text
3216/// both ≥ 0 (upper tail):  erf(b/√2) − erf(a/√2) = erfc(a/√2) − erfc(b/√2)
3217/// both ≤ 0 (lower tail):  erf(b/√2) − erf(a/√2) = erfc(−b/√2) − erfc(−a/√2)
3218/// straddling zero:        erf(b/√2) − erf(a/√2) = 2 − erfc(b/√2) − erfc(−a/√2)
3219/// ```
3220///
3221/// In each branch every `erfc` argument is `≥ 0`, so the terms are small
3222/// positive tail values (or an O(1) constant minus two values `≤ 1`); no
3223/// large quantities cancel and full f64 precision survives down to the
3224/// underflow boundary in either tail. Infinite endpoints fall out via the
3225/// `erfc` limits (`erfc(+∞)=0`, `erfc(−∞)=2`) with no special casing.
3226///
3227/// Uses `libm::erfc` (msun double-precision implementation, ≤ 1 ulp) rather
3228/// than `statrs::function::erf::erfc` (a 6-term rational approximation that
3229/// carries ~3·10⁻¹¹ relative error around `|x| ≈ 1/√2` — see the existing
3230/// `libm::erfc` consumer at `inference::polya_gamma_core::normal_cdf`). That
3231/// statrs error propagates directly into `T_0`, then through every higher
3232/// moment `T_n` (the recurrence `T_n = a^{n-1}e^{-a²/2} − b^{n-1}e^{-b²/2}
3233/// + (n-1)·T_{n-2}` walks `T_0` up two steps at a time), then through every
3234/// affine-cell moment via `affine_anchor_moment_vector` (whose `out[n]` is a
3235/// linear combination of `T_0..=T_n`), and is the dominant source of error
3236/// in the affine-cell branch of the cubic-cell substrate (CPU/GPU parity
3237/// reference for transformation-normal, bernoulli-marginal-slope, and the
3238/// BMS flex-row higher-derivative reuse path).
3239fn truncated_gaussian_zeroth_moment(a: f64, b: f64) -> f64 {
3240    let inv_sqrt2 = 1.0 / std::f64::consts::SQRT_2;
3241    let za = a * inv_sqrt2;
3242    let zb = b * inv_sqrt2;
3243    let erf_diff = if za >= 0.0 {
3244        libm::erfc(za) - libm::erfc(zb)
3245    } else if zb <= 0.0 {
3246        libm::erfc(-zb) - libm::erfc(-za)
3247    } else {
3248        2.0 - libm::erfc(zb) - libm::erfc(-za)
3249    };
3250    // √(2π)·½ = √(π/2).
3251    (std::f64::consts::PI / 2.0).sqrt() * erf_diff
3252}
3253
3254/// Fill `out[0..=max_degree]` with the raw truncated standard-normal moments
3255///
3256/// ```text
3257/// T_n(a, b) = ∫_a^b z^n exp(-z²/2) dz
3258/// ```
3259///
3260/// using the integration-by-parts recurrence
3261///
3262/// ```text
3263/// T_0(a, b) = √(2π) (Φ(b) − Φ(a))
3264/// T_1(a, b) = exp(−a²/2) − exp(−b²/2)
3265/// T_n(a, b) = a^(n−1) e^{−a²/2} − b^(n−1) e^{−b²/2} + (n−1) T_{n−2}(a, b)
3266/// ```
3267///
3268/// Computed in one forward sweep so each call evaluates `erf` and
3269/// `exp(−x²/2)` exactly twice (once at `a`, once at `b`) regardless of the
3270/// requested degree. The naive form — calling `T_n` recursively for each
3271/// `n = 0..=max_degree` — re-evaluated `erf`/`exp` about `max_degree²/4`
3272/// times per affine cell, which dominated the wall time of the
3273/// transformation-normal and bernoulli-marginal-slope inner solves with
3274/// `max_degree = 64` (the transport order's required degree budget).
3275fn fill_truncated_gaussian_moments(a: f64, b: f64, out: &mut [f64]) {
3276    if out.is_empty() {
3277        return;
3278    }
3279    out[0] = truncated_gaussian_zeroth_moment(a, b);
3280    if out.len() == 1 {
3281        return;
3282    }
3283    let ea = exp_neg_half_square(a);
3284    let eb = exp_neg_half_square(b);
3285    out[1] = ea - eb;
3286    if out.len() == 2 {
3287        return;
3288    }
3289    let a_finite = a.is_finite();
3290    let b_finite = b.is_finite();
3291    // For n in 2..=max_degree we need a^{n-1} e^{-a²/2} (resp. b). Carry the
3292    // running powers a^{n-1}, b^{n-1} forward by a single multiply per step.
3293    // Infinite endpoints contribute 0 (the integrand decays at the rate of
3294    // exp(−x²/2)), matching the prior `is_infinite` branch in the recursive
3295    // implementation; we still update the running power so the iteration
3296    // stays branchless when both endpoints are finite.
3297    let mut a_pow_n_minus_1 = a; // a^1, used at n = 2
3298    let mut b_pow_n_minus_1 = b;
3299    for n in 2..out.len() {
3300        let left = if a_finite { a_pow_n_minus_1 * ea } else { 0.0 };
3301        let right = if b_finite { b_pow_n_minus_1 * eb } else { 0.0 };
3302        out[n] = left - right + (n as f64 - 1.0) * out[n - 2];
3303        a_pow_n_minus_1 *= a;
3304        b_pow_n_minus_1 *= b;
3305    }
3306}
3307
3308/// Stack-array bound for `affine_anchor_moment_vector_into`. Public callers
3309/// use up to ~24 (largest is the bernoulli-margslope outer-step degree-21
3310/// reduction); 64 leaves comfortable headroom without growing the per-call
3311/// stack footprint meaningfully.
3312const MAX_AFFINE_ANCHOR_DEGREE: usize = 64;
3313
3314pub fn affine_anchor_moment_vector(
3315    alpha: f64,
3316    beta: f64,
3317    left: f64,
3318    right: f64,
3319    max_degree: usize,
3320) -> Vec<f64> {
3321    let mut out = vec![0.0; max_degree + 1];
3322    affine_anchor_moment_vector_into(alpha, beta, left, right, max_degree, &mut out);
3323    out
3324}
3325
3326fn affine_anchor_moment_vector_into(
3327    alpha: f64,
3328    beta: f64,
3329    left: f64,
3330    right: f64,
3331    max_degree: usize,
3332    out: &mut [f64],
3333) {
3334    assert_eq!(out.len(), max_degree + 1);
3335    let s = (1.0 + beta * beta).sqrt();
3336    let mu = -alpha * beta / (1.0 + beta * beta);
3337    let y_left = if left.is_infinite() {
3338        if left.is_sign_positive() {
3339            f64::INFINITY
3340        } else {
3341            f64::NEG_INFINITY
3342        }
3343    } else {
3344        s * (left - mu)
3345    };
3346    let y_right = if right.is_infinite() {
3347        if right.is_sign_positive() {
3348            f64::INFINITY
3349        } else {
3350            f64::NEG_INFINITY
3351        }
3352    } else {
3353        s * (right - mu)
3354    };
3355    let anchor = (-alpha * alpha / (2.0 * s * s)).exp() / s;
3356    assert!(
3357        max_degree <= MAX_AFFINE_ANCHOR_DEGREE,
3358        "affine_anchor_moment_vector max_degree {} exceeds compile-time bound {}",
3359        max_degree,
3360        MAX_AFFINE_ANCHOR_DEGREE
3361    );
3362    let mut t = [0.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3363    fill_truncated_gaussian_moments(y_left, y_right, &mut t[..=max_degree]);
3364    // Build mu^k and s^{-k} tables once. The inner sum is the binomial
3365    // expansion of the affine change-of-variables, and computing the
3366    // binomial coefficient via Pascal's row recurrence + carrying mu/s
3367    // powers eliminates the per-(n, k) `powi` and binomial calls that
3368    // otherwise dominated the inner loop at large `max_degree`.
3369    let mut mu_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3370    for k in 1..=max_degree {
3371        mu_pow[k] = mu_pow[k - 1] * mu;
3372    }
3373    let inv_s = 1.0 / s;
3374    let mut inv_s_pow = [1.0_f64; MAX_AFFINE_ANCHOR_DEGREE + 1];
3375    for k in 1..=max_degree {
3376        inv_s_pow[k] = inv_s_pow[k - 1] * inv_s;
3377    }
3378    out.fill(0.0);
3379    for n in 0..=max_degree {
3380        let mut acc = 0.0;
3381        // C(n, k+1) = C(n, k) · (n − k) / (k + 1).
3382        let mut binom = 1.0;
3383        for k in 0..=n {
3384            let term = binom * mu_pow[n - k] * inv_s_pow[k];
3385            acc = term.mul_add(t[k], acc);
3386            if k < n {
3387                binom = binom * (n - k) as f64 / (k + 1) as f64;
3388            }
3389        }
3390        out[n] = anchor * acc;
3391    }
3392}
3393
3394fn affine_value_from_moment_primitive(alpha: f64, beta: f64, left: f64, right: f64) -> f64 {
3395    // Exact formula via bivariate normal CDF.
3396    //
3397    // V(α,β,l,r) = ∫_l^r Φ(α+βz)φ(z)dz
3398    //            = P(U ≤ α+βZ, l ≤ Z ≤ r)    where U,Z iid N(0,1)
3399    //            = Φ₂(h, r; ρ) − Φ₂(h, l; ρ)
3400    //
3401    // with h = α/√(1+β²) and ρ = −β/√(1+β²).
3402    //
3403    // This is exact to floating-point precision via the high-accuracy
3404    // Drezner-Wesolowsky BVN routine, replacing the previous fixed 20-point
3405    // Gauss-Legendre numerical integration of the derivative primitive.
3406    let s = (1.0 + beta * beta).sqrt();
3407    let h = alpha / s;
3408    let rho = -beta / s;
3409    bivariate_normal_cdf_interval(h, left, right, rho).unwrap_or(0.0)
3410}
3411
3412/// Evaluate an affine cell (c2=c3=0) with a value/moment-consistent primitive.
3413///
3414/// Value and moments are now generated from the same affine moment primitive.
3415/// The zero-moment derivative is exact, and `value` is reconstructed by
3416/// integrating `d value / d alpha = INV_TWO_PI * moments[0]` over `alpha`
3417/// on a transformed semi-infinite domain.
3418pub fn evaluate_affine_cell_state(
3419    cell: DenestedCubicCell,
3420    max_degree: usize,
3421) -> Result<CellMomentState, String> {
3422    let alpha = cell.c0;
3423    let beta = cell.c1;
3424    let value = affine_value_from_moment_primitive(alpha, beta, cell.left, cell.right);
3425    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3426    Ok(CellMomentState {
3427        branch: ExactCellBranch::Affine,
3428        value,
3429        moments: moments.into(),
3430    })
3431}
3432
3433fn evaluate_affine_cell_derivative_state(
3434    cell: DenestedCubicCell,
3435    max_degree: usize,
3436) -> Result<CellDerivativeMomentState, String> {
3437    let alpha = cell.c0;
3438    let beta = cell.c1;
3439    let moments = affine_anchor_moment_vector(alpha, beta, cell.left, cell.right, max_degree);
3440    Ok(CellDerivativeMomentState {
3441        branch: ExactCellBranch::Affine,
3442        moments: moments.into(),
3443    })
3444}
3445
3446/// Accumulate `mw * z^k` into `moments[k]` for k=0..moments.len(). The
3447/// "unrolled4" name is historical — this is the plain scalar accumulator
3448/// that the SIMD outer loop calls per lane. Moment counts are small enough
3449/// (max_degree + 1 <= ~10) that explicit 4-way unrolling does not measurably
3450/// improve throughput over the iterator path; the wide::f64x4::exp savings
3451/// in the SIMD outer dominate the kernel's runtime.
3452#[inline]
3453fn accumulate_moments_unrolled4(moments: &mut [f64], mw: f64, z: f64) {
3454    let mut z_pow = 1.0_f64;
3455    for slot in moments.iter_mut() {
3456        *slot = mw.mul_add(z_pow, *slot);
3457        z_pow *= z;
3458    }
3459}
3460
3461// Shared SIMD Gauss-Legendre core for non-affine cells. The const generic
3462// `COMPUTE_VALUE` selects whether the cell value integral
3463// `∫ φ(η(z)) · exp(-½z²) dz / √(2π)` is accumulated alongside the moments.
3464// Monomorphization collapses the const-generic branches at compile time, so
3465// `COMPUTE_VALUE = false` emits the moment-only path verbatim.
3466//
3467// Single source of truth for the moment SIMD lane ordering, the Horner-with-FMA
3468// pattern for η(z), the `0.5 * (z² + η²)` quadratic-form evaluation order, the
3469// unscaled per-node GL moment weights, the post-loop half-width fold, and the
3470// per-lane `accumulate_moments_unrolled4` call. The previous duplicated code paths
3471// drifted by 1 ULP whenever any of these details diverged; here both paths
3472// share the same instructions, eliminating an entire class of regressions
3473// where a tweak to the quadrature order or the FMA pattern would silently
3474// re-introduce divergence between the value- and derivative-only callers.
3475//
3476// Gauss-Legendre on [left, right] converges geometrically for the analytic
3477// integrand exp(-q(z)) with quartic/sextic q on a bounded cell; the prior
3478// adaptive transport path expanded basis_moments via the forward 3-/5-step
3479// recurrences in reduce_quartic/sextic_moments, which amplify roundoff by
3480// (1/lead)^n with lead = 2c2²/3c3² and overflow to NaN for small c2/c3 cells
3481// that arise naturally in production.
3482//
3483// The fixed 384-node rule that replaced the transport path is accurate but
3484// pays ~384 exp evaluations per cell unconditionally. Production cells are
3485// narrow spline-knot subdivisions where a 12- or 24-node rule is already
3486// converged to machine precision, and the flex marginal-slope row calculus
3487// evaluates O(100) such cells per row across n=10⁵–10⁶ rows per criterion
3488// evaluation — the fixed rule was the dominant cost of the whole fit (#979).
3489// `evaluate_non_affine_cell_simd` therefore walks a progressive ladder of
3490// rules (12, 24, 48, 96, 192, 384 nodes) and returns as soon as two
3491// consecutive rules agree to `NON_AFFINE_LADDER_RTOL` relative to the moment
3492// vector's own scale. Unlike the old fixed rule — whose error was real but
3493// uncertified — every accepted ladder result carries an embedded two-rule
3494// agreement certificate; a cell that never certifies falls through to the
3495// same 384-node answer the fixed rule produced.
3496//
3497// SIMD path: process 4 GL nodes per outer iteration, batching the two scalar
3498// `exp` calls into single 4-wide `wide::f64x4::exp` invocations. All ladder
3499// rule sizes are divisible by 4, so no scalar tail is needed for the GL
3500// sweep. The inner moment accumulation is then run scalar per-lane but with
3501// a 4-way unrolled slab over the moment slots to break the `z_pow *= z`
3502// serial dependency chain.
3503#[inline(always)]
3504fn evaluate_non_affine_cell_with_rule<const COMPUTE_VALUE: bool>(
3505    cell: DenestedCubicCell,
3506    max_degree: usize,
3507    gl_nodes: &[f64],
3508    gl_weights: &[f64],
3509) -> (CellMomentVec, f64) {
3510    let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
3511    let mut value_integral = 0.0_f64;
3512    let center = 0.5 * (cell.left + cell.right);
3513    let half_width = 0.5 * (cell.right - cell.left);
3514    let c0 = cell.c0;
3515    let c1 = cell.c1;
3516    let c2 = cell.c2;
3517    let c3 = cell.c3;
3518    let moments_slice: &mut [f64] = &mut moments;
3519    assert_eq!(gl_nodes.len(), gl_weights.len());
3520    use wide::f64x4;
3521    let center_v = f64x4::splat(center);
3522    let half_width_v = f64x4::splat(half_width);
3523    let c0_v = f64x4::splat(c0);
3524    let c1_v = f64x4::splat(c1);
3525    let c2_v = f64x4::splat(c2);
3526    let c3_v = f64x4::splat(c3);
3527    let neg_half_v = f64x4::splat(-0.5);
3528    let n_total = gl_nodes.len();
3529    let n_simd = n_total - (n_total % 4);
3530    let mut i = 0;
3531    while i < n_simd {
3532        let node_v = f64x4::from([
3533            gl_nodes[i],
3534            gl_nodes[i + 1],
3535            gl_nodes[i + 2],
3536            gl_nodes[i + 3],
3537        ]);
3538        let weight_v = f64x4::from([
3539            gl_weights[i],
3540            gl_weights[i + 1],
3541            gl_weights[i + 2],
3542            gl_weights[i + 3],
3543        ]);
3544        let z_v = half_width_v.mul_add(node_v, center_v);
3545        // Horner: ((c3*z + c2)*z + c1)*z + c0
3546        let eta_v = c3_v
3547            .mul_add(z_v, c2_v)
3548            .mul_add(z_v, c1_v)
3549            .mul_add(z_v, c0_v);
3550        let z2_v = z_v * z_v;
3551        let neg_q_v = neg_half_v * (z2_v + eta_v * eta_v);
3552        let exp_negq_v = neg_q_v.exp();
3553        let moment_weight_v = weight_v * exp_negq_v;
3554        let z_arr = z_v.to_array();
3555        let mw_arr = moment_weight_v.to_array();
3556        if COMPUTE_VALUE {
3557            for lane in 0..4 {
3558                let z = z_arr[lane];
3559                let mw = mw_arr[lane];
3560                accumulate_moments_unrolled4(moments_slice, mw, z);
3561                // The value integrand carries Φ(η)'s erfc, whose systematic
3562                // per-z error is ~1e-13. To honor the cell-value accuracy
3563                // contract the value term must be assembled bit-for-bit like
3564                // the scalar reference: a non-fused node map
3565                // `z_ref = center + half_width·node`, the expanded
3566                // `η = c0 + c1·z + c2·z² + c3·z³` (NOT the SIMD Horner-FMA used
3567                // for the moments), the unscaled GL weight, a scalar `exp(-½z²)`,
3568                // and a plain `+=`. The SIMD `z_v`/`eta_v` above (fused) feed
3569                // ONLY the moments and are left untouched. Any single ULP slip
3570                // here (FMA node map, Horner η, per-term half_width, SIMD exp,
3571                // FMA accumulation) drifts the 384-node sum by ~1.4e-13 and
3572                // breaks the contract.
3573                let node = gl_nodes[i + lane];
3574                let weight = gl_weights[i + lane];
3575                let z_ref = center + half_width * node;
3576                let eta_ref = c0 + c1 * z_ref + c2 * z_ref * z_ref + c3 * z_ref * z_ref * z_ref;
3577                value_integral += weight * (-0.5 * z_ref * z_ref).exp() * normal_cdf(eta_ref);
3578            }
3579        } else {
3580            for lane in 0..4 {
3581                let z = z_arr[lane];
3582                let mw = mw_arr[lane];
3583                accumulate_moments_unrolled4(moments_slice, mw, z);
3584            }
3585        }
3586        i += 4;
3587    }
3588    while i < n_total {
3589        let node = gl_nodes[i];
3590        let weight = gl_weights[i];
3591        let z = center + half_width * node;
3592        let eta = c3.mul_add(z, c2).mul_add(z, c1).mul_add(z, c0);
3593        let q = 0.5 * (z * z + eta * eta);
3594        let moment_weight = weight * (-q).exp();
3595        accumulate_moments_unrolled4(moments_slice, moment_weight, z);
3596        if COMPUTE_VALUE {
3597            // Bit-for-bit the reference value structure (see SIMD branch): the
3598            // node map `z = center + half_width·node` here already matches the
3599            // reference (non-fused), but η must use the expanded reference form
3600            // rather than the moment path's Horner-FMA.
3601            let eta_ref = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3602            value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta_ref);
3603        }
3604        i += 1;
3605    }
3606    // Apply the cell half-width to both moment and value integrals ONCE at the
3607    // end, mirroring the prefold reference. Folding half_width per-term changes
3608    // f64 rounding enough to show up at the 1e-13 contract.
3609    for moment in moments_slice.iter_mut() {
3610        *moment *= half_width;
3611    }
3612    let value = if COMPUTE_VALUE {
3613        value_integral * half_width
3614    } else {
3615        value_integral
3616    };
3617    (moments, value)
3618}
3619
3620/// Relative agreement threshold for the progressive non-affine quadrature
3621/// ladder: two consecutive Gauss-Legendre rules must agree on every moment
3622/// slot to this tolerance relative to the moment vector's own max magnitude
3623/// before the finer rule's result
3624/// is accepted. Gauss-Legendre error decays geometrically in the node count
3625/// for the analytic integrand `exp(-q(z))`, so agreement between an n-node
3626/// and a 2n-node rule certifies that both are converged: the coarse rule's
3627/// true error is bounded by the observed difference plus the (much smaller)
3628/// fine-rule error.
3629///
3630/// History (#979): a roundoff-floor relaxation of this test (accept when
3631/// successive rungs agree to `≈ n·ε·scale` rather than the bare `3e-15`) was
3632/// tried to let smooth cells certify below the terminal 384-node rung. It was
3633/// reverted: the value-bearing path carries `∫ φ(z)·Φ(η(z)) dz`, and `Φ`'s
3634/// `erfc` implementation has a *systematic per-z* error of order `1e-13` that
3635/// each rung's node set samples differently. Only the exact 384-node rule
3636/// reproduces the reference's erfc-noise realization, so any sub-384 rung
3637/// drifts from the 384 value by `≈ 1e-13` — a drift that is NOT truncation,
3638/// does NOT shrink with rung, and is NOT bounded by rung-to-rung agreement.
3639/// The moment ladder remains independent of the value integral so value- and
3640/// derivative-only evaluators keep returning bit-identical moments. The scalar
3641/// value now evaluates on the terminal 384-node rule directly, preserving the
3642/// `non_affine_cell_state_matches_prefold_reference_to_1e_minus_13` value
3643/// contract without forcing every derivative-moment caller to use the terminal
3644/// rung.
3645const NON_AFFINE_LADDER_RTOL: f64 = 1e-15;
3646
3647/// Node counts of the progressive ladder below the 384-node terminal rung.
3648/// All divisible by 4 so the SIMD sweep needs no scalar tail.
3649const NON_AFFINE_LADDER_RUNGS: [usize; 5] = [12, 24, 48, 96, 192];
3650
3651/// Runtime-generated Gauss-Legendre rules for the ladder rungs, computed
3652/// once per process by Newton iteration on the Legendre polynomial roots
3653/// (standard `gauleg`: cosine initial guess, 3-4 Newton steps to machine
3654/// precision). The terminal 384-node rung reuses the compile-time
3655/// `GL_NODES`/`GL_WEIGHTS` tables, which also remain the single source for
3656/// the GPU kernel.
3657fn non_affine_ladder_rules() -> &'static [(Vec<f64>, Vec<f64>)] {
3658    static RULES: std::sync::OnceLock<Vec<(Vec<f64>, Vec<f64>)>> = std::sync::OnceLock::new();
3659    RULES.get_or_init(|| {
3660        NON_AFFINE_LADDER_RUNGS
3661            .iter()
3662            .map(|&n| gauss_legendre_rule(n))
3663            .collect()
3664    })
3665}
3666
3667/// Nodes and weights of the `n`-point Gauss-Legendre rule on `[-1, 1]`.
3668///
3669/// Newton iteration on `P_n` from the cosine initial guess
3670/// `cos(π(i + 0.75)/(n + 0.5))` converges to every root in a handful of
3671/// steps; weights follow from `w_i = 2 / ((1 - x_i²) P_n'(x_i)²)`. Roots are
3672/// filled symmetrically so the rule is exactly antisymmetric about 0.
3673fn gauss_legendre_rule(n: usize) -> (Vec<f64>, Vec<f64>) {
3674    let mut nodes = vec![0.0_f64; n];
3675    let mut weights = vec![0.0_f64; n];
3676    for i in 0..n.div_ceil(2) {
3677        let mut z = (std::f64::consts::PI * (i as f64 + 0.75) / (n as f64 + 0.5)).cos();
3678        let mut pp = 0.0_f64;
3679        for _ in 0..100 {
3680            // Legendre recurrence: p1 = P_n(z), p2 = P_{n-1}(z).
3681            let mut p1 = 1.0_f64;
3682            let mut p2 = 0.0_f64;
3683            for j in 1..=n {
3684                let p3 = p2;
3685                p2 = p1;
3686                p1 = ((2 * j - 1) as f64 * z * p2 - (j - 1) as f64 * p3) / j as f64;
3687            }
3688            pp = n as f64 * (z * p1 - p2) / (z * z - 1.0);
3689            let z_prev = z;
3690            z = z_prev - p1 / pp;
3691            if (z - z_prev).abs() <= f64::EPSILON {
3692                break;
3693            }
3694        }
3695        nodes[i] = -z;
3696        nodes[n - 1 - i] = z;
3697        let w = 2.0 / ((1.0 - z * z) * pp * pp);
3698        weights[i] = w;
3699        weights[n - 1 - i] = w;
3700    }
3701    (nodes, weights)
3702}
3703
3704/// Two-rule agreement certificate for the progressive ladder. `true` when
3705/// every MOMENT slot agrees to `NON_AFFINE_LADDER_RTOL` relative to the fine
3706/// result's max magnitude. Non-finite results never certify, so they fall
3707/// through to the terminal 384-node rung and reproduce the fixed rule's
3708/// behavior exactly.
3709///
3710/// The decision is deliberately moment-only and independent of whether the
3711/// caller also computed the cell value: the value- and derivative-only
3712/// evaluators MUST select the same ladder rung so they accumulate the moment
3713/// vector over the same nodes and return bit-identical moments (the
3714/// `derivative_moment_evaluator_matches_value_evaluator_moments` invariant).
3715/// Value-bearing callers evaluate the scalar cell probability separately on
3716/// the terminal 384-node rule; this certificate governs only the reusable
3717/// derivative moment vector.
3718fn non_affine_ladder_converged(coarse: &CellMomentVec, fine: &CellMomentVec) -> bool {
3719    let mut scale = 0.0_f64;
3720    let mut err = 0.0_f64;
3721    for (&c, &f) in coarse.iter().zip(fine.iter()) {
3722        scale = scale.max(f.abs());
3723        err = err.max((c - f).abs());
3724    }
3725    if !(scale.is_finite() && err.is_finite()) {
3726        return false;
3727    }
3728    err <= NON_AFFINE_LADDER_RTOL * scale
3729}
3730
3731/// Per-rung certification histogram for the non-affine ladder, indexed by the
3732/// rung that certified (`NON_AFFINE_LADDER_RUNGS[i]` at index `i`), with the
3733/// final slot counting cells that fell through to the terminal 384-node rule.
3734/// Incremented once per non-affine cell evaluation; the BMS exact-cache build
3735/// logs the distribution so the ladder's real cost (early-certify win vs.
3736/// terminal-fallthrough cost) is observable on every large-scale fit rather
3737/// than assumed. `+1` length for the terminal bucket.
3738pub(crate) static NON_AFFINE_LADDER_CERT_COUNTS: [AtomicU64; NON_AFFINE_LADDER_RUNGS.len() + 1] = [
3739    AtomicU64::new(0),
3740    AtomicU64::new(0),
3741    AtomicU64::new(0),
3742    AtomicU64::new(0),
3743    AtomicU64::new(0),
3744    AtomicU64::new(0),
3745];
3746
3747/// Snapshot the ladder certification histogram as `(rung_node_count, count)`
3748/// pairs plus the terminal-fallthrough count, for logging/inspection.
3749pub fn non_affine_ladder_cert_histogram() -> (Vec<(usize, u64)>, u64) {
3750    let per_rung = NON_AFFINE_LADDER_RUNGS
3751        .iter()
3752        .enumerate()
3753        .map(|(i, &n)| (n, NON_AFFINE_LADDER_CERT_COUNTS[i].load(Ordering::Relaxed)))
3754        .collect();
3755    let terminal =
3756        NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].load(Ordering::Relaxed);
3757    (per_rung, terminal)
3758}
3759
3760/// Progressive-ladder evaluation of a non-affine cell: walk the rule ladder
3761/// from 12 nodes upward and return the first result certified by two-rule
3762/// agreement; a cell that never certifies returns the terminal 384-node
3763/// result, byte-identical to the previous fixed-rule implementation.
3764#[inline]
3765fn evaluate_non_affine_cell_simd<const COMPUTE_VALUE: bool>(
3766    cell: DenestedCubicCell,
3767    max_degree: usize,
3768) -> (CellMomentVec, f64) {
3769    let mut prev: Option<(CellMomentVec, f64)> = None;
3770    for (i, (nodes, weights)) in non_affine_ladder_rules().iter().enumerate() {
3771        let cur =
3772            evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, nodes, weights);
3773        if let Some(prev) = prev.as_ref()
3774            && non_affine_ladder_converged(&prev.0, &cur.0)
3775        {
3776            NON_AFFINE_LADDER_CERT_COUNTS[i].fetch_add(1, Ordering::Relaxed);
3777            return cur;
3778        }
3779        prev = Some(cur);
3780    }
3781    NON_AFFINE_LADDER_CERT_COUNTS[NON_AFFINE_LADDER_RUNGS.len()].fetch_add(1, Ordering::Relaxed);
3782    evaluate_non_affine_cell_with_rule::<COMPUTE_VALUE>(cell, max_degree, &GL_NODES, &GL_WEIGHTS)
3783}
3784
3785/// Value-only evaluation of a non-affine cell on the terminal 384-node rule.
3786///
3787/// Returns the cell probability integral `∫ exp(-½z²)·Φ(η(z)) dz` (pre the
3788/// `1/√τ` normalization) computed bit-for-bit like the value branch of
3789/// [`evaluate_non_affine_cell_with_rule`]: the non-fused node map
3790/// `z = center + half_width·node`, the expanded (non-Horner)
3791/// `η = c0 + c1·z + c2·z² + c3·z³`, the unscaled GL weight, a scalar
3792/// `exp(-½z²)`, a plain `+=` in ascending node order, and a single trailing
3793/// `·half_width`. The terminal rule has 384 nodes (divisible by 4), so the
3794/// general kernel's value path never takes its scalar tail — this loop walks
3795/// the same nodes in the same order and therefore reproduces the reference
3796/// erfc-noise realization the `1e-13` value contract pins down.
3797///
3798/// Computing this through `evaluate_non_affine_cell_with_rule::<true>` at
3799/// `max_degree = 0` would additionally run the 4-wide SIMD `exp(-q)` moment
3800/// sweep and a moment accumulation on every node only to discard the moment
3801/// vector. The survival marginal-slope fit evaluates a value per non-affine
3802/// partition cell, so that discarded moment work is the dominant waste in the
3803/// per-cell pass; this evaluator does only the work the value needs.
3804fn evaluate_non_affine_cell_value_terminal(cell: DenestedCubicCell) -> f64 {
3805    let center = 0.5 * (cell.left + cell.right);
3806    let half_width = 0.5 * (cell.right - cell.left);
3807    let c0 = cell.c0;
3808    let c1 = cell.c1;
3809    let c2 = cell.c2;
3810    let c3 = cell.c3;
3811    let mut value_integral = 0.0_f64;
3812    for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
3813        let z = center + half_width * node;
3814        let eta = c0 + c1 * z + c2 * z * z + c3 * z * z * z;
3815        value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
3816    }
3817    value_integral * half_width
3818}
3819
3820fn evaluate_non_affine_cell_state(
3821    cell: DenestedCubicCell,
3822    branch: ExactCellBranch,
3823    max_degree: usize,
3824) -> Result<CellMomentState, String> {
3825    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3826    let value_integral = evaluate_non_affine_cell_value_terminal(cell);
3827    // Reference structure: `value_integral * half_width / sqrt(TAU)`. The
3828    // half_width factor is already applied inside the rule evaluator, so divide
3829    // by sqrt(TAU) here (a true division, NOT multiply-by-reciprocal) to
3830    // reproduce the reference's final rounding bit-for-bit.
3831    Ok(CellMomentState {
3832        branch,
3833        value: value_integral / (std::f64::consts::TAU).sqrt(),
3834        moments,
3835    })
3836}
3837
3838fn evaluate_non_affine_cell_derivative_state(
3839    cell: DenestedCubicCell,
3840    branch: ExactCellBranch,
3841    max_degree: usize,
3842) -> Result<CellDerivativeMomentState, String> {
3843    let (moments, _) = evaluate_non_affine_cell_simd::<false>(cell, max_degree);
3844    Ok(CellDerivativeMomentState { branch, moments })
3845}
3846
3847/// De-nested cubic cell evaluator.
3848///
3849/// Affine cells use the closed-form affine anchor; non-affine cells (Quartic
3850/// and Sextic branches) are evaluated in a single pass over a fixed
3851/// high-order Gauss-Legendre rule on `[left, right]`.
3852pub fn evaluate_cell_moments(
3853    cell: DenestedCubicCell,
3854    max_degree: usize,
3855) -> Result<CellMomentState, String> {
3856    if !TAIL_CELL_MOMENT_CACHE_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
3857        return evaluate_cell_moments_uncached(cell, max_degree);
3858    }
3859    tail_cell_moment_cache().evaluate(cell, max_degree)
3860}
3861
3862/// Evaluate cell moments without consulting the global affine-tail memo.
3863///
3864/// This is retained for regression tests and before/after microbenchmarks;
3865/// production callers should use [`evaluate_cell_moments`].
3866pub fn evaluate_cell_moments_uncached(
3867    cell: DenestedCubicCell,
3868    max_degree: usize,
3869) -> Result<CellMomentState, String> {
3870    evaluate_cell_state_dispatched(
3871        cell,
3872        max_degree,
3873        evaluate_affine_cell_state,
3874        evaluate_non_affine_cell_state,
3875    )
3876}
3877
3878/// Evaluate only the moment vector needed by derivative contractions.
3879///
3880/// This deliberately does not compute the cell probability value
3881/// `∫ φ(z) Φ(η(z)) dz`. Derivative contractions consume
3882/// `∫ z^k exp(-q(z)) dz` moments only, so keeping the value out of the return
3883/// type prevents this cheaper evaluator from satisfying value-bearing calls.
3884pub fn evaluate_cell_derivative_moments_uncached(
3885    cell: DenestedCubicCell,
3886    max_degree: usize,
3887) -> Result<CellDerivativeMomentState, String> {
3888    evaluate_cell_state_dispatched(
3889        cell,
3890        max_degree,
3891        evaluate_affine_cell_derivative_state,
3892        evaluate_non_affine_cell_derivative_state,
3893    )
3894}
3895
3896/// Shared branch dispatch for the value-bearing and derivative-only cell
3897/// evaluators. Both walk the same decision tree (semi-infinite tail → must
3898/// be affine; finite cell → branch-by-coefficients with the sextic
3899/// degenerate-lowering path), differing only in which pair of
3900/// `(affine, non_affine)` evaluator helpers to delegate to.  The two helpers
3901/// are passed as `fn` pointers so the dispatch monomorphizes per `S` and
3902/// keeps the existing pre-condition errors / unreachable branch handling
3903/// in lockstep across both evaluators.
3904fn evaluate_cell_state_dispatched<S>(
3905    cell: DenestedCubicCell,
3906    max_degree: usize,
3907    affine: fn(DenestedCubicCell, usize) -> Result<S, String>,
3908    non_affine: fn(DenestedCubicCell, ExactCellBranch, usize) -> Result<S, String>,
3909) -> Result<S, String> {
3910    let left_inf = !cell.left.is_finite();
3911    let right_inf = !cell.right.is_finite();
3912    if left_inf || right_inf {
3913        // Semi-infinite tail cells must be affine: the deviation saturates
3914        // to a constant outside support, so c2=c3=0.  Both the BVN CDF
3915        // and the truncated-Gaussian moment vector handle infinite bounds.
3916        if cell.c2.abs() > NORMALIZED_CELL_BRANCH_TOL || cell.c3.abs() > NORMALIZED_CELL_BRANCH_TOL
3917        {
3918            return Err(CubicCellKernelError::invalid_cell_shape(format!(
3919                "semi-infinite cell [{}, {}] must be affine (c2=c3=0), got c2={:.3e}, c3={:.3e}",
3920                cell.left, cell.right, cell.c2, cell.c3
3921            ))
3922            .into());
3923        }
3924        return affine(cell, max_degree);
3925    }
3926    if cell.right <= cell.left {
3927        return Err(CubicCellKernelError::invalid_cell_shape(format!(
3928            "finite cell must have left < right, got [{}, {}]",
3929            cell.left, cell.right
3930        ))
3931        .into());
3932    }
3933    let branch = branch_cell(cell)?;
3934    if branch == ExactCellBranch::Affine {
3935        return affine(cell, max_degree);
3936    }
3937    if branch == ExactCellBranch::Sextic {
3938        let lead = sextic_qprime_coefficients(cell.c0, cell.c1, cell.c2, cell.c3)[5];
3939        if !lead.is_finite() {
3940            return Err(CubicCellKernelError::invalid_cell_shape(format!(
3941                "sextic cell evaluation encountered non-finite leading coefficient: {lead:.3e}"
3942            ))
3943            .into());
3944        }
3945        if let Some(lower_branch) = degenerate_sextic_branch(cell, lead)? {
3946            return match lower_branch {
3947                ExactCellBranch::Quartic => non_affine(
3948                    DenestedCubicCell { c3: 0.0, ..cell },
3949                    ExactCellBranch::Quartic,
3950                    max_degree,
3951                ),
3952                ExactCellBranch::Affine => affine(
3953                    DenestedCubicCell {
3954                        c2: 0.0,
3955                        c3: 0.0,
3956                        ..cell
3957                    },
3958                    max_degree,
3959                ),
3960                ExactCellBranch::Sextic => Err(CubicCellKernelError::invalid_cell_shape(
3961                    "internal: degenerate_sextic_branch returned Sextic as a lowered branch",
3962                )
3963                .into()),
3964            };
3965        }
3966    }
3967    non_affine(cell, branch, max_degree)
3968}
3969
3970/// Evaluate a de-nested cubic cell through a fit-lifetime byte-limited LRU cache.
3971///
3972/// The fingerprint is an exact bit-cast of `(c0, c1, c2, c3, left, right)`, so
3973/// eviction and reuse cannot alias nearby-but-different cells.  A cached entry
3974/// computed to a higher degree may satisfy a lower-degree request by truncating
3975/// the moment vector, preserving the public [`evaluate_cell_moments`] contract.
3976pub fn evaluate_cell_moments_cached(
3977    cell: DenestedCubicCell,
3978    max_degree: usize,
3979    cache: &CellMomentLruCache,
3980    stats: Option<&CellMomentCacheStats>,
3981) -> Result<CellMomentState, String> {
3982    // Affine cells (every rigid-path cell and every tail cell) evaluate
3983    // through the closed-form anchor — cheaper than a single LRU probe. The
3984    // LRU exists only to amortize the EXPENSIVE non-affine transport across
3985    // recurring cells; at large n the row scalars `(a, b)` are unique per
3986    // row, so affine cells never recur and routing them through the sharded
3987    // mutex was pure cost (320k lock+insert+evict ops per gradient eval, ~0%
3988    // hit — the dominant cost of the rigid n=320k fit, #979). Bypass the
3989    // cache entirely for them.
3990    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
3991        if let Some(stats) = stats {
3992            stats.misses.fetch_add(1, Ordering::Relaxed);
3993        }
3994        return evaluate_cell_moments_uncached(cell, max_degree);
3995    }
3996    let key = CellFingerprint::new(cell);
3997    let existing_derivative = match cache.get(&key) {
3998        Some(cached) => {
3999            if let Some(state) = cached.state_for_degree(max_degree) {
4000                if let Some(stats) = stats {
4001                    stats.hits.fetch_add(1, Ordering::Relaxed);
4002                }
4003                return Ok(state);
4004            }
4005            // `cached.derivative_state` is `Option<Arc<_>>`; `.clone()` here
4006            // is the cheap refcount bump the audit-39 fix targets, not a
4007            // full moment-vector deep clone.
4008            cached.derivative_state.clone()
4009        }
4010        None => None,
4011    };
4012    if let Some(stats) = stats {
4013        stats.misses.fetch_add(1, Ordering::Relaxed);
4014    }
4015    let state = evaluate_cell_moments(cell, max_degree)?;
4016    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4017    // through `Arc::clone`, and return the underlying value by unwrapping the
4018    // unique-reference (caller-side) `Arc`. This replaces the prior
4019    // `state.clone()` deep copy at the insert site.
4020    let shared = Arc::new(state);
4021    let mut entry = CachedCellMoments::new(Arc::clone(&shared));
4022    if let Some(derivative) = existing_derivative {
4023        entry = entry.with_derivative(derivative);
4024    }
4025    cache.insert(key, entry);
4026    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4027}
4028
4029/// Derivative-moment counterpart to [`evaluate_cell_moments_cached`]. Shares
4030/// the value-moment LRU by storing both moment kinds in a single
4031/// [`CachedCellMoments`] entry keyed on the cell fingerprint — derivative
4032/// insertions preserve any pre-existing value state and vice versa, so the
4033/// two callers never evict each other's work.
4034pub fn evaluate_cell_derivative_moments_cached(
4035    cell: DenestedCubicCell,
4036    max_degree: usize,
4037    cache: &CellMomentLruCache,
4038    stats: Option<&CellMomentCacheStats>,
4039) -> Result<CellDerivativeMomentState, String> {
4040    // Affine cells bypass the LRU — see `evaluate_cell_moments_cached` for
4041    // why the sharded-mutex memo is pure overhead on the closed-form affine
4042    // path at large n (#979).
4043    if matches!(branch_cell(cell), Ok(ExactCellBranch::Affine)) {
4044        if let Some(stats) = stats {
4045            stats.misses.fetch_add(1, Ordering::Relaxed);
4046        }
4047        return evaluate_cell_derivative_moments_uncached(cell, max_degree);
4048    }
4049    let key = CellFingerprint::new(cell);
4050    let existing_value = match cache.get(&key) {
4051        Some(cached) => {
4052            if let Some(state) = cached.derivative_state_for_degree(max_degree) {
4053                if let Some(stats) = stats {
4054                    stats.hits.fetch_add(1, Ordering::Relaxed);
4055                }
4056                return Ok(state);
4057            }
4058            // `cached.state` is `Option<Arc<_>>`; `.clone()` here is the cheap
4059            // refcount bump the audit-39 fix targets, not a full moment-vector
4060            // deep clone.
4061            cached.state.clone()
4062        }
4063        None => None,
4064    };
4065    if let Some(stats) = stats {
4066        stats.misses.fetch_add(1, Ordering::Relaxed);
4067    }
4068    let state = evaluate_cell_derivative_moments_uncached(cell, max_degree)?;
4069    // Wrap the freshly-computed state in `Arc` once, share it with the cache
4070    // through `Arc::clone`, and return the underlying value by unwrapping the
4071    // unique-reference (caller-side) `Arc`. This replaces the prior
4072    // `state.clone()` deep copy at the insert site.
4073    let shared = Arc::new(state);
4074    let mut entry = CachedCellMoments::new_derivative(Arc::clone(&shared));
4075    if let Some(value) = existing_value {
4076        entry = entry.with_value(value);
4077    }
4078    cache.insert(key, entry);
4079    Ok(Arc::try_unwrap(shared).unwrap_or_else(|a| (*a).clone()))
4080}
4081
4082/// Scratch-backed variant of [`evaluate_cell_moments`].
4083///
4084/// Reuses the supplied [`CellMomentScratch`] for the returned moments slice,
4085/// so repeated calls with the same scratch (and a sufficient initial capacity)
4086/// avoid per-call `Vec` allocations on the hot inner-PIRLS row-intercept
4087/// solver path. Internal transport allocations are unchanged.
4088pub fn evaluate_cell_moments_with_scratch<'a>(
4089    cell: DenestedCubicCell,
4090    max_degree: usize,
4091    scratch: &'a mut CellMomentScratch,
4092) -> Result<CellMomentStateRef<'a>, String> {
4093    let state = evaluate_cell_moments(cell, max_degree)?;
4094    let out = scratch.prepare_moments(max_degree + 1);
4095    out.copy_from_slice(&state.moments);
4096    Ok(CellMomentStateRef {
4097        branch: state.branch,
4098        value: state.value,
4099        moments: out,
4100    })
4101}
4102
4103#[cfg(test)]
4104mod tests {
4105    use super::*;
4106    use gam_math::probability::normal_pdf;
4107
4108    /// Pointwise value of the cell THIRD-derivative integrand
4109    /// `(d3/dr ds dt) exp(-q(z))/2pi` at a single `z`, evaluated from the SAME
4110    /// `(r, s, t, rs, rt, st, rst)` coefficient polynomials the moment reduction
4111    /// `cell_third_derivative_from_moments` integrates. Unlike the
4112    /// second-derivative integrand this one does NOT cancel across an interior
4113    /// C2-link knot crossing (the `c_rst` third coefficient jumps), so it backs
4114    /// the C2-telescoping regression below. Test-only; no production consumer.
4115    #[inline]
4116    fn cell_third_derivative_boundary_integrand(
4117        cell: DenestedCubicCell,
4118        first_coefficients_r: &[f64],
4119        first_coefficients_s: &[f64],
4120        first_coefficients_t: &[f64],
4121        second_coefficients_rs: &[f64],
4122        second_coefficients_rt: &[f64],
4123        second_coefficients_st: &[f64],
4124        third_coefficients_rst: &[f64],
4125        z: f64,
4126    ) -> f64 {
4127        let eta = cell.eta(z);
4128        let c_r = poly_eval_at(first_coefficients_r, z);
4129        let c_s = poly_eval_at(first_coefficients_s, z);
4130        let c_t = poly_eval_at(first_coefficients_t, z);
4131        let c_rs = poly_eval_at(second_coefficients_rs, z);
4132        let c_rt = poly_eval_at(second_coefficients_rt, z);
4133        let c_st = poly_eval_at(second_coefficients_st, z);
4134        let c_rst = poly_eval_at(third_coefficients_rst, z);
4135        let amplitude = c_rst - eta * (c_rs * c_t + c_rt * c_s + c_st * c_r)
4136            + (eta * eta - 1.0) * c_r * c_s * c_t;
4137        amplitude * (-cell.q(z)).exp() * INV_TWO_PI
4138    }
4139
4140    #[inline]
4141    pub(super) fn polynomial_value(coefficients: &[f64], z: f64) -> f64 {
4142        coefficients
4143            .iter()
4144            .rev()
4145            .fold(0.0, |acc, &coeff| acc * z + coeff)
4146    }
4147
4148    fn reset_cell_moment_test_reallocs() {
4149        super::CELL_MOMENT_REALLOCS.store(0, std::sync::atomic::Ordering::Relaxed);
4150    }
4151
4152    fn cell_moment_test_reallocs() -> usize {
4153        super::CELL_MOMENT_REALLOCS.load(std::sync::atomic::Ordering::Relaxed)
4154    }
4155
4156    fn assert_close_rel(label: &str, actual: f64, expected: f64, tol: f64) {
4157        let denom = expected.abs().max(1.0);
4158        let rel = (actual - expected).abs() / denom;
4159        assert!(
4160            rel <= tol,
4161            "{label}: actual={actual:.17e} expected={expected:.17e} rel={rel:.3e} tol={tol:.3e}"
4162        );
4163    }
4164
4165    // The link-basis cell coefficient `transformed_link_cubic(span, a, b)` is, in
4166    // each of its four output components, a polynomial of TOTAL degree exactly 3 in
4167    // (a, b):
4168    //   d0 = c0 + c1·s + c2·s² + c3·s³            (s = a − left; deg 3 in a)
4169    //   d1 = b·(c1 + 2c2·s + 3c3·s²)              (a²·b → total deg 3)
4170    //   d2 = b²·(c2 + 3c3·s)                       (a·b² → total deg 3)
4171    //   d3 = c3·b³                                 (b³  → total deg 3)
4172    // Therefore EVERY 4th-order total (a,b)-partial (∂⁴/∂aⁱ∂b^{4−i}) is identically
4173    // zero, while the 3rd-order partials (∂³/∂aⁱ∂b^{3−i}) are the highest nonzero
4174    // ones. This is the exact algebraic fact the bidirectional flex jet relies on:
4175    // a "second mixed derivative of a third-a-partial" slot, etc., demands a 4th
4176    // total (a,b)-partial and must be hard-zero — substituting a (nonzero) 3rd
4177    // partial there is a bug. This test certifies BOTH facts by central FD so the
4178    // hard-coded `0.0` fixes are provably correct and provably necessary.
4179    #[test]
4180    fn link_basis_cell_fourth_ab_partials_vanish_third_are_nonzero() {
4181        let span = LocalSpanCubic {
4182            left: -0.4,
4183            right: 1.6,
4184            c0: 0.37,
4185            c1: -0.81,
4186            c2: 0.53,
4187            c3: -0.29,
4188        };
4189        let a0 = 0.23_f64;
4190        let b0 = 0.61_f64;
4191        let h = 1e-2_f64;
4192
4193        // Generic central-difference stencils per derivative order.
4194        let stencil = |order: usize| -> &'static [(i64, f64)] {
4195            match order {
4196                0 => &[(0, 1.0)],
4197                1 => &[(-1, -0.5), (1, 0.5)],
4198                2 => &[(-1, 1.0), (0, -2.0), (1, 1.0)],
4199                3 => &[(-2, -0.5), (-1, 1.0), (1, -1.0), (2, 0.5)],
4200                4 => &[(-2, 1.0), (-1, -4.0), (0, 6.0), (1, -4.0), (2, 1.0)],
4201                _ => &[(0, 1.0)],
4202            }
4203        };
4204        // FD of component `k` of the cell coefficient: ∂^{na+nb}/∂a^{na}∂b^{nb}.
4205        let fd = |k: usize, na: usize, nb: usize| -> f64 {
4206            let mut acc = 0.0;
4207            for &(ia, wa) in stencil(na) {
4208                for &(ib, wb) in stencil(nb) {
4209                    let a = a0 + (ia as f64) * h;
4210                    let b = b0 + (ib as f64) * h;
4211                    acc += wa * wb * link_basis_cell_coefficients(span, a, b)[k];
4212                }
4213            }
4214            acc / h.powi((na + nb) as i32)
4215        };
4216
4217        let (p3_aaa, p3_aab, p3_abb, p3_bbb) = link_basis_cell_third_partials(span);
4218
4219        // (1) The analytic 3rd partials match FD (within FD truncation) — and at
4220        // least one is appreciably nonzero, so these are real signal that a wrong
4221        // slot would inject.
4222        let mut max_third = 0.0_f64;
4223        for k in 0..4 {
4224            for (label, (na, nb), analytic) in [
4225                ("aaa", (3usize, 0usize), p3_aaa[k]),
4226                ("aab", (2, 1), p3_aab[k]),
4227                ("abb", (1, 2), p3_abb[k]),
4228                ("bbb", (0, 3), p3_bbb[k]),
4229            ] {
4230                let got = fd(k, na, nb);
4231                assert!(
4232                    (got - analytic).abs() <= 1e-4 + 1e-3 * analytic.abs(),
4233                    "3rd partial {label}[{k}] analytic {analytic:+.6e} vs FD {got:+.6e}"
4234                );
4235                max_third = max_third.max(analytic.abs());
4236            }
4237        }
4238        assert!(
4239            max_third > 1e-1,
4240            "expected an appreciable nonzero 3rd (a,b)-partial; max |analytic| = {max_third:.3e}"
4241        );
4242
4243        // (2) EVERY 4th-order total (a,b)-partial vanishes (degree-3 polynomial),
4244        // certifying that the hard-coded `0.0` in the bidirectional d12 slots is the
4245        // mathematically required value, not an approximation.
4246        for k in 0..4 {
4247            for (na, nb) in [(4usize, 0usize), (3, 1), (2, 2), (1, 3), (0, 4)] {
4248                let got = fd(k, na, nb);
4249                assert!(
4250                    got.abs() <= 1e-2,
4251                    "4th (a,b)-partial ∂^{na}_a∂^{nb}_b of cell coeff[{k}] must vanish, FD = {got:+.6e}"
4252                );
4253            }
4254        }
4255    }
4256
4257    #[test]
4258    fn non_affine_cell_state_grid_matches_public_cell_moments_reference() {
4259        let cells = [
4260            DenestedCubicCell {
4261                left: -1.25,
4262                right: -0.2,
4263                c0: -0.35,
4264                c1: 0.85,
4265                c2: 0.04,
4266                c3: -0.015,
4267            },
4268            DenestedCubicCell {
4269                left: -0.2,
4270                right: 0.55,
4271                c0: 0.12,
4272                c1: -0.65,
4273                c2: -0.025,
4274                c3: 0.02,
4275            },
4276            DenestedCubicCell {
4277                left: 0.55,
4278                right: 1.6,
4279                c0: 0.42,
4280                c1: 0.35,
4281                c2: 0.018,
4282                c3: 0.012,
4283            },
4284        ];
4285        for cell in cells {
4286            let branch = branch_cell(cell).expect("branch");
4287            assert_ne!(branch, ExactCellBranch::Affine);
4288            for max_degree in [0usize, 2, 4, 9, 16] {
4289                let direct = evaluate_non_affine_cell_state(cell, branch, max_degree)
4290                    .expect("direct non-affine transport");
4291                let public = evaluate_cell_moments(cell, max_degree).expect("public evaluator");
4292                assert_eq!(direct.branch, public.branch);
4293                assert_eq!(direct.moments.len(), public.moments.len());
4294                let value_scale = direct.value.abs().max(public.value.abs()).max(1.0);
4295                assert!(
4296                    (direct.value - public.value).abs() <= 1e-10 * value_scale,
4297                    "value mismatch for {cell:?} degree {max_degree}: direct={} public={}",
4298                    direct.value,
4299                    public.value
4300                );
4301                for (degree, (lhs, rhs)) in
4302                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4303                {
4304                    let scale = lhs.abs().max(rhs.abs()).max(1.0);
4305                    assert!(
4306                        (lhs - rhs).abs() <= 1e-10 * scale,
4307                        "moment {degree} mismatch for {cell:?} degree {max_degree}: {lhs} vs {rhs}"
4308                    );
4309                }
4310            }
4311        }
4312    }
4313
4314    #[test]
4315    fn affine_tail_cell_memo_matches_uncached_grid_and_records_hits() {
4316        // Use a dedicated local cache so the test's hit/miss/entry counters
4317        // are not perturbed by concurrent tests that drive the shared
4318        // global memo through `evaluate_cell_moments`. Asserting on the
4319        // global counters made this test race-flaky when the suite ran in
4320        // parallel.
4321        let cache = TailCellMomentCache::new();
4322        let c0s = [-2.0, -0.25, 0.0, 1.5];
4323        let c1s = [-1.2, -0.05, 0.0, 0.8];
4324        let endpoints = [-4.0, -1.0, 0.0, 2.5, 6.0];
4325        let degrees = [0_usize, 4, 9, 16, 24];
4326
4327        for &c0 in &c0s {
4328            for &c1 in &c1s {
4329                for &endpoint in &endpoints {
4330                    for &max_degree in &degrees {
4331                        for &(left, right) in
4332                            &[(f64::NEG_INFINITY, endpoint), (endpoint, f64::INFINITY)]
4333                        {
4334                            let cell = DenestedCubicCell {
4335                                left,
4336                                right,
4337                                c0,
4338                                c1,
4339                                c2: 0.0,
4340                                c3: 0.0,
4341                            };
4342                            let expected = evaluate_cell_moments_uncached(cell, max_degree)
4343                                .expect("uncached affine tail moments");
4344                            let actual = cache
4345                                .evaluate(cell, max_degree)
4346                                .expect("cached affine tail moments miss");
4347                            let repeat = cache
4348                                .evaluate(cell, max_degree)
4349                                .expect("cached affine tail moments hit");
4350                            assert_eq!(actual.branch, expected.branch);
4351                            assert_eq!(repeat.branch, expected.branch);
4352                            assert_close_rel(
4353                                "tail value miss",
4354                                actual.value,
4355                                expected.value,
4356                                1e-14,
4357                            );
4358                            assert_close_rel("tail value hit", repeat.value, expected.value, 1e-14);
4359                            assert_eq!(actual.moments.len(), expected.moments.len());
4360                            assert_eq!(repeat.moments.len(), expected.moments.len());
4361                            for (idx, ((a, r), e)) in actual
4362                                .moments
4363                                .iter()
4364                                .zip(repeat.moments.iter())
4365                                .zip(expected.moments.iter())
4366                                .enumerate()
4367                            {
4368                                assert_close_rel(
4369                                    &format!("tail moment miss[{idx}]"),
4370                                    *a,
4371                                    *e,
4372                                    1e-14,
4373                                );
4374                                assert_close_rel(&format!("tail moment hit[{idx}]"), *r, *e, 1e-14);
4375                            }
4376                        }
4377                    }
4378                }
4379            }
4380        }
4381
4382        let stats = cache.stats();
4383        assert_eq!(stats.misses, stats.entries);
4384        assert!(
4385            stats.hits >= stats.misses,
4386            "expected repeat hits: {stats:?}"
4387        );
4388        assert!(
4389            stats.hit_rate() >= 0.5,
4390            "unexpected low hit rate: {stats:?}"
4391        );
4392    }
4393
4394    fn reference_bivariate_normal_cdf_20(h: f64, k: f64, rho: f64) -> f64 {
4395        if h == f64::NEG_INFINITY || k == f64::NEG_INFINITY {
4396            return 0.0;
4397        }
4398        if h == f64::INFINITY {
4399            return normal_cdf(k);
4400        }
4401        if k == f64::INFINITY {
4402            return normal_cdf(h);
4403        }
4404        let rho_clamped = rho.clamp(-1.0, 1.0);
4405        if rho_clamped >= 1.0 - 1e-12 {
4406            return normal_cdf(h.min(k));
4407        }
4408        if rho_clamped <= -1.0 + 1e-12 {
4409            return (normal_cdf(h) - normal_cdf(-k)).clamp(0.0, 1.0);
4410        }
4411
4412        let hs = 0.5 * (h * h + k * k);
4413        let asr = rho_clamped.asin();
4414        let mut sum = 0.0;
4415        for (&node, &weight) in GL20_NODES.iter().zip(GL20_WEIGHTS.iter()) {
4416            let sn = (0.5 * asr * (node + 1.0)).sin();
4417            let one_minus = 1.0 - sn * sn;
4418            let expo = ((sn * h * k) - hs) / one_minus;
4419            sum += weight * expo.exp();
4420        }
4421        (normal_cdf(h) * normal_cdf(k) + asr * sum / (4.0 * std::f64::consts::PI)).clamp(0.0, 1.0)
4422    }
4423
4424    #[test]
4425    fn non_affine_cell_state_reference_grid_matches_public_moments() {
4426        let c0s = [-0.4, 0.0, 0.35];
4427        let c1s = [-0.8, 0.25, 1.1];
4428        let c2s = [-0.12, 0.08];
4429        let c3s = [-0.04, 0.03];
4430        let intervals = [(-1.25, -0.2), (-0.5, 0.75), (0.1, 1.4)];
4431        let degrees = [3usize, 6, 9, 12];
4432
4433        for &c0 in &c0s {
4434            for &c1 in &c1s {
4435                for &c2 in &c2s {
4436                    for &c3 in &c3s {
4437                        for &(left, right) in &intervals {
4438                            let cell = DenestedCubicCell {
4439                                left,
4440                                right,
4441                                c0,
4442                                c1,
4443                                c2,
4444                                c3,
4445                            };
4446                            let branch = branch_cell(cell).expect("branch");
4447                            assert_ne!(branch, ExactCellBranch::Affine);
4448                            for &degree in &degrees {
4449                                let direct = evaluate_non_affine_cell_state(cell, branch, degree)
4450                                    .expect("direct non-affine state");
4451                                let public = evaluate_cell_moments(cell, degree)
4452                                    .expect("public non-affine state");
4453                                assert_eq!(direct.branch, public.branch);
4454                                let value_scale =
4455                                    direct.value.abs().max(public.value.abs()).max(1.0);
4456                                assert!(
4457                                    (direct.value - public.value).abs() / value_scale <= 1.0e-15,
4458                                    "value mismatch for {cell:?}, degree {degree}: direct={:.17e}, public={:.17e}",
4459                                    direct.value,
4460                                    public.value
4461                                );
4462                                assert_eq!(direct.moments.len(), public.moments.len());
4463                                for (idx, (&a, &b)) in
4464                                    direct.moments.iter().zip(public.moments.iter()).enumerate()
4465                                {
4466                                    let scale = a.abs().max(b.abs()).max(1.0);
4467                                    assert!(
4468                                        (a - b).abs() / scale <= 1.0e-15,
4469                                        "moment {idx} mismatch for {cell:?}, degree {degree}: direct={a:.17e}, public={b:.17e}"
4470                                    );
4471                                }
4472                            }
4473                        }
4474                    }
4475                }
4476            }
4477        }
4478    }
4479
4480    #[test]
4481    fn bivariate_normal_cdf_matches_reference_grid_to_1e_minus_10() {
4482        let hs = [-8.0, -5.0, -3.0, -1.5, -0.5, 0.0, 0.25, 1.0, 2.5, 5.0, 8.0];
4483        let ks = [-8.0, -4.0, -2.0, -0.75, 0.0, 0.4, 1.25, 3.0, 6.0, 8.0];
4484        let rhos = [
4485            -0.999_999_999_999,
4486            -0.999,
4487            -0.95,
4488            -0.7,
4489            -0.3,
4490            -1.0e-12,
4491            0.0,
4492            1.0e-12,
4493            0.3,
4494            0.7,
4495            0.95,
4496            0.999,
4497            0.999_999_999_999,
4498        ];
4499        for &h in &hs {
4500            for &k in &ks {
4501                for &rho in &rhos {
4502                    let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4503                    let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4504                    let scale = expected.abs().max(1.0e-300);
4505                    let rel = (actual - expected).abs() / scale;
4506                    assert!(
4507                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4508                        "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4509                    );
4510                }
4511            }
4512        }
4513    }
4514
4515    #[test]
4516    fn bivariate_normal_cdf_matches_reference_lcg_property_samples() {
4517        let mut seed = 0x5eed_cafe_f00d_u64;
4518        let mut next_unit = || {
4519            seed = seed.wrapping_mul(6_364_136_223_846_793_005).wrapping_add(1);
4520            ((seed >> 11) as f64) * (1.0 / ((1_u64 << 53) as f64))
4521        };
4522        for _ in 0..4096 {
4523            let h = -8.0 + 16.0 * next_unit();
4524            let k = -8.0 + 16.0 * next_unit();
4525            let rho = -0.999 + 1.998 * next_unit();
4526            let actual = bivariate_normal_cdf(h, k, rho).expect("bvn");
4527            let expected = reference_bivariate_normal_cdf_20(h, k, rho);
4528            let scale = expected.abs().max(1.0e-300);
4529            let rel = (actual - expected).abs() / scale;
4530            assert!(
4531                rel < 1.0e-10 || (actual - expected).abs() < 1.0e-14,
4532                "h={h} k={k} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4533            );
4534        }
4535    }
4536
4537    #[test]
4538    fn affine_bvn_interval_primitive_matches_two_cdf_difference() {
4539        let hs = [-6.0, -2.0, -0.25, 0.0, 0.8, 3.0, 6.0];
4540        let bounds = [
4541            (-5.0, -2.0),
4542            (-3.0, -0.1),
4543            (-1.0, 0.0),
4544            (-0.25, 0.75),
4545            (0.2, 3.5),
4546            (2.0, 7.0),
4547        ];
4548        let rhos = [-0.98, -0.8, -0.25, 0.0, 0.25, 0.8, 0.98];
4549        for &h in &hs {
4550            for &(left, right) in &bounds {
4551                for &rho in &rhos {
4552                    let actual =
4553                        bivariate_normal_cdf_interval(h, left, right, rho).expect("interval");
4554                    let expected = (reference_bivariate_normal_cdf_20(h, right, rho)
4555                        - reference_bivariate_normal_cdf_20(h, left, rho))
4556                    .clamp(0.0, 1.0);
4557                    let scale = expected.abs().max(1.0e-300);
4558                    let rel = (actual - expected).abs() / scale;
4559                    assert!(
4560                        rel < 1.0e-10 || (actual - expected).abs() < 1.0e-12,
4561                        "h={h} left={left} right={right} rho={rho} actual={actual:.17e} expected={expected:.17e} rel={rel:.3e}"
4562                    );
4563                }
4564            }
4565        }
4566    }
4567
4568    fn simpson_integral<F>(left: f64, right: f64, steps: usize, f: F) -> f64
4569    where
4570        F: Fn(f64) -> f64,
4571    {
4572        let n = if steps.is_multiple_of(2) {
4573            steps
4574        } else {
4575            steps + 1
4576        };
4577        let h = (right - left) / n as f64;
4578        let mut acc = f(left) + f(right);
4579        for k in 1..n {
4580            let x = left + h * k as f64;
4581            let w = if k % 2 == 0 { 2.0 } else { 4.0 };
4582            acc += w * f(x);
4583        }
4584        acc * h / 3.0
4585    }
4586
4587    #[test]
4588    fn global_transform_preserves_local_span_polynomial() {
4589        let span = LocalSpanCubic {
4590            left: -1.2,
4591            right: 0.8,
4592            c0: 0.3,
4593            c1: -0.25,
4594            c2: 0.11,
4595            c3: -0.04,
4596        };
4597        let (g0, g1, g2, g3) = global_cubic_from_local(span);
4598        for &x in &[-1.2, -0.7, -0.1, 0.4, 0.8] {
4599            let local = span.evaluate(x);
4600            let global = g0 + g1 * x + g2 * x * x + g3 * x * x * x;
4601            assert!((local - global).abs() < 1e-12);
4602        }
4603    }
4604
4605    #[test]
4606    fn bivariate_normal_cdf_independent_factorizes() {
4607        let h = -0.35;
4608        let k = 0.8;
4609        let out = bivariate_normal_cdf(h, k, 0.0).expect("bvn");
4610        let target = normal_cdf(h) * normal_cdf(k);
4611        assert!((out - target).abs() < 1e-12);
4612    }
4613
4614    #[test]
4615    fn evaluate_affine_cell_state_matches_numeric_integrals() {
4616        let cell = DenestedCubicCell {
4617            left: -0.9,
4618            right: 0.8,
4619            c0: 0.15,
4620            c1: -0.35,
4621            c2: 0.0,
4622            c3: 0.0,
4623        };
4624        let state = evaluate_affine_cell_state(cell, 6).expect("affine cell");
4625        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
4626            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
4627        });
4628        assert_eq!(state.branch, ExactCellBranch::Affine);
4629        assert!((state.value - value_numeric).abs() < 1e-9);
4630        for degree in 0..=6 {
4631            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
4632                z.powi(degree as i32) * (-cell.q(z)).exp()
4633            });
4634            assert!((state.moments[degree] - target).abs() < 1e-9);
4635        }
4636    }
4637
4638    #[test]
4639    fn affine_cell_value_matches_zero_moment_derivative() {
4640        let cell = DenestedCubicCell {
4641            left: -1.1,
4642            right: 0.7,
4643            c0: 0.23,
4644            c1: -0.41,
4645            c2: 0.0,
4646            c3: 0.0,
4647        };
4648        let h = 1e-6;
4649        let plus = evaluate_affine_cell_state(
4650            DenestedCubicCell {
4651                c0: cell.c0 + h,
4652                ..cell
4653            },
4654            0,
4655        )
4656        .expect("affine plus");
4657        let minus = evaluate_affine_cell_state(
4658            DenestedCubicCell {
4659                c0: cell.c0 - h,
4660                ..cell
4661            },
4662            0,
4663        )
4664        .expect("affine minus");
4665        let center = evaluate_affine_cell_state(cell, 0).expect("affine center");
4666        let d_value = (plus.value - minus.value) / (2.0 * h);
4667        let target = INV_TWO_PI * center.moments[0];
4668        assert!((d_value - target).abs() < 1e-8);
4669    }
4670
4671    #[test]
4672    fn coefficient_partials_match_exact_span_derivatives() {
4673        let score_span = LocalSpanCubic {
4674            left: -0.75,
4675            right: 0.25,
4676            c0: 0.08,
4677            c1: -0.03,
4678            c2: 0.02,
4679            c3: -0.01,
4680        };
4681        let link_span = LocalSpanCubic {
4682            left: -0.6,
4683            right: 0.9,
4684            c0: -0.05,
4685            c1: 0.04,
4686            c2: -0.02,
4687            c3: 0.015,
4688        };
4689        let a = 0.3;
4690        let b = -0.7;
4691        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
4692        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4693            let u = a + b * z;
4694            let eta_a = 1.0 + link_span.first_derivative(u);
4695            let eta_b = z + score_span.evaluate(z) + z * link_span.first_derivative(u);
4696            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4697            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4698        }
4699    }
4700
4701    #[test]
4702    fn second_coefficient_partials_match_exact_span_derivatives() {
4703        let score_span = LocalSpanCubic {
4704            left: -0.75,
4705            right: 0.25,
4706            c0: 0.08,
4707            c1: -0.03,
4708            c2: 0.02,
4709            c3: -0.01,
4710        };
4711        let link_span = LocalSpanCubic {
4712            left: -0.6,
4713            right: 0.9,
4714            c0: -0.05,
4715            c1: 0.04,
4716            c2: -0.02,
4717            c3: 0.015,
4718        };
4719        let a = 0.3;
4720        let b = -0.7;
4721        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
4722        let dc_daa = second_partials.0;
4723        let dc_dab = second_partials.1;
4724        let dc_dbb = second_partials.2;
4725        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4726            let u = a + b * z;
4727            let eta_aa = link_span.second_derivative(u);
4728            let eta_ab = z * link_span.second_derivative(u);
4729            let eta_bb = z * z * link_span.second_derivative(u);
4730            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4731            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4732            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4733        }
4734    }
4735
4736    #[test]
4737    fn higher_derivative_moment_helpers_reject_empty_first_coefficients() {
4738        let cell = DenestedCubicCell {
4739            left: -1.0,
4740            right: 1.0,
4741            c0: 0.0,
4742            c1: 1.0,
4743            c2: 0.0,
4744            c3: 0.0,
4745        };
4746        let moments = [1.0; 16];
4747
4748        let third_err = cell_third_derivative_from_moments(
4749            cell,
4750            &[],
4751            &[1.0],
4752            &[1.0],
4753            &[],
4754            &[],
4755            &[],
4756            &[],
4757            &moments,
4758        )
4759        .expect_err("empty first coefficients should be rejected");
4760        assert!(third_err.contains("r first-derivative coefficients must be non-empty"));
4761
4762        let fourth_err = cell_fourth_derivative_from_moments(
4763            cell,
4764            &[1.0],
4765            &[],
4766            &[1.0],
4767            &[1.0],
4768            &[],
4769            &[],
4770            &[],
4771            &[],
4772            &[],
4773            &[],
4774            &[],
4775            &[],
4776            &[],
4777            &[],
4778            &[],
4779            &moments,
4780        )
4781        .expect_err("empty first coefficients should be rejected");
4782        assert!(fourth_err.contains("s first-derivative coefficients must be non-empty"));
4783    }
4784
4785    #[test]
4786    fn fourth_derivative_rejects_overlong_scratch_convolutions() {
4787        let cell = DenestedCubicCell {
4788            left: -1.0,
4789            right: 1.0,
4790            c0: 0.0,
4791            c1: 1.0,
4792            c2: 0.0,
4793            c3: 0.0,
4794        };
4795        let long_first = [1.0; 10];
4796        let zero = [0.0; 1];
4797        let moments = [1.0; 64];
4798
4799        let err = cell_fourth_derivative_from_moments(
4800            cell,
4801            &long_first,
4802            &long_first,
4803            &long_first,
4804            &long_first,
4805            &zero,
4806            &zero,
4807            &zero,
4808            &zero,
4809            &zero,
4810            &zero,
4811            &zero,
4812            &zero,
4813            &zero,
4814            &zero,
4815            &zero,
4816            &moments,
4817        )
4818        .expect_err("oversized convolution should be rejected before writing scratch");
4819        assert!(err.contains("fourth derivative polynomial convolution scratch too small"));
4820    }
4821
4822    #[test]
4823    fn score_and_link_basis_cell_coefficients_match_direct_construction() {
4824        let score_basis_span = LocalSpanCubic {
4825            left: -0.7,
4826            right: 0.4,
4827            c0: 0.2,
4828            c1: -0.04,
4829            c2: 0.03,
4830            c3: -0.01,
4831        };
4832        let link_basis_span = LocalSpanCubic {
4833            left: -0.5,
4834            right: 1.1,
4835            c0: -0.03,
4836            c1: 0.05,
4837            c2: -0.02,
4838            c3: 0.01,
4839        };
4840        let a = 0.25;
4841        let b = -0.8;
4842        let score_coeffs = score_basis_cell_coefficients(score_basis_span, b);
4843        let link_coeffs = link_basis_cell_coefficients(link_basis_span, a, b);
4844        for &z in &[-0.7, -0.1, 0.2, 0.4] {
4845            let score_poly = polynomial_value(&score_coeffs, z);
4846            let link_poly = polynomial_value(&link_coeffs, z);
4847            assert!((score_poly - b * score_basis_span.evaluate(z)).abs() < 1e-12);
4848            assert!((link_poly - link_basis_span.evaluate(a + b * z)).abs() < 1e-12);
4849        }
4850    }
4851
4852    #[test]
4853    fn link_basis_partials_match_exact_span_derivatives() {
4854        let link_basis_span = LocalSpanCubic {
4855            left: -0.5,
4856            right: 1.1,
4857            c0: -0.03,
4858            c1: 0.05,
4859            c2: -0.02,
4860            c3: 0.01,
4861        };
4862        let a = 0.25;
4863        let b = -0.8;
4864        let (dc_da, dc_db) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
4865        let (dc_daa, dc_dab, dc_dbb) = link_basis_cell_second_partials(link_basis_span, a, b);
4866        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4867            let u = a + b * z;
4868            let eta_a = link_basis_span.first_derivative(u);
4869            let eta_b = z * link_basis_span.first_derivative(u);
4870            let eta_aa = link_basis_span.second_derivative(u);
4871            let eta_ab = z * link_basis_span.second_derivative(u);
4872            let eta_bb = z * z * link_basis_span.second_derivative(u);
4873            assert!((polynomial_value(&dc_da, z) - eta_a).abs() < 1e-12);
4874            assert!((polynomial_value(&dc_db, z) - eta_b).abs() < 1e-12);
4875            assert!((polynomial_value(&dc_daa, z) - eta_aa).abs() < 1e-12);
4876            assert!((polynomial_value(&dc_dab, z) - eta_ab).abs() < 1e-12);
4877            assert!((polynomial_value(&dc_dbb, z) - eta_bb).abs() < 1e-12);
4878        }
4879    }
4880
4881    #[test]
4882    fn denested_third_partials_match_exact_span_derivatives() {
4883        let link_span = LocalSpanCubic {
4884            left: -0.6,
4885            right: 0.9,
4886            c0: -0.05,
4887            c1: 0.04,
4888            c2: -0.02,
4889            c3: 0.015,
4890        };
4891        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
4892        let link_third = 6.0 * link_span.c3;
4893        for &z in &[-0.75, -0.4, -0.1, 0.2] {
4894            let eta_aaa = link_third;
4895            let eta_aab = z * link_third;
4896            let eta_abb = z * z * link_third;
4897            let eta_bbb = z * z * z * link_third;
4898            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4899            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4900            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4901            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4902        }
4903    }
4904
4905    #[test]
4906    fn link_basis_third_partials_match_exact_span_derivatives() {
4907        let link_basis_span = LocalSpanCubic {
4908            left: -0.5,
4909            right: 1.1,
4910            c0: -0.03,
4911            c1: 0.05,
4912            c2: -0.02,
4913            c3: 0.01,
4914        };
4915        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = link_basis_cell_third_partials(link_basis_span);
4916        let link_third = 6.0 * link_basis_span.c3;
4917        for &z in &[-0.6, -0.2, 0.15, 0.5] {
4918            let eta_aaa = link_third;
4919            let eta_aab = z * link_third;
4920            let eta_abb = z * z * link_third;
4921            let eta_bbb = z * z * z * link_third;
4922            assert!((polynomial_value(&dc_daaa, z) - eta_aaa).abs() < 1e-12);
4923            assert!((polynomial_value(&dc_daab, z) - eta_aab).abs() < 1e-12);
4924            assert!((polynomial_value(&dc_dabb, z) - eta_abb).abs() < 1e-12);
4925            assert!((polynomial_value(&dc_dbbb, z) - eta_bbb).abs() < 1e-12);
4926        }
4927    }
4928
4929    #[test]
4930    fn branch_selection_uses_normalized_non_affine_coefficients() {
4931        let affine = DenestedCubicCell {
4932            left: -1.0,
4933            right: 1.0,
4934            c0: 0.1,
4935            c1: -0.4,
4936            c2: 1e-13,
4937            c3: -1e-13,
4938        };
4939        let quartic = DenestedCubicCell {
4940            c2: 2e-4,
4941            c3: 1e-13,
4942            ..affine
4943        };
4944        let sextic = DenestedCubicCell {
4945            c2: 2e-4,
4946            c3: 5e-3,
4947            ..affine
4948        };
4949        assert_eq!(branch_cell(affine).unwrap(), ExactCellBranch::Affine);
4950        assert_eq!(branch_cell(quartic).unwrap(), ExactCellBranch::Quartic);
4951        assert_eq!(branch_cell(sextic).unwrap(), ExactCellBranch::Sextic);
4952    }
4953
4954    #[test]
4955    fn affine_anchor_moments_match_whole_line_closed_forms() {
4956        let out = affine_anchor_moment_vector(0.0, 0.0, f64::NEG_INFINITY, f64::INFINITY, 4);
4957        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
4958        assert!((out[0] - sqrt_2pi).abs() < 1e-12);
4959        assert!(out[1].abs() < 1e-12);
4960        assert!((out[2] - sqrt_2pi).abs() < 1e-12);
4961    }
4962
4963    #[test]
4964    fn affine_anchor_moments_match_shifted_gaussian_whole_line() {
4965        let alpha = 0.7;
4966        let beta = -0.4;
4967        let out = affine_anchor_moment_vector(alpha, beta, f64::NEG_INFINITY, f64::INFINITY, 4);
4968        let s = (1.0 + beta * beta).sqrt();
4969        let mu = -alpha * beta / (1.0 + beta * beta);
4970        let scale = (-alpha * alpha / (2.0 * s * s)).exp() / s;
4971        let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();
4972        assert!((out[0] - scale * sqrt_2pi).abs() < 1e-12);
4973        assert!((out[1] - scale * sqrt_2pi * mu).abs() < 1e-12);
4974        assert!((out[2] - scale * sqrt_2pi * (mu * mu + 1.0 / (s * s))).abs() < 1e-10);
4975    }
4976
4977    #[test]
4978    fn quartic_recurrence_reduces_higher_moments() {
4979        let cell = DenestedCubicCell {
4980            left: -1.0,
4981            right: 0.9,
4982            c0: 0.2,
4983            c1: -0.3,
4984            c2: 0.18,
4985            c3: 0.0,
4986        };
4987        let exact = |k: usize| {
4988            simpson_integral(cell.left, cell.right, 2000, |z| {
4989                z.powi(k as i32) * (-cell.q(z)).exp()
4990            })
4991        };
4992        let reduced = reduce_quartic_moments(cell, [exact(0), exact(1), exact(2)], 6)
4993            .expect("quartic reduction");
4994        for k in 0..=6 {
4995            let target = exact(k);
4996            assert!(
4997                (reduced[k] - target).abs() < 1e-7,
4998                "quartic reduced moment M{k} mismatch: {} vs {}",
4999                reduced[k],
5000                target
5001            );
5002        }
5003    }
5004
5005    #[test]
5006    fn sextic_recurrence_reduces_higher_moments() {
5007        let cell = DenestedCubicCell {
5008            left: -0.8,
5009            right: 0.7,
5010            c0: -0.1,
5011            c1: 0.25,
5012            c2: -0.14,
5013            c3: 0.22,
5014        };
5015        let exact = |k: usize| {
5016            simpson_integral(cell.left, cell.right, 3000, |z| {
5017                z.powi(k as i32) * (-cell.q(z)).exp()
5018            })
5019        };
5020        let reduced =
5021            reduce_sextic_moments(cell, [exact(0), exact(1), exact(2), exact(3), exact(4)], 9)
5022                .expect("sextic reduction");
5023        for k in 0..=9 {
5024            let target = exact(k);
5025            assert!(
5026                (reduced[k] - target).abs() < 1e-7,
5027                "sextic reduced moment M{k} mismatch: {} vs {}",
5028                reduced[k],
5029                target
5030            );
5031        }
5032    }
5033
5034    #[test]
5035    fn degenerate_sextic_branch_preserves_quadratic_coefficient() {
5036        let cell = DenestedCubicCell {
5037            left: -1.0,
5038            right: 1.0,
5039            c0: 0.0,
5040            c1: 0.0,
5041            c2: 0.1,
5042            c3: 2.0e-10,
5043        };
5044        assert_eq!(branch_cell(cell).unwrap(), ExactCellBranch::Sextic);
5045
5046        let state = evaluate_cell_moments(cell, 9).expect("degenerate sextic cell");
5047        let quartic_cell = DenestedCubicCell { c3: 0.0, ..cell };
5048        let quartic = evaluate_cell_moments(quartic_cell, 9).expect("quartic cell");
5049        let affine = evaluate_affine_cell_state(
5050            DenestedCubicCell {
5051                c2: 0.0,
5052                c3: 0.0,
5053                ..cell
5054            },
5055            9,
5056        )
5057        .expect("affine cell");
5058
5059        assert_eq!(state.branch, ExactCellBranch::Quartic);
5060        for k in 0..=9 {
5061            assert!(
5062                (state.moments[k] - quartic.moments[k]).abs() < 1e-12,
5063                "lowered moment M{k} should match the quartic cell: {} vs {}",
5064                state.moments[k],
5065                quartic.moments[k]
5066            );
5067        }
5068        assert!(
5069            (state.moments[0] - affine.moments[0]).abs() > 1e-4,
5070            "degenerate sextic handling must not drop the nonzero c2 term"
5071        );
5072    }
5073
5074    #[test]
5075    fn moment_reduced_first_and_second_derivatives_match_numeric_integrals() {
5076        let cell = DenestedCubicCell {
5077            left: -0.9,
5078            right: 0.6,
5079            c0: 0.15,
5080            c1: -0.2,
5081            c2: 0.08,
5082            c3: 0.17,
5083        };
5084        let moments = reduce_sextic_moments(
5085            cell,
5086            [
5087                simpson_integral(cell.left, cell.right, 3000, |z| (-cell.q(z)).exp()),
5088                simpson_integral(cell.left, cell.right, 3000, |z| z * (-cell.q(z)).exp()),
5089                simpson_integral(cell.left, cell.right, 3000, |z| z * z * (-cell.q(z)).exp()),
5090                simpson_integral(cell.left, cell.right, 3000, |z| {
5091                    z.powi(3) * (-cell.q(z)).exp()
5092                }),
5093                simpson_integral(cell.left, cell.right, 3000, |z| {
5094                    z.powi(4) * (-cell.q(z)).exp()
5095                }),
5096            ],
5097            9,
5098        )
5099        .expect("reduced moments");
5100
5101        let r = [0.7, -0.1, 0.3];
5102        let s = [0.2, 0.5];
5103        let second = [0.4, -0.2, 0.1];
5104        let exact_first = cell_first_derivative_from_moments(&r, &moments).expect("first");
5105        let exact_second =
5106            cell_second_derivative_from_moments(cell, &r, &s, &second, &moments).expect("second");
5107
5108        let numeric_first = simpson_integral(cell.left, cell.right, 3000, |z| {
5109            polynomial_value(&r, z) * (-cell.q(z)).exp() / (2.0 * std::f64::consts::PI)
5110        });
5111        let numeric_second = simpson_integral(cell.left, cell.right, 3000, |z| {
5112            let eta = cell.eta(z);
5113            (polynomial_value(&second, z) - eta * polynomial_value(&r, z) * polynomial_value(&s, z))
5114                * (-cell.q(z)).exp()
5115                / (2.0 * std::f64::consts::PI)
5116        });
5117
5118        assert!((exact_first - numeric_first).abs() < 1e-7);
5119        assert!((exact_second - numeric_second).abs() < 1e-7);
5120    }
5121
5122    #[test]
5123    fn moment_reduced_third_derivative_matches_numeric_integral() {
5124        let cell = DenestedCubicCell {
5125            left: -0.85,
5126            right: 0.7,
5127            c0: -0.12,
5128            c1: 0.18,
5129            c2: 0.09,
5130            c3: -0.11,
5131        };
5132        let moments = evaluate_cell_moments(cell, 12).expect("cell moments");
5133        let r = [0.35, -0.12, 0.08];
5134        let s = [0.17, 0.09];
5135        let t = [-0.21, 0.14, -0.04];
5136        let rs = [0.11, -0.07, 0.05];
5137        let rt = [-0.06, 0.03];
5138        let st = [0.08, -0.02, 0.01];
5139        let rst = [0.04, -0.05, 0.02];
5140
5141        let exact_third = cell_third_derivative_from_moments(
5142            cell,
5143            &r,
5144            &s,
5145            &t,
5146            &rs,
5147            &rt,
5148            &st,
5149            &rst,
5150            &moments.moments,
5151        )
5152        .expect("third derivative");
5153        let numeric_third = simpson_integral(cell.left, cell.right, 4000, |z| {
5154            let eta = cell.eta(z);
5155            let rz = polynomial_value(&r, z);
5156            let sz = polynomial_value(&s, z);
5157            let tz = polynomial_value(&t, z);
5158            let rsz = polynomial_value(&rs, z);
5159            let rtz = polynomial_value(&rt, z);
5160            let stz = polynomial_value(&st, z);
5161            let rstz = polynomial_value(&rst, z);
5162            (rstz - eta * (rsz * tz + rtz * sz + stz * rz) + (eta * eta - 1.0) * rz * sz * tz)
5163                * (-cell.q(z)).exp()
5164                / (2.0 * std::f64::consts::PI)
5165        });
5166
5167        assert!((exact_third - numeric_third).abs() < 1e-7);
5168    }
5169
5170    #[test]
5171    fn moment_reduced_fourth_derivative_matches_numeric_integral() {
5172        let cell = DenestedCubicCell {
5173            left: -0.8,
5174            right: 0.65,
5175            c0: 0.11,
5176            c1: -0.22,
5177            c2: 0.07,
5178            c3: 0.13,
5179        };
5180        let moments = evaluate_cell_moments(cell, 16).expect("cell moments");
5181        let r = [0.21, -0.13, 0.06];
5182        let s = [-0.18, 0.04];
5183        let t = [0.09, 0.07, -0.03];
5184        let u = [-0.14, 0.05];
5185        let rs = [0.08, -0.03, 0.02];
5186        let rt = [-0.05, 0.01];
5187        let ru = [0.04, -0.02, 0.01];
5188        let st = [0.03, 0.02];
5189        let su = [-0.02, 0.05, -0.01];
5190        let tu = [0.07, -0.04];
5191        let rst = [0.03, -0.01, 0.02];
5192        let rsu = [-0.02, 0.04];
5193        let rtu = [0.01, 0.02, -0.01];
5194        let stu = [-0.03, 0.02];
5195        let rstu = [0.02, -0.01, 0.01];
5196
5197        let exact_fourth = cell_fourth_derivative_from_moments(
5198            cell,
5199            &r,
5200            &s,
5201            &t,
5202            &u,
5203            &rs,
5204            &rt,
5205            &ru,
5206            &st,
5207            &su,
5208            &tu,
5209            &rst,
5210            &rsu,
5211            &rtu,
5212            &stu,
5213            &rstu,
5214            &moments.moments,
5215        )
5216        .expect("fourth derivative");
5217        let numeric_fourth = simpson_integral(cell.left, cell.right, 5000, |z| {
5218            let eta = cell.eta(z);
5219            let rz = polynomial_value(&r, z);
5220            let sz = polynomial_value(&s, z);
5221            let tz = polynomial_value(&t, z);
5222            let uz = polynomial_value(&u, z);
5223            let rsz = polynomial_value(&rs, z);
5224            let rtz = polynomial_value(&rt, z);
5225            let ruz = polynomial_value(&ru, z);
5226            let stz = polynomial_value(&st, z);
5227            let suz = polynomial_value(&su, z);
5228            let tuz = polynomial_value(&tu, z);
5229            let rstz = polynomial_value(&rst, z);
5230            let rsuz = polynomial_value(&rsu, z);
5231            let rtuz = polynomial_value(&rtu, z);
5232            let stuz = polynomial_value(&stu, z);
5233            let rstuz = polynomial_value(&rstu, z);
5234            let linear =
5235                rstz * uz + rsuz * tz + rtuz * sz + stuz * rz + rsz * tuz + rtz * suz + ruz * stz;
5236            let quadratic = rsz * tz * uz
5237                + rtz * sz * uz
5238                + ruz * sz * tz
5239                + stz * rz * uz
5240                + suz * rz * tz
5241                + tuz * rz * sz;
5242            let quartic = rz * sz * tz * uz;
5243            (rstuz - eta * linear
5244                + (eta * eta - 1.0) * quadratic
5245                + (-eta * eta * eta + 3.0 * eta) * quartic)
5246                * (-cell.q(z)).exp()
5247                / (2.0 * std::f64::consts::PI)
5248        });
5249
5250        assert!((exact_fourth - numeric_fourth).abs() < 2e-7);
5251    }
5252
5253    #[test]
5254    fn denested_cell_parameter_derivatives_match_exact_integrands() {
5255        let score_span = LocalSpanCubic {
5256            left: -0.75,
5257            right: 0.25,
5258            c0: 0.08,
5259            c1: -0.03,
5260            c2: 0.02,
5261            c3: -0.01,
5262        };
5263        let link_span = LocalSpanCubic {
5264            left: -0.6,
5265            right: 0.9,
5266            c0: -0.05,
5267            c1: 0.04,
5268            c2: -0.02,
5269            c3: 0.015,
5270        };
5271        let a = 0.3;
5272        let b = -0.7;
5273        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5274        let cell = DenestedCubicCell {
5275            left: score_span.left,
5276            right: score_span.right,
5277            c0: coeffs[0],
5278            c1: coeffs[1],
5279            c2: coeffs[2],
5280            c3: coeffs[3],
5281        };
5282        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5283        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5284        let (dc_daa, dc_dab, dc_dbb) = denested_cell_second_partials(score_span, link_span, a, b);
5285        let (dc_daaa, dc_daab, dc_dabb, dc_dbbb) = denested_cell_third_partials(link_span);
5286        let zero = [0.0; 4];
5287        let link_third = 6.0 * link_span.c3;
5288
5289        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5290        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5291        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5292        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5293        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5294        let eta_aaa = |z: f64| link_third + 0.0 * z;
5295        let eta_aab = |z: f64| z * link_third;
5296        let eta_abb = |z: f64| z * z * link_third;
5297        let eta_bbb = |z: f64| z * z * z * link_third;
5298
5299        let exact_a = cell_first_derivative_from_moments(&dc_da, &state.moments).expect("a");
5300        let exact_b = cell_first_derivative_from_moments(&dc_db, &state.moments).expect("b");
5301        let exact_aa =
5302            cell_second_derivative_from_moments(cell, &dc_da, &dc_da, &dc_daa, &state.moments)
5303                .expect("aa");
5304        let exact_ab =
5305            cell_second_derivative_from_moments(cell, &dc_da, &dc_db, &dc_dab, &state.moments)
5306                .expect("ab");
5307        let exact_bb =
5308            cell_second_derivative_from_moments(cell, &dc_db, &dc_db, &dc_dbb, &state.moments)
5309                .expect("bb");
5310        let exact_aaa = cell_third_derivative_from_moments(
5311            cell,
5312            &dc_da,
5313            &dc_da,
5314            &dc_da,
5315            &dc_daa,
5316            &dc_daa,
5317            &dc_daa,
5318            &dc_daaa,
5319            &state.moments,
5320        )
5321        .expect("aaa");
5322        let exact_aab = cell_third_derivative_from_moments(
5323            cell,
5324            &dc_da,
5325            &dc_da,
5326            &dc_db,
5327            &dc_daa,
5328            &dc_dab,
5329            &dc_dab,
5330            &dc_daab,
5331            &state.moments,
5332        )
5333        .expect("aab");
5334        let exact_abb = cell_third_derivative_from_moments(
5335            cell,
5336            &dc_da,
5337            &dc_db,
5338            &dc_db,
5339            &dc_dab,
5340            &dc_dab,
5341            &dc_dbb,
5342            &dc_dabb,
5343            &state.moments,
5344        )
5345        .expect("abb");
5346        let exact_bbb = cell_third_derivative_from_moments(
5347            cell,
5348            &dc_db,
5349            &dc_db,
5350            &dc_db,
5351            &dc_dbb,
5352            &dc_dbb,
5353            &dc_dbb,
5354            &dc_dbbb,
5355            &state.moments,
5356        )
5357        .expect("bbb");
5358        let exact_aaaa = cell_fourth_derivative_from_moments(
5359            cell,
5360            &dc_da,
5361            &dc_da,
5362            &dc_da,
5363            &dc_da,
5364            &dc_daa,
5365            &dc_daa,
5366            &dc_daa,
5367            &dc_daa,
5368            &dc_daa,
5369            &dc_daa,
5370            &dc_daaa,
5371            &dc_daaa,
5372            &dc_daaa,
5373            &dc_daaa,
5374            &zero,
5375            &state.moments,
5376        )
5377        .expect("aaaa");
5378        let exact_aaab = cell_fourth_derivative_from_moments(
5379            cell,
5380            &dc_da,
5381            &dc_da,
5382            &dc_da,
5383            &dc_db,
5384            &dc_daa,
5385            &dc_daa,
5386            &dc_dab,
5387            &dc_daa,
5388            &dc_dab,
5389            &dc_dab,
5390            &dc_daaa,
5391            &dc_daab,
5392            &dc_daab,
5393            &dc_daab,
5394            &zero,
5395            &state.moments,
5396        )
5397        .expect("aaab");
5398        let exact_aabb = cell_fourth_derivative_from_moments(
5399            cell,
5400            &dc_da,
5401            &dc_da,
5402            &dc_db,
5403            &dc_db,
5404            &dc_daa,
5405            &dc_dab,
5406            &dc_dab,
5407            &dc_dab,
5408            &dc_dab,
5409            &dc_dbb,
5410            &dc_daab,
5411            &dc_daab,
5412            &dc_dabb,
5413            &dc_dabb,
5414            &zero,
5415            &state.moments,
5416        )
5417        .expect("aabb");
5418        let exact_abbb = cell_fourth_derivative_from_moments(
5419            cell,
5420            &dc_da,
5421            &dc_db,
5422            &dc_db,
5423            &dc_db,
5424            &dc_dab,
5425            &dc_dab,
5426            &dc_dab,
5427            &dc_dbb,
5428            &dc_dbb,
5429            &dc_dbb,
5430            &dc_dabb,
5431            &dc_dabb,
5432            &dc_dabb,
5433            &dc_dbbb,
5434            &zero,
5435            &state.moments,
5436        )
5437        .expect("abbb");
5438        let exact_bbbb = cell_fourth_derivative_from_moments(
5439            cell,
5440            &dc_db,
5441            &dc_db,
5442            &dc_db,
5443            &dc_db,
5444            &dc_dbb,
5445            &dc_dbb,
5446            &dc_dbb,
5447            &dc_dbb,
5448            &dc_dbb,
5449            &dc_dbb,
5450            &dc_dbbb,
5451            &dc_dbbb,
5452            &dc_dbbb,
5453            &dc_dbbb,
5454            &zero,
5455            &state.moments,
5456        )
5457        .expect("bbbb");
5458
5459        let numeric_a = simpson_integral(cell.left, cell.right, 5000, |z| {
5460            eta_a(z) * (-cell.q(z)).exp() * INV_TWO_PI
5461        });
5462        let numeric_b = simpson_integral(cell.left, cell.right, 5000, |z| {
5463            eta_b(z) * (-cell.q(z)).exp() * INV_TWO_PI
5464        });
5465        let numeric_aa = simpson_integral(cell.left, cell.right, 5000, |z| {
5466            (eta_aa(z) - cell.eta(z) * eta_a(z) * eta_a(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5467        });
5468        let numeric_ab = simpson_integral(cell.left, cell.right, 5000, |z| {
5469            (eta_ab(z) - cell.eta(z) * eta_a(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5470        });
5471        let numeric_bb = simpson_integral(cell.left, cell.right, 5000, |z| {
5472            (eta_bb(z) - cell.eta(z) * eta_b(z) * eta_b(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5473        });
5474        let numeric_aaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5475            let eta = cell.eta(z);
5476            (eta_aaa(z) - 3.0 * eta * eta_aa(z) * eta_a(z) + (eta * eta - 1.0) * eta_a(z).powi(3))
5477                * (-cell.q(z)).exp()
5478                * INV_TWO_PI
5479        });
5480        let numeric_aab = simpson_integral(cell.left, cell.right, 5000, |z| {
5481            let eta = cell.eta(z);
5482            let a_z = eta_a(z);
5483            let b_z = eta_b(z);
5484            (eta_aab(z) - eta * (eta_aa(z) * b_z + 2.0 * eta_ab(z) * a_z)
5485                + (eta * eta - 1.0) * a_z * a_z * b_z)
5486                * (-cell.q(z)).exp()
5487                * INV_TWO_PI
5488        });
5489        let numeric_abb = simpson_integral(cell.left, cell.right, 5000, |z| {
5490            let eta = cell.eta(z);
5491            let a_z = eta_a(z);
5492            let b_z = eta_b(z);
5493            (eta_abb(z) - eta * (2.0 * eta_ab(z) * b_z + eta_bb(z) * a_z)
5494                + (eta * eta - 1.0) * a_z * b_z * b_z)
5495                * (-cell.q(z)).exp()
5496                * INV_TWO_PI
5497        });
5498        let numeric_bbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5499            let eta = cell.eta(z);
5500            (eta_bbb(z) - 3.0 * eta * eta_bb(z) * eta_b(z) + (eta * eta - 1.0) * eta_b(z).powi(3))
5501                * (-cell.q(z)).exp()
5502                * INV_TWO_PI
5503        });
5504        let numeric_aaaa = simpson_integral(cell.left, cell.right, 5000, |z| {
5505            let eta = cell.eta(z);
5506            let eta_a_z = eta_a(z);
5507            let eta_aa_z = eta_aa(z);
5508            let eta_aaa_z = eta_aaa(z);
5509            (-eta * (4.0 * eta_aaa_z * eta_a_z + 3.0 * eta_aa_z * eta_aa_z)
5510                + (eta * eta - 1.0) * (6.0 * eta_aa_z * eta_a_z * eta_a_z)
5511                + (-eta * eta * eta + 3.0 * eta) * eta_a_z.powi(4))
5512                * (-cell.q(z)).exp()
5513                * INV_TWO_PI
5514        });
5515        let numeric_aaab = simpson_integral(cell.left, cell.right, 5000, |z| {
5516            let eta = cell.eta(z);
5517            let a_z = eta_a(z);
5518            let b_z = eta_b(z);
5519            let aa_z = eta_aa(z);
5520            let ab_z = eta_ab(z);
5521            let aaa_z = eta_aaa(z);
5522            let aab_z = eta_aab(z);
5523            (-eta * (aaa_z * b_z + 3.0 * aab_z * a_z + 3.0 * aa_z * ab_z)
5524                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * b_z + 3.0 * ab_z * a_z * a_z)
5525                + (-eta * eta * eta + 3.0 * eta) * a_z.powi(3) * b_z)
5526                * (-cell.q(z)).exp()
5527                * INV_TWO_PI
5528        });
5529        let numeric_aabb = simpson_integral(cell.left, cell.right, 5000, |z| {
5530            let eta = cell.eta(z);
5531            let a_z = eta_a(z);
5532            let b_z = eta_b(z);
5533            let aa_z = eta_aa(z);
5534            let ab_z = eta_ab(z);
5535            let bb_z = eta_bb(z);
5536            let aab_z = eta_aab(z);
5537            let abb_z = eta_abb(z);
5538            (-eta * (2.0 * aab_z * b_z + 2.0 * abb_z * a_z + aa_z * bb_z + 2.0 * ab_z * ab_z)
5539                + (eta * eta - 1.0)
5540                    * (aa_z * b_z * b_z + 4.0 * ab_z * a_z * b_z + bb_z * a_z * a_z)
5541                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * b_z * b_z)
5542                * (-cell.q(z)).exp()
5543                * INV_TWO_PI
5544        });
5545        let numeric_abbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5546            let eta = cell.eta(z);
5547            let a_z = eta_a(z);
5548            let b_z = eta_b(z);
5549            let ab_z = eta_ab(z);
5550            let bb_z = eta_bb(z);
5551            let abb_z = eta_abb(z);
5552            let bbb_z = eta_bbb(z);
5553            (-eta * (3.0 * abb_z * b_z + bbb_z * a_z + 3.0 * ab_z * bb_z)
5554                + (eta * eta - 1.0) * (3.0 * ab_z * b_z * b_z + 3.0 * bb_z * a_z * b_z)
5555                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z.powi(3))
5556                * (-cell.q(z)).exp()
5557                * INV_TWO_PI
5558        });
5559        let numeric_bbbb = simpson_integral(cell.left, cell.right, 5000, |z| {
5560            let eta = cell.eta(z);
5561            let eta_b_z = eta_b(z);
5562            let eta_bb_z = eta_bb(z);
5563            let eta_bbb_z = eta_bbb(z);
5564            (-eta * (4.0 * eta_bbb_z * eta_b_z + 3.0 * eta_bb_z * eta_bb_z)
5565                + (eta * eta - 1.0) * (6.0 * eta_bb_z * eta_b_z * eta_b_z)
5566                + (-eta * eta * eta + 3.0 * eta) * eta_b_z.powi(4))
5567                * (-cell.q(z)).exp()
5568                * INV_TWO_PI
5569        });
5570
5571        assert!((exact_a - numeric_a).abs() < 1e-8);
5572        assert!((exact_b - numeric_b).abs() < 1e-8);
5573        assert!((exact_aa - numeric_aa).abs() < 1e-8);
5574        assert!((exact_ab - numeric_ab).abs() < 1e-8);
5575        assert!((exact_bb - numeric_bb).abs() < 1e-8);
5576        assert!((exact_aaa - numeric_aaa).abs() < 2e-7);
5577        assert!((exact_aab - numeric_aab).abs() < 2e-7);
5578        assert!((exact_abb - numeric_abb).abs() < 2e-7);
5579        assert!((exact_bbb - numeric_bbb).abs() < 2e-7);
5580        assert!((exact_aaaa - numeric_aaaa).abs() < 2e-6);
5581        assert!((exact_aaab - numeric_aaab).abs() < 2e-6);
5582        assert!((exact_aabb - numeric_aabb).abs() < 2e-6);
5583        assert!((exact_abbb - numeric_abbb).abs() < 2e-6);
5584        assert!((exact_bbbb - numeric_bbbb).abs() < 2e-6);
5585    }
5586
5587    #[test]
5588    fn link_basis_cell_derivatives_match_exact_integrands() {
5589        let score_span = LocalSpanCubic {
5590            left: -0.75,
5591            right: 0.25,
5592            c0: 0.08,
5593            c1: -0.03,
5594            c2: 0.02,
5595            c3: -0.01,
5596        };
5597        let link_span = LocalSpanCubic {
5598            left: -0.6,
5599            right: 0.9,
5600            c0: -0.05,
5601            c1: 0.04,
5602            c2: -0.02,
5603            c3: 0.015,
5604        };
5605        let link_basis_span = LocalSpanCubic {
5606            left: -0.6,
5607            right: 0.9,
5608            c0: 0.02,
5609            c1: -0.01,
5610            c2: 0.03,
5611            c3: -0.02,
5612        };
5613        let a = 0.3;
5614        let b = -0.7;
5615        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5616        let cell = DenestedCubicCell {
5617            left: score_span.left,
5618            right: score_span.right,
5619            c0: coeffs[0],
5620            c1: coeffs[1],
5621            c2: coeffs[2],
5622            c3: coeffs[3],
5623        };
5624        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
5625        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
5626        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
5627        let dc_daa = second_partials.0;
5628        let dc_dab = second_partials.1;
5629        let dc_dbb = second_partials.2;
5630        let denested_third = denested_cell_third_partials(link_span);
5631        let dc_daaa = denested_third.0;
5632        let dc_dbbb = denested_third.3;
5633
5634        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
5635        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
5636        let (coeff_aaw, coeff_abw, coeff_bbw) =
5637            link_basis_cell_second_partials(link_basis_span, a, b);
5638        let link_basis_third = link_basis_cell_third_partials(link_basis_span);
5639        let coeff_aaaw = link_basis_third.0;
5640        let coeff_bbbw = link_basis_third.3;
5641        let zero = [0.0; 4];
5642        let basis_third = 6.0 * link_basis_span.c3;
5643
5644        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
5645        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
5646        let eta_aa = |z: f64| link_span.second_derivative(a + b * z);
5647        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
5648        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
5649        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
5650        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
5651        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
5652        let eta_aaw = |z: f64| link_basis_span.second_derivative(a + b * z);
5653        let eta_abw = |z: f64| z * link_basis_span.second_derivative(a + b * z);
5654        let eta_bbw = |z: f64| z * z * link_basis_span.second_derivative(a + b * z);
5655        let eta_aaaw = |z: f64| basis_third + 0.0 * z;
5656        let eta_bbbw = |z: f64| z * z * z * basis_third;
5657
5658        let exact_w = cell_first_derivative_from_moments(&coeff_w, &state.moments).expect("w");
5659        let exact_aw =
5660            cell_second_derivative_from_moments(cell, &dc_da, &coeff_w, &coeff_aw, &state.moments)
5661                .expect("aw");
5662        let exact_bw =
5663            cell_second_derivative_from_moments(cell, &dc_db, &coeff_w, &coeff_bw, &state.moments)
5664                .expect("bw");
5665        let exact_ww =
5666            cell_second_derivative_from_moments(cell, &coeff_w, &coeff_w, &zero, &state.moments)
5667                .expect("ww");
5668        let exact_aaw = cell_third_derivative_from_moments(
5669            cell,
5670            &dc_da,
5671            &dc_da,
5672            &coeff_w,
5673            &dc_daa,
5674            &coeff_aw,
5675            &coeff_aw,
5676            &coeff_aaw,
5677            &state.moments,
5678        )
5679        .expect("aaw");
5680        let exact_abw = cell_third_derivative_from_moments(
5681            cell,
5682            &dc_da,
5683            &dc_db,
5684            &coeff_w,
5685            &dc_dab,
5686            &coeff_aw,
5687            &coeff_bw,
5688            &coeff_abw,
5689            &state.moments,
5690        )
5691        .expect("abw");
5692        let exact_bbw = cell_third_derivative_from_moments(
5693            cell,
5694            &dc_db,
5695            &dc_db,
5696            &coeff_w,
5697            &dc_dbb,
5698            &coeff_bw,
5699            &coeff_bw,
5700            &coeff_bbw,
5701            &state.moments,
5702        )
5703        .expect("bbw");
5704        let exact_www = cell_third_derivative_from_moments(
5705            cell,
5706            &coeff_w,
5707            &coeff_w,
5708            &coeff_w,
5709            &zero,
5710            &zero,
5711            &zero,
5712            &zero,
5713            &state.moments,
5714        )
5715        .expect("www");
5716        let exact_aaaw = cell_fourth_derivative_from_moments(
5717            cell,
5718            &dc_da,
5719            &dc_da,
5720            &dc_da,
5721            &coeff_w,
5722            &dc_daa,
5723            &dc_daa,
5724            &coeff_aw,
5725            &dc_daa,
5726            &coeff_aw,
5727            &coeff_aw,
5728            &dc_daaa,
5729            &coeff_aaw,
5730            &coeff_aaw,
5731            &coeff_aaw,
5732            &coeff_aaaw,
5733            &state.moments,
5734        )
5735        .expect("aaaw");
5736        let exact_aaww = cell_fourth_derivative_from_moments(
5737            cell,
5738            &dc_da,
5739            &dc_da,
5740            &coeff_w,
5741            &coeff_w,
5742            &dc_daa,
5743            &coeff_aw,
5744            &coeff_aw,
5745            &coeff_aw,
5746            &coeff_aw,
5747            &zero,
5748            &coeff_aaw,
5749            &coeff_aaw,
5750            &zero,
5751            &zero,
5752            &zero,
5753            &state.moments,
5754        )
5755        .expect("aaww");
5756        let exact_abww = cell_fourth_derivative_from_moments(
5757            cell,
5758            &dc_da,
5759            &dc_db,
5760            &coeff_w,
5761            &coeff_w,
5762            &dc_dab,
5763            &coeff_aw,
5764            &coeff_aw,
5765            &coeff_bw,
5766            &coeff_bw,
5767            &zero,
5768            &coeff_abw,
5769            &coeff_abw,
5770            &zero,
5771            &zero,
5772            &zero,
5773            &state.moments,
5774        )
5775        .expect("abww");
5776        let exact_bbww = cell_fourth_derivative_from_moments(
5777            cell,
5778            &dc_db,
5779            &dc_db,
5780            &coeff_w,
5781            &coeff_w,
5782            &dc_dbb,
5783            &coeff_bw,
5784            &coeff_bw,
5785            &coeff_bw,
5786            &coeff_bw,
5787            &zero,
5788            &coeff_bbw,
5789            &coeff_bbw,
5790            &zero,
5791            &zero,
5792            &zero,
5793            &state.moments,
5794        )
5795        .expect("bbww");
5796        let exact_bbbw = cell_fourth_derivative_from_moments(
5797            cell,
5798            &dc_db,
5799            &dc_db,
5800            &dc_db,
5801            &coeff_w,
5802            &dc_dbb,
5803            &dc_dbb,
5804            &coeff_bw,
5805            &dc_dbb,
5806            &coeff_bw,
5807            &coeff_bw,
5808            &dc_dbbb,
5809            &coeff_bbw,
5810            &coeff_bbw,
5811            &coeff_bbw,
5812            &coeff_bbbw,
5813            &state.moments,
5814        )
5815        .expect("bbbw");
5816        let exact_wwww = cell_fourth_derivative_from_moments(
5817            cell,
5818            &coeff_w,
5819            &coeff_w,
5820            &coeff_w,
5821            &coeff_w,
5822            &zero,
5823            &zero,
5824            &zero,
5825            &zero,
5826            &zero,
5827            &zero,
5828            &zero,
5829            &zero,
5830            &zero,
5831            &zero,
5832            &zero,
5833            &state.moments,
5834        )
5835        .expect("wwww");
5836
5837        let numeric_w = simpson_integral(cell.left, cell.right, 5000, |z| {
5838            eta_w(z) * (-cell.q(z)).exp() * INV_TWO_PI
5839        });
5840        let numeric_aw = simpson_integral(cell.left, cell.right, 5000, |z| {
5841            (eta_aw(z) - cell.eta(z) * eta_a(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5842        });
5843        let numeric_bw = simpson_integral(cell.left, cell.right, 5000, |z| {
5844            (eta_bw(z) - cell.eta(z) * eta_b(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5845        });
5846        let numeric_ww = simpson_integral(cell.left, cell.right, 5000, |z| {
5847            (-cell.eta(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
5848        });
5849        let numeric_aaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5850            let eta = cell.eta(z);
5851            let w_z = eta_w(z);
5852            let a_z = eta_a(z);
5853            (eta_aaw(z) - eta * (eta_aa(z) * w_z + 2.0 * eta_aw(z) * a_z)
5854                + (eta * eta - 1.0) * a_z * a_z * w_z)
5855                * (-cell.q(z)).exp()
5856                * INV_TWO_PI
5857        });
5858        let numeric_abw = simpson_integral(cell.left, cell.right, 5000, |z| {
5859            let eta = cell.eta(z);
5860            let w_z = eta_w(z);
5861            let a_z = eta_a(z);
5862            let b_z = eta_b(z);
5863            (eta_abw(z) - eta * (eta_ab(z) * w_z + eta_aw(z) * b_z + eta_bw(z) * a_z)
5864                + (eta * eta - 1.0) * a_z * b_z * w_z)
5865                * (-cell.q(z)).exp()
5866                * INV_TWO_PI
5867        });
5868        let numeric_bbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5869            let eta = cell.eta(z);
5870            let w_z = eta_w(z);
5871            let b_z = eta_b(z);
5872            (eta_bbw(z) - eta * (eta_bb(z) * w_z + 2.0 * eta_bw(z) * b_z)
5873                + (eta * eta - 1.0) * b_z * b_z * w_z)
5874                * (-cell.q(z)).exp()
5875                * INV_TWO_PI
5876        });
5877        let numeric_www = simpson_integral(cell.left, cell.right, 5000, |z| {
5878            let eta = cell.eta(z);
5879            let w_z = eta_w(z);
5880            ((eta * eta - 1.0) * w_z * w_z * w_z) * (-cell.q(z)).exp() * INV_TWO_PI
5881        });
5882        let numeric_aaaw = simpson_integral(cell.left, cell.right, 5000, |z| {
5883            let eta = cell.eta(z);
5884            let a_z = eta_a(z);
5885            let w_z = eta_w(z);
5886            let aa_z = eta_aa(z);
5887            let aw_z = eta_aw(z);
5888            (eta_aaaw(z)
5889                - eta * ((dc_daaa[0] + 0.0 * z) * w_z + 3.0 * eta_aaw(z) * a_z + 3.0 * aa_z * aw_z)
5890                + (eta * eta - 1.0) * (3.0 * aa_z * a_z * w_z + 3.0 * aw_z * a_z * a_z)
5891                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * a_z * w_z)
5892                * (-cell.q(z)).exp()
5893                * INV_TWO_PI
5894        });
5895        let numeric_aaww = simpson_integral(cell.left, cell.right, 5000, |z| {
5896            let eta = cell.eta(z);
5897            let a_z = eta_a(z);
5898            let w_z = eta_w(z);
5899            let aw_z = eta_aw(z);
5900            (-(2.0 * eta * (eta_aaw(z) * w_z + aw_z * aw_z))
5901                + (eta * eta - 1.0) * (eta_aa(z) * w_z * w_z + 4.0 * aw_z * a_z * w_z)
5902                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * w_z * w_z)
5903                * (-cell.q(z)).exp()
5904                * INV_TWO_PI
5905        });
5906        let numeric_abww = simpson_integral(cell.left, cell.right, 5000, |z| {
5907            let eta = cell.eta(z);
5908            let a_z = eta_a(z);
5909            let b_z = eta_b(z);
5910            let w_z = eta_w(z);
5911            let aw_z = eta_aw(z);
5912            let bw_z = eta_bw(z);
5913            (-(2.0 * eta * (eta_abw(z) * w_z + aw_z * bw_z))
5914                + (eta * eta - 1.0)
5915                    * (eta_ab(z) * w_z * w_z + 2.0 * aw_z * b_z * w_z + 2.0 * bw_z * a_z * w_z)
5916                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * w_z * w_z)
5917                * (-cell.q(z)).exp()
5918                * INV_TWO_PI
5919        });
5920        let numeric_bbww = simpson_integral(cell.left, cell.right, 5000, |z| {
5921            let eta = cell.eta(z);
5922            let b_z = eta_b(z);
5923            let w_z = eta_w(z);
5924            let bw_z = eta_bw(z);
5925            (-(2.0 * eta * (eta_bbw(z) * w_z + bw_z * bw_z))
5926                + (eta * eta - 1.0) * (eta_bb(z) * w_z * w_z + 4.0 * bw_z * b_z * w_z)
5927                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * w_z * w_z)
5928                * (-cell.q(z)).exp()
5929                * INV_TWO_PI
5930        });
5931        let numeric_bbbw = simpson_integral(cell.left, cell.right, 5000, |z| {
5932            let eta = cell.eta(z);
5933            let b_z = eta_b(z);
5934            let w_z = eta_w(z);
5935            let bb_z = eta_bb(z);
5936            let bw_z = eta_bw(z);
5937            (eta_bbbw(z)
5938                - eta
5939                    * ((dc_dbbb[3] * z * z * z) * w_z + 3.0 * eta_bbw(z) * b_z + 3.0 * bb_z * bw_z)
5940                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * w_z + 3.0 * bw_z * b_z * b_z)
5941                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * w_z)
5942                * (-cell.q(z)).exp()
5943                * INV_TWO_PI
5944        });
5945        let numeric_wwww = simpson_integral(cell.left, cell.right, 5000, |z| {
5946            let eta = cell.eta(z);
5947            let w_z = eta_w(z);
5948            ((-eta * eta * eta + 3.0 * eta) * w_z * w_z * w_z * w_z)
5949                * (-cell.q(z)).exp()
5950                * INV_TWO_PI
5951        });
5952
5953        assert!((exact_w - numeric_w).abs() < 1e-8);
5954        assert!((exact_aw - numeric_aw).abs() < 1e-7);
5955        assert!((exact_bw - numeric_bw).abs() < 1e-7);
5956        assert!((exact_ww - numeric_ww).abs() < 1e-7);
5957        assert!((exact_aaw - numeric_aaw).abs() < 2e-6);
5958        assert!((exact_abw - numeric_abw).abs() < 2e-6);
5959        assert!((exact_bbw - numeric_bbw).abs() < 2e-6);
5960        assert!((exact_www - numeric_www).abs() < 2e-6);
5961        assert!((exact_aaaw - numeric_aaaw).abs() < 3e-6);
5962        assert!((exact_aaww - numeric_aaww).abs() < 3e-6);
5963        assert!((exact_abww - numeric_abww).abs() < 3e-6);
5964        assert!((exact_bbww - numeric_bbww).abs() < 3e-6);
5965        assert!((exact_bbbw - numeric_bbbw).abs() < 3e-6);
5966        assert!((exact_wwww - numeric_wwww).abs() < 3e-6);
5967    }
5968
5969    #[test]
5970    fn score_basis_cell_derivatives_match_exact_integrands() {
5971        let score_span = LocalSpanCubic {
5972            left: -0.75,
5973            right: 0.25,
5974            c0: 0.08,
5975            c1: -0.03,
5976            c2: 0.02,
5977            c3: -0.01,
5978        };
5979        let score_basis_span = LocalSpanCubic {
5980            left: -0.75,
5981            right: 0.25,
5982            c0: -0.04,
5983            c1: 0.06,
5984            c2: -0.01,
5985            c3: 0.02,
5986        };
5987        let link_span = LocalSpanCubic {
5988            left: -0.6,
5989            right: 0.9,
5990            c0: -0.05,
5991            c1: 0.04,
5992            c2: -0.02,
5993            c3: 0.015,
5994        };
5995        let a = 0.3;
5996        let b = -0.7;
5997        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
5998        let cell = DenestedCubicCell {
5999            left: score_span.left,
6000            right: score_span.right,
6001            c0: coeffs[0],
6002            c1: coeffs[1],
6003            c2: coeffs[2],
6004            c3: coeffs[3],
6005        };
6006        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6007        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6008        let second_partials = denested_cell_second_partials(score_span, link_span, a, b);
6009        let dc_daa = second_partials.0;
6010        let dc_dab = second_partials.1;
6011        let dc_dbb = second_partials.2;
6012        let denested_third = denested_cell_third_partials(link_span);
6013        let dc_dbbb = denested_third.3;
6014
6015        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6016        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6017        let zero = [0.0; 4];
6018
6019        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6020        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6021        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6022        let eta_bb = |z: f64| z * z * link_span.second_derivative(a + b * z);
6023        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6024        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6025
6026        let exact_h = cell_first_derivative_from_moments(&coeff_h, &state.moments).expect("h");
6027        let exact_ah =
6028            cell_second_derivative_from_moments(cell, &dc_da, &coeff_h, &zero, &state.moments)
6029                .expect("ah");
6030        let exact_bh =
6031            cell_second_derivative_from_moments(cell, &dc_db, &coeff_h, &coeff_bh, &state.moments)
6032                .expect("bh");
6033        let exact_hh =
6034            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_h, &zero, &state.moments)
6035                .expect("hh");
6036        let exact_abh = cell_third_derivative_from_moments(
6037            cell,
6038            &dc_da,
6039            &dc_db,
6040            &coeff_h,
6041            &dc_dab,
6042            &zero,
6043            &coeff_bh,
6044            &zero,
6045            &state.moments,
6046        )
6047        .expect("abh");
6048        let exact_bbh = cell_third_derivative_from_moments(
6049            cell,
6050            &dc_db,
6051            &dc_db,
6052            &coeff_h,
6053            &dc_dbb,
6054            &coeff_bh,
6055            &coeff_bh,
6056            &zero,
6057            &state.moments,
6058        )
6059        .expect("bbh");
6060        let exact_bhh = cell_third_derivative_from_moments(
6061            cell,
6062            &dc_db,
6063            &coeff_h,
6064            &coeff_h,
6065            &coeff_bh,
6066            &coeff_bh,
6067            &zero,
6068            &zero,
6069            &state.moments,
6070        )
6071        .expect("bhh");
6072        let exact_hhh = cell_third_derivative_from_moments(
6073            cell,
6074            &coeff_h,
6075            &coeff_h,
6076            &coeff_h,
6077            &zero,
6078            &zero,
6079            &zero,
6080            &zero,
6081            &state.moments,
6082        )
6083        .expect("hhh");
6084        let exact_bbbh = cell_fourth_derivative_from_moments(
6085            cell,
6086            &dc_db,
6087            &dc_db,
6088            &dc_db,
6089            &coeff_h,
6090            &dc_dbb,
6091            &dc_dbb,
6092            &coeff_bh,
6093            &dc_dbb,
6094            &coeff_bh,
6095            &coeff_bh,
6096            &dc_dbbb,
6097            &zero,
6098            &zero,
6099            &zero,
6100            &zero,
6101            &state.moments,
6102        )
6103        .expect("bbbh");
6104        let exact_aahh = cell_fourth_derivative_from_moments(
6105            cell,
6106            &dc_da,
6107            &dc_da,
6108            &coeff_h,
6109            &coeff_h,
6110            &dc_daa,
6111            &zero,
6112            &zero,
6113            &zero,
6114            &zero,
6115            &zero,
6116            &zero,
6117            &zero,
6118            &zero,
6119            &zero,
6120            &zero,
6121            &state.moments,
6122        )
6123        .expect("aahh");
6124        let exact_abhh = cell_fourth_derivative_from_moments(
6125            cell,
6126            &dc_da,
6127            &dc_db,
6128            &coeff_h,
6129            &coeff_h,
6130            &dc_dab,
6131            &zero,
6132            &zero,
6133            &coeff_bh,
6134            &coeff_bh,
6135            &zero,
6136            &zero,
6137            &zero,
6138            &zero,
6139            &zero,
6140            &zero,
6141            &state.moments,
6142        )
6143        .expect("abhh");
6144        let exact_bbhh = cell_fourth_derivative_from_moments(
6145            cell,
6146            &dc_db,
6147            &dc_db,
6148            &coeff_h,
6149            &coeff_h,
6150            &dc_dbb,
6151            &coeff_bh,
6152            &coeff_bh,
6153            &coeff_bh,
6154            &coeff_bh,
6155            &zero,
6156            &zero,
6157            &zero,
6158            &zero,
6159            &zero,
6160            &zero,
6161            &state.moments,
6162        )
6163        .expect("bbhh");
6164        let exact_bhhh = cell_fourth_derivative_from_moments(
6165            cell,
6166            &dc_db,
6167            &coeff_h,
6168            &coeff_h,
6169            &coeff_h,
6170            &coeff_bh,
6171            &coeff_bh,
6172            &coeff_bh,
6173            &zero,
6174            &zero,
6175            &zero,
6176            &zero,
6177            &zero,
6178            &zero,
6179            &zero,
6180            &zero,
6181            &state.moments,
6182        )
6183        .expect("bhhh");
6184        let exact_hhhh = cell_fourth_derivative_from_moments(
6185            cell,
6186            &coeff_h,
6187            &coeff_h,
6188            &coeff_h,
6189            &coeff_h,
6190            &zero,
6191            &zero,
6192            &zero,
6193            &zero,
6194            &zero,
6195            &zero,
6196            &zero,
6197            &zero,
6198            &zero,
6199            &zero,
6200            &zero,
6201            &state.moments,
6202        )
6203        .expect("hhhh");
6204
6205        let numeric_h = simpson_integral(cell.left, cell.right, 5000, |z| {
6206            eta_h(z) * (-cell.q(z)).exp() * INV_TWO_PI
6207        });
6208        let numeric_ah = simpson_integral(cell.left, cell.right, 5000, |z| {
6209            (-cell.eta(z) * eta_a(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6210        });
6211        let numeric_bh = simpson_integral(cell.left, cell.right, 5000, |z| {
6212            (eta_bh(z) - cell.eta(z) * eta_b(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6213        });
6214        let numeric_hh = simpson_integral(cell.left, cell.right, 5000, |z| {
6215            (-cell.eta(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6216        });
6217        let numeric_abh = simpson_integral(cell.left, cell.right, 5000, |z| {
6218            let eta = cell.eta(z);
6219            (-(eta * (eta_ab(z) * eta_h(z) + eta_bh(z) * eta_a(z)))
6220                + (eta * eta - 1.0) * eta_a(z) * eta_b(z) * eta_h(z))
6221                * (-cell.q(z)).exp()
6222                * INV_TWO_PI
6223        });
6224        let numeric_bbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6225            let eta = cell.eta(z);
6226            (-(eta * (eta_bb(z) * eta_h(z) + 2.0 * eta_bh(z) * eta_b(z)))
6227                + (eta * eta - 1.0) * eta_b(z) * eta_b(z) * eta_h(z))
6228                * (-cell.q(z)).exp()
6229                * INV_TWO_PI
6230        });
6231        let numeric_bhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6232            let eta = cell.eta(z);
6233            (-(2.0 * eta * eta_bh(z) * eta_h(z))
6234                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_h(z))
6235                * (-cell.q(z)).exp()
6236                * INV_TWO_PI
6237        });
6238        let numeric_hhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6239            let eta = cell.eta(z);
6240            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_h(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6241        });
6242        let numeric_bbbh = simpson_integral(cell.left, cell.right, 5000, |z| {
6243            let eta = cell.eta(z);
6244            let b_z = eta_b(z);
6245            let h_z = eta_h(z);
6246            let bb_z = eta_bb(z);
6247            let bh_z = eta_bh(z);
6248            (-(eta * ((dc_dbbb[3] * z * z * z) * h_z + 3.0 * bb_z * bh_z))
6249                + (eta * eta - 1.0) * (3.0 * bb_z * b_z * h_z + 3.0 * bh_z * b_z * b_z)
6250                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * b_z * h_z)
6251                * (-cell.q(z)).exp()
6252                * INV_TWO_PI
6253        });
6254        let numeric_aahh = simpson_integral(cell.left, cell.right, 5000, |z| {
6255            let eta = cell.eta(z);
6256            let a_z = eta_a(z);
6257            let h_z = eta_h(z);
6258            ((eta * eta - 1.0) * polynomial_value(&dc_daa, z) * h_z * h_z
6259                + (-eta * eta * eta + 3.0 * eta) * a_z * a_z * h_z * h_z)
6260                * (-cell.q(z)).exp()
6261                * INV_TWO_PI
6262        });
6263        let numeric_abhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6264            let eta = cell.eta(z);
6265            let a_z = eta_a(z);
6266            let b_z = eta_b(z);
6267            let h_z = eta_h(z);
6268            ((eta * eta - 1.0) * (eta_ab(z) * h_z * h_z + 2.0 * eta_bh(z) * a_z * h_z)
6269                + (-eta * eta * eta + 3.0 * eta) * a_z * b_z * h_z * h_z)
6270                * (-cell.q(z)).exp()
6271                * INV_TWO_PI
6272        });
6273        let numeric_bbhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6274            let eta = cell.eta(z);
6275            let b_z = eta_b(z);
6276            let h_z = eta_h(z);
6277            let bh_z = eta_bh(z);
6278            (-(2.0 * eta * bh_z * bh_z)
6279                + (eta * eta - 1.0) * (eta_bb(z) * h_z * h_z + 4.0 * bh_z * b_z * h_z)
6280                + (-eta * eta * eta + 3.0 * eta) * b_z * b_z * h_z * h_z)
6281                * (-cell.q(z)).exp()
6282                * INV_TWO_PI
6283        });
6284        let numeric_bhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6285            let eta = cell.eta(z);
6286            let h_z = eta_h(z);
6287            (-(eta * (3.0 * eta_bh(z) * h_z * h_z))
6288                + (eta * eta - 1.0) * (3.0 * eta_bh(z) * h_z * h_z)
6289                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * h_z * h_z)
6290                * (-cell.q(z)).exp()
6291                * INV_TWO_PI
6292        });
6293        let numeric_hhhh = simpson_integral(cell.left, cell.right, 5000, |z| {
6294            let eta = cell.eta(z);
6295            let h_z = eta_h(z);
6296            ((-eta * eta * eta + 3.0 * eta) * h_z * h_z * h_z * h_z)
6297                * (-cell.q(z)).exp()
6298                * INV_TWO_PI
6299        });
6300
6301        assert!((exact_h - numeric_h).abs() < 1e-8);
6302        assert!((exact_ah - numeric_ah).abs() < 1e-7);
6303        assert!((exact_bh - numeric_bh).abs() < 1e-7);
6304        assert!((exact_hh - numeric_hh).abs() < 1e-7);
6305        assert!((exact_abh - numeric_abh).abs() < 2e-6);
6306        assert!((exact_bbh - numeric_bbh).abs() < 2e-6);
6307        assert!((exact_bhh - numeric_bhh).abs() < 2e-6);
6308        assert!((exact_hhh - numeric_hhh).abs() < 2e-6);
6309        assert!((exact_bbbh - numeric_bbbh).abs() < 3e-6);
6310        assert!((exact_aahh - numeric_aahh).abs() < 3e-6);
6311        assert!((exact_abhh - numeric_abhh).abs() < 3e-6);
6312        assert!((exact_bbhh - numeric_bbhh).abs() < 3e-6);
6313        assert!((exact_bhhh - numeric_bhhh).abs() < 3e-6);
6314        assert!((exact_hhhh - numeric_hhhh).abs() < 3e-6);
6315    }
6316
6317    #[test]
6318    fn cross_basis_cell_derivatives_match_exact_integrands() {
6319        let score_span = LocalSpanCubic {
6320            left: -0.75,
6321            right: 0.25,
6322            c0: 0.08,
6323            c1: -0.03,
6324            c2: 0.02,
6325            c3: -0.01,
6326        };
6327        let score_basis_span = LocalSpanCubic {
6328            left: -0.75,
6329            right: 0.25,
6330            c0: -0.04,
6331            c1: 0.06,
6332            c2: -0.01,
6333            c3: 0.02,
6334        };
6335        let link_span = LocalSpanCubic {
6336            left: -0.6,
6337            right: 0.9,
6338            c0: -0.05,
6339            c1: 0.04,
6340            c2: -0.02,
6341            c3: 0.015,
6342        };
6343        let link_basis_span = LocalSpanCubic {
6344            left: -0.6,
6345            right: 0.9,
6346            c0: 0.02,
6347            c1: -0.01,
6348            c2: 0.03,
6349            c3: -0.02,
6350        };
6351        let a = 0.3;
6352        let b = -0.7;
6353        let coeffs = denested_cell_coefficients(score_span, link_span, a, b);
6354        let cell = DenestedCubicCell {
6355            left: score_span.left,
6356            right: score_span.right,
6357            c0: coeffs[0],
6358            c1: coeffs[1],
6359            c2: coeffs[2],
6360            c3: coeffs[3],
6361        };
6362        let state = evaluate_cell_moments(cell, 24).expect("cell moments");
6363        let (dc_da, dc_db) = denested_cell_coefficient_partials(score_span, link_span, a, b);
6364        let (dc_daa, dc_dab, _) = denested_cell_second_partials(score_span, link_span, a, b);
6365
6366        let coeff_h = score_basis_cell_coefficients(score_basis_span, b);
6367        let coeff_bh = score_basis_cell_coefficients(score_basis_span, 1.0);
6368        let coeff_w = link_basis_cell_coefficients(link_basis_span, a, b);
6369        let (coeff_aw, coeff_bw) = link_basis_cell_coefficient_partials(link_basis_span, a, b);
6370        let (coeff_aaw, coeff_abw, _) = link_basis_cell_second_partials(link_basis_span, a, b);
6371        let zero = [0.0; 4];
6372
6373        let eta_a = |z: f64| 1.0 + link_span.first_derivative(a + b * z);
6374        let eta_b = |z: f64| z + score_span.evaluate(z) + z * link_span.first_derivative(a + b * z);
6375        let eta_h = |z: f64| b * score_basis_span.evaluate(z);
6376        let eta_bh = |z: f64| score_basis_span.evaluate(z);
6377        let eta_w = |z: f64| link_basis_span.evaluate(a + b * z);
6378        let eta_ab = |z: f64| z * link_span.second_derivative(a + b * z);
6379        let eta_aw = |z: f64| link_basis_span.first_derivative(a + b * z);
6380        let eta_bw = |z: f64| z * link_basis_span.first_derivative(a + b * z);
6381
6382        let exact_hw =
6383            cell_second_derivative_from_moments(cell, &coeff_h, &coeff_w, &zero, &state.moments)
6384                .expect("hw");
6385        let exact_ahw = cell_third_derivative_from_moments(
6386            cell,
6387            &dc_da,
6388            &coeff_h,
6389            &coeff_w,
6390            &zero,
6391            &coeff_aw,
6392            &zero,
6393            &zero,
6394            &state.moments,
6395        )
6396        .expect("ahw");
6397        let exact_bhw = cell_third_derivative_from_moments(
6398            cell,
6399            &dc_db,
6400            &coeff_h,
6401            &coeff_w,
6402            &coeff_bh,
6403            &coeff_bw,
6404            &zero,
6405            &zero,
6406            &state.moments,
6407        )
6408        .expect("bhw");
6409        let exact_hhw = cell_third_derivative_from_moments(
6410            cell,
6411            &coeff_h,
6412            &coeff_h,
6413            &coeff_w,
6414            &zero,
6415            &zero,
6416            &zero,
6417            &zero,
6418            &state.moments,
6419        )
6420        .expect("hhw");
6421        let exact_hww = cell_third_derivative_from_moments(
6422            cell,
6423            &coeff_h,
6424            &coeff_w,
6425            &coeff_w,
6426            &zero,
6427            &zero,
6428            &zero,
6429            &zero,
6430            &state.moments,
6431        )
6432        .expect("hww");
6433        let exact_aahw = cell_fourth_derivative_from_moments(
6434            cell,
6435            &dc_da,
6436            &dc_da,
6437            &coeff_h,
6438            &coeff_w,
6439            &dc_daa,
6440            &zero,
6441            &coeff_aw,
6442            &zero,
6443            &coeff_aw,
6444            &zero,
6445            &zero,
6446            &coeff_aaw,
6447            &zero,
6448            &zero,
6449            &zero,
6450            &state.moments,
6451        )
6452        .expect("aahw");
6453        let exact_hhww = cell_fourth_derivative_from_moments(
6454            cell,
6455            &coeff_h,
6456            &coeff_h,
6457            &coeff_w,
6458            &coeff_w,
6459            &zero,
6460            &zero,
6461            &zero,
6462            &zero,
6463            &zero,
6464            &zero,
6465            &zero,
6466            &zero,
6467            &zero,
6468            &zero,
6469            &zero,
6470            &state.moments,
6471        )
6472        .expect("hhww");
6473        let exact_hhhw = cell_fourth_derivative_from_moments(
6474            cell,
6475            &coeff_h,
6476            &coeff_h,
6477            &coeff_h,
6478            &coeff_w,
6479            &zero,
6480            &zero,
6481            &zero,
6482            &zero,
6483            &zero,
6484            &zero,
6485            &zero,
6486            &zero,
6487            &zero,
6488            &zero,
6489            &zero,
6490            &state.moments,
6491        )
6492        .expect("hhhw");
6493        let exact_abhw = cell_fourth_derivative_from_moments(
6494            cell,
6495            &dc_da,
6496            &dc_db,
6497            &coeff_h,
6498            &coeff_w,
6499            &dc_dab,
6500            &zero,
6501            &coeff_aw,
6502            &coeff_bh,
6503            &coeff_bw,
6504            &zero,
6505            &zero,
6506            &coeff_abw,
6507            &zero,
6508            &zero,
6509            &zero,
6510            &state.moments,
6511        )
6512        .expect("abhw");
6513        let exact_ahww = cell_fourth_derivative_from_moments(
6514            cell,
6515            &dc_da,
6516            &coeff_h,
6517            &coeff_w,
6518            &coeff_w,
6519            &zero,
6520            &coeff_aw,
6521            &coeff_aw,
6522            &zero,
6523            &zero,
6524            &zero,
6525            &zero,
6526            &zero,
6527            &zero,
6528            &zero,
6529            &zero,
6530            &state.moments,
6531        )
6532        .expect("ahww");
6533        let exact_bhww = cell_fourth_derivative_from_moments(
6534            cell,
6535            &dc_db,
6536            &coeff_h,
6537            &coeff_w,
6538            &coeff_w,
6539            &coeff_bh,
6540            &coeff_bw,
6541            &coeff_bw,
6542            &zero,
6543            &zero,
6544            &zero,
6545            &zero,
6546            &zero,
6547            &zero,
6548            &zero,
6549            &zero,
6550            &state.moments,
6551        )
6552        .expect("bhww");
6553        let exact_hwww = cell_fourth_derivative_from_moments(
6554            cell,
6555            &coeff_h,
6556            &coeff_w,
6557            &coeff_w,
6558            &coeff_w,
6559            &zero,
6560            &zero,
6561            &zero,
6562            &zero,
6563            &zero,
6564            &zero,
6565            &zero,
6566            &zero,
6567            &zero,
6568            &zero,
6569            &zero,
6570            &state.moments,
6571        )
6572        .expect("hwww");
6573
6574        let numeric_hw = simpson_integral(cell.left, cell.right, 5000, |z| {
6575            (-cell.eta(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6576        });
6577        let numeric_ahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6578            let eta = cell.eta(z);
6579            (-(eta * eta_aw(z) * eta_h(z)) + (eta * eta - 1.0) * eta_a(z) * eta_h(z) * eta_w(z))
6580                * (-cell.q(z)).exp()
6581                * INV_TWO_PI
6582        });
6583        let numeric_bhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6584            let eta = cell.eta(z);
6585            (-(eta * (eta_bh(z) * eta_w(z) + eta_bw(z) * eta_h(z)))
6586                + (eta * eta - 1.0) * eta_b(z) * eta_h(z) * eta_w(z))
6587                * (-cell.q(z)).exp()
6588                * INV_TWO_PI
6589        });
6590        let numeric_hhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6591            let eta = cell.eta(z);
6592            ((eta * eta - 1.0) * eta_h(z) * eta_h(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6593        });
6594        let numeric_hww = simpson_integral(cell.left, cell.right, 5000, |z| {
6595            let eta = cell.eta(z);
6596            ((eta * eta - 1.0) * eta_h(z) * eta_w(z) * eta_w(z)) * (-cell.q(z)).exp() * INV_TWO_PI
6597        });
6598        let numeric_aahw = simpson_integral(cell.left, cell.right, 5000, |z| {
6599            let eta = cell.eta(z);
6600            (-(eta * polynomial_value(&coeff_aaw, z) * eta_h(z))
6601                + (eta * eta - 1.0)
6602                    * (polynomial_value(&dc_daa, z) * eta_h(z) * eta_w(z)
6603                        + 2.0 * eta_aw(z) * eta_a(z) * eta_h(z))
6604                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_a(z) * eta_h(z) * eta_w(z))
6605                * (-cell.q(z)).exp()
6606                * INV_TWO_PI
6607        });
6608        let numeric_hhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6609            let eta = cell.eta(z);
6610            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_w(z) * eta_w(z))
6611                * (-cell.q(z)).exp()
6612                * INV_TWO_PI
6613        });
6614        let numeric_hhhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6615            let eta = cell.eta(z);
6616            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_h(z) * eta_h(z) * eta_w(z))
6617                * (-cell.q(z)).exp()
6618                * INV_TWO_PI
6619        });
6620        let numeric_abhw = simpson_integral(cell.left, cell.right, 5000, |z| {
6621            let eta = cell.eta(z);
6622            (-(eta * polynomial_value(&coeff_abw, z) * eta_h(z) + eta * eta_aw(z) * eta_bh(z))
6623                + (eta * eta - 1.0)
6624                    * (eta_ab(z) * eta_h(z) * eta_w(z)
6625                        + eta_aw(z) * eta_b(z) * eta_h(z)
6626                        + eta_bh(z) * eta_a(z) * eta_w(z)
6627                        + eta_bw(z) * eta_a(z) * eta_h(z))
6628                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_b(z) * eta_h(z) * eta_w(z))
6629                * (-cell.q(z)).exp()
6630                * INV_TWO_PI
6631        });
6632        let numeric_ahww = simpson_integral(cell.left, cell.right, 5000, |z| {
6633            let eta = cell.eta(z);
6634            (2.0 * (eta * eta - 1.0) * eta_aw(z) * eta_h(z) * eta_w(z)
6635                + (-eta * eta * eta + 3.0 * eta) * eta_a(z) * eta_h(z) * eta_w(z) * eta_w(z))
6636                * (-cell.q(z)).exp()
6637                * INV_TWO_PI
6638        });
6639        let numeric_bhww = simpson_integral(cell.left, cell.right, 5000, |z| {
6640            let eta = cell.eta(z);
6641            let h_z = eta_h(z);
6642            let w_z = eta_w(z);
6643            ((eta * eta - 1.0) * (eta_bh(z) * w_z * w_z + 2.0 * eta_bw(z) * h_z * w_z)
6644                + (-eta * eta * eta + 3.0 * eta) * eta_b(z) * h_z * w_z * w_z)
6645                * (-cell.q(z)).exp()
6646                * INV_TWO_PI
6647        });
6648        let numeric_hwww = simpson_integral(cell.left, cell.right, 5000, |z| {
6649            let eta = cell.eta(z);
6650            ((-eta * eta * eta + 3.0 * eta) * eta_h(z) * eta_w(z) * eta_w(z) * eta_w(z))
6651                * (-cell.q(z)).exp()
6652                * INV_TWO_PI
6653        });
6654
6655        assert!((exact_hw - numeric_hw).abs() < 1e-7);
6656        assert!((exact_ahw - numeric_ahw).abs() < 2e-6);
6657        assert!((exact_bhw - numeric_bhw).abs() < 2e-6);
6658        assert!((exact_hhw - numeric_hhw).abs() < 2e-6);
6659        assert!((exact_hww - numeric_hww).abs() < 2e-6);
6660        assert!((exact_aahw - numeric_aahw).abs() < 3e-6);
6661        assert!((exact_hhww - numeric_hhww).abs() < 3e-6);
6662        assert!((exact_hhhw - numeric_hhhw).abs() < 3e-6);
6663        assert!((exact_abhw - numeric_abhw).abs() < 3e-6);
6664        assert!((exact_ahww - numeric_ahww).abs() < 3e-6);
6665        assert!((exact_bhww - numeric_bhww).abs() < 3e-6);
6666        assert!((exact_hwww - numeric_hwww).abs() < 3e-6);
6667    }
6668
6669    #[test]
6670    fn cell_moment_scratch_reuses_buffers_under_margslope_like_pressure() {
6671        let cells = [
6672            DenestedCubicCell {
6673                left: -1.2,
6674                right: -0.35,
6675                c0: 0.18,
6676                c1: 0.72,
6677                c2: -0.045,
6678                c3: 0.018,
6679            },
6680            DenestedCubicCell {
6681                left: -0.35,
6682                right: 0.48,
6683                c0: -0.08,
6684                c1: 0.91,
6685                c2: 0.038,
6686                c3: -0.014,
6687            },
6688            DenestedCubicCell {
6689                left: 0.48,
6690                right: 1.4,
6691                c0: 0.11,
6692                c1: 0.83,
6693                c2: 0.022,
6694                c3: 0.012,
6695            },
6696        ];
6697        let mut scratch = CellMomentScratch::with_capacity(MAX_AFFINE_ANCHOR_DEGREE);
6698        for cell in cells {
6699            let baseline = evaluate_cell_moments(cell, 9).expect("baseline moments");
6700            let scratch_state =
6701                evaluate_cell_moments_with_scratch(cell, 9, &mut scratch).expect("scratch moments");
6702            assert_eq!(baseline.branch, scratch_state.branch);
6703            assert!((baseline.value - scratch_state.value).abs() <= 1e-10);
6704            assert_eq!(baseline.moments.len(), scratch_state.moments.len());
6705            for (lhs, rhs) in baseline.moments.iter().zip(scratch_state.moments.iter()) {
6706                assert!((lhs - rhs).abs() <= 1e-10, "{lhs} vs {rhs}");
6707            }
6708        }
6709
6710        reset_cell_moment_test_reallocs();
6711        let mut checksum = 0.0;
6712        for i in 0..5_000 {
6713            let cell = cells[i % cells.len()];
6714            let state = evaluate_cell_moments_with_scratch(cell, 9, &mut scratch)
6715                .expect("scratch moments under repeated pressure");
6716            checksum += state.value + state.moments[0] * 1e-12;
6717        }
6718        assert!(checksum.is_finite());
6719        assert_eq!(
6720            cell_moment_test_reallocs(),
6721            0,
6722            "scratch-backed inner cell-moment calls should not grow Vec buffers"
6723        );
6724    }
6725
6726    #[test]
6727    fn evaluate_cell_moments_matches_numeric_integrals() {
6728        let cell = DenestedCubicCell {
6729            left: -0.9,
6730            right: 0.8,
6731            c0: 0.15,
6732            c1: -0.35,
6733            c2: 0.11,
6734            c3: -0.07,
6735        };
6736        let state = evaluate_cell_moments(cell, 6).expect("cell moments");
6737        let value_numeric = simpson_integral(cell.left, cell.right, 4000, |z| {
6738            super::normal_cdf(cell.eta(z)) * normal_pdf(z)
6739        });
6740        assert!((state.value - value_numeric).abs() < 1e-9);
6741        for degree in 0..=6 {
6742            let target = simpson_integral(cell.left, cell.right, 4000, |z| {
6743                z.powi(degree as i32) * (-cell.q(z)).exp()
6744            });
6745            assert!((state.moments[degree] - target).abs() < 1e-9);
6746        }
6747    }
6748
6749    #[test]
6750    fn partition_builder_moves_link_preimages_with_intercept() {
6751        let score_breaks = [-2.0, -1.0, 0.0, 1.0, 2.0];
6752        let link_breaks = [-1.5, -0.5, 0.5, 1.5];
6753        let score_span = |z: f64| {
6754            let left = if z < -1.0 {
6755                -2.0
6756            } else if z < 0.0 {
6757                -1.0
6758            } else if z < 1.0 {
6759                0.0
6760            } else {
6761                1.0
6762            };
6763            Ok(LocalSpanCubic {
6764                left,
6765                right: left + 1.0,
6766                c0: 0.1,
6767                c1: 0.2,
6768                c2: 0.0,
6769                c3: 0.0,
6770            })
6771        };
6772        let link_span = |u: f64| {
6773            let left = if u < -0.5 {
6774                -1.5
6775            } else if u < 0.5 {
6776                -0.5
6777            } else {
6778                0.5
6779            };
6780            Ok(LocalSpanCubic {
6781                left,
6782                right: left + 1.0,
6783                c0: -0.05,
6784                c1: 0.1,
6785                c2: 0.0,
6786                c3: 0.0,
6787            })
6788        };
6789        let cells_a0 = build_denested_partition_cells(
6790            0.25,
6791            0.9,
6792            &score_breaks,
6793            &link_breaks,
6794            score_span,
6795            link_span,
6796        )
6797        .expect("cells a0");
6798        let cells_a1 = build_denested_partition_cells(
6799            0.55,
6800            0.9,
6801            &score_breaks,
6802            &link_breaks,
6803            score_span,
6804            link_span,
6805        )
6806        .expect("cells a1");
6807        assert!(cells_a0.len() >= score_breaks.len() - 1);
6808        assert!(
6809            cells_a0
6810                .windows(2)
6811                .all(|w| (w[0].cell.right - w[1].cell.left).abs() <= 1e-12)
6812        );
6813        assert!(
6814            cells_a0
6815                .iter()
6816                .zip(cells_a1.iter())
6817                .any(|(lhs, rhs)| (lhs.cell.left - rhs.cell.left).abs() > 1e-10)
6818        );
6819        assert!(cells_a0.first().unwrap().cell.left.is_infinite());
6820        assert!(cells_a0.last().unwrap().cell.right.is_infinite());
6821    }
6822
6823    #[test]
6824    fn partition_builder_without_breaks_returns_single_global_cell() {
6825        let cells = build_denested_partition_cells_with_tails(
6826            0.3,
6827            -0.4,
6828            &[],
6829            &[],
6830            |z| {
6831                if z.is_nan() {
6832                    return Err("probe z is NaN".to_string());
6833                }
6834                Ok(LocalSpanCubic {
6835                    left: 0.0,
6836                    right: 1.0,
6837                    c0: 0.0,
6838                    c1: 0.0,
6839                    c2: 0.0,
6840                    c3: 0.0,
6841                })
6842            },
6843            |u| {
6844                if u.is_nan() {
6845                    return Err("probe u is NaN".to_string());
6846                }
6847                Ok(LocalSpanCubic {
6848                    left: 0.0,
6849                    right: 1.0,
6850                    c0: 0.0,
6851                    c1: 0.0,
6852                    c2: 0.0,
6853                    c3: 0.0,
6854                })
6855            },
6856        )
6857        .expect("global cell");
6858        assert_eq!(cells.len(), 1);
6859        assert_eq!(cells[0].cell.left, f64::NEG_INFINITY);
6860        assert_eq!(cells[0].cell.right, f64::INFINITY);
6861        assert!(cells[0].cell.c2.abs() < 1e-12);
6862        assert!(cells[0].cell.c3.abs() < 1e-12);
6863    }
6864
6865    #[test]
6866    fn polynomial_integral_helper_matches_moment_sum() {
6867        let cell = DenestedCubicCell {
6868            left: -1.5,
6869            right: 1.25,
6870            c0: 0.2,
6871            c1: -0.4,
6872            c2: 0.15,
6873            c3: 0.03,
6874        };
6875        let state = evaluate_cell_moments(cell, 8).expect("cell moments");
6876        let coeffs = [1.5, -0.25, 0.75, 0.1];
6877        let expected = INV_TWO_PI
6878            * coeffs
6879                .iter()
6880                .enumerate()
6881                .map(|(idx, coeff)| coeff * state.moments[idx])
6882                .sum::<f64>();
6883        let got = cell_polynomial_integral_from_moments(&coeffs, &state.moments, "test poly")
6884            .expect("poly integral");
6885        assert!((got - expected).abs() < 1e-14);
6886    }
6887
6888    #[test]
6889    fn batched_cell_moment_max_degree_matches_direct_non_affine_grid() {
6890        let cells = [
6891            DenestedCubicCell {
6892                left: -2.0,
6893                right: -0.25,
6894                c0: -0.7,
6895                c1: 0.8,
6896                c2: 0.015,
6897                c3: -0.004,
6898            },
6899            DenestedCubicCell {
6900                left: -0.5,
6901                right: 0.75,
6902                c0: 0.2,
6903                c1: -0.35,
6904                c2: -0.025,
6905                c3: 0.0,
6906            },
6907            DenestedCubicCell {
6908                left: 0.1,
6909                right: 1.6,
6910                c0: 0.4,
6911                c1: 0.25,
6912                c2: 0.01,
6913                c3: 0.006,
6914            },
6915            DenestedCubicCell {
6916                left: -1.25,
6917                right: 2.25,
6918                c0: -0.1,
6919                c1: 0.55,
6920                c2: -0.012,
6921                c3: 0.003,
6922            },
6923        ];
6924        for cell in cells {
6925            let branch = branch_cell(cell).expect("branch");
6926            if branch == ExactCellBranch::Affine {
6927                continue;
6928            }
6929            let batched =
6930                evaluate_non_affine_cell_state(cell, branch, 21).expect("degree-21 state");
6931            for degree in [9usize, 15, 21] {
6932                let direct =
6933                    evaluate_non_affine_cell_state(cell, branch, degree).expect("direct state");
6934                assert_eq!(batched.branch, direct.branch);
6935                let denom = direct.value.abs().max(1.0);
6936                assert!(((batched.value - direct.value).abs() / denom) < 1e-10);
6937                for k in 0..=degree {
6938                    let denom = direct.moments[k].abs().max(1.0);
6939                    let rel = (batched.moments[k] - direct.moments[k]).abs() / denom;
6940                    assert!(
6941                        rel < 1e-10,
6942                        "cell={cell:?} degree={degree} moment={k} rel={rel:e}"
6943                    );
6944                }
6945            }
6946        }
6947    }
6948
6949    #[test]
6950    fn derivative_moment_evaluator_matches_value_evaluator_moments() {
6951        let cells = [
6952            DenestedCubicCell {
6953                left: -2.0,
6954                right: -0.4,
6955                c0: 0.15,
6956                c1: -0.8,
6957                c2: 0.0,
6958                c3: 0.0,
6959            },
6960            DenestedCubicCell {
6961                left: -0.75,
6962                right: 1.4,
6963                c0: -0.25,
6964                c1: 0.6,
6965                c2: 0.12,
6966                c3: 0.0,
6967            },
6968            DenestedCubicCell {
6969                left: -1.1,
6970                right: 0.9,
6971                c0: 0.35,
6972                c1: -0.3,
6973                c2: 0.05,
6974                c3: -0.015,
6975            },
6976        ];
6977        for cell in cells {
6978            for degree in [4usize, 9, 15, 21] {
6979                let full = evaluate_cell_moments_uncached(cell, degree).expect("full moments");
6980                let derivative = evaluate_cell_derivative_moments_uncached(cell, degree)
6981                    .expect("derivative moments");
6982                assert_eq!(full.branch, derivative.branch);
6983                assert_eq!(full.moments.len(), derivative.moments.len());
6984                for k in 0..full.moments.len() {
6985                    assert_eq!(full.moments[k].to_bits(), derivative.moments[k].to_bits());
6986                }
6987            }
6988        }
6989    }
6990
6991    #[test]
6992    fn cell_moment_lru_matches_uncached_non_affine_grid() {
6993        let cache = CellMomentLruCache::new(16 * 1024 * 1024);
6994        let stats = CellMomentCacheStats::default();
6995        let c0s = [-0.75, 0.0, 0.5];
6996        let c1s = [-1.2, 0.25, 1.1];
6997        let c2s = [-0.18, 0.07];
6998        let c3s = [0.0, 0.025];
6999        let bounds = [(-2.0, -0.5), (-0.25, 1.5)];
7000        let degrees = [4usize, 9, 15, 21];
7001        for &c0 in &c0s {
7002            for &c1 in &c1s {
7003                for &c2 in &c2s {
7004                    for &c3 in &c3s {
7005                        for &(left, right) in &bounds {
7006                            for &max_degree in &degrees {
7007                                let cell = DenestedCubicCell {
7008                                    left,
7009                                    right,
7010                                    c0,
7011                                    c1,
7012                                    c2,
7013                                    c3,
7014                                };
7015                                let branch = branch_cell(cell).expect("branch");
7016                                if branch == ExactCellBranch::Affine {
7017                                    continue;
7018                                }
7019                                let expected =
7020                                    evaluate_non_affine_cell_state(cell, branch, max_degree)
7021                                        .expect("uncached non-affine moments");
7022                                let got = evaluate_cell_moments_cached(
7023                                    cell,
7024                                    max_degree,
7025                                    &cache,
7026                                    Some(&stats),
7027                                )
7028                                .expect("cached moments");
7029                                assert_eq!(got.branch, expected.branch);
7030                                assert_eq!(got.moments.len(), max_degree + 1);
7031                                let denom = expected.value.abs().max(1.0);
7032                                assert!(
7033                                    ((got.value - expected.value).abs() / denom) < 1e-10,
7034                                    "value mismatch for {cell:?} degree {max_degree}: got {} expected {}",
7035                                    got.value,
7036                                    expected.value
7037                                );
7038                                for (idx, (&lhs, &rhs)) in
7039                                    got.moments.iter().zip(expected.moments.iter()).enumerate()
7040                                {
7041                                    let denom = rhs.abs().max(1.0);
7042                                    assert!(
7043                                        ((lhs - rhs).abs() / denom) < 1e-10,
7044                                        "moment {idx} mismatch for {cell:?} degree {max_degree}: got {lhs} expected {rhs}"
7045                                    );
7046                                }
7047                                let warm = evaluate_cell_moments_cached(
7048                                    cell,
7049                                    max_degree,
7050                                    &cache,
7051                                    Some(&stats),
7052                                )
7053                                .expect("warm cached moments");
7054                                assert_eq!(warm, got);
7055                            }
7056                        }
7057                    }
7058                }
7059            }
7060        }
7061        let (hits, misses) = stats.snapshot();
7062        assert!(hits > 0, "expected warm LRU hits");
7063        assert!(misses > 0, "expected cold LRU misses");
7064    }
7065
7066    #[test]
7067    fn cell_moment_fingerprint_exact_cache_matches_current_evaluator() {
7068        let cells = [
7069            DenestedCubicCell {
7070                left: -1.75,
7071                right: -0.25,
7072                c0: 0.15,
7073                c1: -0.35,
7074                c2: 0.08,
7075                c3: -0.015,
7076            },
7077            DenestedCubicCell {
7078                left: -0.5,
7079                right: 0.8,
7080                c0: -0.2,
7081                c1: 0.45,
7082                c2: -0.12,
7083                c3: 0.025,
7084            },
7085            DenestedCubicCell {
7086                left: 0.1,
7087                right: 1.6,
7088                c0: 0.05,
7089                c1: 0.2,
7090                c2: 0.03,
7091                c3: 0.004,
7092            },
7093        ];
7094        let mut cache = std::collections::HashMap::new();
7095        for max_degree in [0usize, 3, 4, 9, 16] {
7096            for cell in cells {
7097                let baseline = evaluate_cell_moments(cell, max_degree).expect("baseline moments");
7098                let key = cell_moment_cache_key(cell, max_degree, 0.0);
7099                let cached = cache.entry(key).or_insert_with(|| {
7100                    evaluate_cell_moments(cell, max_degree).expect("cached moments")
7101                });
7102                assert_eq!(baseline.branch, cached.branch);
7103                assert_eq!(baseline.value.to_bits(), cached.value.to_bits());
7104                assert_eq!(baseline.moments.len(), cached.moments.len());
7105                for (lhs, rhs) in baseline.moments.iter().zip(cached.moments.iter()) {
7106                    assert_eq!(lhs.to_bits(), rhs.to_bits());
7107                }
7108            }
7109        }
7110    }
7111
7112    #[test]
7113    fn fuzzy_cell_moment_fingerprint_error_scales_with_epsilon() {
7114        for epsilon in [1e-8, 1e-6] {
7115            let base = DenestedCubicCell {
7116                left: -1.25,
7117                right: 1.1,
7118                c0: 0.1,
7119                c1: -0.25,
7120                c2: 0.04,
7121                c3: -0.006,
7122            };
7123            let perturbed = DenestedCubicCell {
7124                left: base.left + 0.001 * epsilon,
7125                right: base.right - 0.001 * epsilon,
7126                c0: base.c0 + 0.001 * epsilon,
7127                c1: base.c1 - 0.001 * epsilon,
7128                c2: base.c2 + 0.001 * epsilon,
7129                c3: base.c3 - 0.001 * epsilon,
7130            };
7131            assert_eq!(
7132                cell_moment_cache_key(base, 9, epsilon),
7133                cell_moment_cache_key(perturbed, 9, epsilon)
7134            );
7135            let lhs = evaluate_cell_moments(base, 9).expect("base moments");
7136            let rhs = evaluate_cell_moments(perturbed, 9).expect("perturbed moments");
7137            let max_rel = lhs
7138                .moments
7139                .iter()
7140                .zip(rhs.moments.iter())
7141                .map(|(a, b)| (a - b).abs() / a.abs().max(b.abs()).max(1.0))
7142                .fold(0.0_f64, f64::max);
7143            assert!(
7144                max_rel <= 10.0 * epsilon,
7145                "epsilon={epsilon:.1e} max_rel={max_rel:.3e}"
7146            );
7147        }
7148    }
7149
7150    /// Locks in numerical equivalence of the optimized
7151    /// `evaluate_non_affine_cell_state` against an inline reference
7152    /// implementation that mirrors the prior pre-fold structure
7153    /// (separate `cell.eta(z)` / `cell.q(z)` calls; post-loop
7154    /// `* half_width`; trailing `value_integral * half_width / sqrt(TAU)`).
7155    /// Any drift larger than 1e-13 relative would indicate the hot-path
7156    /// rewrite changed the math.
7157    #[test]
7158    fn non_affine_cell_state_matches_prefold_reference_to_1e_minus_13() {
7159        // Reference: byte-for-byte the structure of the previous
7160        // implementation. Kept local to this test to avoid leaking a second
7161        // public surface.
7162        fn reference(
7163            cell: DenestedCubicCell,
7164            branch: ExactCellBranch,
7165            max_degree: usize,
7166        ) -> CellMomentState {
7167            let mut moments: CellMomentVec = smallvec![0.0_f64; max_degree + 1];
7168            let mut value_integral = 0.0_f64;
7169            let center = 0.5 * (cell.left + cell.right);
7170            let half_width = 0.5 * (cell.right - cell.left);
7171            for (&node, &weight) in GL_NODES.iter().zip(GL_WEIGHTS.iter()) {
7172                let z = center + half_width * node;
7173                let eta = cell.eta(z);
7174                let moment_weight = weight * (-cell.q(z)).exp();
7175                let mut z_pow = 1.0_f64;
7176                for moment in &mut moments {
7177                    *moment = moment_weight.mul_add(z_pow, *moment);
7178                    z_pow *= z;
7179                }
7180                value_integral += weight * (-0.5 * z * z).exp() * normal_cdf(eta);
7181            }
7182            for moment in &mut moments {
7183                *moment *= half_width;
7184            }
7185            CellMomentState {
7186                branch,
7187                value: value_integral * half_width / (std::f64::consts::TAU).sqrt(),
7188                moments,
7189            }
7190        }
7191
7192        // Hand-rolled inputs that cross both Quartic and Sextic branches and
7193        // exercise positive/negative coefficients, asymmetric intervals, and
7194        // a wide degree range (matches survival_marginal_slope's degree=9
7195        // production call as well as the bernoulli outer-step degree=24).
7196        let cells = [
7197            DenestedCubicCell {
7198                left: -1.25,
7199                right: -0.2,
7200                c0: -0.35,
7201                c1: 0.85,
7202                c2: 0.04,
7203                c3: -0.015,
7204            },
7205            DenestedCubicCell {
7206                left: -0.2,
7207                right: 0.55,
7208                c0: 0.12,
7209                c1: -0.65,
7210                c2: -0.025,
7211                c3: 0.02,
7212            },
7213            DenestedCubicCell {
7214                left: 0.55,
7215                right: 1.6,
7216                c0: 0.42,
7217                c1: 0.35,
7218                c2: 0.018,
7219                c3: 0.012,
7220            },
7221            DenestedCubicCell {
7222                left: -3.0,
7223                right: -1.0,
7224                c0: 1.7,
7225                c1: -0.4,
7226                c2: 0.11,
7227                c3: -0.07,
7228            },
7229        ];
7230        let degrees = [0_usize, 4, 9, 16, 24];
7231        for cell in cells {
7232            let branch = branch_cell(cell).expect("branch");
7233            assert_ne!(branch, ExactCellBranch::Affine);
7234            for max_degree in degrees {
7235                let actual = evaluate_non_affine_cell_state(cell, branch, max_degree)
7236                    .expect("optimized non-affine");
7237                let expected = reference(cell, branch, max_degree);
7238                assert_eq!(actual.branch, expected.branch);
7239                assert_eq!(actual.moments.len(), expected.moments.len());
7240                let denom_v = expected.value.abs().max(1.0);
7241                let rel_v = (actual.value - expected.value).abs() / denom_v;
7242                let actual_v = actual.value;
7243                let expected_v = expected.value;
7244                assert!(
7245                    rel_v <= 1e-13,
7246                    "value rel mismatch for {cell:?} degree {max_degree}: \
7247                     actual={actual_v:.17e} expected={expected_v:.17e} rel={rel_v:.3e}"
7248                );
7249                for (k, (lhs, rhs)) in actual
7250                    .moments
7251                    .iter()
7252                    .zip(expected.moments.iter())
7253                    .enumerate()
7254                {
7255                    let denom = rhs.abs().max(1.0);
7256                    let rel = (lhs - rhs).abs() / denom;
7257                    assert!(
7258                        rel <= 1e-13,
7259                        "moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7260                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7261                    );
7262                }
7263
7264                // Also lock in the derivative-state path on the same
7265                // inputs so the (parallel) edit there can't drift.
7266                let actual_deriv =
7267                    evaluate_non_affine_cell_derivative_state(cell, branch, max_degree)
7268                        .expect("optimized derivative");
7269                for (k, (lhs, rhs)) in actual_deriv
7270                    .moments
7271                    .iter()
7272                    .zip(expected.moments.iter())
7273                    .enumerate()
7274                {
7275                    let denom = rhs.abs().max(1.0);
7276                    let rel = (lhs - rhs).abs() / denom;
7277                    assert!(
7278                        rel <= 1e-13,
7279                        "deriv moment {k} rel mismatch for {cell:?} degree {max_degree}: \
7280                         actual={lhs:.17e} expected={rhs:.17e} rel={rel:.3e}"
7281                    );
7282                }
7283            }
7284        }
7285    }
7286
7287    /// DECISIVE: the third-derivative kernel must equal the FD of the
7288    /// second-derivative kernel w.r.t. a parameter that perturbs `eta`,
7289    /// RE-EVALUATING the moments at each step (the moments depend on `eta`
7290    /// via the `exp(-q)` weight). This isolates the kernel from all survival
7291    /// partition/cross machinery (gam#979 f_uv_dir localization).
7292    #[test]
7293    fn third_derivative_kernel_matches_fd_of_second_with_eta_perturbation() {
7294        // A finite, non-affine cell.
7295        let base = DenestedCubicCell {
7296            left: -0.6,
7297            right: 0.9,
7298            c0: 0.30,
7299            c1: 0.45,
7300            c2: -0.20,
7301            c3: 0.12,
7302        };
7303        // Synthetic parameter directions as cubic-in-z perturbations of eta:
7304        //   eta_u = ∂eta/∂u, eta_v = ∂eta/∂v, eta_t = ∂eta/∂t (the dir).
7305        let eta_u = [0.11_f64, -0.07, 0.05, 0.02];
7306        let eta_v = [-0.09_f64, 0.13, -0.04, 0.03];
7307        let eta_t = [0.17_f64, 0.06, -0.10, 0.04]; // the "b-like" direction
7308        // Second crosses ∂²eta/∂{·}{·} (pick small non-zero cubics).
7309        let eta_uv = [0.02_f64, 0.01, -0.015, 0.005];
7310        let eta_ut = [-0.01_f64, 0.02, 0.007, -0.003];
7311        let eta_vt = [0.015_f64, -0.008, 0.01, 0.004];
7312        // Third cross ∂³eta/∂u∂v∂t.
7313        let eta_uvt = [0.003_f64, -0.002, 0.001, 0.0005];
7314
7315        let neg = |a: &[f64; 4]| a.map(|v| -v);
7316        let max_degree = 15usize;
7317
7318        // f_uv(s) where param s shifts eta by s·(eta_t + ½ s²... ) — here we
7319        // build the cell at eta + s·eta_t + s²·eta_vt-style is NOT needed; we
7320        // only need the t-direction to first order for ∂/∂t. To FD ∂(f_uv)/∂t
7321        // we perturb eta along eta_t AND carry the s-dependence of the u,v
7322        // crosses: eta_u(s)=eta_u + s·eta_ut, eta_v(s)=eta_v + s·eta_vt,
7323        // eta_uv(s)=eta_uv + s·eta_uvt. The cell cubic shifts by s·eta_t.
7324        let f_uv_at = |s: f64| -> f64 {
7325            let cell_s = DenestedCubicCell {
7326                c0: base.c0 + s * eta_t[0],
7327                c1: base.c1 + s * eta_t[1],
7328                c2: base.c2 + s * eta_t[2],
7329                c3: base.c3 + s * eta_t[3],
7330                ..base
7331            };
7332            // Moments MUST be recomputed at the perturbed eta.
7333            let st = evaluate_cell_moments(cell_s, max_degree).unwrap();
7334            let neg_cell = DenestedCubicCell {
7335                c0: -cell_s.c0,
7336                c1: -cell_s.c1,
7337                c2: -cell_s.c2,
7338                c3: -cell_s.c3,
7339                ..cell_s
7340            };
7341            let u_s = [
7342                eta_u[0] + s * eta_ut[0],
7343                eta_u[1] + s * eta_ut[1],
7344                eta_u[2] + s * eta_ut[2],
7345                eta_u[3] + s * eta_ut[3],
7346            ];
7347            let v_s = [
7348                eta_v[0] + s * eta_vt[0],
7349                eta_v[1] + s * eta_vt[1],
7350                eta_v[2] + s * eta_vt[2],
7351                eta_v[3] + s * eta_vt[3],
7352            ];
7353            let uv_s = [
7354                eta_uv[0] + s * eta_uvt[0],
7355                eta_uv[1] + s * eta_uvt[1],
7356                eta_uv[2] + s * eta_uvt[2],
7357                eta_uv[3] + s * eta_uvt[3],
7358            ];
7359            cell_second_derivative_from_moments(
7360                neg_cell,
7361                &neg(&u_s),
7362                &neg(&v_s),
7363                &neg(&uv_s),
7364                &st.moments,
7365            )
7366            .unwrap()
7367        };
7368
7369        let h = 1e-5;
7370        let fd = (f_uv_at(h) - f_uv_at(-h)) / (2.0 * h);
7371
7372        // Analytic third via the kernel (negated cell + negated crosses, as the
7373        // survival path does).
7374        let st0 = evaluate_cell_moments(base, max_degree).unwrap();
7375        let neg_cell0 = DenestedCubicCell {
7376            c0: -base.c0,
7377            c1: -base.c1,
7378            c2: -base.c2,
7379            c3: -base.c3,
7380            ..base
7381        };
7382        let analytic = cell_third_derivative_from_moments(
7383            neg_cell0,
7384            &neg(&eta_u),
7385            &neg(&eta_v),
7386            &neg(&eta_t),
7387            &neg(&eta_uv),
7388            &neg(&eta_ut),
7389            &neg(&eta_vt),
7390            &neg(&eta_uvt),
7391            &st0.moments,
7392        )
7393        .unwrap();
7394
7395        let denom = fd.abs().max(1e-3);
7396        let rel = (analytic - fd).abs() / denom;
7397        assert!(
7398            rel <= 1e-5,
7399            "third kernel vs FD-of-second mismatch: analytic={analytic:.12e} fd={fd:.12e} rel={rel:.3e}"
7400        );
7401    }
7402
7403    #[test]
7404    fn moving_shared_edge_second_integral_derivative_has_leibniz_jump_sign() {
7405        let edge0 = 0.2_f64;
7406        let edge_velocity = -0.37_f64;
7407
7408        let left_eta = [0.22_f64, -0.18, 0.09, 0.03];
7409        let right_eta = [-0.11_f64, 0.26, -0.04, 0.02];
7410        let left_r = [0.08_f64, -0.05, 0.03, 0.01];
7411        let left_s = [-0.06_f64, 0.04, 0.02, -0.015];
7412        let left_rs = [0.025_f64, -0.012, 0.006, 0.004];
7413        let right_r = [-0.03_f64, 0.07, -0.02, 0.012];
7414        let right_s = [0.05_f64, -0.025, 0.018, 0.007];
7415        let right_rs = [-0.018_f64, 0.014, -0.005, 0.003];
7416
7417        let integral_at = |shift: f64| -> f64 {
7418            let edge = edge0 + edge_velocity * shift;
7419            let left = DenestedCubicCell {
7420                left: -0.7,
7421                right: edge,
7422                c0: left_eta[0],
7423                c1: left_eta[1],
7424                c2: left_eta[2],
7425                c3: left_eta[3],
7426            };
7427            let right = DenestedCubicCell {
7428                left: edge,
7429                right: 1.1,
7430                c0: right_eta[0],
7431                c1: right_eta[1],
7432                c2: right_eta[2],
7433                c3: right_eta[3],
7434            };
7435            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7436            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7437            cell_second_derivative_from_moments(
7438                left,
7439                &left_r,
7440                &left_s,
7441                &left_rs,
7442                &left_state.moments,
7443            )
7444            .expect("left second")
7445                + cell_second_derivative_from_moments(
7446                    right,
7447                    &right_r,
7448                    &right_s,
7449                    &right_rs,
7450                    &right_state.moments,
7451                )
7452                .expect("right second")
7453        };
7454
7455        let h = 1e-5;
7456        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7457
7458        let left = DenestedCubicCell {
7459            left: -0.7,
7460            right: edge0,
7461            c0: left_eta[0],
7462            c1: left_eta[1],
7463            c2: left_eta[2],
7464            c3: left_eta[3],
7465        };
7466        let right = DenestedCubicCell {
7467            left: edge0,
7468            right: 1.1,
7469            c0: right_eta[0],
7470            c1: right_eta[1],
7471            c2: right_eta[2],
7472            c3: right_eta[3],
7473        };
7474        let f_left =
7475            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7476        let f_right =
7477            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7478        let analytic = edge_velocity * (f_left - f_right);
7479
7480        let denom = analytic.abs().max(1e-8);
7481        let rel = (fd - analytic).abs() / denom;
7482        assert!(
7483            rel <= 5e-8,
7484            "moving edge sign mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7485        );
7486    }
7487
7488    #[test]
7489    fn moving_shared_edge_second_integral_mixed_derivative_has_full_leibniz_terms() {
7490        let edge0 = -0.15_f64;
7491        let edge_d1 = 0.31_f64;
7492        let edge_d2 = -0.27_f64;
7493        let edge_d12 = 0.19_f64;
7494
7495        let left_eta = [0.16_f64, -0.21, 0.07, -0.025];
7496        let right_eta = [-0.09_f64, 0.18, -0.055, 0.018];
7497        let left_r = [0.075_f64, -0.045, 0.018, 0.009];
7498        let left_s = [-0.052_f64, 0.033, 0.014, -0.011];
7499        let left_rs = [0.021_f64, -0.009, 0.005, 0.0025];
7500        let right_r = [-0.028_f64, 0.063, -0.017, 0.010];
7501        let right_s = [0.047_f64, -0.023, 0.016, 0.006];
7502        let right_rs = [-0.015_f64, 0.012, -0.004, 0.002];
7503
7504        let integral_at = |s1: f64, s2: f64| -> f64 {
7505            let edge = edge0 + edge_d1 * s1 + edge_d2 * s2 + edge_d12 * s1 * s2;
7506            let left = DenestedCubicCell {
7507                left: -0.8,
7508                right: edge,
7509                c0: left_eta[0],
7510                c1: left_eta[1],
7511                c2: left_eta[2],
7512                c3: left_eta[3],
7513            };
7514            let right = DenestedCubicCell {
7515                left: edge,
7516                right: 0.9,
7517                c0: right_eta[0],
7518                c1: right_eta[1],
7519                c2: right_eta[2],
7520                c3: right_eta[3],
7521            };
7522            let left_state = evaluate_cell_moments(left, 12).expect("left moments");
7523            let right_state = evaluate_cell_moments(right, 12).expect("right moments");
7524            cell_second_derivative_from_moments(
7525                left,
7526                &left_r,
7527                &left_s,
7528                &left_rs,
7529                &left_state.moments,
7530            )
7531            .expect("left second")
7532                + cell_second_derivative_from_moments(
7533                    right,
7534                    &right_r,
7535                    &right_s,
7536                    &right_rs,
7537                    &right_state.moments,
7538                )
7539                .expect("right second")
7540        };
7541
7542        let h = 2e-4;
7543        let fd = (integral_at(h, h) - integral_at(h, -h) - integral_at(-h, h)
7544            + integral_at(-h, -h))
7545            / (4.0 * h * h);
7546
7547        let left = DenestedCubicCell {
7548            left: -0.8,
7549            right: edge0,
7550            c0: left_eta[0],
7551            c1: left_eta[1],
7552            c2: left_eta[2],
7553            c3: left_eta[3],
7554        };
7555        let right = DenestedCubicCell {
7556            left: edge0,
7557            right: 0.9,
7558            c0: right_eta[0],
7559            c1: right_eta[1],
7560            c2: right_eta[2],
7561            c3: right_eta[3],
7562        };
7563
7564        let boundary_z_derivative =
7565            |cell: DenestedCubicCell, r: &[f64], s: &[f64], rs: &[f64]| -> f64 {
7566                let eta = cell.eta(edge0);
7567                let eta_z = cell.c1 + 2.0 * cell.c2 * edge0 + 3.0 * cell.c3 * edge0 * edge0;
7568                let cr = poly_eval_at(r, edge0);
7569                let cs = poly_eval_at(s, edge0);
7570                let crs = poly_eval_at(rs, edge0);
7571                let cr_z = r.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7572                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7573                });
7574                let cs_z = s.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7575                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7576                });
7577                let crs_z = rs.iter().enumerate().skip(1).fold(0.0, |acc, (k, val)| {
7578                    acc + (k as f64) * val * edge0.powi(k as i32 - 1)
7579                });
7580                let amp = crs - eta * cr * cs;
7581                let amp_z = crs_z - eta_z * cr * cs - eta * cr_z * cs - eta * cr * cs_z;
7582                let q_z = edge0 + eta * eta_z;
7583                (amp_z - amp * q_z) * (-cell.q(edge0)).exp() * INV_TWO_PI
7584            };
7585
7586        let f_left =
7587            cell_second_derivative_boundary_integrand(left, &left_r, &left_s, &left_rs, edge0);
7588        let f_right =
7589            cell_second_derivative_boundary_integrand(right, &right_r, &right_s, &right_rs, edge0);
7590        let fz_left = boundary_z_derivative(left, &left_r, &left_s, &left_rs);
7591        let fz_right = boundary_z_derivative(right, &right_r, &right_s, &right_rs);
7592        let analytic = edge_d12 * (f_left - f_right) + edge_d1 * edge_d2 * (fz_left - fz_right);
7593
7594        let denom = analytic.abs().max(1e-8);
7595        let rel = (fd - analytic).abs() / denom;
7596        assert!(
7597            rel <= 2e-7,
7598            "moving edge mixed term mismatch: fd={fd:.12e} analytic={analytic:.12e} rel={rel:.3e}"
7599        );
7600    }
7601
7602    // gam#1454 resolution. The reported defect ("survival flex directional
7603    // third[g,w0] wrong: candidate f_au_dir/f_aa_dir missing self-flux") posited
7604    // a MISSING third-order Leibniz self-flux at the moving link-knot crossings.
7605    // This regression establishes the two facts that, together, prove the
7606    // implicit-intercept third-order tower
7607    // (`row_primary_third_contracted_recompute*`) is CORRECT to add no such flux:
7608    //
7609    //   (1) The third-derivative integrand `F_rst` genuinely DOES jump across a
7610    //       C²-link knot — its third coefficient slice carries `c_rst ∝ 6·α₃`,
7611    //       and `α₃` (the spline's third `z`-derivative) is the one piece a C²
7612    //       cubic spline leaves discontinuous. So the jump is real and the
7613    //       `cell_third_derivative_boundary_integrand` flux formula is exact
7614    //       (verified by FD of a direct ∂/∂edge of the third-integral sum —
7615    //       a FOURTH-order scenario that pins the integrand, not the tower).
7616    //
7617    //   (2) Every boundary term in the Leibniz expansion of a THIRD derivative,
7618    //       however, evaluates an integrand of order ≤ 2 at the moving edge
7619    //       (one of the three differentiations is spent moving the boundary).
7620    //       The second-derivative integrand `F_rs` is CONTINUOUS across the same
7621    //       C² knot (its slices reach at most `α₂ + 3α₃·shift`, i.e. ½·η''(u*),
7622    //       which a C² spline keeps continuous). Hence the shared-edge flux
7623    //       `velocity·(F_rs^L − F_rs^R)` telescopes to ZERO, and the tower's
7624    //       third-order self-flux is a genuine no-op. The real residual lives in
7625    //       the interior implicit-intercept assembly, not at the boundary.
7626    #[test]
7627    fn third_order_self_flux_telescopes_but_third_integrand_jumps_at_c2_knot_1454() {
7628        let edge0 = 0.13_f64;
7629        let edge_velocity = -0.41_f64;
7630
7631        // Build η continuous to C² at edge0 but with a jump in the cubic (3rd
7632        // derivative) coefficient. Pick the left cubic freely; choose the right
7633        // cubic to match value+1st+2nd derivative at edge0, then perturb its c3.
7634        let left_eta = [0.18_f64, -0.12, 0.07, 0.04];
7635        let right_c3 = 0.04_f64 + 0.09; // α₃ jump across the knot.
7636        // Match η, η', η'' at edge0 for the right piece given its c3:
7637        //   η(z)  = c0 + c1 z + c2 z² + c3 z³
7638        //   η'(z) = c1 + 2 c2 z + 3 c3 z²
7639        //   η''(z)= 2 c2 + 6 c3 z
7640        // Solve right (c0,c1,c2) so the three values equal the left ones at edge0.
7641        let l0 = left_eta[0];
7642        let l1 = left_eta[1];
7643        let l2 = left_eta[2];
7644        let l3 = left_eta[3];
7645        let e = edge0;
7646        let eta_val = l0 + l1 * e + l2 * e * e + l3 * e * e * e;
7647        let eta_d1 = l1 + 2.0 * l2 * e + 3.0 * l3 * e * e;
7648        let eta_d2 = 2.0 * l2 + 6.0 * l3 * e;
7649        let rc2 = (eta_d2 - 6.0 * right_c3 * e) / 2.0;
7650        let rc1 = eta_d1 - 2.0 * rc2 * e - 3.0 * right_c3 * e * e;
7651        let rc0 = eta_val - rc1 * e - rc2 * e * e - right_c3 * e * e * e;
7652        let right_eta = [rc0, rc1, rc2, right_c3];
7653
7654        // Coefficient slices. The first/second slices we keep continuous at the
7655        // edge (mimicking c_r=1+η', c_rs∝η'' which a C² spline matches), so the
7656        // 2nd-order flux would cancel. The third-order slice `rst` carries the
7657        // jumping α₃ and is DIFFERENT across the edge — this is the term that
7658        // breaks cancellation.
7659        let common_r = [0.06_f64, -0.04, 0.02, 0.0];
7660        let common_s = [-0.05_f64, 0.03, 0.015, 0.0];
7661        let common_t = [0.08_f64, 0.05, -0.03, 0.0];
7662        let common_rs = [0.02_f64, -0.01, 0.005, 0.0];
7663        let common_rt = [-0.012_f64, 0.008, 0.004, 0.0];
7664        let common_st = [0.015_f64, -0.006, 0.003, 0.0];
7665        // rst ∝ 6·α₃ in the real path: left and right differ by the α₃ jump.
7666        let left_rst = [6.0 * l3, 0.0, 0.0, 0.0];
7667        let right_rst = [6.0 * right_c3, 0.0, 0.0, 0.0];
7668
7669        let max_degree = 15usize;
7670        let neg = |a: &[f64; 4]| a.map(|v| -v);
7671
7672        // The integral sum over the two cells sharing the moving edge, computed
7673        // via the fixed-domain moment reduction with the SURVIVAL/probit sign
7674        // convention (negated cell + negated coefficient slices), exactly as the
7675        // production `row_primary_third_contracted_recompute` path does.
7676        let integral_at = |shift: f64| -> f64 {
7677            let edge = edge0 + edge_velocity * shift;
7678            let left = DenestedCubicCell {
7679                left: -0.7,
7680                right: edge,
7681                c0: left_eta[0],
7682                c1: left_eta[1],
7683                c2: left_eta[2],
7684                c3: left_eta[3],
7685            };
7686            let right = DenestedCubicCell {
7687                left: edge,
7688                right: 1.0,
7689                c0: right_eta[0],
7690                c1: right_eta[1],
7691                c2: right_eta[2],
7692                c3: right_eta[3],
7693            };
7694            let lst = evaluate_cell_moments(left, max_degree).unwrap();
7695            let rst_m = evaluate_cell_moments(right, max_degree).unwrap();
7696            let neg_left = DenestedCubicCell {
7697                c0: -left.c0,
7698                c1: -left.c1,
7699                c2: -left.c2,
7700                c3: -left.c3,
7701                ..left
7702            };
7703            let neg_right = DenestedCubicCell {
7704                c0: -right.c0,
7705                c1: -right.c1,
7706                c2: -right.c2,
7707                c3: -right.c3,
7708                ..right
7709            };
7710            let li = cell_third_derivative_from_moments(
7711                neg_left,
7712                &neg(&common_r),
7713                &neg(&common_s),
7714                &neg(&common_t),
7715                &neg(&common_rs),
7716                &neg(&common_rt),
7717                &neg(&common_st),
7718                &neg(&left_rst),
7719                &lst.moments,
7720            )
7721            .unwrap();
7722            let ri = cell_third_derivative_from_moments(
7723                neg_right,
7724                &neg(&common_r),
7725                &neg(&common_s),
7726                &neg(&common_t),
7727                &neg(&common_rs),
7728                &neg(&common_rt),
7729                &neg(&common_st),
7730                &neg(&right_rst),
7731                &rst_m.moments,
7732            )
7733            .unwrap();
7734            li + ri
7735        };
7736
7737        let h = 1e-5;
7738        let fd = (integral_at(h) - integral_at(-h)) / (2.0 * h);
7739
7740        // Fixed-domain part: differentiate ONLY the integrands (domain frozen at
7741        // edge0). Its directional derivative is the analytic Leibniz flux alone,
7742        // since the integrand coefficients here are edge-independent:
7743        //   flux = velocity · ( F_rst^L(edge0) − F_rst^R(edge0) ).
7744        //
7745        // CONVENTION: the finite-difference `integral_at` above integrates the
7746        // SURVIVAL/probit sign convention — negated cell (η→−η) AND negated
7747        // coefficient slices — exactly as the production
7748        // `row_primary_third_contracted_recompute` path does. The Leibniz
7749        // boundary integrand must therefore be evaluated in that SAME negated
7750        // convention: the third-derivative integrand is ODD under the joint
7751        // (η→−η, coeff→−coeff) negation (its `rst`, `η·rs·t`, and `(η²−1)·r·s·t`
7752        // terms each flip sign an odd number of times), so evaluating the flux
7753        // with un-negated cells/coeffs yields exactly the opposite sign and the
7754        // Leibniz identity `fd = flux` fails as `fd = −flux`. (The
7755        // second-derivative sibling test `moving_shared_edge_second_integral_
7756        // derivative_has_leibniz_jump_sign` keeps BOTH sides un-negated and so
7757        // stays self-consistent; this test keeps BOTH sides negated.)
7758        let neg_eta = |eta: &[f64; 4]| [-eta[0], -eta[1], -eta[2], -eta[3]];
7759        let left_eta_neg = neg_eta(&left_eta);
7760        let right_eta_neg = neg_eta(&right_eta);
7761        let left0 = DenestedCubicCell {
7762            left: -0.7,
7763            right: edge0,
7764            c0: left_eta_neg[0],
7765            c1: left_eta_neg[1],
7766            c2: left_eta_neg[2],
7767            c3: left_eta_neg[3],
7768        };
7769        let right0 = DenestedCubicCell {
7770            left: edge0,
7771            right: 1.0,
7772            c0: right_eta_neg[0],
7773            c1: right_eta_neg[1],
7774            c2: right_eta_neg[2],
7775            c3: right_eta_neg[3],
7776        };
7777        let f_left = cell_third_derivative_boundary_integrand(
7778            left0,
7779            &neg(&common_r),
7780            &neg(&common_s),
7781            &neg(&common_t),
7782            &neg(&common_rs),
7783            &neg(&common_rt),
7784            &neg(&common_st),
7785            &neg(&left_rst),
7786            edge0,
7787        );
7788        let f_right = cell_third_derivative_boundary_integrand(
7789            right0,
7790            &neg(&common_r),
7791            &neg(&common_s),
7792            &neg(&common_t),
7793            &neg(&common_rs),
7794            &neg(&common_rt),
7795            &neg(&common_st),
7796            &neg(&right_rst),
7797            edge0,
7798        );
7799
7800        // The integrand DOES jump across this C² knot (the α₃ third-coefficient
7801        // term is the only discontinuous piece). Confirm the jump is genuine —
7802        // if it were zero the flux would be a no-op and #1454 would not exist.
7803        let jump = f_left - f_right;
7804        assert!(
7805            jump.abs() > 1e-4,
7806            "third-derivative integrand must jump across the C² knot (α₃ discontinuity); \
7807             got jump={jump:.3e}"
7808        );
7809
7810        let analytic_flux = edge_velocity * jump;
7811        let denom = fd.abs().max(1e-6);
7812        let rel = (fd - analytic_flux).abs() / denom;
7813        assert!(
7814            rel <= 1e-5,
7815            "moving-edge third-derivative flux mismatch (#1454): fd={fd:.12e} \
7816             analytic_flux={analytic_flux:.12e} rel={rel:.3e}"
7817        );
7818
7819        // ---- Fact (2): the SECOND-derivative integrand telescopes to zero. ----
7820        // A 3rd-derivative Leibniz boundary term spends one differentiation on
7821        // the moving edge and evaluates a ≤2nd-order integrand there. The
7822        // hardest such term is the slope-slope Hessian integrand `F_bb`, whose
7823        // coefficient slice is the link cubic's b-b partial
7824        //   dc_dbb(z) = [0, 0, 2(α₂ + 3 α₃·shift), 6 α₃·b]·(z⁰..z³)
7825        //             = z²·η''(u),  with u = a + b·z, shift = a − knot.
7826        // Across a C² knot α₂, α₃, and `shift` all jump, yet η''(u*) is
7827        // continuous — so the EVALUATED slice `c_bb(z*) = z*²·η''(u*)` matches on
7828        // both sides and `F_bb` is continuous. Build the two pieces' raw dc_dbb
7829        // decompositions from `link_cubic_second_partials` and confirm the
7830        // second-derivative integrand carries no jump (flux telescopes to 0).
7831        let a_row = 0.21_f64;
7832        let b_row = 1.37_f64;
7833        let knot = a_row + b_row * edge0; // u-location of the crossing.
7834        // Left/right link pieces: choose α₂,α₃ freely on the left; pick the
7835        // right piece's α₂ so η''(knot) is continuous given a jumped α₃.
7836        let left_link = LocalSpanCubic {
7837            left: knot - 0.6,
7838            right: knot + 0.6,
7839            c0: 0.0,
7840            c1: 0.0,
7841            c2: 0.08,
7842            c3: -0.05,
7843        };
7844        let right_alpha3 = -0.05_f64 + 0.11; // α₃ jump.
7845        // η''(knot) continuity:  2α₂ᴸ + 6α₃ᴸ·(knot−leftᴸ) = 2α₂ᴿ + 6α₃ᴿ·(knot−leftᴿ).
7846        let right_left_coord = knot - 0.4;
7847        let lhs = 2.0 * left_link.c2 + 6.0 * left_link.c3 * (knot - left_link.left);
7848        let right_alpha2 = (lhs - 6.0 * right_alpha3 * (knot - right_left_coord)) / 2.0;
7849        let right_link = LocalSpanCubic {
7850            left: right_left_coord,
7851            right: right_left_coord + 0.8,
7852            c0: 0.0,
7853            c1: 0.0,
7854            c2: right_alpha2,
7855            c3: right_alpha3,
7856        };
7857        let (_, _, dc_dbb_left) = link_cubic_second_partials(left_link, a_row, b_row);
7858        let (_, _, dc_dbb_right) = link_cubic_second_partials(right_link, a_row, b_row);
7859        // The per-coefficient arrays differ (α₃ jumped)...
7860        assert!(
7861            (dc_dbb_left[3] - dc_dbb_right[3]).abs() > 1e-3,
7862            "α₃ jump must make the raw dc_dbb coefficient arrays differ"
7863        );
7864        // ...but the EVALUATED second-order slice at the crossing matches, so the
7865        // F_bb boundary integrand carries no jump and the flux telescopes to 0.
7866        let c_bb_left = poly_eval_at(&dc_dbb_left, edge0);
7867        let c_bb_right = poly_eval_at(&dc_dbb_right, edge0);
7868        assert!(
7869            (c_bb_left - c_bb_right).abs() <= 1e-12,
7870            "second-derivative slope-slope integrand must be CONTINUOUS across the \
7871             C² knot (telescoping self-flux): left={c_bb_left:.15e} right={c_bb_right:.15e}"
7872        );
7873    }
7874}
gam_model_kernels/cubic_cell_kernel.rs

gam_model_kernels/
cubic_cell_kernel.rs