gam_math/
jet_scalar.rs

1//! Order-specific Taylor-jet SCALAR algebras (#932 cutover, doc §A).
2//!
3//! [`crate::jet_tower::Tower4`] carries the full value/gradient/Hessian/`t3`/`t4`
4//! tensor stack: it answers EVERY channel a [`super::row_kernel::RowKernel`]
5//! consumer can ask for, but at `K = 9` that is a ~50 KiB per-row object whose
6//! by-value copies overflowed the stack and timed out the location-scale fit —
7//! which is exactly why `row_kernel_directional_supported()` /
8//! `row_kernel_joint_hessian_supported()` still `return false`. The cutover does
9//! NOT need the dense `Tower4<9>` per row; it needs, per consumer, only the one
10//! channel that consumer serves:
11//!
12//! | consumer | channel | scalar here | K=9 size |
13//! |---|---|---|---|
14//! | inner Newton / `row_kernel` | `(v, g, H)` | [`Order2`] | 728 B |
15//! | `row_third_contracted(dir)` | `Σ_c ℓ_{abc} dir_c` | [`OneSeed`] | 1.46 KiB |
16//! | `row_fourth_contracted(u, v)` | `Σ_{cd} ℓ_{abcd} u_c v_d` | [`TwoSeed`] | 2.8 KiB |
17//!
18//! Each is built on [`Order2`] (value/grad/Hessian), which is the production
19//! [`crate::jet_tower::Tower2`] re-expressed behind a generic interface: a row
20//! loss written ONCE against [`JetScalar`] re-instantiates at whatever order /
21//! representation a consumer needs, with the contraction folded INTO the
22//! differentiation (the nilpotent ε / δ directions), so `t3` / `t4` are never
23//! materialised. The single source of truth is the same one expression — the
24//! genus of #736 cross-block drift cannot reappear because there is no separate
25//! channel to forget.
26//!
27//! # Why each scalar is exact (doc §A.1–A.3)
28//!
29//! * [`Order2`] is the order-≤2 truncation of the Leibniz / Faà di Bruno rules.
30//!   Those order-2 terms read ONLY the order-≤2 channels of their inputs (see
31//!   [`crate::jet_tower::Tower4::mul`]: `out.h[i][j]` never touches `t3`/`t4`),
32//!   so its `(v, g, H)` is BIT-IDENTICAL to a full `Tower4<K>` — and identical
33//!   to [`crate::jet_tower::Tower2`], over which it is a thin newtype.
34//! * [`OneSeed`] carries an [`Order2`] base plus one nilpotent ε (`ε² = 0`)
35//!   holding another [`Order2`]. Seeding ε with the fixed direction `u` makes the
36//!   ε-component of the Hessian channel the contracted third `Σ_c ℓ_{abc} u_c`
37//!   (the nilpotent implements `d/dτ|₀` of `ℓ_{ab}(p + τu)` exactly).
38//! * [`TwoSeed`] carries an [`Order2`] base plus ε, δ (`ε² = δ² = 0`, `εδ`
39//!   retained) — four [`Order2`] parts. Seeding ε, δ with `u, v` makes the
40//!   εδ-component of the Hessian channel the contracted fourth
41//!   `Σ_{cd} ℓ_{abcd} u_c v_d` (the single mixed `∂_σ∂_ρ|₀` term, no `σ²`/`ρ²`
42//!   contamination).
43//!
44//! # Stability discipline
45//!
46//! As in [`crate::jet_tower`], humans own primitive stability and the algebra
47//! owns combinatorics: tail-critical special functions enter ONLY as
48//! hand-certified `[f64; 5]` derivative stacks through [`JetScalar::compose_unary`]
49//! (each scalar consumes the leading entries its order needs), never by
50//! differentiating an unstable primal.
51//!
52//! # Production scalars and the test-only all-channels oracle
53//!
54//! The `JetScalar` trait below is production: it is the bound on
55//! [`crate::jet_tower::RowNllProgramGeneric::row_nll_generic`], the seam a family
56//! row loss is written against. The order-specific scalars that *consume* it —
57//! [`Order2`] (value/grad/Hessian), [`OneSeed`] (contracted third) and
58//! [`TwoSeed`] (contracted fourth) — are production: the survival location-scale
59//! `RowKernel<9>` builds its joint Hessian / directional derivatives through them
60//! (`survival::location_scale::row_kernel`), paying only the small packed scalar
61//! per row instead of the ~50 KiB dense [`crate::jet_tower::Tower4`].
62//!
63//! The [`crate::jet_tower::Tower4`] all-channels `JetScalar` impl is test-only: it
64//! is the oracle that pins the contracted scalars against the dense
65//! value/grad/Hessian/`t3`/`t4` truth, so it lives in the `#[cfg(test)]` module.
66
67/// A truncated-Taylor scalar carrying derivatives in `K` primaries.
68///
69/// All concrete scalars here ([`Order2`], [`OneSeed`], [`TwoSeed`]) and the full
70/// [`crate::jet_tower::Tower4`] implement the SAME algebra; only the carried
71/// channel set differs. A row loss written once against this interface yields a
72/// different channel set per instantiation, all exact for the channel they serve
73/// (doc §A.0).
74pub trait JetScalar<const K: usize>: Copy {
75    /// A constant: value `c`, every derivative channel zero.
76    fn constant(c: f64) -> Self;
77
78    /// The seeded variable `p_axis` at value `x`: unit first derivative in slot
79    /// `axis`, all higher channels zero. (The nilpotent / cross channels of the
80    /// directional scalars are seeded zero — callers set ε/δ directions through
81    /// the scalar-specific [`OneSeed::seed_direction`] / [`TwoSeed::seed`].)
82    fn variable(x: f64, axis: usize) -> Self;
83
84    /// The value channel `ℓ(p)`.
85    fn value(&self) -> f64;
86
87    /// Exact truncated Leibniz sum `self + o`.
88    fn add(&self, o: &Self) -> Self;
89    /// Exact truncated Leibniz difference `self − o`.
90    fn sub(&self, o: &Self) -> Self;
91    /// Exact truncated Leibniz product `self · o`.
92    fn mul(&self, o: &Self) -> Self;
93    /// Negate every channel.
94    fn neg(&self) -> Self;
95    /// Multiply every channel by a plain scalar `s`.
96    fn scale(&self, s: f64) -> Self;
97
98    /// Exact multivariate Faà di Bruno composition `f ∘ self`, given the outer
99    /// derivative stack `d = [f(u), f′(u), f″(u), f‴(u), f⁗(u)]` at
100    /// `u = self.value()`.
101    ///
102    /// This is the SAME `[f64; 5]` stack shape [`crate::jet_tower::Tower4`] and
103    /// the families' `unary_derivatives_*` helpers (built on erfcx / log_ndtr)
104    /// already produce, so those stacks plug in directly. Each scalar consumes
105    /// only the leading entries its order needs (order-2 reads `d[0..=2]`; the
106    /// directional scalars read one / two beyond their base) — the fixed-length
107    /// array makes that windowing total, no length guard required.
108    fn compose_unary(&self, d: [f64; 5]) -> Self;
109
110    /// Compose with a unary special-function whose derivative STACK is built
111    /// from the scalar base value through `stack_fn` — the generic-over-`Lane`
112    /// seam that lets a single-sourced row program instantiate at BOTH the scalar
113    /// `f64` jets and the SIMD `f64x4` batch towers from ONE expression.
114    ///
115    /// On a scalar jet this evaluates `stack_fn(self.value())` ONCE and forwards
116    /// to [`compose_unary`](Self::compose_unary), so it is BIT-IDENTICAL to the
117    /// hand-written `self.compose_unary(stack_fn(self.value()))` (default body
118    /// below). The lever is that the SAME call shape exists on
119    /// [`crate::jet_tower::Tower3Lane`] / [`crate::jet_tower::Tower4Lane`], where
120    /// the four lanes carry FOUR DISTINCT base values, so the batch
121    /// implementation re-runs `stack_fn` per lane — a thing the old
122    /// `compose_unary(stack_from(self.value()))` shape could not express on a
123    /// batch type (it has no single scalar `.value()`). Writing a row program
124    /// against this method instead of the explicit two-step is what makes it
125    /// instantiate, unchanged, at `f64x4` for the 4-rows-per-pass batch path.
126    fn compose_unary_with(&self, stack_fn: impl Fn(f64) -> [f64; 5]) -> Self {
127        self.compose_unary(stack_fn(self.value()))
128    }
129
130    /// `e^self`. Convenience for tame arguments (see module stability note).
131    fn exp(&self) -> Self {
132        let e = self.value().exp();
133        self.compose_unary([e, e, e, e, e])
134    }
135
136    /// `√self`. Caller guarantees positivity.
137    fn sqrt(&self) -> Self {
138        let u = self.value();
139        let s = u.sqrt();
140        self.compose_unary([
141            s,
142            0.5 / s,
143            -0.25 / (u * s),
144            0.375 / (u * u * s),
145            -0.9375 / (u * u * u * s),
146        ])
147    }
148
149    /// `ln(self)`. Caller guarantees positivity. Same derivative stack
150    /// [`crate::jet_tower::Tower4::ln`] uses, so any program written over both
151    /// matches term-for-term.
152    fn ln(&self) -> Self {
153        let u = self.value();
154        let r = 1.0 / u;
155        self.compose_unary([u.ln(), r, -r * r, 2.0 * r * r * r, -6.0 * r * r * r * r])
156    }
157
158    /// `1/self`.
159    fn recip(&self) -> Self {
160        let r = 1.0 / self.value();
161        let r2 = r * r;
162        self.compose_unary([r, -r2, 2.0 * r2 * r, -6.0 * r2 * r2, 24.0 * r2 * r2 * r])
163    }
164
165    /// `self^a` for real exponent `a`. Caller guarantees a positive base.
166    /// Mirrors [`crate::jet_tower::Tower4::powf`] (falling-factorial stack).
167    fn powf(&self, a: f64) -> Self {
168        let u = self.value();
169        self.compose_unary([
170            u.powf(a),
171            a * u.powf(a - 1.0),
172            a * (a - 1.0) * u.powf(a - 2.0),
173            a * (a - 1.0) * (a - 2.0) * u.powf(a - 3.0),
174            a * (a - 1.0) * (a - 2.0) * (a - 3.0) * u.powf(a - 4.0),
175        ])
176    }
177
178    /// `ln Γ(self)`. Caller guarantees a positive argument. Uses the SAME
179    /// hand-certified derivative stack [`crate::jet_tower::Tower4::ln_gamma`]
180    /// consumes ([`crate::jet_tower::ln_gamma_derivative_stack`]), so any
181    /// program written over both matches term-for-term.
182    fn ln_gamma(&self) -> Self {
183        self.compose_unary(crate::jet_tower::ln_gamma_derivative_stack(self.value()))
184    }
185
186    /// `ψ(self) = d/dx ln Γ(x)` (digamma). Caller guarantees a positive
187    /// argument. Same hand-certified stack
188    /// [`crate::jet_tower::digamma_derivative_stack`].
189    fn digamma(&self) -> Self {
190        self.compose_unary(crate::jet_tower::digamma_derivative_stack(self.value()))
191    }
192}
193
194// ── Order2<K> ergonomic operator overloads (doc §A.1) ───────────────────
195//
196// The dispersion-family row NLLs are written with `+`/`-`/`*` operators over
197// the primaries (mirroring how they read as `Tower4` expressions). These
198// delegate channel-for-channel to the inner `Tower2` arithmetic (which has
199// `Add`/`Mul`; `Sub`/`Neg` are expressed as `+ (-1)·rhs` exactly as the
200// `JetScalar::sub` / `JetScalar::neg` impls do), so an `Order2` expression is
201// bit-identical to the same `Tower4` expression's order-≤2 channels.
202
203impl<const K: usize> std::ops::Add for Order2<K> {
204    type Output = Self;
205    #[inline]
206    fn add(self, o: Self) -> Self {
207        Order2(self.0 + o.0)
208    }
209}
210
211impl<const K: usize> std::ops::Add<f64> for Order2<K> {
212    type Output = Self;
213    #[inline]
214    fn add(self, c: f64) -> Self {
215        Order2(self.0 + c)
216    }
217}
218
219impl<const K: usize> std::ops::Sub for Order2<K> {
220    type Output = Self;
221    #[inline]
222    fn sub(self, o: Self) -> Self {
223        Order2(self.0 + o.0.scale(-1.0))
224    }
225}
226
227impl<const K: usize> std::ops::Sub<f64> for Order2<K> {
228    type Output = Self;
229    #[inline]
230    fn sub(self, c: f64) -> Self {
231        Order2(self.0 + (-c))
232    }
233}
234
235impl<const K: usize> std::ops::Mul for Order2<K> {
236    type Output = Self;
237    #[inline]
238    fn mul(self, o: Self) -> Self {
239        Order2(crate::jet_tower::Tower2::mul(&self.0, &o.0))
240    }
241}
242
243impl<const K: usize> std::ops::Mul<f64> for Order2<K> {
244    type Output = Self;
245    #[inline]
246    fn mul(self, c: f64) -> Self {
247        Order2(self.0.scale(c))
248    }
249}
250
251impl<const K: usize> std::ops::Neg for Order2<K> {
252    type Output = Self;
253    #[inline]
254    fn neg(self) -> Self {
255        Order2(self.0.scale(-1.0))
256    }
257}
258
259/// Filtered Hensel lift of a SCALAR implicit state `a(θ)` defined by the
260/// constraint `F(a, θ) = 0`, evaluated in ANY [`JetScalar`] algebra `S` (doc
261/// §11, "A generic implicit-lift operator for every production scalar").
262///
263/// This is the perf-respecting alternative to lifting through a dense
264/// `Tower4<K+1>` (which carries the implicit variable as an extra dense axis):
265/// the state `a` lives directly in the consumer's own `K`-primary algebra
266/// `S` — `Order2<K>` for value/gradient/Hessian, `Tower4<K>` for the full
267/// `t3`/`t4` — never paying for an extra variable.
268///
269/// **Method.** Fixed-Jacobian Newton in the nilpotent algebra. By the
270/// filtered-lift theorem (doc §11.1), if `F_a := ∂F/∂a(a₀, θ₀)` is the primal
271/// Jacobian at the base point and `inv_fa = 1/F_a`, then the iteration
272/// `A ← A − inv_fa · F(A, θ)` raises the filtration degree of the residual by
273/// at least one per step: each step kills exactly one graded layer. Starting
274/// from `A = const(a₀)` (whose residual lies in `F¹` because `θ − θ₀ ∈ 𝔫`),
275/// `iters` equal to the algebra's nilpotency order returns the *exact* lifted
276/// jet (`Order2`: 2, `OneSeed`: 3, `Tower4`/`TwoSeed`: 4). The value channel of
277/// `A` never moves — `F(A, θ).value() = F(a₀, θ₀) = 0` at the certified root —
278/// so a caller may precompute every primitive's derivative stack at the fixed
279/// base index once and let the cheap polynomial composition repeat per step.
280///
281/// `f` evaluates the constraint `F(a, θ)` in `S` (capturing the seeded
282/// parameter jets `θ`); `a0` is the certified scalar root `F(a₀, θ₀) ≈ 0`.
283pub fn filtered_implicit_solve_scalar<const K: usize, S: JetScalar<K>>(
284    a0: f64,
285    inv_fa: f64,
286    iters: usize,
287    f: impl Fn(&S) -> S,
288) -> S {
289    let mut a = S::constant(a0);
290    for _ in 0..iters {
291        let residual = f(&a);
292        a = a.sub(&residual.scale(inv_fa));
293    }
294    a
295}
296
297// ── Order2<K>: value / gradient / Hessian (doc §A.1) ────────────────────
298
299/// Truncated SECOND-order scalar: value `v`, gradient `g_a`, Hessian `H_{ab}`.
300///
301/// This is a thin newtype over the production [`crate::jet_tower::Tower2`], so
302/// its `(v, g, H)` channels are obtained by the SAME formulas — and are
303/// therefore bit-identical to both [`crate::jet_tower::Tower2`] and the order-≤2
304/// channels of a full [`crate::jet_tower::Tower4`] (doc §A.1, "Bit-identity with
305/// the full tower"). The wrapper exists only to satisfy the generic
306/// [`JetScalar`] interface (the `compose_unary` / `add` / `sub` / `neg` /
307/// `recip` the trait demands, which `Tower2` does not expose by that shape) —
308/// every channel is delegated to `Tower2` arithmetic unchanged.
309#[derive(Clone, Copy, Debug)]
310pub struct Order2<const K: usize>(pub crate::jet_tower::Tower2<K>);
311
312impl<const K: usize> Order2<K> {
313    /// Read the gradient channel `g_a = ∂ℓ/∂p_a`.
314    #[inline]
315    pub fn g(&self) -> [f64; K] {
316        self.0.g
317    }
318
319    /// Read the Hessian channel.
320    #[inline]
321    pub fn h(&self) -> [[f64; K]; K] {
322        self.0.h
323    }
324}
325
326impl<const K: usize> JetScalar<K> for Order2<K> {
327    fn constant(c: f64) -> Self {
328        Order2(crate::jet_tower::Tower2::constant(c))
329    }
330    fn variable(x: f64, axis: usize) -> Self {
331        Order2(crate::jet_tower::Tower2::variable(x, axis))
332    }
333    fn value(&self) -> f64 {
334        self.0.v
335    }
336    fn add(&self, o: &Self) -> Self {
337        Order2(self.0 + o.0)
338    }
339    fn sub(&self, o: &Self) -> Self {
340        // Tower2 has no Sub op; subtract by adding the negation, matching
341        // Tower4::sub (self + o.scale(-1.0)).
342        Order2(self.0 + o.0.scale(-1.0))
343    }
344    fn mul(&self, o: &Self) -> Self {
345        Order2(crate::jet_tower::Tower2::mul(&self.0, &o.0))
346    }
347    fn neg(&self) -> Self {
348        Order2(self.0.scale(-1.0))
349    }
350    fn scale(&self, s: f64) -> Self {
351        Order2(self.0.scale(s))
352    }
353    fn compose_unary(&self, d: [f64; 5]) -> Self {
354        // Order-≤2 reads only [f, f', f''] of the stack.
355        Order2(self.0.compose_unary([d[0], d[1], d[2]]))
356    }
357}
358
359// ── Lane-batched Order-2 scalar: 4 rows per pass in SIMD lanes (perf) ────
360//
361// The hot per-row jet kernels evaluate ONE row's `(v, g, H)` tower at a time in
362// scalar `f64`. A hand-written scalar derivative does the same. The throughput
363// lever a jet has that scalar hand-code cannot is **row batching in SIMD
364// lanes**: the order-≤2 Leibniz product `Order2::mul` is `O(K²)` independent
365// per-channel float ops, and EVERY row runs the identical op graph on different
366// data — the textbook SPMD shape. Packing `LANES = 4` rows into a `wide::f64x4`
367// and running the algebra once per 4 rows replaces 4 scalar passes with one
368// vector pass: the `K²` Hessian channel updates become `K²` NEON `.2d` / SSE2
369// `pd` instructions covering 4 rows each, ~4× fewer FP instructions per row.
370//
371// The carried scalar field is abstracted by [`Lane`] so the SAME algebra body
372// instantiates at `f64` (1 row, used as the bit-identity oracle) or
373// [`wide::f64x4`] (4 rows). Bit-identity is structural, not approximate:
374//
375//   * Every arithmetic op is a plain lane-wise `+` / `-` / `*` (NEVER a fused
376//     `mul_add`), and IEEE-754 double `+`/`-`/`*`/`/` are correctly rounded and
377//     deterministic, so lane `i` of an `f64x4` op equals the scalar `f64` op on
378//     that lane's inputs bit-for-bit.
379//   * The transcendental derivative STACKS (`exp`/`ln`/`sqrt`/…) are produced
380//     **per lane by the identical scalar code** ([`Lane::unary3`] unpacks, runs
381//     the same `[f64; 3]` stack closure the scalar path runs, repacks), so the
382//     only thing vectorised is the cheap rational tensor composition — the
383//     library transcendental itself is the exact same `f64::exp` call per lane.
384//   * The op order mirrors [`crate::jet_tower::Tower2`] term-for-term, so
385//     [`Order2Lane<f64, K>`] is `to_bits`-identical to the production
386//     [`Order2<K>`] (= `Tower2<K>`), and [`Order2Lane<f64x4, K>`] lane `i` is
387//     `to_bits`-identical to that — proven by the `batch_tests` oracle below
388//     (≥2000 random 4-row batches across `K ∈ {2,3,4,9}`).
389
390/// The scalar field a [`Order2Lane`] carries: either a single `f64` (one row,
391/// the oracle) or a [`wide::f64x4`] (four rows evaluated in SIMD lanes). All ops
392/// are plain lane-wise IEEE arithmetic, so a vector op equals the scalar op on
393/// each lane bit-for-bit.
394pub trait Lane: Copy {
395    /// Broadcast a scalar to every lane.
396    fn splat(x: f64) -> Self;
397    /// Lane-wise `self + o`.
398    fn add(self, o: Self) -> Self;
399    /// Lane-wise `self - o`.
400    fn sub(self, o: Self) -> Self;
401    /// Lane-wise `self * o`.
402    fn mul(self, o: Self) -> Self;
403    /// The `f64` in lane `i` (`i < LANES`; `f64` ignores `i`).
404    fn lane(self, i: usize) -> f64;
405    /// Build the order-≤2 derivative stack `[f(u), f′(u), f″(u)]` **per lane**
406    /// from the lane value `u`, via the SAME scalar `stack` closure the
407    /// per-row path runs (so the transcendental/rational stack is bit-identical
408    /// to the scalar evaluation — only the subsequent tensor composition is
409    /// vectorised).
410    fn unary3(self, stack: impl Fn(f64) -> [f64; 3]) -> [Self; 3];
411    /// Build the order-≤4 derivative stack `[f, f′, f″, f‴, f⁗]` **per lane**
412    /// from the lane value `u`, via the SAME scalar `stack` closure the per-row
413    /// path runs. The one-/two-seed scalars ([`OneSeedLane`] / [`TwoSeedLane`])
414    /// need outer derivatives one / two orders beyond their order-2 base, so
415    /// they build their composition stack through this five-entry variant. As
416    /// with [`unary3`](Lane::unary3), only the transcendental/rational stack is
417    /// evaluated per lane (bit-identically to the scalar path); the subsequent
418    /// tensor composition is vectorised.
419    fn unary5(self, stack: impl Fn(f64) -> [f64; 5]) -> [Self; 5];
420    /// The general-`N` sibling of [`unary3`](Lane::unary3) / [`unary5`](Lane::unary5):
421    /// build an `N`-wide derivative stack **per lane** from the lane value, via
422    /// the SAME scalar `stack` closure the per-row path runs, then pack the `N`
423    /// columns lane-wise. This is the lane primitive the compose-with-stack seam
424    /// ([`crate::jet_tower::Tower4Lane::compose_unary_with`] and its `Tower3`
425    /// sibling) routes through: it evaluates `stack` once per lane at that lane's
426    /// OWN base value (each of the four rows in an `f64x4` carries a distinct
427    /// base), so lane `i` of the packed result equals the scalar `stack(value_i)`
428    /// bit-for-bit (only the cheap pack is vectorised; the closure body is the
429    /// identical scalar code). With `N = 3` / `N = 5` it is `to_bits`-identical to
430    /// [`unary3`](Lane::unary3) / [`unary5`](Lane::unary5).
431    fn unary_with<const N: usize>(self, stack: impl Fn(f64) -> [f64; N]) -> [Self; N];
432}
433
434impl Lane for f64 {
435    #[inline]
436    fn splat(x: f64) -> Self {
437        x
438    }
439    #[inline]
440    fn add(self, o: Self) -> Self {
441        self + o
442    }
443    #[inline]
444    fn sub(self, o: Self) -> Self {
445        self - o
446    }
447    #[inline]
448    fn mul(self, o: Self) -> Self {
449        self * o
450    }
451    #[inline]
452    fn lane(self, _: usize) -> f64 {
453        self
454    }
455    #[inline]
456    fn unary3(self, stack: impl Fn(f64) -> [f64; 3]) -> [Self; 3] {
457        stack(self)
458    }
459    #[inline]
460    fn unary5(self, stack: impl Fn(f64) -> [f64; 5]) -> [Self; 5] {
461        stack(self)
462    }
463    #[inline]
464    fn unary_with<const N: usize>(self, stack: impl Fn(f64) -> [f64; N]) -> [Self; N] {
465        // One row: the packed result IS the scalar stack ([Self; N] = [f64; N]).
466        stack(self)
467    }
468}
469
470impl Lane for wide::f64x4 {
471    #[inline]
472    fn splat(x: f64) -> Self {
473        wide::f64x4::splat(x)
474    }
475    #[inline]
476    fn add(self, o: Self) -> Self {
477        self + o
478    }
479    #[inline]
480    fn sub(self, o: Self) -> Self {
481        self - o
482    }
483    #[inline]
484    fn mul(self, o: Self) -> Self {
485        self * o
486    }
487    #[inline]
488    fn lane(self, i: usize) -> f64 {
489        self.to_array()[i]
490    }
491    #[inline]
492    fn unary3(self, stack: impl Fn(f64) -> [f64; 3]) -> [Self; 3] {
493        let a = self.to_array();
494        let mut d0 = [0.0_f64; 4];
495        let mut d1 = [0.0_f64; 4];
496        let mut d2 = [0.0_f64; 4];
497        for i in 0..4 {
498            let s = stack(a[i]);
499            d0[i] = s[0];
500            d1[i] = s[1];
501            d2[i] = s[2];
502        }
503        [
504            wide::f64x4::new(d0),
505            wide::f64x4::new(d1),
506            wide::f64x4::new(d2),
507        ]
508    }
509    #[inline]
510    fn unary5(self, stack: impl Fn(f64) -> [f64; 5]) -> [Self; 5] {
511        let a = self.to_array();
512        let mut d = [[0.0_f64; 4]; 5];
513        for i in 0..4 {
514            let s = stack(a[i]);
515            for (k, dk) in d.iter_mut().enumerate() {
516                dk[i] = s[k];
517            }
518        }
519        [
520            wide::f64x4::new(d[0]),
521            wide::f64x4::new(d[1]),
522            wide::f64x4::new(d[2]),
523            wide::f64x4::new(d[3]),
524            wide::f64x4::new(d[4]),
525        ]
526    }
527    #[inline]
528    fn unary_with<const N: usize>(self, stack: impl Fn(f64) -> [f64; N]) -> [Self; N] {
529        // Evaluate the scalar stack PER LANE at that lane's own base value, then
530        // pack the N derivative columns lane-wise (the same shape `unary5` uses,
531        // generalised to N). Lane `i` of column `k` is `stack(base_i)[k]`.
532        let a = self.to_array();
533        let mut cols = [[0.0_f64; 4]; N];
534        for (i, &base) in a.iter().enumerate() {
535            let s = stack(base);
536            for (k, sk) in s.iter().enumerate() {
537                cols[k][i] = *sk;
538            }
539        }
540        std::array::from_fn(|k| wide::f64x4::new(cols[k]))
541    }
542}
543
544/// A lane-batched order-≤2 Taylor scalar: value / gradient / Hessian carried in
545/// a SIMD field [`L: Lane`](Lane). With `L = f64x4` one instance carries FOUR
546/// rows at once, so the row loop processes 4 rows per vector pass instead of one
547/// per scalar pass.
548///
549/// The channel layout and every float op mirror [`crate::jet_tower::Tower2`]
550/// term-for-term, so `Order2Lane<f64, K>` is `to_bits`-identical to the
551/// production [`Order2<K>`] and `Order2Lane<f64x4, K>` lane `i` is
552/// `to_bits`-identical to that (see the module note and `batch_tests`).
553#[derive(Clone, Copy, Debug)]
554pub struct Order2Lane<L: Lane, const K: usize> {
555    /// Value channel `ℓ` (one entry per lane/row).
556    pub v: L,
557    /// Gradient channel `∂ℓ/∂p_a`.
558    pub g: [L; K],
559    /// Hessian channel `∂²ℓ/∂p_a∂p_b` (symmetric).
560    pub h: [[L; K]; K],
561}
562
563/// The 4-rows-per-pass batched order-≤2 scalar (`wide::f64x4` lanes).
564pub type Order2Batch<const K: usize> = Order2Lane<wide::f64x4, K>;
565
566impl<L: Lane, const K: usize> Order2Lane<L, K> {
567    /// A constant: value `c` in every channel-zero slot.
568    #[inline]
569    pub fn constant(c: L) -> Self {
570        Order2Lane {
571            v: c,
572            g: [L::splat(0.0); K],
573            h: [[L::splat(0.0); K]; K],
574        }
575    }
576
577    /// The seeded variable `p_axis` at (per-lane) value `value`: unit first
578    /// derivative in slot `axis`. With `L = f64x4`, `value` packs the four
579    /// rows' values of primary `axis`.
580    #[inline]
581    pub fn variable(value: L, axis: usize) -> Self {
582        let mut out = Self::constant(value);
583        out.g[axis] = L::splat(1.0);
584        out
585    }
586
587    /// Lane-wise `self + o` (mirrors `Tower2` Add: per-channel add).
588    #[inline]
589    pub fn add(&self, o: &Self) -> Self {
590        let mut out = *self;
591        out.v = self.v.add(o.v);
592        for i in 0..K {
593            out.g[i] = self.g[i].add(o.g[i]);
594            for j in 0..K {
595                out.h[i][j] = self.h[i][j].add(o.h[i][j]);
596            }
597        }
598        out
599    }
600
601    /// Multiply every channel by the plain scalar `s` (mirrors `Tower2::scale`).
602    #[inline]
603    pub fn scale(&self, s: f64) -> Self {
604        let sl = L::splat(s);
605        let mut out = *self;
606        out.v = self.v.mul(sl);
607        for i in 0..K {
608            out.g[i] = self.g[i].mul(sl);
609            for j in 0..K {
610                out.h[i][j] = self.h[i][j].mul(sl);
611            }
612        }
613        out
614    }
615
616    /// Lane-wise `self - o`, expressed as `self + o·(-1)` exactly as
617    /// [`Order2::sub`] / `Tower4::sub` do, so signed-zero handling matches.
618    #[inline]
619    pub fn sub(&self, o: &Self) -> Self {
620        self.add(&o.scale(-1.0))
621    }
622
623    /// Negate every channel (= `scale(-1.0)`, matching [`Order2::neg`]).
624    #[inline]
625    pub fn neg(&self) -> Self {
626        self.scale(-1.0)
627    }
628
629    /// Exact order-≤2 Leibniz product, term-for-term identical to
630    /// [`crate::jet_tower::Tower2::mul`] (same factor order, no `mul_add`).
631    #[inline]
632    pub fn mul(&self, o: &Self) -> Self {
633        let a = self;
634        let b = o;
635        let mut out = Self::constant(a.v.mul(b.v));
636        for i in 0..K {
637            // a.v*b.g[i] + a.g[i]*b.v
638            out.g[i] = a.v.mul(b.g[i]).add(a.g[i].mul(b.v));
639        }
640        for i in 0..K {
641            for j in 0..K {
642                // a.v*b.h + a.g[i]*b.g[j] + a.g[j]*b.g[i] + a.h*b.v
643                out.h[i][j] = a
644                    .v
645                    .mul(b.h[i][j])
646                    .add(a.g[i].mul(b.g[j]))
647                    .add(a.g[j].mul(b.g[i]))
648                    .add(a.h[i][j].mul(b.v));
649            }
650        }
651        out
652    }
653
654    /// Exact order-≤2 Faà di Bruno composition `f ∘ self`, given the per-lane
655    /// derivative stack `d = [f(u), f′(u), f″(u)]`. Mirrors
656    /// [`crate::jet_tower::Tower2::compose_unary`] term-for-term (`acc` starts at
657    /// `0` then accumulates, so signed-zero collapses identically).
658    #[inline]
659    pub fn compose_unary(&self, d: [L; 3]) -> Self {
660        let mut out = Self::constant(d[0]);
661        for i in 0..K {
662            let mut acc = L::splat(0.0);
663            acc = acc.add(d[1].mul(self.g[i]));
664            out.g[i] = acc;
665        }
666        for i in 0..K {
667            for j in 0..K {
668                let mut acc = L::splat(0.0);
669                acc = acc.add(d[1].mul(self.h[i][j]));
670                acc = acc.add(d[2].mul(self.g[i]).mul(self.g[j]));
671                out.h[i][j] = acc;
672            }
673        }
674        out
675    }
676
677    /// `e^self`, per-lane stack `[e, e, e]` (matches the [`JetScalar::exp`]
678    /// default forwarded through `Order2`).
679    #[inline]
680    pub fn exp(&self) -> Self {
681        let d = self.v.unary3(|u| {
682            let e = u.exp();
683            [e, e, e]
684        });
685        self.compose_unary(d)
686    }
687
688    /// `ln(self)`; caller guarantees positivity. Per-lane stack
689    /// `[ln u, 1/u, -1/u²]` (matches [`JetScalar::ln`] truncated to order 2).
690    #[inline]
691    pub fn ln(&self) -> Self {
692        let d = self.v.unary3(|u| {
693            let r = 1.0 / u;
694            [u.ln(), r, -r * r]
695        });
696        self.compose_unary(d)
697    }
698
699    /// `√self`; caller guarantees positivity. Per-lane stack
700    /// `[s, 0.5/s, -0.25/(u·s)]` (matches [`JetScalar::sqrt`]).
701    #[inline]
702    pub fn sqrt(&self) -> Self {
703        let d = self.v.unary3(|u| {
704            let s = u.sqrt();
705            [s, 0.5 / s, -0.25 / (u * s)]
706        });
707        self.compose_unary(d)
708    }
709
710    /// `1/self`. Per-lane stack `[r, -r², 2r³]` (matches [`JetScalar::recip`]).
711    #[inline]
712    pub fn recip(&self) -> Self {
713        let d = self.v.unary3(|u| {
714            let r = 1.0 / u;
715            let r2 = r * r;
716            [r, -r2, 2.0 * r2 * r]
717        });
718        self.compose_unary(d)
719    }
720
721    /// `self^a` for real `a`; caller guarantees a positive base. Per-lane
722    /// falling-factorial stack (matches [`JetScalar::powf`]).
723    #[inline]
724    pub fn powf(&self, a: f64) -> Self {
725        let d = self.v.unary3(|u| {
726            [
727                u.powf(a),
728                a * u.powf(a - 1.0),
729                a * (a - 1.0) * u.powf(a - 2.0),
730            ]
731        });
732        self.compose_unary(d)
733    }
734}
735
736impl<const K: usize> Order2Batch<K> {
737    /// Extract lane `i`'s `(v, g, H)` as a production [`Order2<K>`] scalar.
738    /// Lane `i` is `to_bits`-identical to evaluating the same program at
739    /// [`Order2<K>`] on row `i` (see `batch_tests`).
740    #[inline]
741    #[must_use]
742    pub fn lane(&self, i: usize) -> Order2<K> {
743        let mut t = crate::jet_tower::Tower2::<K>::constant(self.v.lane(i));
744        for a in 0..K {
745            t.g[a] = self.g[a].lane(i);
746            for b in 0..K {
747                t.h[a][b] = self.h[a][b].lane(i);
748            }
749        }
750        Order2(t)
751    }
752}
753
754// ── Order1<K>: value / gradient only (doc §A.1, first-order prune) ──────
755
756/// Truncated FIRST-order scalar: value `v` and gradient `g_a` only — NO Hessian.
757///
758/// This is [`Order2`] with the K×K Hessian channel deleted. Its value and
759/// gradient are computed by the SAME order-≤1 truncation of the Leibniz / Faà
760/// di Bruno rules that [`Order2`] uses for those two channels, with the float
761/// operations applied in the identical order — so its `(v, g)` is BIT-IDENTICAL
762/// to both [`Order2`]'s and a full [`crate::jet_tower::Tower4`]'s order-≤1
763/// channels. Use it at a consumer that reads ONLY value + gradient (the SAE
764/// β-border channel: the reconstruction is linear in β, so the Hessian-in-β
765/// vanishes and the dense K×K Hessian product `Tower2::mul` would build is pure
766/// discarded work). Order-≤1 value/gradient never read any input's Hessian, so
767/// dropping that channel changes neither result nor float-op order — it only
768/// removes the `K²` arithmetic that produced an unread tensor.
769#[derive(Clone, Copy, Debug)]
770pub struct Order1<const K: usize> {
771    /// Value ℓ.
772    pub v: f64,
773    /// Gradient ∂ℓ/∂p_a.
774    pub g: [f64; K],
775}
776
777impl<const K: usize> Order1<K> {
778    /// Read the gradient channel `g_a = ∂ℓ/∂p_a`.
779    #[inline]
780    pub fn g(&self) -> [f64; K] {
781        self.g
782    }
783}
784
785impl<const K: usize> JetScalar<K> for Order1<K> {
786    fn constant(c: f64) -> Self {
787        // Order2::constant -> Tower2::constant: value c, all derivatives zero.
788        Order1 { v: c, g: [0.0; K] }
789    }
790    fn variable(x: f64, axis: usize) -> Self {
791        // Order2::variable -> Tower2::variable: unit first derivative in `axis`.
792        let mut g = [0.0; K];
793        g[axis] = 1.0;
794        Order1 { v: x, g }
795    }
796    fn value(&self) -> f64 {
797        self.v
798    }
799    fn add(&self, o: &Self) -> Self {
800        // Tower2 Add: out.v += o.v; out.g[i] += o.g[i] (same float order).
801        let mut g = self.g;
802        for i in 0..K {
803            g[i] += o.g[i];
804        }
805        Order1 { v: self.v + o.v, g }
806    }
807    fn sub(&self, o: &Self) -> Self {
808        // Mirror Order2::sub == self + o.scale(-1.0) exactly: scale then add.
809        self.add(&o.scale(-1.0))
810    }
811    fn mul(&self, o: &Self) -> Self {
812        // Tower2::mul value/grad terms, identical float order:
813        //   v = a.v*b.v;  g[i] = a.v*b.g[i] + a.g[i]*b.v.
814        // (The Hessian loop `a.v*b.h + a.g*b.g + ... + a.h*b.v` is the discarded
815        //  work this type exists to skip; it never feeds v or g.)
816        let a = self;
817        let b = o;
818        let mut g = [0.0; K];
819        for i in 0..K {
820            g[i] = a.v * b.g[i] + a.g[i] * b.v;
821        }
822        Order1 { v: a.v * b.v, g }
823    }
824    fn neg(&self) -> Self {
825        // Order2::neg == self.0.scale(-1.0).
826        self.scale(-1.0)
827    }
828    fn scale(&self, s: f64) -> Self {
829        // Tower2::scale: out.v *= s; out.g[i] *= s (same float order).
830        let mut g = self.g;
831        for i in 0..K {
832            g[i] *= s;
833        }
834        Order1 { v: self.v * s, g }
835    }
836    fn compose_unary(&self, d: [f64; 5]) -> Self {
837        // Faà di Bruno truncated to order ≤ 1 (matches `faa_di_bruno` /
838        // `Tower2::compose_unary` for the value and gradient channels):
839        //   value channel (m=0): d[0].
840        //   grad channel (positions=[i], single partition {{0}}): d[1]·g[i].
841        // Order-≤1 reads only d[0], d[1]; trailing stack entries are unused.
842        let mut g = [0.0; K];
843        for i in 0..K {
844            g[i] = d[1] * self.g[i];
845        }
846        Order1 { v: d[0], g }
847    }
848}
849
850// ── OneSeed<K>: one-seed directional, contracted third (doc §A.2) ───────
851
852/// One-seed directional scalar: an [`Order2`] base plus ONE nilpotent ε
853/// (`ε² = 0`) whose coefficient is itself an [`Order2`].
854///
855/// A scalar is `s = base + ε·eps`. Arithmetic is the `ε² = 0` truncation of the
856/// product (doc §A.2): the base parts multiply as ordinary [`Order2`] products,
857/// and the ε-coefficient picks up `a.base·b.eps + a.eps·b.base`. Composition
858/// pushes ε through one extra outer derivative.
859///
860/// Seed each primary with [`seed_direction`](Self::seed_direction): the base is
861/// the usual seeded variable (carrying `e_a` for the Hessian channel) and the
862/// ε-coefficient is the FIXED contraction direction `u_a` (a constant). Then the
863/// ε-component of the evaluated Hessian channel is the contracted third
864/// `[eps.h][a][b] = Σ_c ℓ_{abc} u_c` — exactly `row_third_contracted(dir = u)`,
865/// without materialising `t3`.
866#[derive(Clone, Copy, Debug)]
867pub struct OneSeed<const K: usize> {
868    /// The `ε⁰` part: value / gradient / Hessian of `ℓ`.
869    pub base: Order2<K>,
870    /// The `ε¹` part: value / gradient / Hessian of the ε-coefficient. After a
871    /// `seed_direction(u)` evaluation, `eps.h[a][b] = Σ_c ℓ_{abc} u_c`.
872    pub eps: Order2<K>,
873}
874
875impl<const K: usize> OneSeed<K> {
876    /// Seed primary `axis` at value `x` with ε-direction component `u_axis`:
877    /// `p_axis = p_axis⁰ + x-seed + ε·u_axis`, i.e. base = `variable(x, axis)`
878    /// and eps = `constant(u_axis)` (doc §A.2 "Seeding").
879    pub fn seed_direction(x: f64, axis: usize, u_axis: f64) -> Self {
880        OneSeed {
881            base: Order2::variable(x, axis),
882            eps: Order2::constant(u_axis),
883        }
884    }
885
886    /// The contracted-third channel after a `seed_direction(u)` evaluation:
887    /// `out[a][b] = Σ_c ℓ_{abc} u_c`, i.e. the ε-coefficient's Hessian (doc §A.2).
888    pub fn contracted_third(&self) -> [[f64; K]; K] {
889        self.eps.h()
890    }
891}
892
893impl<const K: usize> JetScalar<K> for OneSeed<K> {
894    fn constant(c: f64) -> Self {
895        OneSeed {
896            base: Order2::constant(c),
897            eps: Order2::constant(0.0),
898        }
899    }
900    fn variable(x: f64, axis: usize) -> Self {
901        // No ε-direction unless seeded via `seed_direction`.
902        OneSeed {
903            base: Order2::variable(x, axis),
904            eps: Order2::constant(0.0),
905        }
906    }
907    fn value(&self) -> f64 {
908        self.base.value()
909    }
910    fn add(&self, o: &Self) -> Self {
911        OneSeed {
912            base: self.base.add(&o.base),
913            eps: self.eps.add(&o.eps),
914        }
915    }
916    fn sub(&self, o: &Self) -> Self {
917        OneSeed {
918            base: self.base.sub(&o.base),
919            eps: self.eps.sub(&o.eps),
920        }
921    }
922    fn mul(&self, o: &Self) -> Self {
923        // (a.base + ε a.eps)(b.base + ε b.eps), dropping ε².
924        OneSeed {
925            base: self.base.mul(&o.base),
926            eps: self.base.mul(&o.eps).add(&self.eps.mul(&o.base)),
927        }
928    }
929    fn neg(&self) -> Self {
930        OneSeed {
931            base: self.base.neg(),
932            eps: self.eps.neg(),
933        }
934    }
935    fn scale(&self, s: f64) -> Self {
936        OneSeed {
937            base: self.base.scale(s),
938            eps: self.eps.scale(s),
939        }
940    }
941    fn compose_unary(&self, d: [f64; 5]) -> Self {
942        // f(base + ε eps) = f(base) + ε · f'(base)·eps  (ε² = 0). Each factor is
943        // an Order2 composition: the base composes with the f-stack, and the
944        // ε-coefficient is the Order2 of the SHIFTED stack (the chain rule
945        // `f'(base)` as an Order2) times eps. Order2 reads only the leading
946        // three entries of whatever stack it is handed, so the trailing slots
947        // are unused padding (the fixed-length array makes the windowing total).
948        let base = self.base.compose_unary([d[0], d[1], d[2], d[3], d[4]]);
949        // f'(base) as an Order2 (consumes [f', f'', f''']).
950        let fprime = self.base.compose_unary([d[1], d[2], d[3], d[4], d[4]]);
951        let eps = fprime.mul(&self.eps);
952        OneSeed { base, eps }
953    }
954}
955
956// ── OneSeedLane<L, K>: lane-batched one-seed directional (doc §A.2) ──────
957
958/// Lane-batched [`OneSeed`]: the same one-seed directional scalar with its two
959/// [`Order2`] parts re-typed to [`Order2Lane<L, K>`], so one `L = f64x4`
960/// instance carries FOUR rows' contracted-third evaluations per vector pass.
961///
962/// Every operation (`add`/`sub`/`mul`/`neg`/`scale`/`compose_unary` and the
963/// transcendentals) is a term-for-term structural re-type of the scalar
964/// [`OneSeed`] ops onto the lane-implemented [`Order2Lane`] algebra. With
965/// `L = f64`, `OneSeedLane<f64, K>` is `to_bits`-identical to [`OneSeed<K>`];
966/// with `L = f64x4`, lane `i` is `to_bits`-identical to that (see `batch_tests`).
967#[derive(Clone, Copy, Debug)]
968pub struct OneSeedLane<L: Lane, const K: usize> {
969    /// The `ε⁰` part (lane-batched value / gradient / Hessian of `ℓ`).
970    pub base: Order2Lane<L, K>,
971    /// The `ε¹` part. After a `seed_direction(u)` evaluation,
972    /// `eps.h[a][b]` lane `i` is row `i`'s `Σ_c ℓ_{abc} u_c`.
973    pub eps: Order2Lane<L, K>,
974}
975
976/// The 4-rows-per-pass batched one-seed scalar (`wide::f64x4` lanes).
977pub type OneSeedBatch<const K: usize> = OneSeedLane<wide::f64x4, K>;
978
979impl<L: Lane, const K: usize> OneSeedLane<L, K> {
980    /// A constant: base = `constant(c)`, ε-part zero (mirrors [`OneSeed::constant`]).
981    #[inline]
982    pub fn constant(c: L) -> Self {
983        OneSeedLane {
984            base: Order2Lane::constant(c),
985            eps: Order2Lane::constant(L::splat(0.0)),
986        }
987    }
988
989    /// The seeded variable `p_axis` at (per-lane) value `value`, no ε-direction
990    /// (mirrors [`OneSeed::variable`]).
991    #[inline]
992    pub fn variable(value: L, axis: usize) -> Self {
993        OneSeedLane {
994            base: Order2Lane::variable(value, axis),
995            eps: Order2Lane::constant(L::splat(0.0)),
996        }
997    }
998
999    /// Seed primary `axis` at (per-lane) value `value` with ε-direction
1000    /// component `u_axis`: base = `variable(value, axis)`, eps = `constant(u_axis)`
1001    /// (mirrors [`OneSeed::seed_direction`]). With `L = f64x4`, `value` / `u_axis`
1002    /// pack the four rows' values / directions of primary `axis`.
1003    #[inline]
1004    pub fn seed_direction(value: L, axis: usize, u_axis: L) -> Self {
1005        OneSeedLane {
1006            base: Order2Lane::variable(value, axis),
1007            eps: Order2Lane::constant(u_axis),
1008        }
1009    }
1010
1011    /// The contracted-third channel after a `seed_direction(u)` evaluation:
1012    /// `out[a][b]` lane `i` is row `i`'s `Σ_c ℓ_{abc} u_c` (the ε-part Hessian).
1013    #[inline]
1014    #[must_use]
1015    pub fn contracted_third(&self) -> [[L; K]; K] {
1016        self.eps.h
1017    }
1018
1019    /// Lane-wise `self + o` (mirrors [`OneSeed::add`]).
1020    #[inline]
1021    pub fn add(&self, o: &Self) -> Self {
1022        OneSeedLane {
1023            base: self.base.add(&o.base),
1024            eps: self.eps.add(&o.eps),
1025        }
1026    }
1027
1028    /// Lane-wise `self - o` (mirrors [`OneSeed::sub`]).
1029    #[inline]
1030    pub fn sub(&self, o: &Self) -> Self {
1031        OneSeedLane {
1032            base: self.base.sub(&o.base),
1033            eps: self.eps.sub(&o.eps),
1034        }
1035    }
1036
1037    /// Lane-wise `self · o`, ε² = 0 truncation (mirrors [`OneSeed::mul`]).
1038    #[inline]
1039    pub fn mul(&self, o: &Self) -> Self {
1040        OneSeedLane {
1041            base: self.base.mul(&o.base),
1042            eps: self.base.mul(&o.eps).add(&self.eps.mul(&o.base)),
1043        }
1044    }
1045
1046    /// Negate every part (mirrors [`OneSeed::neg`]).
1047    #[inline]
1048    pub fn neg(&self) -> Self {
1049        OneSeedLane {
1050            base: self.base.neg(),
1051            eps: self.eps.neg(),
1052        }
1053    }
1054
1055    /// Multiply every part by the plain scalar `s` (mirrors [`OneSeed::scale`]).
1056    #[inline]
1057    pub fn scale(&self, s: f64) -> Self {
1058        OneSeedLane {
1059            base: self.base.scale(s),
1060            eps: self.eps.scale(s),
1061        }
1062    }
1063
1064    /// Exact order-≤2-per-part Faà di Bruno composition `f ∘ self`, given the
1065    /// per-lane outer-derivative stack `d = [f, f′, f″, f‴, f⁗]`. Term-for-term
1066    /// identical to [`OneSeed::compose_unary`]: the base reads `d[0..=2]` and the
1067    /// ε-coefficient is `f′(base)` (reads `d[1..=3]`) times `eps`.
1068    #[inline]
1069    pub fn compose_unary(&self, d: [L; 5]) -> Self {
1070        let base = self.base.compose_unary([d[0], d[1], d[2]]);
1071        let fprime = self.base.compose_unary([d[1], d[2], d[3]]);
1072        let eps = fprime.mul(&self.eps);
1073        OneSeedLane { base, eps }
1074    }
1075
1076    /// `e^self`, per-lane stack `[e, e, e, e, e]` (matches [`JetScalar::exp`]).
1077    #[inline]
1078    pub fn exp(&self) -> Self {
1079        let d = self.base.v.unary5(|u| {
1080            let e = u.exp();
1081            [e, e, e, e, e]
1082        });
1083        self.compose_unary(d)
1084    }
1085
1086    /// `ln(self)`; caller guarantees positivity (matches [`JetScalar::ln`]).
1087    #[inline]
1088    pub fn ln(&self) -> Self {
1089        let d = self.base.v.unary5(|u| {
1090            let r = 1.0 / u;
1091            [u.ln(), r, -r * r, 2.0 * r * r * r, -6.0 * r * r * r * r]
1092        });
1093        self.compose_unary(d)
1094    }
1095
1096    /// `√self`; caller guarantees positivity (matches [`JetScalar::sqrt`]).
1097    #[inline]
1098    pub fn sqrt(&self) -> Self {
1099        let d = self.base.v.unary5(|u| {
1100            let s = u.sqrt();
1101            [
1102                s,
1103                0.5 / s,
1104                -0.25 / (u * s),
1105                0.375 / (u * u * s),
1106                -0.9375 / (u * u * u * s),
1107            ]
1108        });
1109        self.compose_unary(d)
1110    }
1111
1112    /// `1/self` (matches [`JetScalar::recip`]).
1113    #[inline]
1114    pub fn recip(&self) -> Self {
1115        let d = self.base.v.unary5(|u| {
1116            let r = 1.0 / u;
1117            let r2 = r * r;
1118            [r, -r2, 2.0 * r2 * r, -6.0 * r2 * r2, 24.0 * r2 * r2 * r]
1119        });
1120        self.compose_unary(d)
1121    }
1122
1123    /// `self^a` for real `a`; caller guarantees a positive base (matches
1124    /// [`JetScalar::powf`]).
1125    #[inline]
1126    pub fn powf(&self, a: f64) -> Self {
1127        let d = self.base.v.unary5(|u| {
1128            [
1129                u.powf(a),
1130                a * u.powf(a - 1.0),
1131                a * (a - 1.0) * u.powf(a - 2.0),
1132                a * (a - 1.0) * (a - 2.0) * u.powf(a - 3.0),
1133                a * (a - 1.0) * (a - 2.0) * (a - 3.0) * u.powf(a - 4.0),
1134            ]
1135        });
1136        self.compose_unary(d)
1137    }
1138
1139    /// `ln Γ(self)`; caller guarantees positivity (matches [`JetScalar::ln_gamma`],
1140    /// same hand-certified stack).
1141    #[inline]
1142    pub fn ln_gamma(&self) -> Self {
1143        let d = self
1144            .base
1145            .v
1146            .unary5(crate::jet_tower::ln_gamma_derivative_stack);
1147        self.compose_unary(d)
1148    }
1149
1150    /// `ψ(self)` digamma; caller guarantees positivity (matches
1151    /// [`JetScalar::digamma`], same hand-certified stack).
1152    #[inline]
1153    pub fn digamma(&self) -> Self {
1154        let d = self
1155            .base
1156            .v
1157            .unary5(crate::jet_tower::digamma_derivative_stack);
1158        self.compose_unary(d)
1159    }
1160}
1161
1162impl<const K: usize> OneSeedBatch<K> {
1163    /// Extract lane `i`'s parts as a production [`OneSeed<K>`]. Lane `i` is
1164    /// `to_bits`-identical to evaluating the same program at [`OneSeed<K>`] on
1165    /// row `i` (see `batch_tests`).
1166    #[inline]
1167    #[must_use]
1168    pub fn lane(&self, i: usize) -> OneSeed<K> {
1169        OneSeed {
1170            base: self.base.lane(i),
1171            eps: self.eps.lane(i),
1172        }
1173    }
1174}
1175
1176// ── TwoSeed<K>: two-seed, contracted fourth (doc §A.3) ──────────────────
1177
1178/// Two-seed scalar: an [`Order2`] base plus TWO nilpotents ε, δ
1179/// (`ε² = δ² = 0`, `εδ` retained) — four [`Order2`] parts
1180/// `s = base + ε·eps + δ·del + εδ·eps_del`.
1181///
1182/// Product truncates `ε² = δ² = 0` (doc §A.3): each part is built from
1183/// [`Order2`] products of the four input parts. Composition picks up
1184/// successively higher outer derivatives, the cross part carrying the second
1185/// Faà di Bruno term `f''·eps·del + f'·eps_del`.
1186///
1187/// Seed each primary with [`seed`](Self::seed): base = `variable(x, axis)`,
1188/// eps = `constant(u_axis)`, del = `constant(v_axis)`, eps_del = `constant(0)`.
1189/// Then the εδ-component of the evaluated Hessian channel is the contracted
1190/// fourth `[eps_del.h][a][b] = Σ_{cd} ℓ_{abcd} u_c v_d` — exactly
1191/// `row_fourth_contracted(u, v)`, without materialising `t4`.
1192#[derive(Clone, Copy, Debug)]
1193pub struct TwoSeed<const K: usize> {
1194    /// The `ε⁰δ⁰` part: value / grad / Hessian of `ℓ`.
1195    pub base: Order2<K>,
1196    /// The `ε¹δ⁰` part.
1197    pub eps: Order2<K>,
1198    /// The `ε⁰δ¹` part.
1199    pub del: Order2<K>,
1200    /// The `ε¹δ¹` part. After a `seed(u, v)` evaluation,
1201    /// `eps_del.h[a][b] = Σ_{cd} ℓ_{abcd} u_c v_d`.
1202    pub eps_del: Order2<K>,
1203}
1204
1205impl<const K: usize> TwoSeed<K> {
1206    /// Seed primary `axis` at value `x` with ε-direction `u_axis` and
1207    /// δ-direction `v_axis`:
1208    /// `p_axis = p_axis⁰ + x-seed + ε·u_axis + δ·v_axis` (doc §A.3 "Seeding").
1209    pub fn seed(x: f64, axis: usize, u_axis: f64, v_axis: f64) -> Self {
1210        TwoSeed {
1211            base: Order2::variable(x, axis),
1212            eps: Order2::constant(u_axis),
1213            del: Order2::constant(v_axis),
1214            eps_del: Order2::constant(0.0),
1215        }
1216    }
1217
1218    /// The contracted-fourth channel after a `seed(u, v)` evaluation:
1219    /// `out[a][b] = Σ_{cd} ℓ_{abcd} u_c v_d`, i.e. the εδ-coefficient's Hessian.
1220    pub fn contracted_fourth(&self) -> [[f64; K]; K] {
1221        self.eps_del.h()
1222    }
1223}
1224
1225impl<const K: usize> JetScalar<K> for TwoSeed<K> {
1226    fn constant(c: f64) -> Self {
1227        TwoSeed {
1228            base: Order2::constant(c),
1229            eps: Order2::constant(0.0),
1230            del: Order2::constant(0.0),
1231            eps_del: Order2::constant(0.0),
1232        }
1233    }
1234    fn variable(x: f64, axis: usize) -> Self {
1235        TwoSeed {
1236            base: Order2::variable(x, axis),
1237            eps: Order2::constant(0.0),
1238            del: Order2::constant(0.0),
1239            eps_del: Order2::constant(0.0),
1240        }
1241    }
1242    fn value(&self) -> f64 {
1243        self.base.value()
1244    }
1245    fn add(&self, o: &Self) -> Self {
1246        TwoSeed {
1247            base: self.base.add(&o.base),
1248            eps: self.eps.add(&o.eps),
1249            del: self.del.add(&o.del),
1250            eps_del: self.eps_del.add(&o.eps_del),
1251        }
1252    }
1253    fn sub(&self, o: &Self) -> Self {
1254        TwoSeed {
1255            base: self.base.sub(&o.base),
1256            eps: self.eps.sub(&o.eps),
1257            del: self.del.sub(&o.del),
1258            eps_del: self.eps_del.sub(&o.eps_del),
1259        }
1260    }
1261    fn mul(&self, o: &Self) -> Self {
1262        let a = self;
1263        let b = o;
1264        // Truncate ε² = δ² = 0 (doc §A.3 product table).
1265        let base = a.base.mul(&b.base);
1266        let eps = a.base.mul(&b.eps).add(&a.eps.mul(&b.base));
1267        let del = a.base.mul(&b.del).add(&a.del.mul(&b.base));
1268        let eps_del = a
1269            .base
1270            .mul(&b.eps_del)
1271            .add(&a.eps.mul(&b.del))
1272            .add(&a.del.mul(&b.eps))
1273            .add(&a.eps_del.mul(&b.base));
1274        TwoSeed {
1275            base,
1276            eps,
1277            del,
1278            eps_del,
1279        }
1280    }
1281    fn neg(&self) -> Self {
1282        TwoSeed {
1283            base: self.base.neg(),
1284            eps: self.eps.neg(),
1285            del: self.del.neg(),
1286            eps_del: self.eps_del.neg(),
1287        }
1288    }
1289    fn scale(&self, s: f64) -> Self {
1290        TwoSeed {
1291            base: self.base.scale(s),
1292            eps: self.eps.scale(s),
1293            del: self.del.scale(s),
1294            eps_del: self.eps_del.scale(s),
1295        }
1296    }
1297    fn compose_unary(&self, d: [f64; 5]) -> Self {
1298        // f(s) with s = base + ε eps + δ del + εδ eps_del, ε²=δ²=0:
1299        //   f(s) = f(base)
1300        //        + ε · f'(base)·eps
1301        //        + δ · f'(base)·del
1302        //        + εδ · ( f''(base)·eps·del + f'(base)·eps_del ).
1303        // Each f^{(r)}(base) is the Order2 composition of base with the stack
1304        // shifted r entries (doc §A.3 composition). Order2 reads only the
1305        // leading three entries of whatever stack it is handed, so the trailing
1306        // padding slots are unused (the fixed-length array makes this total).
1307        let base = self.base.compose_unary([d[0], d[1], d[2], d[3], d[4]]);
1308        let fprime = self.base.compose_unary([d[1], d[2], d[3], d[4], d[4]]); // f'(base) as Order2
1309        let fsecond = self.base.compose_unary([d[2], d[3], d[4], d[4], d[4]]); // f''(base) as Order2
1310        let eps = fprime.mul(&self.eps);
1311        let del = fprime.mul(&self.del);
1312        let eps_del = fsecond
1313            .mul(&self.eps)
1314            .mul(&self.del)
1315            .add(&fprime.mul(&self.eps_del));
1316        TwoSeed {
1317            base,
1318            eps,
1319            del,
1320            eps_del,
1321        }
1322    }
1323}
1324
1325// ── TwoSeedLane<L, K>: lane-batched two-seed, contracted fourth (doc §A.3) ─
1326
1327/// Lane-batched [`TwoSeed`]: the same two-seed scalar with its four [`Order2`]
1328/// parts re-typed to [`Order2Lane<L, K>`], so one `L = f64x4` instance carries
1329/// FOUR rows' contracted-fourth evaluations per vector pass.
1330///
1331/// Every operation is a term-for-term structural re-type of the scalar
1332/// [`TwoSeed`] ops onto the lane-implemented [`Order2Lane`] algebra. With
1333/// `L = f64`, `TwoSeedLane<f64, K>` is `to_bits`-identical to [`TwoSeed<K>`];
1334/// with `L = f64x4`, lane `i` is `to_bits`-identical to that (see `batch_tests`).
1335#[derive(Clone, Copy, Debug)]
1336pub struct TwoSeedLane<L: Lane, const K: usize> {
1337    /// The `ε⁰δ⁰` part.
1338    pub base: Order2Lane<L, K>,
1339    /// The `ε¹δ⁰` part.
1340    pub eps: Order2Lane<L, K>,
1341    /// The `ε⁰δ¹` part.
1342    pub del: Order2Lane<L, K>,
1343    /// The `ε¹δ¹` part. After a `seed(u, v)` evaluation, `eps_del.h[a][b]`
1344    /// lane `i` is row `i`'s `Σ_{cd} ℓ_{abcd} u_c v_d`.
1345    pub eps_del: Order2Lane<L, K>,
1346}
1347
1348/// The 4-rows-per-pass batched two-seed scalar (`wide::f64x4` lanes).
1349pub type TwoSeedBatch<const K: usize> = TwoSeedLane<wide::f64x4, K>;
1350
1351impl<L: Lane, const K: usize> TwoSeedLane<L, K> {
1352    /// A constant: base = `constant(c)`, all seed parts zero (mirrors
1353    /// [`TwoSeed::constant`]).
1354    #[inline]
1355    pub fn constant(c: L) -> Self {
1356        let z = Order2Lane::constant(L::splat(0.0));
1357        TwoSeedLane {
1358            base: Order2Lane::constant(c),
1359            eps: z,
1360            del: z,
1361            eps_del: z,
1362        }
1363    }
1364
1365    /// The seeded variable `p_axis` at (per-lane) value `value`, no ε/δ direction
1366    /// (mirrors [`TwoSeed::variable`]).
1367    #[inline]
1368    pub fn variable(value: L, axis: usize) -> Self {
1369        let z = Order2Lane::constant(L::splat(0.0));
1370        TwoSeedLane {
1371            base: Order2Lane::variable(value, axis),
1372            eps: z,
1373            del: z,
1374            eps_del: z,
1375        }
1376    }
1377
1378    /// Seed primary `axis` at (per-lane) value `value` with ε-direction `u_axis`
1379    /// and δ-direction `v_axis` (mirrors [`TwoSeed::seed`]). With `L = f64x4`,
1380    /// each argument packs the four rows' values for primary `axis`.
1381    #[inline]
1382    pub fn seed(value: L, axis: usize, u_axis: L, v_axis: L) -> Self {
1383        TwoSeedLane {
1384            base: Order2Lane::variable(value, axis),
1385            eps: Order2Lane::constant(u_axis),
1386            del: Order2Lane::constant(v_axis),
1387            eps_del: Order2Lane::constant(L::splat(0.0)),
1388        }
1389    }
1390
1391    /// The contracted-fourth channel after a `seed(u, v)` evaluation:
1392    /// `out[a][b]` lane `i` is row `i`'s `Σ_{cd} ℓ_{abcd} u_c v_d`
1393    /// (the εδ-part Hessian).
1394    #[inline]
1395    #[must_use]
1396    pub fn contracted_fourth(&self) -> [[L; K]; K] {
1397        self.eps_del.h
1398    }
1399
1400    /// Lane-wise `self + o` (mirrors [`TwoSeed::add`]).
1401    #[inline]
1402    pub fn add(&self, o: &Self) -> Self {
1403        TwoSeedLane {
1404            base: self.base.add(&o.base),
1405            eps: self.eps.add(&o.eps),
1406            del: self.del.add(&o.del),
1407            eps_del: self.eps_del.add(&o.eps_del),
1408        }
1409    }
1410
1411    /// Lane-wise `self - o` (mirrors [`TwoSeed::sub`]).
1412    #[inline]
1413    pub fn sub(&self, o: &Self) -> Self {
1414        TwoSeedLane {
1415            base: self.base.sub(&o.base),
1416            eps: self.eps.sub(&o.eps),
1417            del: self.del.sub(&o.del),
1418            eps_del: self.eps_del.sub(&o.eps_del),
1419        }
1420    }
1421
1422    /// Lane-wise `self · o`, ε² = δ² = 0 truncation (mirrors [`TwoSeed::mul`]).
1423    #[inline]
1424    pub fn mul(&self, o: &Self) -> Self {
1425        let a = self;
1426        let b = o;
1427        let base = a.base.mul(&b.base);
1428        let eps = a.base.mul(&b.eps).add(&a.eps.mul(&b.base));
1429        let del = a.base.mul(&b.del).add(&a.del.mul(&b.base));
1430        let eps_del = a
1431            .base
1432            .mul(&b.eps_del)
1433            .add(&a.eps.mul(&b.del))
1434            .add(&a.del.mul(&b.eps))
1435            .add(&a.eps_del.mul(&b.base));
1436        TwoSeedLane {
1437            base,
1438            eps,
1439            del,
1440            eps_del,
1441        }
1442    }
1443
1444    /// Negate every part (mirrors [`TwoSeed::neg`]).
1445    #[inline]
1446    pub fn neg(&self) -> Self {
1447        TwoSeedLane {
1448            base: self.base.neg(),
1449            eps: self.eps.neg(),
1450            del: self.del.neg(),
1451            eps_del: self.eps_del.neg(),
1452        }
1453    }
1454
1455    /// Multiply every part by the plain scalar `s` (mirrors [`TwoSeed::scale`]).
1456    #[inline]
1457    pub fn scale(&self, s: f64) -> Self {
1458        TwoSeedLane {
1459            base: self.base.scale(s),
1460            eps: self.eps.scale(s),
1461            del: self.del.scale(s),
1462            eps_del: self.eps_del.scale(s),
1463        }
1464    }
1465
1466    /// Exact composition `f ∘ self`, given the per-lane outer-derivative stack
1467    /// `d = [f, f′, f″, f‴, f⁗]`. Term-for-term identical to
1468    /// [`TwoSeed::compose_unary`]: base reads `d[0..=2]`, `f′(base)` reads
1469    /// `d[1..=3]`, `f″(base)` reads `d[2..=4]`, and the cross part carries
1470    /// `f″·eps·del + f′·eps_del`.
1471    #[inline]
1472    pub fn compose_unary(&self, d: [L; 5]) -> Self {
1473        let base = self.base.compose_unary([d[0], d[1], d[2]]);
1474        let fprime = self.base.compose_unary([d[1], d[2], d[3]]);
1475        let fsecond = self.base.compose_unary([d[2], d[3], d[4]]);
1476        let eps = fprime.mul(&self.eps);
1477        let del = fprime.mul(&self.del);
1478        let eps_del = fsecond
1479            .mul(&self.eps)
1480            .mul(&self.del)
1481            .add(&fprime.mul(&self.eps_del));
1482        TwoSeedLane {
1483            base,
1484            eps,
1485            del,
1486            eps_del,
1487        }
1488    }
1489
1490    /// `e^self`, per-lane stack `[e; 5]` (matches [`JetScalar::exp`]).
1491    #[inline]
1492    pub fn exp(&self) -> Self {
1493        let d = self.base.v.unary5(|u| {
1494            let e = u.exp();
1495            [e, e, e, e, e]
1496        });
1497        self.compose_unary(d)
1498    }
1499
1500    /// `ln(self)`; caller guarantees positivity (matches [`JetScalar::ln`]).
1501    #[inline]
1502    pub fn ln(&self) -> Self {
1503        let d = self.base.v.unary5(|u| {
1504            let r = 1.0 / u;
1505            [u.ln(), r, -r * r, 2.0 * r * r * r, -6.0 * r * r * r * r]
1506        });
1507        self.compose_unary(d)
1508    }
1509
1510    /// `√self`; caller guarantees positivity (matches [`JetScalar::sqrt`]).
1511    #[inline]
1512    pub fn sqrt(&self) -> Self {
1513        let d = self.base.v.unary5(|u| {
1514            let s = u.sqrt();
1515            [
1516                s,
1517                0.5 / s,
1518                -0.25 / (u * s),
1519                0.375 / (u * u * s),
1520                -0.9375 / (u * u * u * s),
1521            ]
1522        });
1523        self.compose_unary(d)
1524    }
1525
1526    /// `1/self` (matches [`JetScalar::recip`]).
1527    #[inline]
1528    pub fn recip(&self) -> Self {
1529        let d = self.base.v.unary5(|u| {
1530            let r = 1.0 / u;
1531            let r2 = r * r;
1532            [r, -r2, 2.0 * r2 * r, -6.0 * r2 * r2, 24.0 * r2 * r2 * r]
1533        });
1534        self.compose_unary(d)
1535    }
1536
1537    /// `self^a` for real `a`; caller guarantees a positive base (matches
1538    /// [`JetScalar::powf`]).
1539    #[inline]
1540    pub fn powf(&self, a: f64) -> Self {
1541        let d = self.base.v.unary5(|u| {
1542            [
1543                u.powf(a),
1544                a * u.powf(a - 1.0),
1545                a * (a - 1.0) * u.powf(a - 2.0),
1546                a * (a - 1.0) * (a - 2.0) * u.powf(a - 3.0),
1547                a * (a - 1.0) * (a - 2.0) * (a - 3.0) * u.powf(a - 4.0),
1548            ]
1549        });
1550        self.compose_unary(d)
1551    }
1552
1553    /// `ln Γ(self)`; caller guarantees positivity (matches [`JetScalar::ln_gamma`]).
1554    #[inline]
1555    pub fn ln_gamma(&self) -> Self {
1556        let d = self
1557            .base
1558            .v
1559            .unary5(crate::jet_tower::ln_gamma_derivative_stack);
1560        self.compose_unary(d)
1561    }
1562
1563    /// `ψ(self)` digamma; caller guarantees positivity (matches
1564    /// [`JetScalar::digamma`]).
1565    #[inline]
1566    pub fn digamma(&self) -> Self {
1567        let d = self
1568            .base
1569            .v
1570            .unary5(crate::jet_tower::digamma_derivative_stack);
1571        self.compose_unary(d)
1572    }
1573}
1574
1575impl<const K: usize> TwoSeedBatch<K> {
1576    /// Extract lane `i`'s parts as a production [`TwoSeed<K>`]. Lane `i` is
1577    /// `to_bits`-identical to evaluating the same program at [`TwoSeed<K>`] on
1578    /// row `i` (see `batch_tests`).
1579    #[inline]
1580    #[must_use]
1581    pub fn lane(&self, i: usize) -> TwoSeed<K> {
1582        TwoSeed {
1583            base: self.base.lane(i),
1584            eps: self.eps.lane(i),
1585            del: self.del.lane(i),
1586            eps_del: self.eps_del.lane(i),
1587        }
1588    }
1589}
1590
1591// ── Tower3<K>: value / gradient / Hessian / third tensor ────────────────
1592
1593/// The order-≤3 [`crate::jet_tower::Tower3`] is also a [`JetScalar`]. It serves
1594/// consumers that read `.t3` but never `.t4`, avoiding the fourth-tensor
1595/// product/composition work while preserving the lower channels
1596/// bit-for-bit against [`crate::jet_tower::Tower4`].
1597impl<const K: usize> JetScalar<K> for crate::jet_tower::Tower3<K> {
1598    fn constant(c: f64) -> Self {
1599        crate::jet_tower::Tower3::constant(c)
1600    }
1601    fn variable(x: f64, axis: usize) -> Self {
1602        crate::jet_tower::Tower3::variable(x, axis)
1603    }
1604    fn value(&self) -> f64 {
1605        self.v
1606    }
1607    fn add(&self, o: &Self) -> Self {
1608        *self + *o
1609    }
1610    fn sub(&self, o: &Self) -> Self {
1611        *self + o.scale(-1.0)
1612    }
1613    fn mul(&self, o: &Self) -> Self {
1614        crate::jet_tower::Tower3::mul(self, o)
1615    }
1616    fn neg(&self) -> Self {
1617        self.scale(-1.0)
1618    }
1619    fn scale(&self, s: f64) -> Self {
1620        crate::jet_tower::Tower3::scale(self, s)
1621    }
1622    fn compose_unary(&self, d: [f64; 5]) -> Self {
1623        crate::jet_tower::Tower3::compose_unary(self, [d[0], d[1], d[2], d[3]])
1624    }
1625}
1626
1627// ── Tower4<K>: full dense tower as a JetScalar (the all-channels scalar) ─
1628
1629/// The full dense [`crate::jet_tower::Tower4`] is itself a [`JetScalar`]: it
1630/// carries EVERY channel, so a row expression written ONCE against [`JetScalar`]
1631/// can be evaluated at `Tower4` to obtain the full `(v, g, H, t3, t4)` in one
1632/// pass. This is BOTH the #932 oracle ground truth the packed [`Order2`] /
1633/// [`OneSeed`] / [`TwoSeed`] scalars are pinned against, AND a production scalar:
1634/// a family whose uncontracted third / fourth derivative tensors are needed
1635/// (the BMS rigid `third_full` / `fourth_full` caches) evaluates the SAME
1636/// generic row-NLL expression at `Tower4` and reads `.t3` / `.t4` off the
1637/// result — so the dense tensors come from the single source of truth, not a
1638/// separately hand-written jet. The packed scalars serve the consumers that
1639/// need only `(v, g, H)` (`Order2`) or one / two contractions
1640/// (`OneSeed` / `TwoSeed`) without paying for the dense tensors.
1641impl<const K: usize> JetScalar<K> for crate::jet_tower::Tower4<K> {
1642    fn constant(c: f64) -> Self {
1643        crate::jet_tower::Tower4::constant(c)
1644    }
1645    fn variable(x: f64, axis: usize) -> Self {
1646        crate::jet_tower::Tower4::variable(x, axis)
1647    }
1648    fn value(&self) -> f64 {
1649        self.v
1650    }
1651    fn add(&self, o: &Self) -> Self {
1652        *self + *o
1653    }
1654    fn sub(&self, o: &Self) -> Self {
1655        *self - *o
1656    }
1657    fn mul(&self, o: &Self) -> Self {
1658        crate::jet_tower::Tower4::mul(self, o)
1659    }
1660    fn neg(&self) -> Self {
1661        self.scale(-1.0)
1662    }
1663    fn scale(&self, s: f64) -> Self {
1664        crate::jet_tower::Tower4::scale(self, s)
1665    }
1666    fn compose_unary(&self, d: [f64; 5]) -> Self {
1667        crate::jet_tower::Tower4::compose_unary(self, d)
1668    }
1669}
1670
1671#[cfg(test)]
1672mod tests {
1673    use super::*;
1674    use crate::jet_tower::{RowNllProgram, Tower4, evaluate_program};
1675
1676    /// A small polynomial-plus-unary row expression written ONCE, generically
1677    /// over `S: JetScalar<2>`, so it can be evaluated against every scalar:
1678    /// `ℓ = (e^{p0·p1} + 2) · √(p0·p0 + 1) − p1·p1·0.5`.
1679    /// Exercises mul, add/sub, scale, exp, sqrt — every algebra op.
1680    fn row_expr<S: JetScalar<2>>(p: &[S; 2]) -> S {
1681        let g = p[0].mul(&p[1]).exp();
1682        let inner = g.add(&S::constant(2.0));
1683        let radic = p[0].mul(&p[0]).add(&S::constant(1.0)).sqrt();
1684        inner.mul(&radic).sub(&p[1].mul(&p[1]).scale(0.5))
1685    }
1686
1687    /// The same expression as a Tower4 `RowNllProgram`, the ground-truth tower.
1688    struct ExprProgram {
1689        p: [f64; 2],
1690    }
1691    impl RowNllProgram<2> for ExprProgram {
1692        fn n_rows(&self) -> usize {
1693            1
1694        }
1695        fn primaries(&self, row: usize) -> Result<[f64; 2], String> {
1696            if row >= self.n_rows() {
1697                return Err(format!("ExprProgram: row {row} out of range"));
1698            }
1699            Ok(self.p)
1700        }
1701        fn row_nll(&self, row: usize, p: &[Tower4<2>; 2]) -> Result<Tower4<2>, String> {
1702            if row >= self.n_rows() {
1703                return Err(format!("ExprProgram: row {row} out of range"));
1704            }
1705            Ok(row_expr(p))
1706        }
1707    }
1708
1709    const SEED: [f64; 2] = [0.37, -0.81];
1710    const U: [f64; 2] = [0.6, -0.2];
1711    const V: [f64; 2] = [-0.4, 1.1];
1712    const TOL: f64 = 1e-10;
1713
1714    fn close(a: f64, b: f64, label: &str) {
1715        let band = TOL + TOL * a.abs().max(b.abs());
1716        assert!(
1717            (a - b).abs() <= band,
1718            "{label}: {a:+.15e} vs {b:+.15e} (band {band:.3e})"
1719        );
1720    }
1721
1722    fn tower() -> Tower4<2> {
1723        evaluate_program(&ExprProgram { p: SEED }, 0).expect("tower")
1724    }
1725
1726    /// Order2 reproduces Tower4's value/grad/Hessian channels exactly.
1727    #[test]
1728    fn order2_matches_tower_value_grad_hessian() {
1729        let t = tower();
1730        let vars: [Order2<2>; 2] = std::array::from_fn(|a| Order2::variable(SEED[a], a));
1731        let s = row_expr(&vars);
1732        close(s.value(), t.v, "value");
1733        for a in 0..2 {
1734            close(s.0.g[a], t.g[a], &format!("grad[{a}]"));
1735            for b in 0..2 {
1736                close(s.h()[a][b], t.h[a][b], &format!("hess[{a}][{b}]"));
1737            }
1738        }
1739    }
1740
1741    /// The `compose_unary_with` seam on a scalar jet is `to_bits`-identical to
1742    /// the explicit `compose_unary(stack_fn(value))` — the contract the batch
1743    /// arm (`Tower{3,4}Lane::compose_unary_with`) lane-matches. Exercised on
1744    /// [`Order2`] across `K ∈ {2,3,4,9}`, ≥ 4000 random seeded inputs.
1745    #[test]
1746    fn compose_unary_with_scalar_seam_bit_identical() {
1747        fn rand_unit(state: &mut u64) -> f64 {
1748            let mut x = *state;
1749            x ^= x << 13;
1750            x ^= x >> 7;
1751            x ^= x << 17;
1752            *state = x;
1753            2.0 * ((x >> 11) as f64 / ((1u64 << 53) as f64)) - 1.0
1754        }
1755        // A base-value-dependent finite stack standing in for a family stack.
1756        fn stack(u: f64) -> [f64; 5] {
1757            [u.sin(), u.cos(), (2.0 * u).sin(), (0.5 * u).cos(), u * u - 0.3]
1758        }
1759        fn run<const K: usize>(state: &mut u64, n: usize) -> usize {
1760            for _ in 0..n {
1761                // A non-trivial Order2<K> jet: a seeded variable pushed through a
1762                // couple of algebra ops so g/h are dense, then exercise the seam.
1763                let base = rand_unit(state);
1764                let mut s = Order2::<K>::variable(base, 0);
1765                for a in 1..K {
1766                    s = JetScalar::mul(&s, &Order2::<K>::variable(rand_unit(state), a));
1767                }
1768                let with = s.compose_unary_with(stack);
1769                let explicit = s.compose_unary(stack(s.value()));
1770                assert_eq!(with.value().to_bits(), explicit.value().to_bits(), "value");
1771                for a in 0..K {
1772                    assert_eq!(with.g()[a].to_bits(), explicit.g()[a].to_bits(), "g[{a}]");
1773                    for b in 0..K {
1774                        assert_eq!(
1775                            with.h()[a][b].to_bits(),
1776                            explicit.h()[a][b].to_bits(),
1777                            "h[{a}][{b}]"
1778                        );
1779                    }
1780                }
1781            }
1782            n
1783        }
1784        let mut st = 0x9e37_79b9_7f4a_7c15u64;
1785        let total =
1786            run::<2>(&mut st, 1100) + run::<3>(&mut st, 1100) + run::<4>(&mut st, 1100) + run::<9>(&mut st, 1100);
1787        assert_eq!(total, 4400);
1788    }
1789
1790    /// OneSeed's ε-Hessian is the contracted third Σ_c ℓ_{abc} u_c, matching
1791    /// `Tower4::third_contracted(u)`. Base channels also match the tower.
1792    #[test]
1793    fn one_seed_matches_tower_third_contracted() {
1794        let t = tower();
1795        let truth = t.third_contracted(&U);
1796        let vars: [OneSeed<2>; 2] =
1797            std::array::from_fn(|a| OneSeed::seed_direction(SEED[a], a, U[a]));
1798        let s = row_expr(&vars);
1799        // Base channels are the plain (v, g, H).
1800        close(s.value(), t.v, "value");
1801        for a in 0..2 {
1802            for b in 0..2 {
1803                close(s.base.h()[a][b], t.h[a][b], &format!("base hess[{a}][{b}]"));
1804            }
1805        }
1806        let third = s.contracted_third();
1807        for a in 0..2 {
1808            for b in 0..2 {
1809                close(third[a][b], truth[a][b], &format!("third[{a}][{b}]"));
1810            }
1811        }
1812    }
1813
1814    /// TwoSeed's εδ-Hessian is the contracted fourth Σ_{cd} ℓ_{abcd} u_c v_d,
1815    /// matching `Tower4::fourth_contracted(u, v)`. The ε / δ single-seed parts
1816    /// reproduce the two third contractions Σ_c ℓ_{abc} u_c and …v_d.
1817    #[test]
1818    fn two_seed_matches_tower_fourth_contracted() {
1819        let t = tower();
1820        let truth4 = t.fourth_contracted(&U, &V);
1821        let truth3_u = t.third_contracted(&U);
1822        let truth3_v = t.third_contracted(&V);
1823        let vars: [TwoSeed<2>; 2] = std::array::from_fn(|a| TwoSeed::seed(SEED[a], a, U[a], V[a]));
1824        let s = row_expr(&vars);
1825        close(s.value(), t.v, "value");
1826        for a in 0..2 {
1827            close(s.base.0.g[a], t.g[a], &format!("grad[{a}]"));
1828            for b in 0..2 {
1829                close(s.base.h()[a][b], t.h[a][b], &format!("base hess[{a}][{b}]"));
1830                close(
1831                    s.eps.h()[a][b],
1832                    truth3_u[a][b],
1833                    &format!("eps third_u[{a}][{b}]"),
1834                );
1835                close(
1836                    s.del.h()[a][b],
1837                    truth3_v[a][b],
1838                    &format!("del third_v[{a}][{b}]"),
1839                );
1840            }
1841        }
1842        let fourth = s.contracted_fourth();
1843        for a in 0..2 {
1844            for b in 0..2 {
1845                close(fourth[a][b], truth4[a][b], &format!("fourth[{a}][{b}]"));
1846            }
1847        }
1848    }
1849
1850    /// The generic `row_nll_generic` seam (added to Tower4's program trait
1851    /// surface) evaluates the SAME expression on each scalar and extracts the
1852    /// channel a consumer asks for, agreeing with the direct Tower4 contraction.
1853    #[test]
1854    fn generic_program_seam_matches_tower_for_every_channel() {
1855        let t = tower();
1856        // Order2 via generic seam.
1857        let o2: [Order2<2>; 2] = std::array::from_fn(|a| Order2::variable(SEED[a], a));
1858        let so2 = row_expr(&o2);
1859        close(so2.value(), t.v, "seam order2 value");
1860        // OneSeed third.
1861        let os: [OneSeed<2>; 2] =
1862            std::array::from_fn(|a| OneSeed::seed_direction(SEED[a], a, U[a]));
1863        let third = row_expr(&os).contracted_third();
1864        let truth3 = t.third_contracted(&U);
1865        for a in 0..2 {
1866            for b in 0..2 {
1867                close(third[a][b], truth3[a][b], &format!("seam third[{a}][{b}]"));
1868            }
1869        }
1870        // TwoSeed fourth.
1871        let ts: [TwoSeed<2>; 2] = std::array::from_fn(|a| TwoSeed::seed(SEED[a], a, U[a], V[a]));
1872        let fourth = row_expr(&ts).contracted_fourth();
1873        let truth4 = t.fourth_contracted(&U, &V);
1874        for a in 0..2 {
1875            for b in 0..2 {
1876                close(
1877                    fourth[a][b],
1878                    truth4[a][b],
1879                    &format!("seam fourth[{a}][{b}]"),
1880                );
1881            }
1882        }
1883    }
1884
1885    /// The (test-only) `Tower4: JetScalar` impl is the all-channels oracle scalar:
1886    /// evaluating the SAME generic `row_expr` at `S = Tower4` (through the
1887    /// `JetScalar` trait ops) must reproduce, channel-for-channel, the `Tower4`
1888    /// obtained from the `RowNllProgram` / inherent-operator path
1889    /// (`evaluate_program`). This pins that the trait impl delegates faithfully to
1890    /// the inherent `Tower4` arithmetic (so the contracted-scalar oracles above,
1891    /// which compare against `evaluate_program`'s tower, are comparing against the
1892    /// same algebra the `JetScalar` interface exposes).
1893    #[test]
1894    fn tower4_as_jetscalar_matches_program_tower_all_channels() {
1895        let t = tower();
1896        let vars: [Tower4<2>; 2] = std::array::from_fn(|a| Tower4::variable(SEED[a], a));
1897        let s = row_expr(&vars);
1898        close(s.v, t.v, "tower-jetscalar value");
1899        for a in 0..2 {
1900            close(s.g[a], t.g[a], &format!("tower-jetscalar grad[{a}]"));
1901            for b in 0..2 {
1902                close(
1903                    s.h[a][b],
1904                    t.h[a][b],
1905                    &format!("tower-jetscalar hess[{a}][{b}]"),
1906                );
1907                for c in 0..2 {
1908                    close(
1909                        s.t3[a][b][c],
1910                        t.t3[a][b][c],
1911                        &format!("tower-jetscalar t3[{a}][{b}][{c}]"),
1912                    );
1913                    for d in 0..2 {
1914                        close(
1915                            s.t4[a][b][c][d],
1916                            t.t4[a][b][c][d],
1917                            &format!("tower-jetscalar t4[{a}][{b}][{c}][{d}]"),
1918                        );
1919                    }
1920                }
1921            }
1922        }
1923    }
1924}
1925
1926#[cfg(test)]
1927mod batch_tests {
1928    //! SIMD row-batching oracle: prove [`Order2Batch<K>`] (4 rows in
1929    //! `wide::f64x4` lanes) is `to_bits`-identical, on every value/gradient/
1930    //! Hessian channel, to the production [`Order2<K>`] evaluated per row — and
1931    //! that the new scalar field [`Order2Lane<f64, K>`] is too. Composing the two
1932    //! claims, batch lane `i` reproduces the production scalar for row `i` bit
1933    //! for bit, so the 4× throughput is a free lunch (no result change).
1934
1935    use super::{
1936        JetScalar, Lane, OneSeed, OneSeedBatch, OneSeedLane, Order2, Order2Batch, Order2Lane,
1937        TwoSeed, TwoSeedBatch, TwoSeedLane,
1938    };
1939
1940    /// The ops the witness row expression needs, so ONE generic body evaluates
1941    /// at the production [`Order2<K>`], the new scalar [`Order2Lane<f64, K>`],
1942    /// and the batched [`Order2Batch<K>`].
1943    trait RowAlg<const K: usize>: Copy {
1944        fn constant(c: f64) -> Self;
1945        fn add(&self, o: &Self) -> Self;
1946        fn sub(&self, o: &Self) -> Self;
1947        fn mul(&self, o: &Self) -> Self;
1948        fn scale(&self, s: f64) -> Self;
1949        fn exp(&self) -> Self;
1950        fn sqrt(&self) -> Self;
1951        fn recip(&self) -> Self;
1952    }
1953
1954    impl<const K: usize> RowAlg<K> for Order2<K> {
1955        fn constant(c: f64) -> Self {
1956            <Self as JetScalar<K>>::constant(c)
1957        }
1958        fn add(&self, o: &Self) -> Self {
1959            JetScalar::add(self, o)
1960        }
1961        fn sub(&self, o: &Self) -> Self {
1962            JetScalar::sub(self, o)
1963        }
1964        fn mul(&self, o: &Self) -> Self {
1965            JetScalar::mul(self, o)
1966        }
1967        fn scale(&self, s: f64) -> Self {
1968            JetScalar::scale(self, s)
1969        }
1970        fn exp(&self) -> Self {
1971            JetScalar::exp(self)
1972        }
1973        fn sqrt(&self) -> Self {
1974            JetScalar::sqrt(self)
1975        }
1976        fn recip(&self) -> Self {
1977            JetScalar::recip(self)
1978        }
1979    }
1980
1981    impl<L: Lane, const K: usize> RowAlg<K> for Order2Lane<L, K> {
1982        fn constant(c: f64) -> Self {
1983            Order2Lane::constant(L::splat(c))
1984        }
1985        fn add(&self, o: &Self) -> Self {
1986            Order2Lane::add(self, o)
1987        }
1988        fn sub(&self, o: &Self) -> Self {
1989            Order2Lane::sub(self, o)
1990        }
1991        fn mul(&self, o: &Self) -> Self {
1992            Order2Lane::mul(self, o)
1993        }
1994        fn scale(&self, s: f64) -> Self {
1995            Order2Lane::scale(self, s)
1996        }
1997        fn exp(&self) -> Self {
1998            Order2Lane::exp(self)
1999        }
2000        fn sqrt(&self) -> Self {
2001            Order2Lane::sqrt(self)
2002        }
2003        fn recip(&self) -> Self {
2004            Order2Lane::recip(self)
2005        }
2006    }
2007
2008    /// A dense witness row expression touching every algebra op (mul, add, sub,
2009    /// scale, exp, sqrt, recip) over ALL `K` primaries, so the gradient and the
2010    /// full `K×K` Hessian are dense (no trivially-zero channel). All transcend.
2011    /// arguments are kept finite/positive: `sqrt(s²+1) > 0`, `recip(exp+2) > 0`.
2012    fn row_expr<const K: usize, A: RowAlg<K>>(p: &[A; K]) -> A {
2013        let mut s = A::constant(0.3);
2014        for a in 0..K {
2015            let b = (a + 1) % K;
2016            s = s.add(&p[a].mul(&p[b]).scale(0.1 + 0.05 * a as f64));
2017        }
2018        let e = s.exp();
2019        let r = s.mul(&s).add(&A::constant(1.0)).sqrt();
2020        let denom = e.add(&A::constant(2.0));
2021        e.mul(&r).sub(&s.scale(0.5)).mul(&denom.recip())
2022    }
2023
2024    /// xorshift64 → `f64` in `[-1, 1)`.
2025    fn rand_unit(state: &mut u64) -> f64 {
2026        let mut x = *state;
2027        x ^= x << 13;
2028        x ^= x >> 7;
2029        x ^= x << 17;
2030        *state = x;
2031        let u = (x >> 11) as f64 / ((1u64 << 53) as f64); // [0, 1)
2032        2.0 * u - 1.0
2033    }
2034
2035    /// Returns the number of (batch, row) pairs whose every channel was
2036    /// verified bit-identical, so the caller can assert the expected total ran.
2037    fn check_k<const K: usize>(state: &mut u64, batches: usize) -> usize {
2038        let mut verified_rows = 0usize;
2039        for _ in 0..batches {
2040            // Four independent rows of K primary values.
2041            let rows: [[f64; K]; 4] =
2042                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2043
2044            // Production ground truth, evaluated per row at Order2<K>.
2045            let prod: [Order2<K>; 4] = std::array::from_fn(|r| {
2046                let p: [Order2<K>; K] = std::array::from_fn(|a| Order2::variable(rows[r][a], a));
2047                row_expr(&p)
2048            });
2049
2050            // New scalar field (Order2Lane<f64>), per row.
2051            let scal: [Order2Lane<f64, K>; 4] = std::array::from_fn(|r| {
2052                let p: [Order2Lane<f64, K>; K] =
2053                    std::array::from_fn(|a| Order2Lane::variable(rows[r][a], a));
2054                row_expr(&p)
2055            });
2056
2057            // Batched: 4 rows packed into f64x4 lanes, ONE vector pass.
2058            let pbatch: [Order2Batch<K>; K] = std::array::from_fn(|a| {
2059                let packed =
2060                    wide::f64x4::new([rows[0][a], rows[1][a], rows[2][a], rows[3][a]]);
2061                Order2Batch::variable(packed, a)
2062            });
2063            let batch = row_expr(&pbatch);
2064
2065            for r in 0..4 {
2066                let g = prod[r].0;
2067                // Order2Lane<f64> == Order2<K> (bit-identical scalar field).
2068                assert_eq!(scal[r].v.to_bits(), g.v.to_bits(), "K={K} scalar v");
2069                // Batch lane r == Order2<K> for row r.
2070                let lr = batch.lane(r).0;
2071                assert_eq!(lr.v.to_bits(), g.v.to_bits(), "K={K} batch lane {r} v");
2072                for a in 0..K {
2073                    assert_eq!(
2074                        scal[r].g[a].to_bits(),
2075                        g.g[a].to_bits(),
2076                        "K={K} scalar g[{a}]"
2077                    );
2078                    assert_eq!(
2079                        lr.g[a].to_bits(),
2080                        g.g[a].to_bits(),
2081                        "K={K} batch lane {r} g[{a}]"
2082                    );
2083                    for b in 0..K {
2084                        assert_eq!(
2085                            scal[r].h[a][b].to_bits(),
2086                            g.h[a][b].to_bits(),
2087                            "K={K} scalar h[{a}][{b}]"
2088                        );
2089                        assert_eq!(
2090                            lr.h[a][b].to_bits(),
2091                            g.h[a][b].to_bits(),
2092                            "K={K} batch lane {r} h[{a}][{b}]"
2093                        );
2094                    }
2095                }
2096                verified_rows += 1;
2097            }
2098        }
2099        verified_rows
2100    }
2101
2102    /// ≥2000 random 4-row batches per K, across K ∈ {2,3,4,9}: every channel of
2103    /// every lane is `to_bits`-identical to the production scalar per row.
2104    #[test]
2105    fn batch_lanes_bit_identical_to_scalar_per_row() {
2106        let mut state = 0x9E37_79B9_7F4A_7C15_u64;
2107        let mut verified = 0usize;
2108        verified += check_k::<2>(&mut state, 2000);
2109        verified += check_k::<3>(&mut state, 2000);
2110        verified += check_k::<4>(&mut state, 2000);
2111        verified += check_k::<9>(&mut state, 2000);
2112        // 4 K-values × 2000 batches × 4 packed rows each, all bit-identical.
2113        assert_eq!(verified, 4 * 2000 * 4, "every batch row must be verified");
2114    }
2115
2116    // ── One-/two-seed lane oracles ──────────────────────────────────────────
2117    //
2118    // The same dense `row_expr` witness program runs over the SEEDED directional
2119    // scalars: the scalar `OneSeed`/`TwoSeed` per row, the `f64`-lane re-type
2120    // (`*SeedLane<f64>`), and the 4-rows-per-pass batch (`*SeedBatch`). The
2121    // headline claim is that the contracted-third / contracted-fourth channel of
2122    // every lane is `to_bits`-identical to the production scalar's per row.
2123
2124    impl<const K: usize> RowAlg<K> for OneSeed<K> {
2125        fn constant(c: f64) -> Self {
2126            <Self as JetScalar<K>>::constant(c)
2127        }
2128        fn add(&self, o: &Self) -> Self {
2129            JetScalar::add(self, o)
2130        }
2131        fn sub(&self, o: &Self) -> Self {
2132            JetScalar::sub(self, o)
2133        }
2134        fn mul(&self, o: &Self) -> Self {
2135            JetScalar::mul(self, o)
2136        }
2137        fn scale(&self, s: f64) -> Self {
2138            JetScalar::scale(self, s)
2139        }
2140        fn exp(&self) -> Self {
2141            JetScalar::exp(self)
2142        }
2143        fn sqrt(&self) -> Self {
2144            JetScalar::sqrt(self)
2145        }
2146        fn recip(&self) -> Self {
2147            JetScalar::recip(self)
2148        }
2149    }
2150
2151    impl<L: Lane, const K: usize> RowAlg<K> for OneSeedLane<L, K> {
2152        fn constant(c: f64) -> Self {
2153            OneSeedLane::constant(L::splat(c))
2154        }
2155        fn add(&self, o: &Self) -> Self {
2156            OneSeedLane::add(self, o)
2157        }
2158        fn sub(&self, o: &Self) -> Self {
2159            OneSeedLane::sub(self, o)
2160        }
2161        fn mul(&self, o: &Self) -> Self {
2162            OneSeedLane::mul(self, o)
2163        }
2164        fn scale(&self, s: f64) -> Self {
2165            OneSeedLane::scale(self, s)
2166        }
2167        fn exp(&self) -> Self {
2168            OneSeedLane::exp(self)
2169        }
2170        fn sqrt(&self) -> Self {
2171            OneSeedLane::sqrt(self)
2172        }
2173        fn recip(&self) -> Self {
2174            OneSeedLane::recip(self)
2175        }
2176    }
2177
2178    impl<const K: usize> RowAlg<K> for TwoSeed<K> {
2179        fn constant(c: f64) -> Self {
2180            <Self as JetScalar<K>>::constant(c)
2181        }
2182        fn add(&self, o: &Self) -> Self {
2183            JetScalar::add(self, o)
2184        }
2185        fn sub(&self, o: &Self) -> Self {
2186            JetScalar::sub(self, o)
2187        }
2188        fn mul(&self, o: &Self) -> Self {
2189            JetScalar::mul(self, o)
2190        }
2191        fn scale(&self, s: f64) -> Self {
2192            JetScalar::scale(self, s)
2193        }
2194        fn exp(&self) -> Self {
2195            JetScalar::exp(self)
2196        }
2197        fn sqrt(&self) -> Self {
2198            JetScalar::sqrt(self)
2199        }
2200        fn recip(&self) -> Self {
2201            JetScalar::recip(self)
2202        }
2203    }
2204
2205    impl<L: Lane, const K: usize> RowAlg<K> for TwoSeedLane<L, K> {
2206        fn constant(c: f64) -> Self {
2207            TwoSeedLane::constant(L::splat(c))
2208        }
2209        fn add(&self, o: &Self) -> Self {
2210            TwoSeedLane::add(self, o)
2211        }
2212        fn sub(&self, o: &Self) -> Self {
2213            TwoSeedLane::sub(self, o)
2214        }
2215        fn mul(&self, o: &Self) -> Self {
2216            TwoSeedLane::mul(self, o)
2217        }
2218        fn scale(&self, s: f64) -> Self {
2219            TwoSeedLane::scale(self, s)
2220        }
2221        fn exp(&self) -> Self {
2222            TwoSeedLane::exp(self)
2223        }
2224        fn sqrt(&self) -> Self {
2225            TwoSeedLane::sqrt(self)
2226        }
2227        fn recip(&self) -> Self {
2228            TwoSeedLane::recip(self)
2229        }
2230    }
2231
2232    fn check_oneseed<const K: usize>(state: &mut u64, batches: usize) -> usize {
2233        let mut rows_checked = 0;
2234        for _ in 0..batches {
2235            let rows: [[f64; K]; 4] =
2236                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2237            // Per-row ε-direction.
2238            let u: [[f64; K]; 4] =
2239                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2240
2241            // Production ground truth (scalar OneSeed per row).
2242            let prod: [OneSeed<K>; 4] = std::array::from_fn(|r| {
2243                let p: [OneSeed<K>; K] =
2244                    std::array::from_fn(|a| OneSeed::seed_direction(rows[r][a], a, u[r][a]));
2245                row_expr(&p)
2246            });
2247
2248            // f64-lane re-type per row.
2249            let scal: [OneSeedLane<f64, K>; 4] = std::array::from_fn(|r| {
2250                let p: [OneSeedLane<f64, K>; K] =
2251                    std::array::from_fn(|a| OneSeedLane::seed_direction(rows[r][a], a, u[r][a]));
2252                row_expr(&p)
2253            });
2254
2255            // 4-rows-per-pass batch.
2256            let pbatch: [OneSeedBatch<K>; K] = std::array::from_fn(|a| {
2257                let val = wide::f64x4::new([rows[0][a], rows[1][a], rows[2][a], rows[3][a]]);
2258                let uu = wide::f64x4::new([u[0][a], u[1][a], u[2][a], u[3][a]]);
2259                OneSeedBatch::seed_direction(val, a, uu)
2260            });
2261            let batch = row_expr(&pbatch);
2262
2263            for r in 0..4 {
2264                let want = prod[r].contracted_third();
2265                let got_scal = scal[r].contracted_third();
2266                let got_batch = batch.lane(r).contracted_third();
2267                // Value channel too (sanity that the base program agrees).
2268                assert_eq!(
2269                    scal[r].base.v.to_bits(),
2270                    prod[r].base.value().to_bits(),
2271                    "OneSeed K={K} scalar value"
2272                );
2273                assert_eq!(
2274                    batch.lane(r).base.value().to_bits(),
2275                    prod[r].base.value().to_bits(),
2276                    "OneSeed K={K} batch lane {r} value"
2277                );
2278                for a in 0..K {
2279                    for b in 0..K {
2280                        assert_eq!(
2281                            got_scal[a][b].to_bits(),
2282                            want[a][b].to_bits(),
2283                            "OneSeed K={K} scalar third[{a}][{b}]"
2284                        );
2285                        assert_eq!(
2286                            got_batch[a][b].to_bits(),
2287                            want[a][b].to_bits(),
2288                            "OneSeed K={K} batch lane {r} third[{a}][{b}]"
2289                        );
2290                    }
2291                }
2292                rows_checked += 1;
2293            }
2294        }
2295        rows_checked
2296    }
2297
2298    fn check_twoseed<const K: usize>(state: &mut u64, batches: usize) -> usize {
2299        let mut rows_checked = 0;
2300        for _ in 0..batches {
2301            let rows: [[f64; K]; 4] =
2302                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2303            let u: [[f64; K]; 4] =
2304                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2305            let v: [[f64; K]; 4] =
2306                std::array::from_fn(|_| std::array::from_fn(|_| rand_unit(state)));
2307
2308            let prod: [TwoSeed<K>; 4] = std::array::from_fn(|r| {
2309                let p: [TwoSeed<K>; K] =
2310                    std::array::from_fn(|a| TwoSeed::seed(rows[r][a], a, u[r][a], v[r][a]));
2311                row_expr(&p)
2312            });
2313
2314            let scal: [TwoSeedLane<f64, K>; 4] = std::array::from_fn(|r| {
2315                let p: [TwoSeedLane<f64, K>; K] =
2316                    std::array::from_fn(|a| TwoSeedLane::seed(rows[r][a], a, u[r][a], v[r][a]));
2317                row_expr(&p)
2318            });
2319
2320            let pbatch: [TwoSeedBatch<K>; K] = std::array::from_fn(|a| {
2321                let val = wide::f64x4::new([rows[0][a], rows[1][a], rows[2][a], rows[3][a]]);
2322                let uu = wide::f64x4::new([u[0][a], u[1][a], u[2][a], u[3][a]]);
2323                let vv = wide::f64x4::new([v[0][a], v[1][a], v[2][a], v[3][a]]);
2324                TwoSeedBatch::seed(val, a, uu, vv)
2325            });
2326            let batch = row_expr(&pbatch);
2327
2328            for r in 0..4 {
2329                let want = prod[r].contracted_fourth();
2330                let got_scal = scal[r].contracted_fourth();
2331                let got_batch = batch.lane(r).contracted_fourth();
2332                assert_eq!(
2333                    scal[r].base.v.to_bits(),
2334                    prod[r].base.value().to_bits(),
2335                    "TwoSeed K={K} scalar value"
2336                );
2337                assert_eq!(
2338                    batch.lane(r).base.value().to_bits(),
2339                    prod[r].base.value().to_bits(),
2340                    "TwoSeed K={K} batch lane {r} value"
2341                );
2342                for a in 0..K {
2343                    for b in 0..K {
2344                        assert_eq!(
2345                            got_scal[a][b].to_bits(),
2346                            want[a][b].to_bits(),
2347                            "TwoSeed K={K} scalar fourth[{a}][{b}]"
2348                        );
2349                        assert_eq!(
2350                            got_batch[a][b].to_bits(),
2351                            want[a][b].to_bits(),
2352                            "TwoSeed K={K} batch lane {r} fourth[{a}][{b}]"
2353                        );
2354                    }
2355                }
2356                rows_checked += 1;
2357            }
2358        }
2359        rows_checked
2360    }
2361
2362    /// ≥2000 random 4-row batches per K, across K ∈ {2,3,4,9}: the
2363    /// contracted-third channel of every `OneSeedLane` lane is `to_bits`-identical
2364    /// to the production [`OneSeed`] per row.
2365    #[test]
2366    fn oneseed_lanes_contracted_third_bit_identical() {
2367        let mut state = 0x1234_5678_9ABC_DEF0_u64;
2368        let batches = 2000;
2369        let rows_checked = check_oneseed::<2>(&mut state, batches)
2370            + check_oneseed::<3>(&mut state, batches)
2371            + check_oneseed::<4>(&mut state, batches)
2372            + check_oneseed::<9>(&mut state, batches);
2373        // 4 widths × `batches` batches × 4 rows each: a silently empty inner
2374        // loop would leave this at zero instead of passing as a no-op.
2375        assert_eq!(rows_checked, 4 * batches * 4);
2376    }
2377
2378    /// ≥2000 random 4-row batches per K, across K ∈ {2,3,4,9}: the
2379    /// contracted-fourth channel of every `TwoSeedLane` lane is `to_bits`-identical
2380    /// to the production [`TwoSeed`] per row.
2381    #[test]
2382    fn twoseed_lanes_contracted_fourth_bit_identical() {
2383        let mut state = 0x0FED_CBA9_8765_4321_u64;
2384        let batches = 2000;
2385        let rows_checked = check_twoseed::<2>(&mut state, batches)
2386            + check_twoseed::<3>(&mut state, batches)
2387            + check_twoseed::<4>(&mut state, batches)
2388            + check_twoseed::<9>(&mut state, batches);
2389        // 4 widths × `batches` batches × 4 rows each: a silently empty inner
2390        // loop would leave this at zero instead of passing as a no-op.
2391        assert_eq!(rows_checked, 4 * batches * 4);
2392    }
2393}
gam_math/jet_scalar.rs

gam_math/
jet_scalar.rs