gam 0.2.3

Generalized penalized likelihood engine
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
//! Joint `(t, β)` inner Newton driver for [`crate::terms::latent_coord::LatentCoordValues`]
//! blocks.
//!
//! See `proposals/latent_coord.md` (full design, gauge story) and
//! `proposals/composition_engine.md` §7 (audit-revised arrow-Schur
//! complexity story: inner step is `O(N d³ + K³)`; the REML outer
//! gradient w.r.t. `t` carries a shared `Schur⁻¹` factor and is handled
//! at the REML driver level, not here).
//!
//! ## Scope
//!
//! This module wires together:
//!
//! * [`crate::terms::latent_coord::LatentCoordValues`] — the per-row
//!   latent field;
//! * [`crate::solver::arrow_schur::ArrowSchurSystem`] — the bordered
//!   `(t, β)` Newton system with arrow structure;
//! * [`crate::solver::arrow_schur::ArrowFactorCache`] — the per-row
//!   Cholesky factors saved between inner iterations and reused by the
//!   IFT warm-start predictor in
//!   [`crate::solver::persistent_warm_start::ift_warm_start_latent`].
//!
//! The driver is a thin coordinator: it expects the caller to supply a
//! closure that, given the current `(β, t)`, assembles a fresh
//! `ArrowSchurSystem` (per-row `H_tt^(i)`, `H_tβ^(i)`, `g_t^(i)`, and
//! the β-block `H_ββ`, `g_β`). The closure is the natural home for
//! "evaluate Φ(t), form Gauss–Newton blocks from the radial jet, fold
//! in analytic penalties" — all of which depend on the basis and the
//! analytic-penalty registry the outer-fit configuration owns.
//!
//! ## Convergence criterion
//!
//! The inner loop terminates when the relative joint gradient norm
//! drops below `tol`:
//!
//! ```text
//!   ‖[g_t; g_β]‖₂ / (1 + ‖[t; β]‖₂)  <  tol
//! ```
//!
//! or when the LM damping cannot be lowered any further (the latter is
//! interpreted as "we are at the steepest-descent floor; declare
//! convergence and let the outer loop tighten ρ"). Failure to factor
//! the per-row block or the Schur complement at the current ridge
//! escalates `ridge_t` / `ridge_beta` by `lm_grow` and retries.

use ndarray::{Array1, ArrayView1};

use crate::solver::arrow_schur::{
    ArrowFactorCache, ArrowSchurError, ArrowSchurSystem, ArrowSolveOptions, ArrowSolverMode,
    arrow_quadratic_model_reduction, solve_arrow_newton_step_with_options,
};
use crate::terms::latent_coord::LatentCoordValues;

/// Configuration for [`LatentInnerSolver::solve`].
#[derive(Debug, Clone)]
pub struct LatentInnerOptions {
    /// Maximum joint `(t, β)` Newton iterations.
    pub max_iterations: usize,
    /// Relative-gradient convergence tolerance (see module docs).
    pub convergence_tolerance: f64,
    /// Initial LM-style ridge on the per-row latent blocks.
    pub initial_ridge_t: f64,
    /// Initial LM-style ridge on the β block.
    pub initial_ridge_beta: f64,
    /// Multiplicative growth factor for the LM ridges on a rejected step.
    pub lm_grow: f64,
    /// Multiplicative shrink factor for the LM ridges on an accepted step.
    pub lm_shrink: f64,
    /// Maximum ridge value before declaring failure.
    pub max_ridge: f64,
    /// BA Schur mode for the reduced shared system. `None` selects Direct for
    /// `K <= 2000` and InexactPCG above, matching large-scale BA practice.
    pub solver_mode: Option<ArrowSolverMode>,
    /// Reduced-system trust-region radius for Steihaug-CG. This is the
    /// Ceres/BA trust-region bound layered on top of the existing LM ridges.
    pub trust_region_radius: f64,
}

impl Default for LatentInnerOptions {
    fn default() -> Self {
        Self {
            max_iterations: 50,
            convergence_tolerance: 1e-8,
            initial_ridge_t: 0.0,
            initial_ridge_beta: 0.0,
            lm_grow: 10.0,
            lm_shrink: 0.5,
            max_ridge: 1e12,
            solver_mode: None,
            trust_region_radius: 1.0,
        }
    }
}

/// Outcome of a [`LatentInnerSolver::solve`] call.
#[derive(Debug, Clone)]
pub struct LatentInnerOutcome {
    /// Final β coefficient vector.
    pub beta: Array1<f64>,
    /// Per-row Cholesky factor cache from the *last* accepted Newton
    /// step. Consumed by the IFT warm-start predictor
    /// ([`crate::solver::persistent_warm_start::ift_warm_start_latent`]).
    pub factor_cache: Option<ArrowFactorCache>,
    /// Number of iterations executed.
    pub iterations: usize,
    /// Whether the convergence test was satisfied.
    pub converged: bool,
    /// Final ridge values (for warm-starting subsequent solves).
    pub final_ridge_t: f64,
    pub final_ridge_beta: f64,
}

/// Driver trait the caller implements to assemble the arrow system at
/// the current iterate.
///
/// The driver owns the basis evaluation (`Φ(t)`), the radial jet
/// (`∂Φ/∂t` via
/// [`crate::terms::latent_coord::LatentCoordValues::design_gradient_wrt_t`]),
/// the Gauss–Newton block assembly
/// (`H_tt^(i) ← (g_i β)(g_i β)^T`, `H_tβ^(i) ← (g_i β) ⊗ Φ_i`,
/// `H_ββ ← Φ^T W Φ + Σ_k λ_k S_k`), and the analytic-penalty fold-in via
/// [`crate::solver::arrow_schur::ArrowSchurSystem::add_analytic_penalty_contributions`].
pub trait ArrowSystemAssembler {
    /// Build a freshly-populated arrow system at the current `(β, t)`.
    ///
    /// `latent` is the *current* latent field; the assembler should read
    /// its values via `latent.as_flat()` / `latent.row(i)` and form Φ at
    /// those coordinates. β is supplied as a view.
    fn assemble(
        &mut self,
        beta: ArrayView1<'_, f64>,
        latent: &LatentCoordValues,
    ) -> Result<ArrowSchurSystem, String>;

    /// Evaluate the true joint merit/objective at the current `(β, t)`.
    ///
    /// This is deliberately separate from [`Self::assemble`]: the Schur system
    /// is a local quadratic model, but nonlinear latent retractions must be
    /// accepted against the objective they actually change.
    fn objective(
        &mut self,
        beta: ArrayView1<'_, f64>,
        latent: &LatentCoordValues,
    ) -> Result<f64, String>;
}

/// Joint `(t, β)` inner Newton solver exploiting arrow structure.
///
/// ## Usage
///
/// 1. Call [`LatentInnerSolver::new`] with the initial `β`, a mutable
///    [`LatentCoordValues`], an [`ArrowSystemAssembler`], and options.
/// 2. Call [`LatentInnerSolver::solve`] to run the inner Newton loop.
///    Both `β` and the latent field are updated in place.
/// 3. The returned [`LatentInnerOutcome::factor_cache`] is the
///    artifact Piece 3's IFT warm-start consumes when the outer loop
///    next perturbs `(β, ρ)`.
pub struct LatentInnerSolver<'a, A: ArrowSystemAssembler> {
    pub beta: Array1<f64>,
    pub latent: &'a mut LatentCoordValues,
    pub assembler: A,
    pub options: LatentInnerOptions,
}

impl<'a, A: ArrowSystemAssembler> LatentInnerSolver<'a, A> {
    #[must_use]
    pub fn new(
        beta: Array1<f64>,
        latent: &'a mut LatentCoordValues,
        assembler: A,
        options: LatentInnerOptions,
    ) -> Self {
        Self {
            beta,
            latent,
            assembler,
            options,
        }
    }

    /// Run the joint Newton loop.
    ///
    /// Numerical-stability invariants:
    ///   * `initial_ridge_t`, `initial_ridge_beta` are clamped to `≥ 0`.
    ///   * On a per-row or Schur PD failure, both ridges are escalated
    ///     by `lm_grow` (or seeded at `1e-6` when starting from `0`).
    ///   * `max_ridge` is the cold-restart trigger: if we exhaust the
    ///     ramp and the Hessian is still non-PSD, the loop bails with a
    ///     clear diagnostic citing the iteration index and both ridge
    ///     levels reached — the outer driver should treat this as an
    ///     identifiability failure (missing gauge-fixing penalty,
    ///     collinear basis, etc.).
    pub fn solve(&mut self) -> Result<LatentInnerOutcome, String> {
        let opts = self.options.clone();
        assert!(opts.lm_grow > 1.0, "LM ridge grow factor must exceed 1");
        assert!(
            opts.lm_shrink > 0.0 && opts.lm_shrink < 1.0,
            "LM ridge shrink factor must lie in (0, 1)"
        );
        let mut ridge_t = opts.initial_ridge_t.max(0.0);
        let mut ridge_beta = opts.initial_ridge_beta.max(0.0);
        let mut last_cache: Option<ArrowFactorCache> = None;
        let mut converged = false;
        let mut iter = 0_usize;
        let mut current_objective = self
            .assembler
            .objective(self.beta.view(), self.latent)
            .map_err(|e| format!("LatentInnerSolver: objective failed at start: {e}"))?;
        if !current_objective.is_finite() {
            return Err("LatentInnerSolver: non-finite objective at start".to_string());
        }

        while iter < opts.max_iterations {
            let mut system = self
                .assembler
                .assemble(self.beta.view(), self.latent)
                .map_err(|e| format!("LatentInnerSolver: assembler failed at iter {iter}: {e}"))?;
            system.apply_riemannian_latent_geometry(self.latent);

            // Convergence test: relative joint gradient norm.
            let g_norm_sq = system_gradient_norm_sq(&system);
            let scale = 1.0 + iterate_norm(self.beta.view(), self.latent.as_flat().view());
            let rel = (g_norm_sq.sqrt()) / scale;
            if rel < opts.convergence_tolerance {
                converged = true;
                // Build a final factor cache for the warm-start IFT
                // predictor even though we didn't take a step. Best-
                // effort — if the factorization fails (e.g. ill-
                // conditioned at the very converged point), skip the
                // cache; the predictor will then no-op.
                let solve_options = latent_arrow_solve_options(
                    &system,
                    &opts,
                    !self.latent.effective_is_all_euclidean(),
                );
                if let Ok((_, _, cache)) = solve_arrow_newton_step_with_options(
                    &system,
                    ridge_t.max(1e-12),
                    ridge_beta.max(1e-12),
                    &solve_options,
                ) {
                    last_cache = Some(cache);
                }
                break;
            }

            // Attempt the LM-damped arrow-Schur step. On failure (per-
            // row PD violation or Schur PD violation), grow the ridge
            // and retry without consuming an outer iteration.
            let solve_options = latent_arrow_solve_options(
                &system,
                &opts,
                !self.latent.effective_is_all_euclidean(),
            );
            let step_result =
                solve_arrow_newton_step_with_options(&system, ridge_t, ridge_beta, &solve_options);
            match step_result {
                Ok((delta_t, delta_beta, cache)) => {
                    let delta_t = limit_delta_to_riemannian_trust_region(
                        delta_t,
                        self.latent,
                        solve_options.riemannian_trust_region,
                        solve_options.trust_region.radius,
                    );
                    let predicted_reduction = arrow_quadratic_model_reduction(
                        &system,
                        delta_t.view(),
                        delta_beta.view(),
                        ridge_t,
                        ridge_beta,
                    )
                    .map_err(|e| {
                        format!("LatentInnerSolver: predicted reduction failed at iter {iter}: {e}")
                    })?;
                    let beta_before = self.beta.clone();
                    let t_before = self.latent.as_flat().clone();
                    for (b, db) in self.beta.iter_mut().zip(delta_beta.iter()) {
                        *b += *db;
                    }
                    self.latent.retract_flat_delta(delta_t.view());
                    let trial_objective = self
                        .assembler
                        .objective(self.beta.view(), self.latent)
                        .map_err(|e| {
                        format!("LatentInnerSolver: objective failed at trial iter {iter}: {e}")
                    })?;
                    let objective_scale = current_objective.abs().max(1.0);
                    let noise_floor = objective_scale * 1e-14;
                    let actual_reduction = current_objective - trial_objective;
                    let rho = if predicted_reduction > noise_floor {
                        actual_reduction / predicted_reduction
                    } else if actual_reduction >= -noise_floor {
                        1.0
                    } else {
                        -1.0
                    };
                    if rho > 0.0 && trial_objective.is_finite() {
                        current_objective = trial_objective;
                        ridge_t = (ridge_t * opts.lm_shrink).max(0.0);
                        ridge_beta = (ridge_beta * opts.lm_shrink).max(0.0);
                        last_cache = Some(cache);
                        iter += 1;
                    } else {
                        self.beta = beta_before;
                        self.latent.set_flat(t_before.view());
                        ridge_t = if ridge_t == 0.0 {
                            1e-6
                        } else {
                            ridge_t * opts.lm_grow
                        };
                        ridge_beta = if ridge_beta == 0.0 {
                            1e-6
                        } else {
                            ridge_beta * opts.lm_grow
                        };
                        if ridge_t > opts.max_ridge || ridge_beta > opts.max_ridge {
                            return Err(format!(
                                "LatentInnerSolver: LM rejected nonlinear step until ridge \
                                 exceeded max ({}) at iter {} \
                                 (ridge_t={ridge_t:.3e}, ridge_beta={ridge_beta:.3e}, \
                                 rho={rho:.3e}, predicted_reduction={predicted_reduction:.3e}, \
                                 actual_reduction={actual_reduction:.3e})",
                                opts.max_ridge, iter,
                            ));
                        }
                    }
                }
                Err(err @ ArrowSchurError::PerRowFactorFailed { .. })
                | Err(err @ ArrowSchurError::PerRowFactorIllConditioned { .. })
                | Err(err @ ArrowSchurError::SchurFactorFailed { .. })
                | Err(err @ ArrowSchurError::PcgFailed { .. })
                | Err(err @ ArrowSchurError::AdaptiveCorrectionFailed { .. }) => {
                    // Grow ridges; retry without burning an iteration.
                    // The per-row `factor_blocks` already ran an internal
                    // ridge-ramp before surfacing the error here — if we
                    // see it at this layer, the row block is
                    // genuinely under-regularized (gauge issue or
                    // collinear basis under U_i). Escalate the LM ridge
                    // and let the outer Newton step damp.
                    ridge_t = if ridge_t == 0.0 {
                        1e-6
                    } else {
                        ridge_t * opts.lm_grow
                    };
                    ridge_beta = if ridge_beta == 0.0 {
                        1e-6
                    } else {
                        ridge_beta * opts.lm_grow
                    };
                    if ridge_t > opts.max_ridge || ridge_beta > opts.max_ridge {
                        return Err(format!(
                            "LatentInnerSolver: cold-restart condition — LM ridge \
                             exceeded max ({}) at iter {} \
                             (ridge_t={ridge_t:.3e}, ridge_beta={ridge_beta:.3e}); \
                             root-cause arrow-Schur error: {err}",
                            opts.max_ridge, iter,
                        ));
                    }
                }
            }
        }

        Ok(LatentInnerOutcome {
            beta: self.beta.clone(),
            factor_cache: last_cache,
            iterations: iter,
            converged,
            final_ridge_t: ridge_t,
            final_ridge_beta: ridge_beta,
        })
    }
}

fn latent_arrow_solve_options(
    system: &ArrowSchurSystem,
    opts: &LatentInnerOptions,
    riemannian_trust_region: bool,
) -> ArrowSolveOptions {
    let mut solve_options = ArrowSolveOptions::automatic(system.k);
    if let Some(mode) = opts.solver_mode {
        solve_options.mode = mode;
    }
    solve_options.trust_region.radius = opts.trust_region_radius;
    solve_options.riemannian_trust_region = riemannian_trust_region;
    solve_options
}

fn limit_delta_to_riemannian_trust_region(
    mut delta_t: Array1<f64>,
    latent: &LatentCoordValues,
    enabled: bool,
    radius: f64,
) -> Array1<f64> {
    if !enabled || !radius.is_finite() || radius <= 0.0 {
        return delta_t;
    }
    let row_weights = latent.effective_metric_weights();
    assert_eq!(row_weights.len(), latent.latent_dim());
    let mut norm_sq = 0.0_f64;
    for n in 0..latent.n_obs() {
        let start = n * latent.latent_dim();
        for a in 0..latent.latent_dim() {
            let value = delta_t[start + a];
            norm_sq += row_weights[a] * value * value;
        }
    }
    let norm = norm_sq.sqrt();
    if norm <= radius || norm == 0.0 {
        return delta_t;
    }
    let shrink = radius / norm;
    for value in delta_t.iter_mut() {
        *value *= shrink;
    }
    delta_t
}

fn system_gradient_norm_sq(sys: &ArrowSchurSystem) -> f64 {
    let mut acc = 0.0_f64;
    for j in 0..sys.k {
        acc += sys.gb[j] * sys.gb[j];
    }
    for row in &sys.rows {
        for c in 0..sys.d {
            acc += row.gt[c] * row.gt[c];
        }
    }
    acc
}

fn iterate_norm(beta: ArrayView1<'_, f64>, t: ArrayView1<'_, f64>) -> f64 {
    let mut acc = 0.0_f64;
    for v in beta.iter() {
        acc += v * v;
    }
    for v in t.iter() {
        acc += v * v;
    }
    acc.sqrt()
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::terms::latent_coord::{LatentCoordValues, LatentIdMode};
    use ndarray::array;

    struct ZeroAssembler {
        n: usize,
        d: usize,
        k: usize,
    }

    impl ArrowSystemAssembler for ZeroAssembler {
        fn assemble(
            &mut self,
            arr: ArrayView1<'_, f64>,
            latent_coords: &LatentCoordValues,
        ) -> Result<ArrowSchurSystem, String> {
            assert!(arr.iter().all(|v| !v.is_nan()));
            assert!(std::mem::size_of_val(latent_coords) > 0);
            let mut sys = ArrowSchurSystem::new(self.n, self.d, self.k);
            for j in 0..self.k {
                sys.hbb[[j, j]] = 1.0;
            }
            for row in sys.rows.iter_mut() {
                for c in 0..self.d {
                    row.htt[[c, c]] = 1.0;
                }
            }
            Ok(sys)
        }

        fn objective(
            &mut self,
            arr: ArrayView1<'_, f64>,
            latent_coords: &LatentCoordValues,
        ) -> Result<f64, String> {
            assert!(arr.iter().all(|v| !v.is_nan()));
            assert!(std::mem::size_of_val(latent_coords) > 0);
            Ok(0.0)
        }
    }

    #[test]
    fn zero_assembler_converges_immediately() {
        let m = array![[0.0_f64, 0.0], [0.0, 0.0]];
        let mut latent = LatentCoordValues::from_matrix(m.view(), LatentIdMode::None);
        let beta = Array1::<f64>::zeros(3);
        let assembler = ZeroAssembler { n: 2, d: 2, k: 3 };
        let mut solver =
            LatentInnerSolver::new(beta, &mut latent, assembler, LatentInnerOptions::default());
        let outcome = solver.solve().expect("zero assembler always succeeds");
        assert!(outcome.converged);
        assert_eq!(outcome.iterations, 0);
    }
}