Skip to main content

gam_model_api/families/custom_family/
options.rs

1//! Fit-time configuration and cost accounting: `BlockwiseFitOptions`, the
2//! outer-derivative policy + order selection, coefficient cost models, and the
3//! argument-validation asserts shared by the solver entry points.
4
5use crate::families::custom_family::family_trait::{CustomFamily, OuterEvalContext};
6use crate::families::custom_family::psi_design::{
7    CustomFamilyBlockPsiDerivative, ExactNewtonJointHessianWorkspace,
8};
9use gam_linalg::RidgePolicy;
10use gam_problem::{ParameterBlockSpec, ParameterBlockState};
11use ndarray::Array1;
12use std::ops::Range;
13use std::sync::Arc;
14use std::sync::atomic::AtomicUsize;
15
16// Moved to `gam-problem` (#1521 CustomFamily-cone inversion): the neutral,
17// dependency-free outer-objective + exact-derivative-order capability enums now
18// live in `gam_problem::family_options` and are re-exported here so every
19// `custom_family::ExactNewtonOuterObjective` / `ExactOuterDerivativeOrder` path
20// keeps resolving byte-for-byte.
21pub use gam_problem::{ExactNewtonOuterObjective, ExactOuterDerivativeOrder};
22
23// The IRLS weight / ridge floors and the block-spec consistency validator are
24// neutral (no `CustomFamily` dependency); they live in `gam-problem` and are
25// re-exported here so every `custom_family::{CUSTOM_FAMILY_WEIGHT_FLOOR,
26// CUSTOM_FAMILY_RIDGE_FLOOR, validate_blockspec_consistency}` path keeps
27// resolving and `CUSTOM_FAMILY_WEIGHT_FLOOR` stays sourced from the canonical
28// `MIN_WEIGHT` floor rather than a hardcoded literal.
29pub use gam_problem::{
30    CUSTOM_FAMILY_RIDGE_FLOOR, CUSTOM_FAMILY_WEIGHT_FLOOR, validate_blockspec_consistency,
31};
32
33/// Exact outer derivative order for families that expose second-order
34/// coefficient geometry.
35///
36/// This used to be a cost gate that demoted large large-scale problems to
37/// first-order BFGS. That was a policy leak into the math layer: if the family
38/// supplies analytic dense Hessian blocks or an analytic profiled-Hessian HVP,
39/// the outer optimizer should see the exact second-order objective. Runtime
40/// representation choices (dense vs operator) belong below this declaration,
41/// not in a first-order downgrade.
42/// Precondition check for the family capability / operator hooks (e.g.
43/// `batched_outer_hessian_terms`, `outer_hyper_hessian_operator`).
44///
45/// These hooks operate on whatever block geometry the caller has assembled and
46/// must validate the *consistency* of the specs they are handed — never the
47/// fit-level "at least one block" precondition, which belongs to the fit entry
48/// points (`validate_blockspecs`). An empty, self-consistent argument set is a
49/// valid no-op probe of the operator path (the operator may ignore the specs
50/// entirely), so it must not panic here.
51pub(crate) fn assert_valid_blockspecs(specs: &[ParameterBlockSpec], context: &str) {
52    assert!(
53        validate_blockspec_consistency(specs).is_ok(),
54        "{context}: inconsistent parameter block specs"
55    );
56}
57
58pub(crate) fn assert_valid_options(options: &BlockwiseFitOptions, context: &str) {
59    assert!(
60        options.inner_tol.is_finite() && options.inner_tol >= 0.0,
61        "{context}: inner_tol must be finite and non-negative"
62    );
63    assert!(
64        options.outer_tol.is_finite() && options.outer_tol >= 0.0,
65        "{context}: outer_tol must be finite and non-negative"
66    );
67    assert!(
68        options.minweight.is_finite() && options.minweight >= 0.0,
69        "{context}: minweight must be finite and non-negative"
70    );
71    assert!(
72        options.ridge_floor.is_finite() && options.ridge_floor >= 0.0,
73        "{context}: ridge_floor must be finite and non-negative"
74    );
75    if let Some(threshold) = options.early_exit_threshold {
76        assert!(
77            threshold.is_finite(),
78            "{context}: early_exit_threshold must be finite"
79        );
80    }
81}
82
83pub(crate) fn assert_states_match_specs(
84    states: &[ParameterBlockState],
85    specs: &[ParameterBlockSpec],
86    context: &str,
87) {
88    assert_eq!(
89        states.len(),
90        specs.len(),
91        "{context}: state/spec block count mismatch"
92    );
93    for (block, (state, spec)) in states.iter().zip(specs).enumerate() {
94        assert_eq!(
95            state.beta.len(),
96            spec.design.ncols(),
97            "{context}: beta length mismatch in block {block}"
98        );
99        // `state.eta` is produced from `solver_design()` (see
100        // `refresh_all_block_etas`), which is `stacked_design` when set
101        // (3·n_obs rows for survival LS time-varying blocks) and `design`
102        // (n_obs rows) otherwise. Use the same accessor here.
103        assert_eq!(
104            state.eta.len(),
105            spec.solver_design().nrows(),
106            "{context}: eta length mismatch in block {block}"
107        );
108    }
109}
110
111pub(crate) fn assert_derivative_blocks_match_specs(
112    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
113    specs: &[ParameterBlockSpec],
114    context: &str,
115) {
116    assert_eq!(
117        derivative_blocks.len(),
118        specs.len(),
119        "{context}: derivative/spec block count mismatch"
120    );
121}
122
123pub(crate) fn assert_rho_matches_specs(
124    rho: &Array1<f64>,
125    specs: &[ParameterBlockSpec],
126    context: &str,
127) {
128    let expected = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
129    assert_eq!(
130        rho.len(),
131        expected,
132        "{context}: rho length does not match penalty count"
133    );
134}
135
136pub(crate) fn validate_hessian_workspace_ready(
137    hessian_workspace: &Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
138    context: &str,
139    eval_mode: gam_problem::EvalMode,
140) -> Result<(), String> {
141    if let Some(workspace) = hessian_workspace.as_ref() {
142        workspace
143            .warm_up_outer_caches_for_mode(eval_mode)
144            .map_err(|err| format!("{context}: failed to warm Hessian workspace caches: {err}"))?;
145    }
146    Ok(())
147}
148
149pub fn exact_outer_order_from_capability(
150    specs: &[ParameterBlockSpec],
151    coefficient_cost: u64,
152) -> ExactOuterDerivativeOrder {
153    assert_valid_blockspecs(specs, "exact outer derivative order");
154    match coefficient_cost {
155        0 => ExactOuterDerivativeOrder::Second,
156        _ => ExactOuterDerivativeOrder::Second,
157    }
158}
159
160/// Capability-aware variant of [`exact_outer_order_from_capability`].
161///
162/// Kept as the public declaration helper for existing family impls, but it no
163/// longer gates by cost. Once a caller has established dense or HVP analytic
164/// second-order support, the correct derivative order is `Second`.
165pub fn exact_outer_order_with_outer_hvp(
166    specs: &[ParameterBlockSpec],
167    coefficient_cost: u64,
168    outer_hyper_hessian_hvp_available: bool,
169) -> ExactOuterDerivativeOrder {
170    if outer_hyper_hessian_hvp_available {
171        assert_valid_blockspecs(specs, "exact outer derivative order with HVP");
172        match coefficient_cost {
173            0 => ExactOuterDerivativeOrder::Second,
174            _ => ExactOuterDerivativeOrder::Second,
175        }
176    } else {
177        exact_outer_order_from_capability(specs, coefficient_cost)
178    }
179}
180
181/// Realized outer-derivative policy at the current problem size.
182///
183/// Capability (the family can produce exact second-order calculus) controls
184/// whether the Hessian is declared. Runtime cost controls only representation
185/// and staging choices below this layer. Large problems must stay on the exact
186/// analytic Hessian path and use an operator representation when dense assembly
187/// is too expensive; they are not demoted to first-order BFGS here.
188///
189/// `OuterDerivativePolicy` records the family's *capability*, the *predicted
190/// per-eval cost* for both gradient-only and Hessian paths, and exposes the
191/// two policy queries the outer optimizer actually needs:
192///
193/// * [`order_for_evaluation`](Self::order_for_evaluation) — clamp a requested
194///   evaluation order against the policy gate.
195/// * [`declared_hessian_form`](Self::declared_hessian_form) — what shape the
196///   outer-strategy planner should declare to its plan ladder.
197/// * [`should_use_staged_kappa`](Self::should_use_staged_kappa) — auto-route
198///   the κ optimizer through the pilot/polish schedule at large `n`.
199///
200/// All thresholds are *const* — no env vars, no CLI flags. The cost model is
201/// the family's own `coefficient_gradient_cost` / `coefficient_hessian_cost`
202/// scaled by the joint outer-coordinate dimension, with `saturating_mul` so
203/// overflow rounds up to the budget ceiling rather than wrapping silently.
204#[derive(Clone, Copy, Debug)]
205pub struct OuterDerivativePolicy {
206    /// What exact calculus the family advertises in principle.
207    pub capability: ExactOuterDerivativeOrder,
208    /// Predicted per-eval work for one `ValueGradientHessian` evaluation.
209    /// Rounded conservatively *up* via `saturating_mul`. Informational for
210    /// representation and diagnostics; it does not disable Hessian capability.
211    pub predicted_hessian_work: u128,
212    /// Predicted per-eval work for one `ValueAndGradient` evaluation.
213    /// Rounded conservatively *up* via `saturating_mul`.
214    pub predicted_gradient_work: u128,
215    /// True when the family's outer-only paths consume
216    /// [`BlockwiseFitOptions::outer_score_subsample`] and produce
217    /// Horvitz-Thompson-weighted partial sums (i.e. the family overrides
218    /// `log_likelihood_only_with_options`,
219    /// `exact_newton_joint_psi_workspace_with_options`, and any other
220    /// outer-only hooks reached by `evaluate_custom_family_joint_hyper`).
221    ///
222    /// Determines whether the κ optimizer's pilot/polish staging schedule
223    /// engages: when this is `false`, [`Self::should_use_staged_kappa`]
224    /// returns `false` regardless of `n`. Engaging the schedule on a
225    /// family that ignores the subsample is strictly worse than not
226    /// engaging it — the schedule builds a `RowSet::Subsample` and the
227    /// boundary plumbing installs an `OuterScoreSubsample` on options,
228    /// but the family's default outer-only paths fall back to full-data
229    /// sums, so the pilot evaluation costs the same as the polish but
230    /// adds a Vec allocation per eval.
231    ///
232    /// Families that do **not** consume the subsample (default for new
233    /// implementations, including the GAMLSS location-scale families
234    /// today) leave this `false`. Families that do consume (today:
235    /// `BernoulliMarginalSlopeFamily`) override `outer_derivative_policy`
236    /// to set this `true`.
237    pub subsample_capable: bool,
238}
239
240impl OuterDerivativePolicy {
241    /// Per-eval gradient work ceiling above which the κ schedule switches
242    /// to the staged pilot/polish path. At large scale (n ≳ 100 k) even
243    /// the gradient sweep takes minutes per outer iter; subsampling the
244    /// pilot stage cuts that to seconds and leaves the final polish on
245    /// full data to recover the MLE.
246    pub const OUTER_GRADIENT_WORK_BUDGET: u128 = 50_000_000_000;
247
248    /// Pilot subsample auto-engages when full-data `n` exceeds this. Below
249    /// this the κ schedule collapses to a single full-data stage —
250    /// behaviour identical to the pre-P7 path.
251    pub const STAGED_KAPPA_TRIGGER_N: usize = 30_000;
252
253    /// Clamp a requested evaluation order against the policy gate.
254    ///
255    /// Returns the highest order this policy permits for the requested order:
256    /// * `ValueGradientHessian` requested → keep only if `declared_hessian_form`
257    ///   is something other than `Unavailable`.
258    /// * `ValueAndGradient` requested → always permitted (gradient-only is
259    ///   universal).
260    pub fn order_for_evaluation(&self, requested: crate::OuterEvalOrder) -> crate::OuterEvalOrder {
261        use crate::OuterEvalOrder;
262        match requested {
263            // Value-only is universal: every policy can evaluate the bare
264            // objective, so the request passes through unclamped.
265            OuterEvalOrder::Value => OuterEvalOrder::Value,
266            OuterEvalOrder::ValueAndGradient => OuterEvalOrder::ValueAndGradient,
267            OuterEvalOrder::ValueGradientHessian => {
268                if matches!(
269                    self.declared_hessian_form(),
270                    gam_problem::DeclaredHessianForm::Unavailable
271                ) {
272                    OuterEvalOrder::ValueAndGradient
273                } else {
274                    OuterEvalOrder::ValueGradientHessian
275                }
276            }
277        }
278    }
279
280    /// Outer Hessian declaration for the outer-strategy planner.
281    ///
282    /// `Either` ⇔ capability has Hessian. Work estimates select dense vs
283    /// operator assembly later; they must not erase analytic second-order
284    /// capability from the planner.
285    pub fn declared_hessian_form(&self) -> gam_problem::DeclaredHessianForm {
286        use gam_problem::DeclaredHessianForm;
287        if !self.capability.has_hessian() {
288            return DeclaredHessianForm::Unavailable;
289        }
290        DeclaredHessianForm::Either
291    }
292
293    /// True when the κ optimizer should auto-route through the staged
294    /// pilot/polish schedule. Triggers when **either** the data is big
295    /// (`n ≥ STAGED_KAPPA_TRIGGER_N`) **or** the per-eval gradient work
296    /// exceeds `OUTER_GRADIENT_WORK_BUDGET`. The second clause catches
297    /// problems with moderate `n` but very wide design (large `p_total`
298    /// or `psi_dim`) where a single full-data gradient sweep still
299    /// dominates the κ trajectory.
300    pub fn should_use_staged_kappa(&self, n: usize) -> bool {
301        if !self.subsample_capable {
302            // Family does not consume `outer_score_subsample` on its
303            // outer-only paths. Engaging the schedule would build a
304            // pilot `RowSet::Subsample` whose only effect is per-eval
305            // Vec/Arc bookkeeping — the underlying coefficient gradient
306            // would still sum every row. Gate the schedule off until
307            // the family override declares consumption.
308            return false;
309        }
310        n >= Self::STAGED_KAPPA_TRIGGER_N
311            || self.predicted_gradient_work > Self::OUTER_GRADIENT_WORK_BUDGET
312    }
313}
314
315/// Total outer-coordinate dimensionality used by the default policy work
316/// model: `rho_dim + psi_dim`. Each outer evaluation propagates one
317/// directional derivative per outer coordinate through the inner solve.
318#[inline]
319pub(crate) fn outer_coord_dim_for_policy(specs: &[ParameterBlockSpec], psi_dim: usize) -> u128 {
320    let rho_total: u128 = specs
321        .iter()
322        .map(|s| s.penalties.len() as u128)
323        .fold(0u128, |acc, k| acc.saturating_add(k));
324    rho_total.saturating_add(psi_dim as u128)
325}
326
327/// Default predicted-cost model for [`OuterDerivativePolicy`]:
328///
329/// * gradient work ≈ `coefficient_gradient_cost · (rho_dim + psi_dim)`
330/// * Hessian work  ≈ `coefficient_hessian_cost  · (rho_dim + psi_dim)`
331///
332/// Each outer coordinate triggers one analytic directional derivative
333/// through the inner solve; the dense Hessian assembly carries the extra
334/// `O(p_total)` factor already captured by `coefficient_hessian_cost`.
335///
336/// All multiplications saturate so an overflow rounds *up* to the gate
337/// ceiling: we'd rather drop one Hessian evaluation that we could have
338/// afforded than crash on a 600 s eval.
339pub fn default_outer_derivative_policy_costs(
340    specs: &[ParameterBlockSpec],
341    psi_dim: usize,
342    grad_cost: u64,
343    hess_cost: u64,
344) -> (u128, u128) {
345    let k = outer_coord_dim_for_policy(specs, psi_dim);
346    let grad = (grad_cost as u128).saturating_mul(k.max(1));
347    let hess = (hess_cost as u128).saturating_mul(k.max(1));
348    (grad, hess)
349}
350
351/// Default coefficient-space Hessian cost: `Σ_b n_b · p_b²`, summed across
352/// blocks. Represents the work to assemble or apply the dense block-diagonal
353/// inner Hessian once.
354pub fn default_coefficient_hessian_cost(specs: &[ParameterBlockSpec]) -> u64 {
355    specs
356        .iter()
357        .map(|s| {
358            let n = s.design.nrows() as u64;
359            let p = s.design.ncols() as u64;
360            n.saturating_mul(p.saturating_mul(p))
361        })
362        .fold(0u64, |acc, c| acc.saturating_add(c))
363}
364
365/// Joint-coupled coefficient-space Hessian cost: `n · (Σ_b p_b)²`. The honest
366/// per-evaluation work for any family whose row likelihood couples every block
367/// (every observation contributes a rank-`m` outer-product update to the full
368/// joint Hessian over `Σ p_b` coefficients), as opposed to the block-diagonal
369/// `default_coefficient_hessian_cost` which assumes each `X_b' W_b X_b` is
370/// assembled independently.
371///
372/// Used by all GAMLSS, marginal-slope, and joint-latent families. CTN does
373/// not delegate here — it uses its Khatri–Rao factor dimensions internally.
374pub fn joint_coupled_coefficient_hessian_cost(n: u64, specs: &[ParameterBlockSpec]) -> u64 {
375    let p_total: u64 = specs
376        .iter()
377        .map(|s| s.design.ncols() as u64)
378        .fold(0u64, |acc, p| acc.saturating_add(p));
379    n.saturating_mul(p_total.saturating_mul(p_total))
380}
381
382/// Default coefficient-space gradient cost: half the Hessian cost.
383///
384/// The first-order analytic gradient in the unified evaluator runs the same
385/// inner Newton solve as the second-order path but skips the `K`-fold
386/// pairwise Hessian assembly (`B_{j,k}` blocks) and the `K`-fold inner
387/// derivative solves; what remains is the inner solve plus a single
388/// gradient-only sweep through the data. Empirically this is roughly half
389/// the per-evaluation arithmetic of forming the dense Hessian, hence the
390/// `/2` default. Families whose gradient assembly differs structurally
391/// (e.g. matrix-free Hv operators with no dense Hessian assembly to halve)
392/// should override [`CustomFamily::coefficient_gradient_cost`] explicitly.
393pub fn default_coefficient_gradient_cost(specs: &[ParameterBlockSpec]) -> u64 {
394    default_coefficient_hessian_cost(specs) / 2
395}
396
397/// Compute β-block column ranges from a slice of `ParameterBlockSpec`s.
398///
399/// Returns one `Range<usize>` per spec, covering the spec's columns in the
400/// concatenated β vector (i.e. `offset .. offset + p_block` where `p_block =
401/// spec.design.ncols()`). The ranges are non-overlapping, sorted, and their
402/// union covers `0..Σ p_block`.
403///
404/// This is the canonical source of `block_offsets` for every
405/// [`crate::solver::arrow_schur::ArrowSchurSystem`] built for a custom family
406/// (survival, GAMLSS, transformation-normal, latent-survival, marginal-slope,
407/// …). Pass the result to
408/// [`crate::solver::arrow_schur::ArrowSchurSystem::set_block_offsets`] before
409/// calling `solve` or `solve_with_options` whenever the system will use
410/// [`crate::solver::arrow_schur::ArrowSolverMode::InexactPCG`].
411///
412/// Specs with zero columns produce a zero-width range; callers that want to
413/// skip trivial blocks may filter on `r.start < r.end` after calling this
414/// function.
415pub fn block_offsets_from_specs(specs: &[ParameterBlockSpec]) -> Arc<[Range<usize>]> {
416    let mut ranges: Vec<Range<usize>> = Vec::with_capacity(specs.len());
417    let mut cursor = 0usize;
418    for spec in specs {
419        let p = spec.design.ncols();
420        ranges.push(cursor..cursor + p);
421        cursor += p;
422    }
423    Arc::from(ranges.into_boxed_slice())
424}
425
426/// Bound first-order outer iterations when each analytic-gradient evaluation is
427/// already large-scale work. This is only applied after the planner has
428/// selected a gradient-only route; second-order/ARC plans keep their requested
429/// iteration budget.
430pub fn cost_gated_first_order_max_iter(
431    requested: usize,
432    coefficient_gradient_cost: u64,
433    has_outer_hessian: bool,
434) -> usize {
435    const FIRST_ORDER_OUTER_WORK_BUDGET: u64 = 80_000_000_000;
436    const MIN_FIRST_ORDER_ITERS: usize = 4;
437
438    if has_outer_hessian || requested <= 1 || coefficient_gradient_cost == 0 {
439        return requested;
440    }
441
442    let affordable = (FIRST_ORDER_OUTER_WORK_BUDGET / coefficient_gradient_cost) as usize;
443    requested.min(affordable.max(MIN_FIRST_ORDER_ITERS))
444}
445
446/// Local trust budget for first-order outer BFGS on log-smoothing parameters.
447///
448/// One unit in `rho = log(lambda)` is an `e`-fold smoothing-parameter change.
449/// Previously this cap was `1.0`, which throttled BFGS to ~1/5 of its
450/// quasi-Newton step on flat REML surfaces (the natural BFGS direction has
451/// `|d|_inf` of ~5 in log-λ for large-scale survival fits). Probes whose
452/// `step_inf > cap` are rejected for free in `OuterFirstOrderBridge::eval_cost`
453/// (returning `BFGS_LINE_SEARCH_REJECT_COST` without running an inner solve),
454/// so a larger cap costs nothing on rejection — it only lets Strong-Wolfe
455/// accept bigger steps that the inner-PIRLS divergence guard can already
456/// validate. `5.0` allows up to `e^5 ≈ 148`-fold smoothing-parameter change
457/// per accepted outer iter, which matches the typical quasi-Newton direction
458/// magnitude while still bounding pathological probes.
459pub const fn first_order_bfgs_loglambda_step_cap(has_outer_hessian: bool) -> Option<f64> {
460    if has_outer_hessian { None } else { Some(5.0) }
461}
462
463pub fn exact_newton_outer_geometry_supports_second_order_solver<F: CustomFamily + ?Sized>(
464    family: &F,
465) -> bool {
466    family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
467}
468
469/// Stable public API for installing outer-score subsampling.
470#[derive(Clone)]
471pub struct BlockwiseFitOptions {
472    pub inner_max_cycles: usize,
473    pub inner_tol: f64,
474    pub outer_max_iter: usize,
475    pub outer_tol: f64,
476    /// Optional override for the OUTER smoothing optimizer's
477    /// *relative-cost-decrease* convergence stop, decoupled from `outer_tol`.
478    ///
479    /// The outer convergence test derives BOTH the absolute projected-gradient
480    /// floor (`max(outer_tol, n·1e-9)`) AND the relative-cost stop
481    /// (`rel_cost = outer_tol`) from the single `outer_tol`. A caller that needs
482    /// a *tight absolute floor* to resolve λ to the genuine REML optimum at
483    /// large `n` (where the floor is `n·1e-9`) is then forced to also accept a
484    /// *tight rel-cost stop*, which on a flat REML ridge never trips and grinds
485    /// the optimizer to `outer_max_iter` — dozens of surplus O(D·p³)
486    /// Laplace-derivative outer iterations (the #1082 multinomial
487    /// smooth-by-factor wall-clock blow-up). When `Some(r)`, the rel-cost stop
488    /// uses `r` while the absolute floor keeps using `outer_tol`, so accuracy
489    /// (absolute floor) and perf (loose rel-cost) are selected independently.
490    /// `None` preserves the legacy coupling (`rel_cost = outer_tol`) for every
491    /// existing caller byte-for-byte.
492    pub outer_rel_cost_tol: Option<f64>,
493    /// Lower box bound for smoothing coordinates ρ = log λ.
494    ///
495    /// The default preserves the historical custom-family domain
496    /// `λ >= exp(-10)`. Families with known calibration failures at the
497    /// near-zero penalty boundary can raise this lower bound without changing
498    /// the upper effective-df cap or adding family-specific branches inside the
499    /// optimizer.
500    pub rho_lower_bound: f64,
501    pub minweight: f64,
502    pub ridge_floor: f64,
503    /// Shared ridge semantics used by solve/quadratic/logdet terms.
504    pub ridge_policy: RidgePolicy,
505    /// If true, outer smoothing optimization uses a Laplace/REML-style objective:
506    ///   -loglik + penalty + 0.5(log|H| - log|S|_+)
507    /// where H is blockwise working curvature and S is blockwise penalty.
508    pub use_remlobjective: bool,
509    /// If false, the outer smoothing optimizer uses exact gradients but does
510    /// not request an analytic outer Hessian from the family.
511    pub use_outer_hessian: bool,
512    /// If false, skip post-fit joint covariance assembly.
513    pub compute_covariance: bool,
514    /// Shared cap engaged during seed screening so cost-only evaluations can
515    /// stop inner iterations early without affecting the full solve.
516    pub screening_max_inner_iterations: Option<Arc<AtomicUsize>>,
517    /// Shared cap engaged during regular outer iterations. Unlike screening,
518    /// this is only a budget: capped solves still have to earn the ordinary
519    /// KKT certificate before derivatives may be exposed.
520    pub outer_inner_max_iterations: Option<Arc<AtomicUsize>>,
521    /// Optional line-search objective ceiling for lazy log-likelihood-only
522    /// evaluations. Families whose per-row log-likelihood contributions are
523    /// non-positive may stop once the partial negative log-likelihood is already
524    /// above this ceiling, because the unvisited rows cannot improve the trial
525    /// objective enough to be accepted. Default `None` preserves exact full-sum
526    /// behavior and is the only mode used outside backtracking rejection tests.
527    pub early_exit_threshold: Option<f64>,
528    /// Stable public API for installing outer-score subsampling.
529    ///
530    /// Optional stratified row subsample used by outer-only score/gradient
531    /// passes. When `Some(s)`, outer score/gradient hot loops should iterate
532    /// only over `s.rows` and multiply each contribution by that row's
533    /// Horvitz-Thompson inverse-inclusion weight. Inner-PIRLS and final
534    /// covariance passes always run on the full data, so this field is
535    /// consulted only by outer-only call sites. Default `None` preserves the
536    /// full-data behavior. Wrapping in `Arc` keeps `Clone` cheap across the
537    /// many places `BlockwiseFitOptions` is duplicated per-eval.
538    pub outer_score_subsample: Option<Arc<crate::OuterScoreSubsample>>,
539    /// Gate for marginal-slope families to auto-derive a stratified
540    /// outer-score subsample at large scale (see
541    /// [`crate::families::marginal_slope_shared::auto_outer_score_subsample`]).
542    ///
543    /// **Default `true`.** Auto-subsampling makes the early rho-gradient
544    /// evaluations unbiased stochastic estimators with bounded relative
545    /// variance (≈ 1 % at the conservative defaults), then the family switches
546    /// back to full-data gradients for the remaining outer iterations. That
547    /// keeps large marginal-slope fits fast during the high-motion part of the
548    /// trajectory while preserving the default tight `outer_tol` polish on
549    /// exact gradients. For small datasets the auto path declines to install a
550    /// mask and the fit remains full-data throughout.
551    ///
552    /// When `outer_score_subsample` is already `Some(...)` the auto
553    /// path is bypassed entirely (caller-provided masks always win).
554    pub auto_outer_subsample: bool,
555    /// Outer-evaluation context populated by the smoothing optimizer at
556    /// the top of each real outer derivative evaluation. Used by
557    /// auto-subsample install paths to key the stratified mask on the
558    /// outer ρ rather than the inner β proxy: during the inner trust-
559    /// region / coefficient line search β changes on every trial step,
560    /// so keying on β re-fires phase prints (and re-shuffles the mask)
561    /// inside a single outer eval. Keying on (rho, eval_id) instead
562    /// keeps the mask stable across the inner Newton at one ρ, and
563    /// suppresses auto-subsample entirely on inner trial evaluations via
564    /// the [`EvalScope::InnerCoefficient`] tag set by
565    /// [`coefficient_line_search_options`].
566    ///
567    /// `None` preserves legacy behavior (no context — install paths fall
568    /// back to "no auto-subsample"). Default `None`.
569    pub outer_eval_context: Option<OuterEvalContext>,
570    /// Optional persistent warm-start cache session. When `Some`, the
571    /// outer smoothing optimizer consults the on-disk cache before
572    /// starting (to seed θ from the last accepted iterate) and writes
573    /// checkpoints + a final entry on completion. When `None`, the fit
574    /// runs cold and writes nothing — the default for unit tests and
575    /// any caller that pinned a deterministic optimum.
576    ///
577    /// Callers that need cross-process reuse must supply the session
578    /// explicitly; ordinary workflow fits leave this empty so refit-heavy
579    /// loops do not touch the shared on-disk store.
580    pub cache_session: Option<Arc<gam_runtime::warm_start::Session>>,
581    /// Optional mirror sessions that receive a copy of the final-result
582    /// finalize() write. Callers can use this to broadcast a converged ρ to
583    /// additional keyspace(s) so future fits with related structure can
584    /// warm-start from this run. Writes still pass through the session rate
585    /// limiter, so mirroring checkpoints does not add unbounded I/O.
586    pub cache_mirror_sessions: Vec<Arc<gam_runtime::warm_start::Session>>,
587    /// Optional bundle of cross-block (full-width) penalties, paired with
588    /// their current `log λ` values from the outer ρ vector. When `Some`,
589    /// the inner joint-Newton primitives add the contributions
590    ///
591    /// * objective: `½ Σ_j exp(ρ_j) βᵀ S_j β`
592    /// * gradient:  `Σ_j exp(ρ_j) S_j β`
593    /// * Hessian:   `Σ_j exp(ρ_j) S_j`
594    ///
595    /// in addition to the per-block penalty stack assembled from
596    /// `ParameterBlockSpec.penalties`. The per-block path is unchanged.
597    /// `None` preserves legacy behaviour for every existing caller.
598    pub joint_penalties: Option<Arc<crate::JointPenaltyBundle>>,
599    /// Whether the outer smoothing optimizer screens the explicit
600    /// `initial_rho` seed through the seed-screening cascade before the
601    /// solver starts.
602    ///
603    /// **Default `true`** — the general path benefits from ranking the
604    /// initial seed against the generated exploration seeds via cheap
605    /// capped proxy fits.
606    ///
607    /// A caller sets this `false` when `initial_rho` is already the correct,
608    /// identified optimum for its regime so that re-screening it adds only
609    /// cost. The survival location-scale constant-scale (parametric-AFT)
610    /// path uses this: its time-warp ρ seed is pinned AT the inner ρ box
611    /// bound (the affine-baseline limit), where the REML/LAML profile is a
612    /// dead-flat unidentified ridge. Running the screening cascade there
613    /// drives each proxy fit (and, when every capped stage collapses to
614    /// non-finite cost, the uncapped final stage) into a full inner solve on
615    /// the near-singular flat Hessian — the source of the multi-minute
616    /// no-iteration-log stall (#736, #735, #721). Skipping screening lets the
617    /// already-correct seed flow straight to the outer solver, which certifies
618    /// box-constraint stationarity at iteration 0. Genuinely flexible regimes
619    /// (smooth scale / spatial) leave this `true` and keep full screening.
620    pub screen_initial_rho: bool,
621    /// Set ONLY while the inner solve is invoked from the seed-screening proxy
622    /// (`custom_family_seed_screening_proxy_labeled`), which RANKS candidate
623    /// seeds by their penalized objective and never produces the final fit.
624    ///
625    /// When `true`, the inner joint-Newton skips the full per-axis
626    /// Jeffreys/Firth curvature (`custom_family_joint_jeffreys_term`'s
627    /// `for k in 0..p` directional-derivative loop, O(p · per-axis-Hdot) per
628    /// cycle), keeping ONLY the cheap value-only Jeffreys term
629    /// (`custom_family_joint_jeffreys_value`, one reduced-info eigendecomposition)
630    /// in the screening score. The per-axis gradient/curvature is what the inner
631    /// Newton step needs to *converge* a near-separating fit; the screening proxy
632    /// is capped and only ranks, so it does not need step convergence — it needs
633    /// a finite, separation-aware score cheaply. For a K-block coupled family
634    /// (Dirichlet/multinomial) each per-axis directional derivative is itself
635    /// O(K²·n·p), so running the full term for every cascade candidate over the
636    /// joint width `p` is the wrong cost class and made the coupled fit
637    /// non-completing during screening alone (gam#729/#808). The actual fit
638    /// (after a seed is selected) runs with this `false`, so the load-bearing
639    /// Firth curvature is fully present where it matters.
640    ///
641    /// **Default `false`** — only the screening proxy sets it `true`.
642    pub seed_screening: bool,
643}
644
645pub const DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES: usize = 1200;
646
647impl Default for BlockwiseFitOptions {
648    fn default() -> Self {
649        Self {
650            // Large-scale custom-family marginal-slope fits can have a
651            // long, monotone joint-Newton tail: objective and step size keep
652            // shrinking, but the exact KKT residual may need several hundred
653            // additional cycles after the old 300-cycle cap. The outer
654            // REML/LAML derivative path is correct only at a stationary inner
655            // mode, so a merely descended iterate must not be accepted as
656            // converged. Use a production-sized cap by default and rely on the
657            // KKT/objective certificates to exit early for well-conditioned
658            // Gaussian, logistic, and small-n fits.
659            inner_max_cycles: DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES,
660            inner_tol: 1e-6,
661            outer_max_iter: 60,
662            outer_tol: 1e-5,
663            outer_rel_cost_tol: None,
664            rho_lower_bound: -10.0,
665            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
666            // `ridge_floor` is an ExplicitPrior in the canonical
667            // stabilization ledger taxonomy (`StabilizationKind::ExplicitPrior`):
668            // its δ enters the quadratic term, the Laplace Hessian, and the
669            // penalty log-determinant — `ridge_policy` below is the live
670            // policy that confirms which terms it lands in. The default
671            // pos-part policy enables every inclusion flag, so callers
672            // wanting solver-only damping should construct a custom policy
673            // (or, preferably, a `StabilizationLedger::numerical_perturbation`)
674            // rather than reusing this field.
675            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
676            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
677            use_remlobjective: true,
678            // Default ON: families expose exact outer Hessians whenever their
679            // analytic dense or operator representation is implemented.
680            use_outer_hessian: true,
681            compute_covariance: false,
682            screening_max_inner_iterations: None,
683            outer_inner_max_iterations: None,
684            seed_screening: false,
685            early_exit_threshold: None,
686            outer_score_subsample: None,
687            auto_outer_subsample: true,
688            outer_eval_context: None,
689            cache_session: None,
690            cache_mirror_sessions: Vec::new(),
691            joint_penalties: None,
692            screen_initial_rho: true,
693        }
694    }
695}
696
697#[cfg(test)]
698mod tests {
699    use super::*;
700    use gam_linalg::matrix::DesignMatrix;
701    use ndarray::Array2;
702
703    fn make_spec(nrows: usize, ncols: usize) -> ParameterBlockSpec {
704        ParameterBlockSpec {
705            design: DesignMatrix::from(Array2::<f64>::zeros((nrows, ncols))),
706            ..ParameterBlockSpec::defaults()
707        }
708    }
709
710    // -----------------------------------------------------------------------
711    // default_coefficient_hessian_cost
712    // -----------------------------------------------------------------------
713
714    #[test]
715    fn hessian_cost_empty_specs_is_zero() {
716        assert_eq!(default_coefficient_hessian_cost(&[]), 0);
717    }
718
719    #[test]
720    fn hessian_cost_single_block() {
721        // n=10, p=3 → 10 * 3^2 = 90
722        let spec = make_spec(10, 3);
723        assert_eq!(default_coefficient_hessian_cost(&[spec]), 90);
724    }
725
726    #[test]
727    fn hessian_cost_two_blocks_sum() {
728        // n=10, p=3 → 90; n=5, p=4 → 5*16=80; total=170
729        let specs = [make_spec(10, 3), make_spec(5, 4)];
730        assert_eq!(default_coefficient_hessian_cost(&specs), 170);
731    }
732
733    // -----------------------------------------------------------------------
734    // default_coefficient_gradient_cost
735    // -----------------------------------------------------------------------
736
737    #[test]
738    fn gradient_cost_is_half_hessian_cost() {
739        let specs = [make_spec(10, 3)];
740        let hess = default_coefficient_hessian_cost(&specs);
741        assert_eq!(default_coefficient_gradient_cost(&specs), hess / 2);
742    }
743
744    // -----------------------------------------------------------------------
745    // joint_coupled_coefficient_hessian_cost
746    // -----------------------------------------------------------------------
747
748    #[test]
749    fn joint_coupled_cost_empty_specs_is_zero() {
750        assert_eq!(joint_coupled_coefficient_hessian_cost(100, &[]), 0);
751    }
752
753    #[test]
754    fn joint_coupled_cost_two_blocks() {
755        // n=10, p_total = 3+4=7 → 10 * 49 = 490
756        let specs = [make_spec(99, 3), make_spec(99, 4)];
757        assert_eq!(joint_coupled_coefficient_hessian_cost(10, &specs), 490);
758    }
759
760    // -----------------------------------------------------------------------
761    // block_offsets_from_specs
762    // -----------------------------------------------------------------------
763
764    #[test]
765    fn block_offsets_empty_is_empty() {
766        let offsets = block_offsets_from_specs(&[]);
767        assert_eq!(offsets.len(), 0);
768    }
769
770    #[test]
771    fn block_offsets_three_blocks() {
772        // p = [2, 3, 1] → [0..2, 2..5, 5..6]
773        let specs = [make_spec(1, 2), make_spec(1, 3), make_spec(1, 1)];
774        let offsets = block_offsets_from_specs(&specs);
775        assert_eq!(&offsets[0], &(0..2));
776        assert_eq!(&offsets[1], &(2..5));
777        assert_eq!(&offsets[2], &(5..6));
778    }
779
780    #[test]
781    fn block_offsets_zero_width_block() {
782        // p = [2, 0, 1] → [0..2, 2..2, 2..3]
783        let specs = [make_spec(1, 2), make_spec(1, 0), make_spec(1, 1)];
784        let offsets = block_offsets_from_specs(&specs);
785        assert_eq!(&offsets[0], &(0..2));
786        assert_eq!(&offsets[1], &(2..2));
787        assert_eq!(&offsets[2], &(2..3));
788    }
789
790    // -----------------------------------------------------------------------
791    // first_order_bfgs_loglambda_step_cap
792    // -----------------------------------------------------------------------
793
794    #[test]
795    fn step_cap_without_outer_hessian_is_some_five() {
796        assert_eq!(first_order_bfgs_loglambda_step_cap(false), Some(5.0));
797    }
798
799    #[test]
800    fn step_cap_with_outer_hessian_is_none() {
801        assert_eq!(first_order_bfgs_loglambda_step_cap(true), None);
802    }
803
804    // -----------------------------------------------------------------------
805    // cost_gated_first_order_max_iter
806    // -----------------------------------------------------------------------
807
808    #[test]
809    fn cost_gated_iter_passes_through_when_has_outer_hessian() {
810        assert_eq!(cost_gated_first_order_max_iter(100, 1_000_000_000, true), 100);
811    }
812
813    #[test]
814    fn cost_gated_iter_passes_through_at_one_iteration() {
815        assert_eq!(cost_gated_first_order_max_iter(1, 1_000_000_000_000, false), 1);
816    }
817
818    #[test]
819    fn cost_gated_iter_passes_through_zero_cost() {
820        assert_eq!(cost_gated_first_order_max_iter(200, 0, false), 200);
821    }
822
823    #[test]
824    fn cost_gated_iter_clamps_expensive_gradient() {
825        // Budget = 80_000_000_000. With cost = 1_000_000_000 the affordable
826        // count is 80. requested=200 should be clamped to 80.
827        let result = cost_gated_first_order_max_iter(200, 1_000_000_000, false);
828        assert!(result <= 80, "got {result}");
829        assert!(result >= 4, "got {result}");
830    }
831
832    #[test]
833    fn cost_gated_iter_enforces_minimum_four() {
834        // Cost so high only 1 iteration is affordable, but minimum is 4.
835        let result = cost_gated_first_order_max_iter(100, u64::MAX / 2, false);
836        assert_eq!(result, 4);
837    }
838}