Skip to main content

gam_model_api/families/custom_family/
options.rs

1//! Fit-time configuration and cost accounting: `BlockwiseFitOptions`, the
2//! outer-derivative policy + order selection, coefficient cost models, and the
3//! argument-validation asserts shared by the solver entry points.
4
5use crate::families::custom_family::family_trait::{CustomFamily, OuterEvalContext};
6use crate::families::custom_family::psi_design::{
7    CustomFamilyBlockPsiDerivative, ExactNewtonJointHessianWorkspace,
8};
9use gam_linalg::RidgePolicy;
10use gam_problem::{ParameterBlockSpec, ParameterBlockState};
11use ndarray::Array1;
12use std::ops::Range;
13use std::sync::Arc;
14use std::sync::atomic::AtomicUsize;
15
16// Moved to `gam-problem` (#1521 CustomFamily-cone inversion): the neutral,
17// dependency-free outer-objective + exact-derivative-order capability enums now
18// live in `gam_problem::family_options` and are re-exported here so every
19// `custom_family::ExactNewtonOuterObjective` / `ExactOuterDerivativeOrder` path
20// keeps resolving byte-for-byte.
21pub use gam_problem::{ExactNewtonOuterObjective, ExactOuterDerivativeOrder};
22
23// The IRLS weight / ridge floors and the block-spec consistency validator are
24// neutral (no `CustomFamily` dependency); they live in `gam-problem` and are
25// re-exported here so every `custom_family::{CUSTOM_FAMILY_WEIGHT_FLOOR,
26// CUSTOM_FAMILY_RIDGE_FLOOR, validate_blockspec_consistency}` path keeps
27// resolving and `CUSTOM_FAMILY_WEIGHT_FLOOR` stays sourced from the canonical
28// `MIN_WEIGHT` floor rather than a hardcoded literal.
29pub use gam_problem::{
30    CUSTOM_FAMILY_RIDGE_FLOOR, CUSTOM_FAMILY_WEIGHT_FLOOR, validate_blockspec_consistency,
31};
32
33/// Exact outer derivative order for families that expose second-order
34/// coefficient geometry.
35///
36/// This used to be a cost gate that demoted large large-scale problems to
37/// first-order BFGS. That was a policy leak into the math layer: if the family
38/// supplies analytic dense Hessian blocks or an analytic profiled-Hessian HVP,
39/// the outer optimizer should see the exact second-order objective. Runtime
40/// representation choices (dense vs operator) belong below this declaration,
41/// not in a first-order downgrade.
42/// Precondition check for the family capability / operator hooks (e.g.
43/// `batched_outer_hessian_terms`, `outer_hyper_hessian_operator`).
44///
45/// These hooks operate on whatever block geometry the caller has assembled and
46/// must validate the *consistency* of the specs they are handed — never the
47/// fit-level "at least one block" precondition, which belongs to the fit entry
48/// points (`validate_blockspecs`). An empty, self-consistent argument set is a
49/// valid no-op probe of the operator path (the operator may ignore the specs
50/// entirely), so it must not panic here.
51pub(crate) fn assert_valid_blockspecs(specs: &[ParameterBlockSpec], context: &str) {
52    assert!(
53        validate_blockspec_consistency(specs).is_ok(),
54        "{context}: inconsistent parameter block specs"
55    );
56}
57
58pub(crate) fn assert_valid_options(options: &BlockwiseFitOptions, context: &str) {
59    assert!(
60        options.inner_tol.is_finite() && options.inner_tol >= 0.0,
61        "{context}: inner_tol must be finite and non-negative"
62    );
63    assert!(
64        options.outer_tol.is_finite() && options.outer_tol >= 0.0,
65        "{context}: outer_tol must be finite and non-negative"
66    );
67    assert!(
68        options.minweight.is_finite() && options.minweight >= 0.0,
69        "{context}: minweight must be finite and non-negative"
70    );
71    assert!(
72        options.ridge_floor.is_finite() && options.ridge_floor >= 0.0,
73        "{context}: ridge_floor must be finite and non-negative"
74    );
75    if let Some(threshold) = options.early_exit_threshold {
76        assert!(
77            threshold.is_finite(),
78            "{context}: early_exit_threshold must be finite"
79        );
80    }
81}
82
83pub(crate) fn assert_states_match_specs(
84    states: &[ParameterBlockState],
85    specs: &[ParameterBlockSpec],
86    context: &str,
87) {
88    assert_eq!(
89        states.len(),
90        specs.len(),
91        "{context}: state/spec block count mismatch"
92    );
93    for (block, (state, spec)) in states.iter().zip(specs).enumerate() {
94        assert_eq!(
95            state.beta.len(),
96            spec.design.ncols(),
97            "{context}: beta length mismatch in block {block}"
98        );
99        // `state.eta` is produced from `solver_design()` (see
100        // `refresh_all_block_etas`), which is `stacked_design` when set
101        // (3·n_obs rows for survival LS time-varying blocks) and `design`
102        // (n_obs rows) otherwise. Use the same accessor here.
103        assert_eq!(
104            state.eta.len(),
105            spec.solver_design().nrows(),
106            "{context}: eta length mismatch in block {block}"
107        );
108    }
109}
110
111pub(crate) fn assert_derivative_blocks_match_specs(
112    derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
113    specs: &[ParameterBlockSpec],
114    context: &str,
115) {
116    assert_eq!(
117        derivative_blocks.len(),
118        specs.len(),
119        "{context}: derivative/spec block count mismatch"
120    );
121}
122
123pub(crate) fn assert_rho_matches_specs(
124    rho: &Array1<f64>,
125    specs: &[ParameterBlockSpec],
126    context: &str,
127) {
128    let expected = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
129    assert_eq!(
130        rho.len(),
131        expected,
132        "{context}: rho length does not match penalty count"
133    );
134}
135
136pub(crate) fn validate_hessian_workspace_ready(
137    hessian_workspace: &Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
138    context: &str,
139) -> Result<(), String> {
140    if let Some(workspace) = hessian_workspace.as_ref() {
141        workspace
142            .warm_up_outer_caches()
143            .map_err(|err| format!("{context}: failed to warm Hessian workspace caches: {err}"))?;
144    }
145    Ok(())
146}
147
148pub fn exact_outer_order_from_capability(
149    specs: &[ParameterBlockSpec],
150    coefficient_cost: u64,
151) -> ExactOuterDerivativeOrder {
152    assert_valid_blockspecs(specs, "exact outer derivative order");
153    match coefficient_cost {
154        0 => ExactOuterDerivativeOrder::Second,
155        _ => ExactOuterDerivativeOrder::Second,
156    }
157}
158
159/// Capability-aware variant of [`exact_outer_order_from_capability`].
160///
161/// Kept as the public declaration helper for existing family impls, but it no
162/// longer gates by cost. Once a caller has established dense or HVP analytic
163/// second-order support, the correct derivative order is `Second`.
164pub fn exact_outer_order_with_outer_hvp(
165    specs: &[ParameterBlockSpec],
166    coefficient_cost: u64,
167    outer_hyper_hessian_hvp_available: bool,
168) -> ExactOuterDerivativeOrder {
169    if outer_hyper_hessian_hvp_available {
170        assert_valid_blockspecs(specs, "exact outer derivative order with HVP");
171        match coefficient_cost {
172            0 => ExactOuterDerivativeOrder::Second,
173            _ => ExactOuterDerivativeOrder::Second,
174        }
175    } else {
176        exact_outer_order_from_capability(specs, coefficient_cost)
177    }
178}
179
180/// Realized outer-derivative policy at the current problem size.
181///
182/// Capability (the family can produce exact second-order calculus) controls
183/// whether the Hessian is declared. Runtime cost controls only representation
184/// and staging choices below this layer. Large problems must stay on the exact
185/// analytic Hessian path and use an operator representation when dense assembly
186/// is too expensive; they are not demoted to first-order BFGS here.
187///
188/// `OuterDerivativePolicy` records the family's *capability*, the *predicted
189/// per-eval cost* for both gradient-only and Hessian paths, and exposes the
190/// two policy queries the outer optimizer actually needs:
191///
192/// * [`order_for_evaluation`](Self::order_for_evaluation) — clamp a requested
193///   evaluation order against the policy gate.
194/// * [`declared_hessian_form`](Self::declared_hessian_form) — what shape the
195///   outer-strategy planner should declare to its plan ladder.
196/// * [`should_use_staged_kappa`](Self::should_use_staged_kappa) — auto-route
197///   the κ optimizer through the pilot/polish schedule at large `n`.
198///
199/// All thresholds are *const* — no env vars, no CLI flags. The cost model is
200/// the family's own `coefficient_gradient_cost` / `coefficient_hessian_cost`
201/// scaled by the joint outer-coordinate dimension, with `saturating_mul` so
202/// overflow rounds up to the budget ceiling rather than wrapping silently.
203#[derive(Clone, Copy, Debug)]
204pub struct OuterDerivativePolicy {
205    /// What exact calculus the family advertises in principle.
206    pub capability: ExactOuterDerivativeOrder,
207    /// Predicted per-eval work for one `ValueGradientHessian` evaluation.
208    /// Rounded conservatively *up* via `saturating_mul`. Informational for
209    /// representation and diagnostics; it does not disable Hessian capability.
210    pub predicted_hessian_work: u128,
211    /// Predicted per-eval work for one `ValueAndGradient` evaluation.
212    /// Rounded conservatively *up* via `saturating_mul`.
213    pub predicted_gradient_work: u128,
214    /// True when the family's outer-only paths consume
215    /// [`BlockwiseFitOptions::outer_score_subsample`] and produce
216    /// Horvitz-Thompson-weighted partial sums (i.e. the family overrides
217    /// `log_likelihood_only_with_options`,
218    /// `exact_newton_joint_psi_workspace_with_options`, and any other
219    /// outer-only hooks reached by `evaluate_custom_family_joint_hyper`).
220    ///
221    /// Determines whether the κ optimizer's pilot/polish staging schedule
222    /// engages: when this is `false`, [`Self::should_use_staged_kappa`]
223    /// returns `false` regardless of `n`. Engaging the schedule on a
224    /// family that ignores the subsample is strictly worse than not
225    /// engaging it — the schedule builds a `RowSet::Subsample` and the
226    /// boundary plumbing installs an `OuterScoreSubsample` on options,
227    /// but the family's default outer-only paths fall back to full-data
228    /// sums, so the pilot evaluation costs the same as the polish but
229    /// adds a Vec allocation per eval.
230    ///
231    /// Families that do **not** consume the subsample (default for new
232    /// implementations, including the GAMLSS location-scale families
233    /// today) leave this `false`. Families that do consume (today:
234    /// `BernoulliMarginalSlopeFamily`) override `outer_derivative_policy`
235    /// to set this `true`.
236    pub subsample_capable: bool,
237}
238
239impl OuterDerivativePolicy {
240    /// Per-eval gradient work ceiling above which the κ schedule switches
241    /// to the staged pilot/polish path. At large scale (n ≳ 100 k) even
242    /// the gradient sweep takes minutes per outer iter; subsampling the
243    /// pilot stage cuts that to seconds and leaves the final polish on
244    /// full data to recover the MLE.
245    pub const OUTER_GRADIENT_WORK_BUDGET: u128 = 50_000_000_000;
246
247    /// Pilot subsample auto-engages when full-data `n` exceeds this. Below
248    /// this the κ schedule collapses to a single full-data stage —
249    /// behaviour identical to the pre-P7 path.
250    pub const STAGED_KAPPA_TRIGGER_N: usize = 30_000;
251
252    /// Clamp a requested evaluation order against the policy gate.
253    ///
254    /// Returns the highest order this policy permits for the requested order:
255    /// * `ValueGradientHessian` requested → keep only if `declared_hessian_form`
256    ///   is something other than `Unavailable`.
257    /// * `ValueAndGradient` requested → always permitted (gradient-only is
258    ///   universal).
259    pub fn order_for_evaluation(&self, requested: crate::OuterEvalOrder) -> crate::OuterEvalOrder {
260        use crate::OuterEvalOrder;
261        match requested {
262            // Value-only is universal: every policy can evaluate the bare
263            // objective, so the request passes through unclamped.
264            OuterEvalOrder::Value => OuterEvalOrder::Value,
265            OuterEvalOrder::ValueAndGradient => OuterEvalOrder::ValueAndGradient,
266            OuterEvalOrder::ValueGradientHessian => {
267                if matches!(
268                    self.declared_hessian_form(),
269                    gam_problem::DeclaredHessianForm::Unavailable
270                ) {
271                    OuterEvalOrder::ValueAndGradient
272                } else {
273                    OuterEvalOrder::ValueGradientHessian
274                }
275            }
276        }
277    }
278
279    /// Outer Hessian declaration for the outer-strategy planner.
280    ///
281    /// `Either` ⇔ capability has Hessian. Work estimates select dense vs
282    /// operator assembly later; they must not erase analytic second-order
283    /// capability from the planner.
284    pub fn declared_hessian_form(&self) -> gam_problem::DeclaredHessianForm {
285        use gam_problem::DeclaredHessianForm;
286        if !self.capability.has_hessian() {
287            return DeclaredHessianForm::Unavailable;
288        }
289        DeclaredHessianForm::Either
290    }
291
292    /// True when the κ optimizer should auto-route through the staged
293    /// pilot/polish schedule. Triggers when **either** the data is big
294    /// (`n ≥ STAGED_KAPPA_TRIGGER_N`) **or** the per-eval gradient work
295    /// exceeds `OUTER_GRADIENT_WORK_BUDGET`. The second clause catches
296    /// problems with moderate `n` but very wide design (large `p_total`
297    /// or `psi_dim`) where a single full-data gradient sweep still
298    /// dominates the κ trajectory.
299    pub fn should_use_staged_kappa(&self, n: usize) -> bool {
300        if !self.subsample_capable {
301            // Family does not consume `outer_score_subsample` on its
302            // outer-only paths. Engaging the schedule would build a
303            // pilot `RowSet::Subsample` whose only effect is per-eval
304            // Vec/Arc bookkeeping — the underlying coefficient gradient
305            // would still sum every row. Gate the schedule off until
306            // the family override declares consumption.
307            return false;
308        }
309        n >= Self::STAGED_KAPPA_TRIGGER_N
310            || self.predicted_gradient_work > Self::OUTER_GRADIENT_WORK_BUDGET
311    }
312}
313
314/// Total outer-coordinate dimensionality used by the default policy work
315/// model: `rho_dim + psi_dim`. Each outer evaluation propagates one
316/// directional derivative per outer coordinate through the inner solve.
317#[inline]
318pub(crate) fn outer_coord_dim_for_policy(specs: &[ParameterBlockSpec], psi_dim: usize) -> u128 {
319    let rho_total: u128 = specs
320        .iter()
321        .map(|s| s.penalties.len() as u128)
322        .fold(0u128, |acc, k| acc.saturating_add(k));
323    rho_total.saturating_add(psi_dim as u128)
324}
325
326/// Default predicted-cost model for [`OuterDerivativePolicy`]:
327///
328/// * gradient work ≈ `coefficient_gradient_cost · (rho_dim + psi_dim)`
329/// * Hessian work  ≈ `coefficient_hessian_cost  · (rho_dim + psi_dim)`
330///
331/// Each outer coordinate triggers one analytic directional derivative
332/// through the inner solve; the dense Hessian assembly carries the extra
333/// `O(p_total)` factor already captured by `coefficient_hessian_cost`.
334///
335/// All multiplications saturate so an overflow rounds *up* to the gate
336/// ceiling: we'd rather drop one Hessian evaluation that we could have
337/// afforded than crash on a 600 s eval.
338pub fn default_outer_derivative_policy_costs(
339    specs: &[ParameterBlockSpec],
340    psi_dim: usize,
341    grad_cost: u64,
342    hess_cost: u64,
343) -> (u128, u128) {
344    let k = outer_coord_dim_for_policy(specs, psi_dim);
345    let grad = (grad_cost as u128).saturating_mul(k.max(1));
346    let hess = (hess_cost as u128).saturating_mul(k.max(1));
347    (grad, hess)
348}
349
350/// Default coefficient-space Hessian cost: `Σ_b n_b · p_b²`, summed across
351/// blocks. Represents the work to assemble or apply the dense block-diagonal
352/// inner Hessian once.
353pub fn default_coefficient_hessian_cost(specs: &[ParameterBlockSpec]) -> u64 {
354    specs
355        .iter()
356        .map(|s| {
357            let n = s.design.nrows() as u64;
358            let p = s.design.ncols() as u64;
359            n.saturating_mul(p.saturating_mul(p))
360        })
361        .fold(0u64, |acc, c| acc.saturating_add(c))
362}
363
364/// Joint-coupled coefficient-space Hessian cost: `n · (Σ_b p_b)²`. The honest
365/// per-evaluation work for any family whose row likelihood couples every block
366/// (every observation contributes a rank-`m` outer-product update to the full
367/// joint Hessian over `Σ p_b` coefficients), as opposed to the block-diagonal
368/// `default_coefficient_hessian_cost` which assumes each `X_b' W_b X_b` is
369/// assembled independently.
370///
371/// Used by all GAMLSS, marginal-slope, and joint-latent families. CTN does
372/// not delegate here — it uses its Khatri–Rao factor dimensions internally.
373pub fn joint_coupled_coefficient_hessian_cost(n: u64, specs: &[ParameterBlockSpec]) -> u64 {
374    let p_total: u64 = specs
375        .iter()
376        .map(|s| s.design.ncols() as u64)
377        .fold(0u64, |acc, p| acc.saturating_add(p));
378    n.saturating_mul(p_total.saturating_mul(p_total))
379}
380
381/// Default coefficient-space gradient cost: half the Hessian cost.
382///
383/// The first-order analytic gradient in the unified evaluator runs the same
384/// inner Newton solve as the second-order path but skips the `K`-fold
385/// pairwise Hessian assembly (`B_{j,k}` blocks) and the `K`-fold inner
386/// derivative solves; what remains is the inner solve plus a single
387/// gradient-only sweep through the data. Empirically this is roughly half
388/// the per-evaluation arithmetic of forming the dense Hessian, hence the
389/// `/2` default. Families whose gradient assembly differs structurally
390/// (e.g. matrix-free Hv operators with no dense Hessian assembly to halve)
391/// should override [`CustomFamily::coefficient_gradient_cost`] explicitly.
392pub fn default_coefficient_gradient_cost(specs: &[ParameterBlockSpec]) -> u64 {
393    default_coefficient_hessian_cost(specs) / 2
394}
395
396/// Compute β-block column ranges from a slice of `ParameterBlockSpec`s.
397///
398/// Returns one `Range<usize>` per spec, covering the spec's columns in the
399/// concatenated β vector (i.e. `offset .. offset + p_block` where `p_block =
400/// spec.design.ncols()`). The ranges are non-overlapping, sorted, and their
401/// union covers `0..Σ p_block`.
402///
403/// This is the canonical source of `block_offsets` for every
404/// [`crate::solver::arrow_schur::ArrowSchurSystem`] built for a custom family
405/// (survival, GAMLSS, transformation-normal, latent-survival, marginal-slope,
406/// …). Pass the result to
407/// [`crate::solver::arrow_schur::ArrowSchurSystem::set_block_offsets`] before
408/// calling `solve` or `solve_with_options` whenever the system will use
409/// [`crate::solver::arrow_schur::ArrowSolverMode::InexactPCG`].
410///
411/// Specs with zero columns produce a zero-width range; callers that want to
412/// skip trivial blocks may filter on `r.start < r.end` after calling this
413/// function.
414pub fn block_offsets_from_specs(specs: &[ParameterBlockSpec]) -> Arc<[Range<usize>]> {
415    let mut ranges: Vec<Range<usize>> = Vec::with_capacity(specs.len());
416    let mut cursor = 0usize;
417    for spec in specs {
418        let p = spec.design.ncols();
419        ranges.push(cursor..cursor + p);
420        cursor += p;
421    }
422    Arc::from(ranges.into_boxed_slice())
423}
424
425/// Bound first-order outer iterations when each analytic-gradient evaluation is
426/// already large-scale work. This is only applied after the planner has
427/// selected a gradient-only route; second-order/ARC plans keep their requested
428/// iteration budget.
429pub fn cost_gated_first_order_max_iter(
430    requested: usize,
431    coefficient_gradient_cost: u64,
432    has_outer_hessian: bool,
433) -> usize {
434    const FIRST_ORDER_OUTER_WORK_BUDGET: u64 = 80_000_000_000;
435    const MIN_FIRST_ORDER_ITERS: usize = 4;
436
437    if has_outer_hessian || requested <= 1 || coefficient_gradient_cost == 0 {
438        return requested;
439    }
440
441    let affordable = (FIRST_ORDER_OUTER_WORK_BUDGET / coefficient_gradient_cost) as usize;
442    requested.min(affordable.max(MIN_FIRST_ORDER_ITERS))
443}
444
445/// Local trust budget for first-order outer BFGS on log-smoothing parameters.
446///
447/// One unit in `rho = log(lambda)` is an `e`-fold smoothing-parameter change.
448/// Previously this cap was `1.0`, which throttled BFGS to ~1/5 of its
449/// quasi-Newton step on flat REML surfaces (the natural BFGS direction has
450/// `|d|_inf` of ~5 in log-λ for large-scale survival fits). Probes whose
451/// `step_inf > cap` are rejected for free in `OuterFirstOrderBridge::eval_cost`
452/// (returning `BFGS_LINE_SEARCH_REJECT_COST` without running an inner solve),
453/// so a larger cap costs nothing on rejection — it only lets Strong-Wolfe
454/// accept bigger steps that the inner-PIRLS divergence guard can already
455/// validate. `5.0` allows up to `e^5 ≈ 148`-fold smoothing-parameter change
456/// per accepted outer iter, which matches the typical quasi-Newton direction
457/// magnitude while still bounding pathological probes.
458pub const fn first_order_bfgs_loglambda_step_cap(has_outer_hessian: bool) -> Option<f64> {
459    if has_outer_hessian { None } else { Some(5.0) }
460}
461
462pub fn exact_newton_outer_geometry_supports_second_order_solver<F: CustomFamily + ?Sized>(
463    family: &F,
464) -> bool {
465    family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
466}
467
468/// Stable public API for installing outer-score subsampling.
469#[derive(Clone)]
470pub struct BlockwiseFitOptions {
471    pub inner_max_cycles: usize,
472    pub inner_tol: f64,
473    pub outer_max_iter: usize,
474    pub outer_tol: f64,
475    /// Optional override for the OUTER smoothing optimizer's
476    /// *relative-cost-decrease* convergence stop, decoupled from `outer_tol`.
477    ///
478    /// The outer convergence test derives BOTH the absolute projected-gradient
479    /// floor (`max(outer_tol, n·1e-9)`) AND the relative-cost stop
480    /// (`rel_cost = outer_tol`) from the single `outer_tol`. A caller that needs
481    /// a *tight absolute floor* to resolve λ to the genuine REML optimum at
482    /// large `n` (where the floor is `n·1e-9`) is then forced to also accept a
483    /// *tight rel-cost stop*, which on a flat REML ridge never trips and grinds
484    /// the optimizer to `outer_max_iter` — dozens of surplus O(D·p³)
485    /// Laplace-derivative outer iterations (the #1082 multinomial
486    /// smooth-by-factor wall-clock blow-up). When `Some(r)`, the rel-cost stop
487    /// uses `r` while the absolute floor keeps using `outer_tol`, so accuracy
488    /// (absolute floor) and perf (loose rel-cost) are selected independently.
489    /// `None` preserves the legacy coupling (`rel_cost = outer_tol`) for every
490    /// existing caller byte-for-byte.
491    pub outer_rel_cost_tol: Option<f64>,
492    /// Lower box bound for smoothing coordinates ρ = log λ.
493    ///
494    /// The default preserves the historical custom-family domain
495    /// `λ >= exp(-10)`. Families with known calibration failures at the
496    /// near-zero penalty boundary can raise this lower bound without changing
497    /// the upper effective-df cap or adding family-specific branches inside the
498    /// optimizer.
499    pub rho_lower_bound: f64,
500    pub minweight: f64,
501    pub ridge_floor: f64,
502    /// Shared ridge semantics used by solve/quadratic/logdet terms.
503    pub ridge_policy: RidgePolicy,
504    /// If true, outer smoothing optimization uses a Laplace/REML-style objective:
505    ///   -loglik + penalty + 0.5(log|H| - log|S|_+)
506    /// where H is blockwise working curvature and S is blockwise penalty.
507    pub use_remlobjective: bool,
508    /// If false, the outer smoothing optimizer uses exact gradients but does
509    /// not request an analytic outer Hessian from the family.
510    pub use_outer_hessian: bool,
511    /// If false, skip post-fit joint covariance assembly.
512    pub compute_covariance: bool,
513    /// Shared cap engaged during seed screening so cost-only evaluations can
514    /// stop inner iterations early without affecting the full solve.
515    pub screening_max_inner_iterations: Option<Arc<AtomicUsize>>,
516    /// Shared cap engaged during regular outer iterations. Unlike screening,
517    /// this is only a budget: capped solves still have to earn the ordinary
518    /// KKT certificate before derivatives may be exposed.
519    pub outer_inner_max_iterations: Option<Arc<AtomicUsize>>,
520    /// Optional line-search objective ceiling for lazy log-likelihood-only
521    /// evaluations. Families whose per-row log-likelihood contributions are
522    /// non-positive may stop once the partial negative log-likelihood is already
523    /// above this ceiling, because the unvisited rows cannot improve the trial
524    /// objective enough to be accepted. Default `None` preserves exact full-sum
525    /// behavior and is the only mode used outside backtracking rejection tests.
526    pub early_exit_threshold: Option<f64>,
527    /// Stable public API for installing outer-score subsampling.
528    ///
529    /// Optional stratified row subsample used by outer-only score/gradient
530    /// passes. When `Some(s)`, outer score/gradient hot loops should iterate
531    /// only over `s.rows` and multiply each contribution by that row's
532    /// Horvitz-Thompson inverse-inclusion weight. Inner-PIRLS and final
533    /// covariance passes always run on the full data, so this field is
534    /// consulted only by outer-only call sites. Default `None` preserves the
535    /// full-data behavior. Wrapping in `Arc` keeps `Clone` cheap across the
536    /// many places `BlockwiseFitOptions` is duplicated per-eval.
537    pub outer_score_subsample: Option<Arc<crate::OuterScoreSubsample>>,
538    /// Gate for marginal-slope families to auto-derive a stratified
539    /// outer-score subsample at large scale (see
540    /// [`crate::families::marginal_slope_shared::auto_outer_score_subsample`]).
541    ///
542    /// **Default `true`.** Auto-subsampling makes the early rho-gradient
543    /// evaluations unbiased stochastic estimators with bounded relative
544    /// variance (≈ 1 % at the conservative defaults), then the family switches
545    /// back to full-data gradients for the remaining outer iterations. That
546    /// keeps large marginal-slope fits fast during the high-motion part of the
547    /// trajectory while preserving the default tight `outer_tol` polish on
548    /// exact gradients. For small datasets the auto path declines to install a
549    /// mask and the fit remains full-data throughout.
550    ///
551    /// When `outer_score_subsample` is already `Some(...)` the auto
552    /// path is bypassed entirely (caller-provided masks always win).
553    pub auto_outer_subsample: bool,
554    /// Outer-evaluation context populated by the smoothing optimizer at
555    /// the top of each real outer derivative evaluation. Used by
556    /// auto-subsample install paths to key the stratified mask on the
557    /// outer ρ rather than the inner β proxy: during the inner trust-
558    /// region / coefficient line search β changes on every trial step,
559    /// so keying on β re-fires phase prints (and re-shuffles the mask)
560    /// inside a single outer eval. Keying on (rho, eval_id) instead
561    /// keeps the mask stable across the inner Newton at one ρ, and
562    /// suppresses auto-subsample entirely on inner trial evaluations via
563    /// the [`EvalScope::InnerCoefficient`] tag set by
564    /// [`coefficient_line_search_options`].
565    ///
566    /// `None` preserves legacy behavior (no context — install paths fall
567    /// back to "no auto-subsample"). Default `None`.
568    pub outer_eval_context: Option<OuterEvalContext>,
569    /// Optional persistent warm-start cache session. When `Some`, the
570    /// outer smoothing optimizer consults the on-disk cache before
571    /// starting (to seed θ from the last accepted iterate) and writes
572    /// checkpoints + a final entry on completion. When `None`, the fit
573    /// runs cold and writes nothing — the default for unit tests and
574    /// any caller that pinned a deterministic optimum.
575    ///
576    /// Callers that need cross-process reuse must supply the session
577    /// explicitly; ordinary workflow fits leave this empty so refit-heavy
578    /// loops do not touch the shared on-disk store.
579    pub cache_session: Option<Arc<gam_runtime::warm_start::Session>>,
580    /// Optional mirror sessions that receive a copy of the final-result
581    /// finalize() write. Callers can use this to broadcast a converged ρ to
582    /// additional keyspace(s) so future fits with related structure can
583    /// warm-start from this run. Writes still pass through the session rate
584    /// limiter, so mirroring checkpoints does not add unbounded I/O.
585    pub cache_mirror_sessions: Vec<Arc<gam_runtime::warm_start::Session>>,
586    /// Optional bundle of cross-block (full-width) penalties, paired with
587    /// their current `log λ` values from the outer ρ vector. When `Some`,
588    /// the inner joint-Newton primitives add the contributions
589    ///
590    /// * objective: `½ Σ_j exp(ρ_j) βᵀ S_j β`
591    /// * gradient:  `Σ_j exp(ρ_j) S_j β`
592    /// * Hessian:   `Σ_j exp(ρ_j) S_j`
593    ///
594    /// in addition to the per-block penalty stack assembled from
595    /// `ParameterBlockSpec.penalties`. The per-block path is unchanged.
596    /// `None` preserves legacy behaviour for every existing caller.
597    pub joint_penalties: Option<Arc<crate::JointPenaltyBundle>>,
598    /// Whether the outer smoothing optimizer screens the explicit
599    /// `initial_rho` seed through the seed-screening cascade before the
600    /// solver starts.
601    ///
602    /// **Default `true`** — the general path benefits from ranking the
603    /// initial seed against the generated exploration seeds via cheap
604    /// capped proxy fits.
605    ///
606    /// A caller sets this `false` when `initial_rho` is already the correct,
607    /// identified optimum for its regime so that re-screening it adds only
608    /// cost. The survival location-scale constant-scale (parametric-AFT)
609    /// path uses this: its time-warp ρ seed is pinned AT the inner ρ box
610    /// bound (the affine-baseline limit), where the REML/LAML profile is a
611    /// dead-flat unidentified ridge. Running the screening cascade there
612    /// drives each proxy fit (and, when every capped stage collapses to
613    /// non-finite cost, the uncapped final stage) into a full inner solve on
614    /// the near-singular flat Hessian — the source of the multi-minute
615    /// no-iteration-log stall (#736, #735, #721). Skipping screening lets the
616    /// already-correct seed flow straight to the outer solver, which certifies
617    /// box-constraint stationarity at iteration 0. Genuinely flexible regimes
618    /// (smooth scale / spatial) leave this `true` and keep full screening.
619    pub screen_initial_rho: bool,
620    /// Set ONLY while the inner solve is invoked from the seed-screening proxy
621    /// (`custom_family_seed_screening_proxy_labeled`), which RANKS candidate
622    /// seeds by their penalized objective and never produces the final fit.
623    ///
624    /// When `true`, the inner joint-Newton skips the full per-axis
625    /// Jeffreys/Firth curvature (`custom_family_joint_jeffreys_term`'s
626    /// `for k in 0..p` directional-derivative loop, O(p · per-axis-Hdot) per
627    /// cycle), keeping ONLY the cheap value-only Jeffreys term
628    /// (`custom_family_joint_jeffreys_value`, one reduced-info eigendecomposition)
629    /// in the screening score. The per-axis gradient/curvature is what the inner
630    /// Newton step needs to *converge* a near-separating fit; the screening proxy
631    /// is capped and only ranks, so it does not need step convergence — it needs
632    /// a finite, separation-aware score cheaply. For a K-block coupled family
633    /// (Dirichlet/multinomial) each per-axis directional derivative is itself
634    /// O(K²·n·p), so running the full term for every cascade candidate over the
635    /// joint width `p` is the wrong cost class and made the coupled fit
636    /// non-completing during screening alone (gam#729/#808). The actual fit
637    /// (after a seed is selected) runs with this `false`, so the load-bearing
638    /// Firth curvature is fully present where it matters.
639    ///
640    /// **Default `false`** — only the screening proxy sets it `true`.
641    pub seed_screening: bool,
642}
643
644pub const DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES: usize = 1200;
645
646impl Default for BlockwiseFitOptions {
647    fn default() -> Self {
648        Self {
649            // Large-scale custom-family marginal-slope fits can have a
650            // long, monotone joint-Newton tail: objective and step size keep
651            // shrinking, but the exact KKT residual may need several hundred
652            // additional cycles after the old 300-cycle cap. The outer
653            // REML/LAML derivative path is correct only at a stationary inner
654            // mode, so a merely descended iterate must not be accepted as
655            // converged. Use a production-sized cap by default and rely on the
656            // KKT/objective certificates to exit early for well-conditioned
657            // Gaussian, logistic, and small-n fits.
658            inner_max_cycles: DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES,
659            inner_tol: 1e-6,
660            outer_max_iter: 60,
661            outer_tol: 1e-5,
662            outer_rel_cost_tol: None,
663            rho_lower_bound: -10.0,
664            minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
665            // `ridge_floor` is an ExplicitPrior in the canonical
666            // stabilization ledger taxonomy (`StabilizationKind::ExplicitPrior`):
667            // its δ enters the quadratic term, the Laplace Hessian, and the
668            // penalty log-determinant — `ridge_policy` below is the live
669            // policy that confirms which terms it lands in. The default
670            // pos-part policy enables every inclusion flag, so callers
671            // wanting solver-only damping should construct a custom policy
672            // (or, preferably, a `StabilizationLedger::numerical_perturbation`)
673            // rather than reusing this field.
674            ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
675            ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
676            use_remlobjective: true,
677            // Default ON: families expose exact outer Hessians whenever their
678            // analytic dense or operator representation is implemented.
679            use_outer_hessian: true,
680            compute_covariance: false,
681            screening_max_inner_iterations: None,
682            outer_inner_max_iterations: None,
683            seed_screening: false,
684            early_exit_threshold: None,
685            outer_score_subsample: None,
686            auto_outer_subsample: true,
687            outer_eval_context: None,
688            cache_session: None,
689            cache_mirror_sessions: Vec::new(),
690            joint_penalties: None,
691            screen_initial_rho: true,
692        }
693    }
694}
695
696#[cfg(test)]
697mod tests {
698    use super::*;
699    use gam_linalg::matrix::DesignMatrix;
700    use ndarray::Array2;
701
702    fn make_spec(nrows: usize, ncols: usize) -> ParameterBlockSpec {
703        ParameterBlockSpec {
704            design: DesignMatrix::from(Array2::<f64>::zeros((nrows, ncols))),
705            ..ParameterBlockSpec::defaults()
706        }
707    }
708
709    // -----------------------------------------------------------------------
710    // default_coefficient_hessian_cost
711    // -----------------------------------------------------------------------
712
713    #[test]
714    fn hessian_cost_empty_specs_is_zero() {
715        assert_eq!(default_coefficient_hessian_cost(&[]), 0);
716    }
717
718    #[test]
719    fn hessian_cost_single_block() {
720        // n=10, p=3 → 10 * 3^2 = 90
721        let spec = make_spec(10, 3);
722        assert_eq!(default_coefficient_hessian_cost(&[spec]), 90);
723    }
724
725    #[test]
726    fn hessian_cost_two_blocks_sum() {
727        // n=10, p=3 → 90; n=5, p=4 → 5*16=80; total=170
728        let specs = [make_spec(10, 3), make_spec(5, 4)];
729        assert_eq!(default_coefficient_hessian_cost(&specs), 170);
730    }
731
732    // -----------------------------------------------------------------------
733    // default_coefficient_gradient_cost
734    // -----------------------------------------------------------------------
735
736    #[test]
737    fn gradient_cost_is_half_hessian_cost() {
738        let specs = [make_spec(10, 3)];
739        let hess = default_coefficient_hessian_cost(&specs);
740        assert_eq!(default_coefficient_gradient_cost(&specs), hess / 2);
741    }
742
743    // -----------------------------------------------------------------------
744    // joint_coupled_coefficient_hessian_cost
745    // -----------------------------------------------------------------------
746
747    #[test]
748    fn joint_coupled_cost_empty_specs_is_zero() {
749        assert_eq!(joint_coupled_coefficient_hessian_cost(100, &[]), 0);
750    }
751
752    #[test]
753    fn joint_coupled_cost_two_blocks() {
754        // n=10, p_total = 3+4=7 → 10 * 49 = 490
755        let specs = [make_spec(99, 3), make_spec(99, 4)];
756        assert_eq!(joint_coupled_coefficient_hessian_cost(10, &specs), 490);
757    }
758
759    // -----------------------------------------------------------------------
760    // block_offsets_from_specs
761    // -----------------------------------------------------------------------
762
763    #[test]
764    fn block_offsets_empty_is_empty() {
765        let offsets = block_offsets_from_specs(&[]);
766        assert_eq!(offsets.len(), 0);
767    }
768
769    #[test]
770    fn block_offsets_three_blocks() {
771        // p = [2, 3, 1] → [0..2, 2..5, 5..6]
772        let specs = [make_spec(1, 2), make_spec(1, 3), make_spec(1, 1)];
773        let offsets = block_offsets_from_specs(&specs);
774        assert_eq!(&offsets[0], &(0..2));
775        assert_eq!(&offsets[1], &(2..5));
776        assert_eq!(&offsets[2], &(5..6));
777    }
778
779    #[test]
780    fn block_offsets_zero_width_block() {
781        // p = [2, 0, 1] → [0..2, 2..2, 2..3]
782        let specs = [make_spec(1, 2), make_spec(1, 0), make_spec(1, 1)];
783        let offsets = block_offsets_from_specs(&specs);
784        assert_eq!(&offsets[0], &(0..2));
785        assert_eq!(&offsets[1], &(2..2));
786        assert_eq!(&offsets[2], &(2..3));
787    }
788
789    // -----------------------------------------------------------------------
790    // first_order_bfgs_loglambda_step_cap
791    // -----------------------------------------------------------------------
792
793    #[test]
794    fn step_cap_without_outer_hessian_is_some_five() {
795        assert_eq!(first_order_bfgs_loglambda_step_cap(false), Some(5.0));
796    }
797
798    #[test]
799    fn step_cap_with_outer_hessian_is_none() {
800        assert_eq!(first_order_bfgs_loglambda_step_cap(true), None);
801    }
802
803    // -----------------------------------------------------------------------
804    // cost_gated_first_order_max_iter
805    // -----------------------------------------------------------------------
806
807    #[test]
808    fn cost_gated_iter_passes_through_when_has_outer_hessian() {
809        assert_eq!(cost_gated_first_order_max_iter(100, 1_000_000_000, true), 100);
810    }
811
812    #[test]
813    fn cost_gated_iter_passes_through_at_one_iteration() {
814        assert_eq!(cost_gated_first_order_max_iter(1, 1_000_000_000_000, false), 1);
815    }
816
817    #[test]
818    fn cost_gated_iter_passes_through_zero_cost() {
819        assert_eq!(cost_gated_first_order_max_iter(200, 0, false), 200);
820    }
821
822    #[test]
823    fn cost_gated_iter_clamps_expensive_gradient() {
824        // Budget = 80_000_000_000. With cost = 1_000_000_000 the affordable
825        // count is 80. requested=200 should be clamped to 80.
826        let result = cost_gated_first_order_max_iter(200, 1_000_000_000, false);
827        assert!(result <= 80, "got {result}");
828        assert!(result >= 4, "got {result}");
829    }
830
831    #[test]
832    fn cost_gated_iter_enforces_minimum_four() {
833        // Cost so high only 1 iteration is affordable, but minimum is 4.
834        let result = cost_gated_first_order_max_iter(100, u64::MAX / 2, false);
835        assert_eq!(result, 4);
836    }
837}