gam_model_api/families/custom_family/options.rs
1//! Fit-time configuration and cost accounting: `BlockwiseFitOptions`, the
2//! outer-derivative policy + order selection, coefficient cost models, and the
3//! argument-validation asserts shared by the solver entry points.
4
5use crate::families::custom_family::family_trait::{CustomFamily, OuterEvalContext};
6use crate::families::custom_family::psi_design::{
7 CustomFamilyBlockPsiDerivative, ExactNewtonJointHessianWorkspace,
8};
9use gam_linalg::RidgePolicy;
10use gam_problem::{ParameterBlockSpec, ParameterBlockState};
11use ndarray::Array1;
12use std::ops::Range;
13use std::sync::Arc;
14use std::sync::atomic::AtomicUsize;
15
16// Moved to `gam-problem` (#1521 CustomFamily-cone inversion): the neutral,
17// dependency-free outer-objective + exact-derivative-order capability enums now
18// live in `gam_problem::family_options` and are re-exported here so every
19// `custom_family::ExactNewtonOuterObjective` / `ExactOuterDerivativeOrder` path
20// keeps resolving byte-for-byte.
21pub use gam_problem::{ExactNewtonOuterObjective, ExactOuterDerivativeOrder};
22
23// The IRLS weight / ridge floors and the block-spec consistency validator are
24// neutral (no `CustomFamily` dependency); they live in `gam-problem` and are
25// re-exported here so every `custom_family::{CUSTOM_FAMILY_WEIGHT_FLOOR,
26// CUSTOM_FAMILY_RIDGE_FLOOR, validate_blockspec_consistency}` path keeps
27// resolving and `CUSTOM_FAMILY_WEIGHT_FLOOR` stays sourced from the canonical
28// `MIN_WEIGHT` floor rather than a hardcoded literal.
29pub use gam_problem::{
30 CUSTOM_FAMILY_RIDGE_FLOOR, CUSTOM_FAMILY_WEIGHT_FLOOR, validate_blockspec_consistency,
31};
32
33/// Exact outer derivative order for families that expose second-order
34/// coefficient geometry.
35///
36/// This used to be a cost gate that demoted large large-scale problems to
37/// first-order BFGS. That was a policy leak into the math layer: if the family
38/// supplies analytic dense Hessian blocks or an analytic profiled-Hessian HVP,
39/// the outer optimizer should see the exact second-order objective. Runtime
40/// representation choices (dense vs operator) belong below this declaration,
41/// not in a first-order downgrade.
42/// Precondition check for the family capability / operator hooks (e.g.
43/// `batched_outer_hessian_terms`, `outer_hyper_hessian_operator`).
44///
45/// These hooks operate on whatever block geometry the caller has assembled and
46/// must validate the *consistency* of the specs they are handed — never the
47/// fit-level "at least one block" precondition, which belongs to the fit entry
48/// points (`validate_blockspecs`). An empty, self-consistent argument set is a
49/// valid no-op probe of the operator path (the operator may ignore the specs
50/// entirely), so it must not panic here.
51pub(crate) fn assert_valid_blockspecs(specs: &[ParameterBlockSpec], context: &str) {
52 assert!(
53 validate_blockspec_consistency(specs).is_ok(),
54 "{context}: inconsistent parameter block specs"
55 );
56}
57
58pub(crate) fn assert_valid_options(options: &BlockwiseFitOptions, context: &str) {
59 assert!(
60 options.inner_tol.is_finite() && options.inner_tol >= 0.0,
61 "{context}: inner_tol must be finite and non-negative"
62 );
63 assert!(
64 options.outer_tol.is_finite() && options.outer_tol >= 0.0,
65 "{context}: outer_tol must be finite and non-negative"
66 );
67 assert!(
68 options.minweight.is_finite() && options.minweight >= 0.0,
69 "{context}: minweight must be finite and non-negative"
70 );
71 assert!(
72 options.ridge_floor.is_finite() && options.ridge_floor >= 0.0,
73 "{context}: ridge_floor must be finite and non-negative"
74 );
75 if let Some(threshold) = options.early_exit_threshold {
76 assert!(
77 threshold.is_finite(),
78 "{context}: early_exit_threshold must be finite"
79 );
80 }
81}
82
83pub(crate) fn assert_states_match_specs(
84 states: &[ParameterBlockState],
85 specs: &[ParameterBlockSpec],
86 context: &str,
87) {
88 assert_eq!(
89 states.len(),
90 specs.len(),
91 "{context}: state/spec block count mismatch"
92 );
93 for (block, (state, spec)) in states.iter().zip(specs).enumerate() {
94 assert_eq!(
95 state.beta.len(),
96 spec.design.ncols(),
97 "{context}: beta length mismatch in block {block}"
98 );
99 // `state.eta` is produced from `solver_design()` (see
100 // `refresh_all_block_etas`), which is `stacked_design` when set
101 // (3·n_obs rows for survival LS time-varying blocks) and `design`
102 // (n_obs rows) otherwise. Use the same accessor here.
103 assert_eq!(
104 state.eta.len(),
105 spec.solver_design().nrows(),
106 "{context}: eta length mismatch in block {block}"
107 );
108 }
109}
110
111pub(crate) fn assert_derivative_blocks_match_specs(
112 derivative_blocks: &[Vec<CustomFamilyBlockPsiDerivative>],
113 specs: &[ParameterBlockSpec],
114 context: &str,
115) {
116 assert_eq!(
117 derivative_blocks.len(),
118 specs.len(),
119 "{context}: derivative/spec block count mismatch"
120 );
121}
122
123pub(crate) fn assert_rho_matches_specs(
124 rho: &Array1<f64>,
125 specs: &[ParameterBlockSpec],
126 context: &str,
127) {
128 let expected = specs.iter().map(|spec| spec.penalties.len()).sum::<usize>();
129 assert_eq!(
130 rho.len(),
131 expected,
132 "{context}: rho length does not match penalty count"
133 );
134}
135
136pub(crate) fn validate_hessian_workspace_ready(
137 hessian_workspace: &Option<Arc<dyn ExactNewtonJointHessianWorkspace>>,
138 context: &str,
139) -> Result<(), String> {
140 if let Some(workspace) = hessian_workspace.as_ref() {
141 workspace
142 .warm_up_outer_caches()
143 .map_err(|err| format!("{context}: failed to warm Hessian workspace caches: {err}"))?;
144 }
145 Ok(())
146}
147
148pub fn exact_outer_order_from_capability(
149 specs: &[ParameterBlockSpec],
150 coefficient_cost: u64,
151) -> ExactOuterDerivativeOrder {
152 assert_valid_blockspecs(specs, "exact outer derivative order");
153 match coefficient_cost {
154 0 => ExactOuterDerivativeOrder::Second,
155 _ => ExactOuterDerivativeOrder::Second,
156 }
157}
158
159/// Capability-aware variant of [`exact_outer_order_from_capability`].
160///
161/// Kept as the public declaration helper for existing family impls, but it no
162/// longer gates by cost. Once a caller has established dense or HVP analytic
163/// second-order support, the correct derivative order is `Second`.
164pub fn exact_outer_order_with_outer_hvp(
165 specs: &[ParameterBlockSpec],
166 coefficient_cost: u64,
167 outer_hyper_hessian_hvp_available: bool,
168) -> ExactOuterDerivativeOrder {
169 if outer_hyper_hessian_hvp_available {
170 assert_valid_blockspecs(specs, "exact outer derivative order with HVP");
171 match coefficient_cost {
172 0 => ExactOuterDerivativeOrder::Second,
173 _ => ExactOuterDerivativeOrder::Second,
174 }
175 } else {
176 exact_outer_order_from_capability(specs, coefficient_cost)
177 }
178}
179
180/// Realized outer-derivative policy at the current problem size.
181///
182/// Capability (the family can produce exact second-order calculus) controls
183/// whether the Hessian is declared. Runtime cost controls only representation
184/// and staging choices below this layer. Large problems must stay on the exact
185/// analytic Hessian path and use an operator representation when dense assembly
186/// is too expensive; they are not demoted to first-order BFGS here.
187///
188/// `OuterDerivativePolicy` records the family's *capability*, the *predicted
189/// per-eval cost* for both gradient-only and Hessian paths, and exposes the
190/// two policy queries the outer optimizer actually needs:
191///
192/// * [`order_for_evaluation`](Self::order_for_evaluation) — clamp a requested
193/// evaluation order against the policy gate.
194/// * [`declared_hessian_form`](Self::declared_hessian_form) — what shape the
195/// outer-strategy planner should declare to its plan ladder.
196/// * [`should_use_staged_kappa`](Self::should_use_staged_kappa) — auto-route
197/// the κ optimizer through the pilot/polish schedule at large `n`.
198///
199/// All thresholds are *const* — no env vars, no CLI flags. The cost model is
200/// the family's own `coefficient_gradient_cost` / `coefficient_hessian_cost`
201/// scaled by the joint outer-coordinate dimension, with `saturating_mul` so
202/// overflow rounds up to the budget ceiling rather than wrapping silently.
203#[derive(Clone, Copy, Debug)]
204pub struct OuterDerivativePolicy {
205 /// What exact calculus the family advertises in principle.
206 pub capability: ExactOuterDerivativeOrder,
207 /// Predicted per-eval work for one `ValueGradientHessian` evaluation.
208 /// Rounded conservatively *up* via `saturating_mul`. Informational for
209 /// representation and diagnostics; it does not disable Hessian capability.
210 pub predicted_hessian_work: u128,
211 /// Predicted per-eval work for one `ValueAndGradient` evaluation.
212 /// Rounded conservatively *up* via `saturating_mul`.
213 pub predicted_gradient_work: u128,
214 /// True when the family's outer-only paths consume
215 /// [`BlockwiseFitOptions::outer_score_subsample`] and produce
216 /// Horvitz-Thompson-weighted partial sums (i.e. the family overrides
217 /// `log_likelihood_only_with_options`,
218 /// `exact_newton_joint_psi_workspace_with_options`, and any other
219 /// outer-only hooks reached by `evaluate_custom_family_joint_hyper`).
220 ///
221 /// Determines whether the κ optimizer's pilot/polish staging schedule
222 /// engages: when this is `false`, [`Self::should_use_staged_kappa`]
223 /// returns `false` regardless of `n`. Engaging the schedule on a
224 /// family that ignores the subsample is strictly worse than not
225 /// engaging it — the schedule builds a `RowSet::Subsample` and the
226 /// boundary plumbing installs an `OuterScoreSubsample` on options,
227 /// but the family's default outer-only paths fall back to full-data
228 /// sums, so the pilot evaluation costs the same as the polish but
229 /// adds a Vec allocation per eval.
230 ///
231 /// Families that do **not** consume the subsample (default for new
232 /// implementations, including the GAMLSS location-scale families
233 /// today) leave this `false`. Families that do consume (today:
234 /// `BernoulliMarginalSlopeFamily`) override `outer_derivative_policy`
235 /// to set this `true`.
236 pub subsample_capable: bool,
237}
238
239impl OuterDerivativePolicy {
240 /// Per-eval gradient work ceiling above which the κ schedule switches
241 /// to the staged pilot/polish path. At large scale (n ≳ 100 k) even
242 /// the gradient sweep takes minutes per outer iter; subsampling the
243 /// pilot stage cuts that to seconds and leaves the final polish on
244 /// full data to recover the MLE.
245 pub const OUTER_GRADIENT_WORK_BUDGET: u128 = 50_000_000_000;
246
247 /// Pilot subsample auto-engages when full-data `n` exceeds this. Below
248 /// this the κ schedule collapses to a single full-data stage —
249 /// behaviour identical to the pre-P7 path.
250 pub const STAGED_KAPPA_TRIGGER_N: usize = 30_000;
251
252 /// Clamp a requested evaluation order against the policy gate.
253 ///
254 /// Returns the highest order this policy permits for the requested order:
255 /// * `ValueGradientHessian` requested → keep only if `declared_hessian_form`
256 /// is something other than `Unavailable`.
257 /// * `ValueAndGradient` requested → always permitted (gradient-only is
258 /// universal).
259 pub fn order_for_evaluation(&self, requested: crate::OuterEvalOrder) -> crate::OuterEvalOrder {
260 use crate::OuterEvalOrder;
261 match requested {
262 // Value-only is universal: every policy can evaluate the bare
263 // objective, so the request passes through unclamped.
264 OuterEvalOrder::Value => OuterEvalOrder::Value,
265 OuterEvalOrder::ValueAndGradient => OuterEvalOrder::ValueAndGradient,
266 OuterEvalOrder::ValueGradientHessian => {
267 if matches!(
268 self.declared_hessian_form(),
269 gam_problem::DeclaredHessianForm::Unavailable
270 ) {
271 OuterEvalOrder::ValueAndGradient
272 } else {
273 OuterEvalOrder::ValueGradientHessian
274 }
275 }
276 }
277 }
278
279 /// Outer Hessian declaration for the outer-strategy planner.
280 ///
281 /// `Either` ⇔ capability has Hessian. Work estimates select dense vs
282 /// operator assembly later; they must not erase analytic second-order
283 /// capability from the planner.
284 pub fn declared_hessian_form(&self) -> gam_problem::DeclaredHessianForm {
285 use gam_problem::DeclaredHessianForm;
286 if !self.capability.has_hessian() {
287 return DeclaredHessianForm::Unavailable;
288 }
289 DeclaredHessianForm::Either
290 }
291
292 /// True when the κ optimizer should auto-route through the staged
293 /// pilot/polish schedule. Triggers when **either** the data is big
294 /// (`n ≥ STAGED_KAPPA_TRIGGER_N`) **or** the per-eval gradient work
295 /// exceeds `OUTER_GRADIENT_WORK_BUDGET`. The second clause catches
296 /// problems with moderate `n` but very wide design (large `p_total`
297 /// or `psi_dim`) where a single full-data gradient sweep still
298 /// dominates the κ trajectory.
299 pub fn should_use_staged_kappa(&self, n: usize) -> bool {
300 if !self.subsample_capable {
301 // Family does not consume `outer_score_subsample` on its
302 // outer-only paths. Engaging the schedule would build a
303 // pilot `RowSet::Subsample` whose only effect is per-eval
304 // Vec/Arc bookkeeping — the underlying coefficient gradient
305 // would still sum every row. Gate the schedule off until
306 // the family override declares consumption.
307 return false;
308 }
309 n >= Self::STAGED_KAPPA_TRIGGER_N
310 || self.predicted_gradient_work > Self::OUTER_GRADIENT_WORK_BUDGET
311 }
312}
313
314/// Total outer-coordinate dimensionality used by the default policy work
315/// model: `rho_dim + psi_dim`. Each outer evaluation propagates one
316/// directional derivative per outer coordinate through the inner solve.
317#[inline]
318pub(crate) fn outer_coord_dim_for_policy(specs: &[ParameterBlockSpec], psi_dim: usize) -> u128 {
319 let rho_total: u128 = specs
320 .iter()
321 .map(|s| s.penalties.len() as u128)
322 .fold(0u128, |acc, k| acc.saturating_add(k));
323 rho_total.saturating_add(psi_dim as u128)
324}
325
326/// Default predicted-cost model for [`OuterDerivativePolicy`]:
327///
328/// * gradient work ≈ `coefficient_gradient_cost · (rho_dim + psi_dim)`
329/// * Hessian work ≈ `coefficient_hessian_cost · (rho_dim + psi_dim)`
330///
331/// Each outer coordinate triggers one analytic directional derivative
332/// through the inner solve; the dense Hessian assembly carries the extra
333/// `O(p_total)` factor already captured by `coefficient_hessian_cost`.
334///
335/// All multiplications saturate so an overflow rounds *up* to the gate
336/// ceiling: we'd rather drop one Hessian evaluation that we could have
337/// afforded than crash on a 600 s eval.
338pub fn default_outer_derivative_policy_costs(
339 specs: &[ParameterBlockSpec],
340 psi_dim: usize,
341 grad_cost: u64,
342 hess_cost: u64,
343) -> (u128, u128) {
344 let k = outer_coord_dim_for_policy(specs, psi_dim);
345 let grad = (grad_cost as u128).saturating_mul(k.max(1));
346 let hess = (hess_cost as u128).saturating_mul(k.max(1));
347 (grad, hess)
348}
349
350/// Default coefficient-space Hessian cost: `Σ_b n_b · p_b²`, summed across
351/// blocks. Represents the work to assemble or apply the dense block-diagonal
352/// inner Hessian once.
353pub fn default_coefficient_hessian_cost(specs: &[ParameterBlockSpec]) -> u64 {
354 specs
355 .iter()
356 .map(|s| {
357 let n = s.design.nrows() as u64;
358 let p = s.design.ncols() as u64;
359 n.saturating_mul(p.saturating_mul(p))
360 })
361 .fold(0u64, |acc, c| acc.saturating_add(c))
362}
363
364/// Joint-coupled coefficient-space Hessian cost: `n · (Σ_b p_b)²`. The honest
365/// per-evaluation work for any family whose row likelihood couples every block
366/// (every observation contributes a rank-`m` outer-product update to the full
367/// joint Hessian over `Σ p_b` coefficients), as opposed to the block-diagonal
368/// `default_coefficient_hessian_cost` which assumes each `X_b' W_b X_b` is
369/// assembled independently.
370///
371/// Used by all GAMLSS, marginal-slope, and joint-latent families. CTN does
372/// not delegate here — it uses its Khatri–Rao factor dimensions internally.
373pub fn joint_coupled_coefficient_hessian_cost(n: u64, specs: &[ParameterBlockSpec]) -> u64 {
374 let p_total: u64 = specs
375 .iter()
376 .map(|s| s.design.ncols() as u64)
377 .fold(0u64, |acc, p| acc.saturating_add(p));
378 n.saturating_mul(p_total.saturating_mul(p_total))
379}
380
381/// Default coefficient-space gradient cost: half the Hessian cost.
382///
383/// The first-order analytic gradient in the unified evaluator runs the same
384/// inner Newton solve as the second-order path but skips the `K`-fold
385/// pairwise Hessian assembly (`B_{j,k}` blocks) and the `K`-fold inner
386/// derivative solves; what remains is the inner solve plus a single
387/// gradient-only sweep through the data. Empirically this is roughly half
388/// the per-evaluation arithmetic of forming the dense Hessian, hence the
389/// `/2` default. Families whose gradient assembly differs structurally
390/// (e.g. matrix-free Hv operators with no dense Hessian assembly to halve)
391/// should override [`CustomFamily::coefficient_gradient_cost`] explicitly.
392pub fn default_coefficient_gradient_cost(specs: &[ParameterBlockSpec]) -> u64 {
393 default_coefficient_hessian_cost(specs) / 2
394}
395
396/// Compute β-block column ranges from a slice of `ParameterBlockSpec`s.
397///
398/// Returns one `Range<usize>` per spec, covering the spec's columns in the
399/// concatenated β vector (i.e. `offset .. offset + p_block` where `p_block =
400/// spec.design.ncols()`). The ranges are non-overlapping, sorted, and their
401/// union covers `0..Σ p_block`.
402///
403/// This is the canonical source of `block_offsets` for every
404/// [`crate::solver::arrow_schur::ArrowSchurSystem`] built for a custom family
405/// (survival, GAMLSS, transformation-normal, latent-survival, marginal-slope,
406/// …). Pass the result to
407/// [`crate::solver::arrow_schur::ArrowSchurSystem::set_block_offsets`] before
408/// calling `solve` or `solve_with_options` whenever the system will use
409/// [`crate::solver::arrow_schur::ArrowSolverMode::InexactPCG`].
410///
411/// Specs with zero columns produce a zero-width range; callers that want to
412/// skip trivial blocks may filter on `r.start < r.end` after calling this
413/// function.
414pub fn block_offsets_from_specs(specs: &[ParameterBlockSpec]) -> Arc<[Range<usize>]> {
415 let mut ranges: Vec<Range<usize>> = Vec::with_capacity(specs.len());
416 let mut cursor = 0usize;
417 for spec in specs {
418 let p = spec.design.ncols();
419 ranges.push(cursor..cursor + p);
420 cursor += p;
421 }
422 Arc::from(ranges.into_boxed_slice())
423}
424
425/// Bound first-order outer iterations when each analytic-gradient evaluation is
426/// already large-scale work. This is only applied after the planner has
427/// selected a gradient-only route; second-order/ARC plans keep their requested
428/// iteration budget.
429pub fn cost_gated_first_order_max_iter(
430 requested: usize,
431 coefficient_gradient_cost: u64,
432 has_outer_hessian: bool,
433) -> usize {
434 const FIRST_ORDER_OUTER_WORK_BUDGET: u64 = 80_000_000_000;
435 const MIN_FIRST_ORDER_ITERS: usize = 4;
436
437 if has_outer_hessian || requested <= 1 || coefficient_gradient_cost == 0 {
438 return requested;
439 }
440
441 let affordable = (FIRST_ORDER_OUTER_WORK_BUDGET / coefficient_gradient_cost) as usize;
442 requested.min(affordable.max(MIN_FIRST_ORDER_ITERS))
443}
444
445/// Local trust budget for first-order outer BFGS on log-smoothing parameters.
446///
447/// One unit in `rho = log(lambda)` is an `e`-fold smoothing-parameter change.
448/// Previously this cap was `1.0`, which throttled BFGS to ~1/5 of its
449/// quasi-Newton step on flat REML surfaces (the natural BFGS direction has
450/// `|d|_inf` of ~5 in log-λ for large-scale survival fits). Probes whose
451/// `step_inf > cap` are rejected for free in `OuterFirstOrderBridge::eval_cost`
452/// (returning `BFGS_LINE_SEARCH_REJECT_COST` without running an inner solve),
453/// so a larger cap costs nothing on rejection — it only lets Strong-Wolfe
454/// accept bigger steps that the inner-PIRLS divergence guard can already
455/// validate. `5.0` allows up to `e^5 ≈ 148`-fold smoothing-parameter change
456/// per accepted outer iter, which matches the typical quasi-Newton direction
457/// magnitude while still bounding pathological probes.
458pub const fn first_order_bfgs_loglambda_step_cap(has_outer_hessian: bool) -> Option<f64> {
459 if has_outer_hessian { None } else { Some(5.0) }
460}
461
462pub fn exact_newton_outer_geometry_supports_second_order_solver<F: CustomFamily + ?Sized>(
463 family: &F,
464) -> bool {
465 family.exact_newton_outerobjective() == ExactNewtonOuterObjective::StrictPseudoLaplace
466}
467
468/// Stable public API for installing outer-score subsampling.
469#[derive(Clone)]
470pub struct BlockwiseFitOptions {
471 pub inner_max_cycles: usize,
472 pub inner_tol: f64,
473 pub outer_max_iter: usize,
474 pub outer_tol: f64,
475 /// Optional override for the OUTER smoothing optimizer's
476 /// *relative-cost-decrease* convergence stop, decoupled from `outer_tol`.
477 ///
478 /// The outer convergence test derives BOTH the absolute projected-gradient
479 /// floor (`max(outer_tol, n·1e-9)`) AND the relative-cost stop
480 /// (`rel_cost = outer_tol`) from the single `outer_tol`. A caller that needs
481 /// a *tight absolute floor* to resolve λ to the genuine REML optimum at
482 /// large `n` (where the floor is `n·1e-9`) is then forced to also accept a
483 /// *tight rel-cost stop*, which on a flat REML ridge never trips and grinds
484 /// the optimizer to `outer_max_iter` — dozens of surplus O(D·p³)
485 /// Laplace-derivative outer iterations (the #1082 multinomial
486 /// smooth-by-factor wall-clock blow-up). When `Some(r)`, the rel-cost stop
487 /// uses `r` while the absolute floor keeps using `outer_tol`, so accuracy
488 /// (absolute floor) and perf (loose rel-cost) are selected independently.
489 /// `None` preserves the legacy coupling (`rel_cost = outer_tol`) for every
490 /// existing caller byte-for-byte.
491 pub outer_rel_cost_tol: Option<f64>,
492 /// Lower box bound for smoothing coordinates ρ = log λ.
493 ///
494 /// The default preserves the historical custom-family domain
495 /// `λ >= exp(-10)`. Families with known calibration failures at the
496 /// near-zero penalty boundary can raise this lower bound without changing
497 /// the upper effective-df cap or adding family-specific branches inside the
498 /// optimizer.
499 pub rho_lower_bound: f64,
500 pub minweight: f64,
501 pub ridge_floor: f64,
502 /// Shared ridge semantics used by solve/quadratic/logdet terms.
503 pub ridge_policy: RidgePolicy,
504 /// If true, outer smoothing optimization uses a Laplace/REML-style objective:
505 /// -loglik + penalty + 0.5(log|H| - log|S|_+)
506 /// where H is blockwise working curvature and S is blockwise penalty.
507 pub use_remlobjective: bool,
508 /// If false, the outer smoothing optimizer uses exact gradients but does
509 /// not request an analytic outer Hessian from the family.
510 pub use_outer_hessian: bool,
511 /// If false, skip post-fit joint covariance assembly.
512 pub compute_covariance: bool,
513 /// Shared cap engaged during seed screening so cost-only evaluations can
514 /// stop inner iterations early without affecting the full solve.
515 pub screening_max_inner_iterations: Option<Arc<AtomicUsize>>,
516 /// Shared cap engaged during regular outer iterations. Unlike screening,
517 /// this is only a budget: capped solves still have to earn the ordinary
518 /// KKT certificate before derivatives may be exposed.
519 pub outer_inner_max_iterations: Option<Arc<AtomicUsize>>,
520 /// Optional line-search objective ceiling for lazy log-likelihood-only
521 /// evaluations. Families whose per-row log-likelihood contributions are
522 /// non-positive may stop once the partial negative log-likelihood is already
523 /// above this ceiling, because the unvisited rows cannot improve the trial
524 /// objective enough to be accepted. Default `None` preserves exact full-sum
525 /// behavior and is the only mode used outside backtracking rejection tests.
526 pub early_exit_threshold: Option<f64>,
527 /// Stable public API for installing outer-score subsampling.
528 ///
529 /// Optional stratified row subsample used by outer-only score/gradient
530 /// passes. When `Some(s)`, outer score/gradient hot loops should iterate
531 /// only over `s.rows` and multiply each contribution by that row's
532 /// Horvitz-Thompson inverse-inclusion weight. Inner-PIRLS and final
533 /// covariance passes always run on the full data, so this field is
534 /// consulted only by outer-only call sites. Default `None` preserves the
535 /// full-data behavior. Wrapping in `Arc` keeps `Clone` cheap across the
536 /// many places `BlockwiseFitOptions` is duplicated per-eval.
537 pub outer_score_subsample: Option<Arc<crate::OuterScoreSubsample>>,
538 /// Gate for marginal-slope families to auto-derive a stratified
539 /// outer-score subsample at large scale (see
540 /// [`crate::families::marginal_slope_shared::auto_outer_score_subsample`]).
541 ///
542 /// **Default `true`.** Auto-subsampling makes the early rho-gradient
543 /// evaluations unbiased stochastic estimators with bounded relative
544 /// variance (≈ 1 % at the conservative defaults), then the family switches
545 /// back to full-data gradients for the remaining outer iterations. That
546 /// keeps large marginal-slope fits fast during the high-motion part of the
547 /// trajectory while preserving the default tight `outer_tol` polish on
548 /// exact gradients. For small datasets the auto path declines to install a
549 /// mask and the fit remains full-data throughout.
550 ///
551 /// When `outer_score_subsample` is already `Some(...)` the auto
552 /// path is bypassed entirely (caller-provided masks always win).
553 pub auto_outer_subsample: bool,
554 /// Outer-evaluation context populated by the smoothing optimizer at
555 /// the top of each real outer derivative evaluation. Used by
556 /// auto-subsample install paths to key the stratified mask on the
557 /// outer ρ rather than the inner β proxy: during the inner trust-
558 /// region / coefficient line search β changes on every trial step,
559 /// so keying on β re-fires phase prints (and re-shuffles the mask)
560 /// inside a single outer eval. Keying on (rho, eval_id) instead
561 /// keeps the mask stable across the inner Newton at one ρ, and
562 /// suppresses auto-subsample entirely on inner trial evaluations via
563 /// the [`EvalScope::InnerCoefficient`] tag set by
564 /// [`coefficient_line_search_options`].
565 ///
566 /// `None` preserves legacy behavior (no context — install paths fall
567 /// back to "no auto-subsample"). Default `None`.
568 pub outer_eval_context: Option<OuterEvalContext>,
569 /// Optional persistent warm-start cache session. When `Some`, the
570 /// outer smoothing optimizer consults the on-disk cache before
571 /// starting (to seed θ from the last accepted iterate) and writes
572 /// checkpoints + a final entry on completion. When `None`, the fit
573 /// runs cold and writes nothing — the default for unit tests and
574 /// any caller that pinned a deterministic optimum.
575 ///
576 /// Callers that need cross-process reuse must supply the session
577 /// explicitly; ordinary workflow fits leave this empty so refit-heavy
578 /// loops do not touch the shared on-disk store.
579 pub cache_session: Option<Arc<gam_runtime::warm_start::Session>>,
580 /// Optional mirror sessions that receive a copy of the final-result
581 /// finalize() write. Callers can use this to broadcast a converged ρ to
582 /// additional keyspace(s) so future fits with related structure can
583 /// warm-start from this run. Writes still pass through the session rate
584 /// limiter, so mirroring checkpoints does not add unbounded I/O.
585 pub cache_mirror_sessions: Vec<Arc<gam_runtime::warm_start::Session>>,
586 /// Optional bundle of cross-block (full-width) penalties, paired with
587 /// their current `log λ` values from the outer ρ vector. When `Some`,
588 /// the inner joint-Newton primitives add the contributions
589 ///
590 /// * objective: `½ Σ_j exp(ρ_j) βᵀ S_j β`
591 /// * gradient: `Σ_j exp(ρ_j) S_j β`
592 /// * Hessian: `Σ_j exp(ρ_j) S_j`
593 ///
594 /// in addition to the per-block penalty stack assembled from
595 /// `ParameterBlockSpec.penalties`. The per-block path is unchanged.
596 /// `None` preserves legacy behaviour for every existing caller.
597 pub joint_penalties: Option<Arc<crate::JointPenaltyBundle>>,
598 /// Whether the outer smoothing optimizer screens the explicit
599 /// `initial_rho` seed through the seed-screening cascade before the
600 /// solver starts.
601 ///
602 /// **Default `true`** — the general path benefits from ranking the
603 /// initial seed against the generated exploration seeds via cheap
604 /// capped proxy fits.
605 ///
606 /// A caller sets this `false` when `initial_rho` is already the correct,
607 /// identified optimum for its regime so that re-screening it adds only
608 /// cost. The survival location-scale constant-scale (parametric-AFT)
609 /// path uses this: its time-warp ρ seed is pinned AT the inner ρ box
610 /// bound (the affine-baseline limit), where the REML/LAML profile is a
611 /// dead-flat unidentified ridge. Running the screening cascade there
612 /// drives each proxy fit (and, when every capped stage collapses to
613 /// non-finite cost, the uncapped final stage) into a full inner solve on
614 /// the near-singular flat Hessian — the source of the multi-minute
615 /// no-iteration-log stall (#736, #735, #721). Skipping screening lets the
616 /// already-correct seed flow straight to the outer solver, which certifies
617 /// box-constraint stationarity at iteration 0. Genuinely flexible regimes
618 /// (smooth scale / spatial) leave this `true` and keep full screening.
619 pub screen_initial_rho: bool,
620 /// Set ONLY while the inner solve is invoked from the seed-screening proxy
621 /// (`custom_family_seed_screening_proxy_labeled`), which RANKS candidate
622 /// seeds by their penalized objective and never produces the final fit.
623 ///
624 /// When `true`, the inner joint-Newton skips the full per-axis
625 /// Jeffreys/Firth curvature (`custom_family_joint_jeffreys_term`'s
626 /// `for k in 0..p` directional-derivative loop, O(p · per-axis-Hdot) per
627 /// cycle), keeping ONLY the cheap value-only Jeffreys term
628 /// (`custom_family_joint_jeffreys_value`, one reduced-info eigendecomposition)
629 /// in the screening score. The per-axis gradient/curvature is what the inner
630 /// Newton step needs to *converge* a near-separating fit; the screening proxy
631 /// is capped and only ranks, so it does not need step convergence — it needs
632 /// a finite, separation-aware score cheaply. For a K-block coupled family
633 /// (Dirichlet/multinomial) each per-axis directional derivative is itself
634 /// O(K²·n·p), so running the full term for every cascade candidate over the
635 /// joint width `p` is the wrong cost class and made the coupled fit
636 /// non-completing during screening alone (gam#729/#808). The actual fit
637 /// (after a seed is selected) runs with this `false`, so the load-bearing
638 /// Firth curvature is fully present where it matters.
639 ///
640 /// **Default `false`** — only the screening proxy sets it `true`.
641 pub seed_screening: bool,
642}
643
644pub const DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES: usize = 1200;
645
646impl Default for BlockwiseFitOptions {
647 fn default() -> Self {
648 Self {
649 // Large-scale custom-family marginal-slope fits can have a
650 // long, monotone joint-Newton tail: objective and step size keep
651 // shrinking, but the exact KKT residual may need several hundred
652 // additional cycles after the old 300-cycle cap. The outer
653 // REML/LAML derivative path is correct only at a stationary inner
654 // mode, so a merely descended iterate must not be accepted as
655 // converged. Use a production-sized cap by default and rely on the
656 // KKT/objective certificates to exit early for well-conditioned
657 // Gaussian, logistic, and small-n fits.
658 inner_max_cycles: DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES,
659 inner_tol: 1e-6,
660 outer_max_iter: 60,
661 outer_tol: 1e-5,
662 outer_rel_cost_tol: None,
663 rho_lower_bound: -10.0,
664 minweight: CUSTOM_FAMILY_WEIGHT_FLOOR,
665 // `ridge_floor` is an ExplicitPrior in the canonical
666 // stabilization ledger taxonomy (`StabilizationKind::ExplicitPrior`):
667 // its δ enters the quadratic term, the Laplace Hessian, and the
668 // penalty log-determinant — `ridge_policy` below is the live
669 // policy that confirms which terms it lands in. The default
670 // pos-part policy enables every inclusion flag, so callers
671 // wanting solver-only damping should construct a custom policy
672 // (or, preferably, a `StabilizationLedger::numerical_perturbation`)
673 // rather than reusing this field.
674 ridge_floor: CUSTOM_FAMILY_RIDGE_FLOOR,
675 ridge_policy: RidgePolicy::explicit_stabilization_pospart(),
676 use_remlobjective: true,
677 // Default ON: families expose exact outer Hessians whenever their
678 // analytic dense or operator representation is implemented.
679 use_outer_hessian: true,
680 compute_covariance: false,
681 screening_max_inner_iterations: None,
682 outer_inner_max_iterations: None,
683 seed_screening: false,
684 early_exit_threshold: None,
685 outer_score_subsample: None,
686 auto_outer_subsample: true,
687 outer_eval_context: None,
688 cache_session: None,
689 cache_mirror_sessions: Vec::new(),
690 joint_penalties: None,
691 screen_initial_rho: true,
692 }
693 }
694}
695
696#[cfg(test)]
697mod tests {
698 use super::*;
699 use gam_linalg::matrix::DesignMatrix;
700 use ndarray::Array2;
701
702 fn make_spec(nrows: usize, ncols: usize) -> ParameterBlockSpec {
703 ParameterBlockSpec {
704 design: DesignMatrix::from(Array2::<f64>::zeros((nrows, ncols))),
705 ..ParameterBlockSpec::defaults()
706 }
707 }
708
709 // -----------------------------------------------------------------------
710 // default_coefficient_hessian_cost
711 // -----------------------------------------------------------------------
712
713 #[test]
714 fn hessian_cost_empty_specs_is_zero() {
715 assert_eq!(default_coefficient_hessian_cost(&[]), 0);
716 }
717
718 #[test]
719 fn hessian_cost_single_block() {
720 // n=10, p=3 → 10 * 3^2 = 90
721 let spec = make_spec(10, 3);
722 assert_eq!(default_coefficient_hessian_cost(&[spec]), 90);
723 }
724
725 #[test]
726 fn hessian_cost_two_blocks_sum() {
727 // n=10, p=3 → 90; n=5, p=4 → 5*16=80; total=170
728 let specs = [make_spec(10, 3), make_spec(5, 4)];
729 assert_eq!(default_coefficient_hessian_cost(&specs), 170);
730 }
731
732 // -----------------------------------------------------------------------
733 // default_coefficient_gradient_cost
734 // -----------------------------------------------------------------------
735
736 #[test]
737 fn gradient_cost_is_half_hessian_cost() {
738 let specs = [make_spec(10, 3)];
739 let hess = default_coefficient_hessian_cost(&specs);
740 assert_eq!(default_coefficient_gradient_cost(&specs), hess / 2);
741 }
742
743 // -----------------------------------------------------------------------
744 // joint_coupled_coefficient_hessian_cost
745 // -----------------------------------------------------------------------
746
747 #[test]
748 fn joint_coupled_cost_empty_specs_is_zero() {
749 assert_eq!(joint_coupled_coefficient_hessian_cost(100, &[]), 0);
750 }
751
752 #[test]
753 fn joint_coupled_cost_two_blocks() {
754 // n=10, p_total = 3+4=7 → 10 * 49 = 490
755 let specs = [make_spec(99, 3), make_spec(99, 4)];
756 assert_eq!(joint_coupled_coefficient_hessian_cost(10, &specs), 490);
757 }
758
759 // -----------------------------------------------------------------------
760 // block_offsets_from_specs
761 // -----------------------------------------------------------------------
762
763 #[test]
764 fn block_offsets_empty_is_empty() {
765 let offsets = block_offsets_from_specs(&[]);
766 assert_eq!(offsets.len(), 0);
767 }
768
769 #[test]
770 fn block_offsets_three_blocks() {
771 // p = [2, 3, 1] → [0..2, 2..5, 5..6]
772 let specs = [make_spec(1, 2), make_spec(1, 3), make_spec(1, 1)];
773 let offsets = block_offsets_from_specs(&specs);
774 assert_eq!(&offsets[0], &(0..2));
775 assert_eq!(&offsets[1], &(2..5));
776 assert_eq!(&offsets[2], &(5..6));
777 }
778
779 #[test]
780 fn block_offsets_zero_width_block() {
781 // p = [2, 0, 1] → [0..2, 2..2, 2..3]
782 let specs = [make_spec(1, 2), make_spec(1, 0), make_spec(1, 1)];
783 let offsets = block_offsets_from_specs(&specs);
784 assert_eq!(&offsets[0], &(0..2));
785 assert_eq!(&offsets[1], &(2..2));
786 assert_eq!(&offsets[2], &(2..3));
787 }
788
789 // -----------------------------------------------------------------------
790 // first_order_bfgs_loglambda_step_cap
791 // -----------------------------------------------------------------------
792
793 #[test]
794 fn step_cap_without_outer_hessian_is_some_five() {
795 assert_eq!(first_order_bfgs_loglambda_step_cap(false), Some(5.0));
796 }
797
798 #[test]
799 fn step_cap_with_outer_hessian_is_none() {
800 assert_eq!(first_order_bfgs_loglambda_step_cap(true), None);
801 }
802
803 // -----------------------------------------------------------------------
804 // cost_gated_first_order_max_iter
805 // -----------------------------------------------------------------------
806
807 #[test]
808 fn cost_gated_iter_passes_through_when_has_outer_hessian() {
809 assert_eq!(cost_gated_first_order_max_iter(100, 1_000_000_000, true), 100);
810 }
811
812 #[test]
813 fn cost_gated_iter_passes_through_at_one_iteration() {
814 assert_eq!(cost_gated_first_order_max_iter(1, 1_000_000_000_000, false), 1);
815 }
816
817 #[test]
818 fn cost_gated_iter_passes_through_zero_cost() {
819 assert_eq!(cost_gated_first_order_max_iter(200, 0, false), 200);
820 }
821
822 #[test]
823 fn cost_gated_iter_clamps_expensive_gradient() {
824 // Budget = 80_000_000_000. With cost = 1_000_000_000 the affordable
825 // count is 80. requested=200 should be clamped to 80.
826 let result = cost_gated_first_order_max_iter(200, 1_000_000_000, false);
827 assert!(result <= 80, "got {result}");
828 assert!(result >= 4, "got {result}");
829 }
830
831 #[test]
832 fn cost_gated_iter_enforces_minimum_four() {
833 // Cost so high only 1 iteration is affordable, but minimum is 4.
834 let result = cost_gated_first_order_max_iter(100, u64::MAX / 2, false);
835 assert_eq!(result, 4);
836 }
837}