gam_models/transformation_normal/config.rs
1#[derive(Clone, Debug)]
2pub struct TransformationNormalConfig {
3 /// B-spline degree for the response-direction deviation basis (default 3).
4 pub response_degree: usize,
5 /// Number of interior knots for the response-direction deviation basis (default 10).
6 pub response_num_internal_knots: usize,
7 /// Difference penalty order for the response-direction roughness penalty (default 2).
8 pub response_penalty_order: usize,
9 /// Additional penalty orders for the response-direction (default [1]).
10 pub response_extra_penalty_orders: Vec<usize>,
11 /// Whether to add a global identity (ridge) penalty (default true).
12 pub double_penalty: bool,
13 /// When true, `response_num_internal_knots` is treated as an already-resolved
14 /// effective value: `fit_transformation_normal` uses it verbatim instead of
15 /// re-running `effective_response_num_internal_knots`. This is required by the
16 /// cross-fit Stage-1 calibration, which pins the knot count once at the
17 /// smallest fold complement so `p_resp` (and hence `p₁ = p_resp · p_cov`)
18 /// is fold-invariant; the data-driven complexity cap would otherwise round
19 /// to different counts on each fold's response subsample (workflow.rs §3).
20 pub response_num_internal_knots_pinned: bool,
21}
22
23impl Default for TransformationNormalConfig {
24 fn default() -> Self {
25 Self {
26 response_degree: 3,
27 response_num_internal_knots: 10,
28 response_penalty_order: 2,
29 response_extra_penalty_orders: vec![1],
30 double_penalty: true,
31 response_num_internal_knots_pinned: false,
32 }
33 }
34}
35
36/// Baseline cap for the tensor-product width used by the transformation-normal
37/// response basis. Small datasets should stay compact because the fit
38/// repeatedly factorizes dense penalized Hessians.
39pub(crate) const BASE_TRANSFORMATION_TENSOR_WIDTH: usize = 160;
40
41/// Large samples can support a richer response basis without the aggressive
42/// underfitting forced by the small-sample cap above. This upper cap keeps the
43/// tensor width bounded even when the covariate side is narrow.
44pub(crate) const LARGE_SAMPLE_TRANSFORMATION_TENSOR_WIDTH: usize = 320;
45
46/// E[log |Z|] for Z ~ N(0, 1), used to put local log-absolute residual
47/// projections on the standard-normal scale.
48pub(crate) const STANDARD_NORMAL_MEAN_LOG_ABS: f64 = -0.635_181_422_730_739_1;
49
50/// Strict-feasibility margin for `h' > 0` on the monotonicity grid. Used
51/// both by the fit-time fraction-to-boundary line search (so accepted β
52/// keeps `h'(grid) ≥ EPS`) and by the predict-time monotonicity check
53/// in `inference::predict_input` (which rejects predictions whose minimum
54/// `h'` on the response grid drops below this threshold). Keeping these
55/// in sync prevents the predict path from rejecting fits that the
56/// optimizer accepted as feasible — and vice versa.
57pub const TRANSFORMATION_MONOTONICITY_EPS: f64 = 1.0e-8;
58
59/// Absolute bound for feasible transformation scores on the standard-normal
60/// scale. The CTN likelihood targets `h(Y|x) ~ N(0,1)`; accepting exact-Newton
61/// iterates with finite positive `h'` but astronomical `|h|` lets curvature
62/// diagnostics overflow into meaningless values. This is a numerical runaway
63/// guard, not a statistical plausibility filter: startup seeds can temporarily
64/// land outside practically observable normal quantiles before the line search
65/// moves them back into the likelihood's high-density region.
66pub const TRANSFORMATION_NORMAL_H_ABS_MAX: f64 = 1.0e6;
67
68/// Number of dense-spectral factor columns processed per exact ψψ HVP row pass.
69/// At large-scale CTN dimensions p≈800, this keeps the per-worker accumulator well
70/// under 1 MiB while reducing repeated SCOP row-invariant work by 32× relative
71/// to one-column HVP dispatch.
72pub(crate) const SCOP_PSI_PSI_HVP_TILE_COLS: usize = 32;
73
74/// Exact dense SCOP coefficient Hessian cache limit for the inner `H·v` path.
75///
76/// The large-scale CTN calibration fit has many rows but a moderate coefficient
77/// dimension (for example n=20k, p=264). In that regime repeated PCG products
78/// against the same Hessian should pay the row-streaming chain rule once, then
79/// serve subsequent products as dense BLAS matvecs. Keep the cache restricted to
80/// genuinely moderate p so wide CTN fits remain row-streamed.
81pub(crate) const SCOP_HESSIAN_HVP_DENSE_CACHE_MAX_DIM: usize = 384;
82
83pub(crate) const SCOP_HESSIAN_HVP_DENSE_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
84
85/// CTN-scoped ceiling on the custom-family inner exact-Newton cycle budget.
86///
87/// The global `DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES = 1200` exists for the
88/// large-scale survival marginal-slope path, whose inner mode has a long,
89/// rank-deficient KKT tail that genuinely needs hundreds of cycles. CTN is a
90/// different regime: its coefficient block is a *bounded-dimension* Khatri–Rao
91/// tensor (capped by `BASE/LARGE_SAMPLE_TRANSFORMATION_TENSOR_WIDTH`), and the
92/// objective is strictly convex by construction — the `double_penalty` ridge
93/// plus the order-2/order-1 roughness penalties make the penalized Hessian
94/// positive definite even where the likelihood is flat on weakly-identified
95/// shape×covariate directions. An exact-Newton iteration on a strictly convex,
96/// bounded-dimension block converges in a handful of cycles; the only way the
97/// fit reaches 1200 inner cycles is by polishing weakly-identified directions
98/// that contribute nothing to the likelihood (the #720 timeout). Scaling the
99/// cap with the realized coefficient dimension keeps a generous margin for a
100/// genuinely nonlinear, high-dimensional transformation while refusing to grind
101/// the production large-scale cap on an easy near-Gaussian shift.
102pub(crate) const CTN_INNER_MAX_CYCLES_BASE: usize = 64;
103
104pub(crate) const CTN_INNER_MAX_CYCLES_PER_DIM: usize = 2;
105
106pub(crate) const CTN_INNER_MAX_CYCLES_CEILING: usize = 400;
107
108/// Numerical floor on a Gram/penalty diagonal scale before it enters the
109/// `likelihood_scale / penalty_scale` ratio that seeds the outer log-λ search.
110/// A genuinely zero diagonal (an all-zero penalty block, or a degenerate
111/// likelihood Gram) would otherwise produce a `0/0` or `x/0` seed; flooring
112/// both scales at a value far below any meaningful curvature keeps the ratio
113/// finite without perturbing well-posed problems.
114pub(crate) const CTN_SEED_SCALE_FLOOR: f64 = 1.0e-8;
115
116/// Lower bound on the cold-start seed log-λ (i.e. λ ≥ 1). Keeps the outer
117/// optimizer out of the under-regularized regime where the CTN inner solve is
118/// structurally rank-deficient (small-n / p > n); the optimizer is free to step
119/// below this once the data support it. See `ctn_penalty_scale_log_lambdas`.
120pub(crate) const CTN_SEED_LOG_LAMBDA_MIN: f64 = 0.0;
121
122/// Upper bound on the cold-start seed log-λ, matching the outer ρ-bound used
123/// across the location-scale families: λ ≈ e¹² caps the seed in the strongly
124/// over-smoothed regime so a tiny penalty scale cannot seed an absurd λ.
125pub(crate) const CTN_SEED_LOG_LAMBDA_MAX: f64 = 12.0;
126
127/// Floor on the warm-start global residual scale `sqrt(weighted_ss / Σw)`.
128/// Guards the degenerate near-perfect-fit case (residuals collapse to numerical
129/// zero) so the per-residual `residual_floor` below — and the subsequent
130/// `ln(|y−μ|)` log-scale target — stay finite. Well below any real response
131/// spread, so it never perturbs a genuine fit.
132pub(crate) const WARMSTART_GLOBAL_SCALE_FLOOR: f64 = 1e-6;
133
134/// Per-residual floor used to form the log-scale warm-start target
135/// `ln(|y−μ|) − E[ln|N(0,1)|]`. Built as `global_scale · WARMSTART_RESIDUAL_REL_FLOOR
136/// + WARMSTART_RESIDUAL_ABS_FLOOR`: the relative term keeps an exactly-fit point
137/// (|y−μ| = 0) from sending `ln(0) → −∞` at 1/1000 of the data scale, and the
138/// absolute term backstops the case where `global_scale` itself sits at its floor.
139pub(crate) const WARMSTART_RESIDUAL_REL_FLOOR: f64 = 1e-3;
140
141pub(crate) const WARMSTART_RESIDUAL_ABS_FLOOR: f64 = 1e-12;
142
143/// Floor on a per-row warm-start scale τ before forming `1/τ` when building the
144/// affine transformation seed targets. A degenerate τ = 0 (a collapsed warm-start
145/// scale block) would otherwise produce a non-finite reciprocal; the floor sits
146/// far below any meaningful scale so it only fires on the degenerate path.
147pub(crate) const WARMSTART_INV_SCALE_FLOOR: f64 = 1e-12;
148
149/// Ridge stabilization floor for the penalized least-squares projections that
150/// produce the default warm-start location and log-scale coefficients. These
151/// seeds only need to land in the right basin (the outer solver refines them),
152/// so a mild ridge that keeps the projection well-posed under a near-rank-
153/// deficient covariate design is preferable to the tighter floor used for the
154/// production inner solve.
155pub(crate) const WARMSTART_PROJECTION_RIDGE_FLOOR: f64 = 1e-8;