1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/// Baseline cap for the tensor-product width used by the transformation-normal
/// response basis. Small datasets should stay compact because the fit
/// repeatedly factorizes dense penalized Hessians.
pub const BASE_TRANSFORMATION_TENSOR_WIDTH: usize = 160;
/// Large samples can support a richer response basis without the aggressive
/// underfitting forced by the small-sample cap above. This upper cap keeps the
/// tensor width bounded even when the covariate side is narrow.
pub const LARGE_SAMPLE_TRANSFORMATION_TENSOR_WIDTH: usize = 320;
/// E[log |Z|] for Z ~ N(0, 1), used to put local log-absolute residual
/// projections on the standard-normal scale.
pub const STANDARD_NORMAL_MEAN_LOG_ABS: f64 = -0.635_181_422_730_739_1;
/// Strict-feasibility margin for `h' > 0` on the monotonicity grid. Used
/// both by the fit-time fraction-to-boundary line search (so accepted β
/// keeps `h'(grid) ≥ EPS`) and by the predict-time monotonicity check
/// in `inference::predict_input` (which rejects predictions whose minimum
/// `h'` on the response grid drops below this threshold). Keeping these
/// in sync prevents the predict path from rejecting fits that the
/// optimizer accepted as feasible — and vice versa.
pub const TRANSFORMATION_MONOTONICITY_EPS: f64 = 1.0e-8;
/// Absolute bound for feasible transformation scores on the standard-normal
/// scale. The CTN likelihood targets `h(Y|x) ~ N(0,1)`; accepting exact-Newton
/// iterates with finite positive `h'` but astronomical `|h|` lets curvature
/// diagnostics overflow into meaningless values. This is a numerical runaway
/// guard, not a statistical plausibility filter: startup seeds can temporarily
/// land outside practically observable normal quantiles before the line search
/// moves them back into the likelihood's high-density region.
pub const TRANSFORMATION_NORMAL_H_ABS_MAX: f64 = 1.0e6;
/// Number of dense-spectral factor columns processed per exact ψψ HVP row pass.
/// At large-scale CTN dimensions p≈800, this keeps the per-worker accumulator well
/// under 1 MiB while reducing repeated SCOP row-invariant work by 32× relative
/// to one-column HVP dispatch.
pub const SCOP_PSI_PSI_HVP_TILE_COLS: usize = 32;
/// Exact dense SCOP coefficient Hessian cache limit for the inner `H·v` path.
///
/// The large-scale CTN calibration fit has many rows but a moderate coefficient
/// dimension (for example n=20k, p=264). In that regime repeated PCG products
/// against the same Hessian should pay the row-streaming chain rule once, then
/// serve subsequent products as dense BLAS matvecs. Keep the cache restricted to
/// genuinely moderate p so wide CTN fits remain row-streamed.
pub const SCOP_HESSIAN_HVP_DENSE_CACHE_MAX_DIM: usize = 384;
pub const SCOP_HESSIAN_HVP_DENSE_CACHE_MAX_BYTES: usize = 64 * 1024 * 1024;
/// CTN-scoped ceiling on the custom-family inner exact-Newton cycle budget.
///
/// The global `DEFAULT_CUSTOM_FAMILY_INNER_MAX_CYCLES = 1200` exists for the
/// large-scale survival marginal-slope path, whose inner mode has a long,
/// rank-deficient KKT tail that genuinely needs hundreds of cycles. CTN is a
/// different regime: its coefficient block is a *bounded-dimension* Khatri–Rao
/// tensor (capped by `BASE/LARGE_SAMPLE_TRANSFORMATION_TENSOR_WIDTH`), and the
/// objective is strictly convex by construction — the `double_penalty` ridge
/// plus the order-2/order-1 roughness penalties make the penalized Hessian
/// positive definite even where the likelihood is flat on weakly-identified
/// shape×covariate directions. An exact-Newton iteration on a strictly convex,
/// bounded-dimension block converges in a handful of cycles; the only way the
/// fit reaches 1200 inner cycles is by polishing weakly-identified directions
/// that contribute nothing to the likelihood (the #720 timeout). Scaling the
/// cap with the realized coefficient dimension keeps a generous margin for a
/// genuinely nonlinear, high-dimensional transformation while refusing to grind
/// the production large-scale cap on an easy near-Gaussian shift.
pub const CTN_INNER_MAX_CYCLES_BASE: usize = 64;
pub const CTN_INNER_MAX_CYCLES_PER_DIM: usize = 2;
pub const CTN_INNER_MAX_CYCLES_CEILING: usize = 400;
/// Numerical floor on a Gram/penalty diagonal scale before it enters the
/// `likelihood_scale / penalty_scale` ratio that seeds the outer log-λ search.
/// A genuinely zero diagonal (an all-zero penalty block, or a degenerate
/// likelihood Gram) would otherwise produce a `0/0` or `x/0` seed; flooring
/// both scales at a value far below any meaningful curvature keeps the ratio
/// finite without perturbing well-posed problems.
pub const CTN_SEED_SCALE_FLOOR: f64 = 1.0e-8;
/// Lower bound on the cold-start seed log-λ (i.e. λ ≥ 1). Keeps the outer
/// optimizer out of the under-regularized regime where the CTN inner solve is
/// structurally rank-deficient (small-n / p > n); the optimizer is free to step
/// below this once the data support it. See `ctn_penalty_scale_log_lambdas`.
pub const CTN_SEED_LOG_LAMBDA_MIN: f64 = 0.0;
/// Upper bound on the cold-start seed log-λ, matching the outer ρ-bound used
/// across the location-scale families: λ ≈ e¹² caps the seed in the strongly
/// over-smoothed regime so a tiny penalty scale cannot seed an absurd λ.
pub const CTN_SEED_LOG_LAMBDA_MAX: f64 = 12.0;
/// Floor on the warm-start global residual scale `sqrt(weighted_ss / Σw)`.
/// Guards the degenerate near-perfect-fit case (residuals collapse to numerical
/// zero) so the per-residual `residual_floor` below — and the subsequent
/// `ln(|y−μ|)` log-scale target — stay finite. Well below any real response
/// spread, so it never perturbs a genuine fit.
pub const WARMSTART_GLOBAL_SCALE_FLOOR: f64 = 1e-6;
/// Per-residual floor used to form the log-scale warm-start target
/// `ln(|y−μ|) − E[ln|N(0,1)|]`. Built as `global_scale · WARMSTART_RESIDUAL_REL_FLOOR
/// + WARMSTART_RESIDUAL_ABS_FLOOR`: the relative term keeps an exactly-fit point
/// (|y−μ| = 0) from sending `ln(0) → −∞` at 1/1000 of the data scale, and the
/// absolute term backstops the case where `global_scale` itself sits at its floor.
pub const WARMSTART_RESIDUAL_REL_FLOOR: f64 = 1e-3;
pub const WARMSTART_RESIDUAL_ABS_FLOOR: f64 = 1e-12;
/// Floor on a per-row warm-start scale τ before forming `1/τ` when building the
/// affine transformation seed targets. A degenerate τ = 0 (a collapsed warm-start
/// scale block) would otherwise produce a non-finite reciprocal; the floor sits
/// far below any meaningful scale so it only fires on the degenerate path.
pub const WARMSTART_INV_SCALE_FLOOR: f64 = 1e-12;
/// Ridge stabilization floor for the penalized least-squares projections that
/// produce the default warm-start location and log-scale coefficients. These
/// seeds only need to land in the right basin (the outer solver refines them),
/// so a mild ridge that keeps the projection well-posed under a near-rank-
/// deficient covariate design is preferable to the tighter floor used for the
/// production inner solve.
pub const WARMSTART_PROJECTION_RIDGE_FLOOR: f64 = 1e-8;