Skip to main content

gam_solve/rho_optimizer/
run.rs

1use super::*;
2
3pub(crate) const OPERATOR_TRUST_RESTART_RADIUS_FLOOR: f64 = 1.0e-6;
4
5/// Configuration for the outer optimization runner.
6#[derive(Clone, Debug)]
7pub(crate) struct OuterConfig {
8    pub(crate) tolerance: f64,
9    /// Optional override for the *relative-cost-decrease* convergence stop,
10    /// decoupled from `tolerance`. `outer_gradient_tolerance` normally derives
11    /// BOTH the absolute projected-gradient floor (`max(tolerance, scale·1e-9)`)
12    /// AND the relative-cost stop (`rel_cost = tolerance`) from the single
13    /// `tolerance`. That conflation forces a caller who needs a *tight absolute
14    /// floor* (to resolve λ to the genuine REML optimum at large `n`, where the
15    /// floor is `scale·1e-9`) to also accept a *tight rel-cost stop*, which on a
16    /// flat REML ridge never trips and grinds the optimizer to `max_iter` —
17    /// dozens of surplus O(D·p³) Laplace-derivative outer iterations (the #1082
18    /// multinomial smooth-by-factor wall-clock blow-up). When `Some(r)`, the
19    /// rel-cost stop uses `r` while the absolute floor keeps using `tolerance`
20    /// via `objective_scale`, so accuracy (absolute floor) and perf (loose
21    /// rel-cost) are selected independently. `None` preserves the legacy coupling
22    /// (`rel_cost = tolerance`) for every existing path byte-for-byte.
23    pub(crate) rel_cost_tolerance: Option<f64>,
24    pub(crate) max_iter: usize,
25    pub(crate) bounds: Option<(Array1<f64>, Array1<f64>)>,
26    pub(crate) seed_config: gam_problem::SeedConfig,
27    pub(crate) rho_bound: f64,
28    pub(crate) heuristic_lambdas: Option<Vec<f64>>,
29    pub(crate) initial_rho: Option<Array1<f64>>,
30    pub(crate) fallback_policy: FallbackPolicy,
31    pub(crate) screening_cap: Option<Arc<AtomicUsize>>,
32    pub(crate) screen_initial_rho: bool,
33    /// Outer-aware inner-PIRLS iteration cap (sibling of `screening_cap`).
34    /// When set, the BFGS bridge drives this atomic on every accepted
35    /// gradient eval to coarsen the inner Newton solve at early outer iters
36    /// (when ρ is far from converged) and lift it back to full as
37    /// convergence approaches. Distinct from `screening_cap` in that it
38    /// does NOT suppress cache writes / warm-start updates / KKT
39    /// enforcement; it is purely a budget. See
40    /// `RemlObjectiveState::outer_inner_cap` for dual-cap semantics.
41    pub(crate) outer_inner_cap: Option<InnerProgressFeedback>,
42    pub(crate) operator_initial_trust_radius: Option<f64>,
43    pub(crate) arc_initial_regularization: Option<f64>,
44    /// Optional scale factor for the objective's natural magnitude.
45    /// Used to widen the absolute gradient-norm floor on objectives whose
46    /// gradient lives on a non-unit scale (e.g. Gaussian-identity REML at
47    /// large `n`, whose ∂/∂logλ inherits the O(n) likelihood constant).
48    /// `None` falls back to the bare `tolerance` floor.
49    pub(crate) objective_scale: Option<f64>,
50    /// BFGS line-search infinity-norm cap applied to the leading `rho_dim`
51    /// outer parameters (log-λ axes). Documented natural step for
52    /// `log(lambda)` is ≈ 5 (`e^5 ≈ 148`-fold smoothing-parameter change
53    /// per accepted outer iter — matches typical quasi-Newton direction
54    /// magnitude on flat REML surfaces). Setting this `None` disables the
55    /// rho-axis cap entirely.
56    pub(crate) bfgs_step_cap: Option<f64>,
57    /// BFGS line-search infinity-norm cap applied to the trailing `psi_dim`
58    /// outer parameters (kappa / aniso-log-scale axes). Required because
59    /// the kernel scale axes need much tighter control (`e^1 ≈ 2.7`-fold
60    /// per iter is plenty) — using the rho-axis cap here lets the optimizer
61    /// jump kappa by orders of magnitude per step and oscillate. Setting
62    /// this `None` disables the psi-axis cap.
63    pub(crate) bfgs_step_cap_psi: Option<f64>,
64    /// Optional persistent-cache session. When `Some`, every finite objective
65    /// evaluation is written through to disk (rate-limited, atomic-rename)
66    /// and the best on-disk rho is prepended as a seed at the start of each
67    /// plan attempt. Defaulted off so test-only paths skip filesystem I/O.
68    pub(crate) cache_session: Option<Arc<CacheSession>>,
69    /// Optional mirror cache sessions. Checkpoints and successful finalize
70    /// writes are also written to each of these sessions (different keys,
71    /// shared store). Used for hierarchical broadcast: the current best ρ is
72    /// written to the exact-key (primary) AND the data-independent
73    /// seed-prefix key so the next fit with related structure can warm-start
74    /// from this one, even after an interrupted run.
75    pub(crate) cache_mirror_sessions: Vec<Arc<CacheSession>>,
76    pub(crate) rho_uncertainty_problem_size: crate::rho_uncertainty::RhoUncertaintyProblemSize,
77    /// Set by the persistent-cache resume path (`run`) when the outer seed
78    /// originates from a warm-start cache *hit* — i.e. `config.initial_rho`
79    /// (and, since 0.1.204, the inner β) was populated from a prior fit's
80    /// persisted near-optimal iterate (`CacheSeedDecision::Seed`). On a hit
81    /// the continuation pre-warm — which exists to anneal a COLD seed toward
82    /// the optimum — is redundant work (the seed is already near-optimal), so
83    /// the pre-warm step budget is dropped to zero and the run proceeds
84    /// straight to the BFGS/Newton certificate from the cached iterate. The
85    /// converged optimum is unchanged: warm start only sets the STARTING
86    /// point. Defaulted `false`, so every cold-start / no-cache path keeps its
87    /// existing continuation pre-warm budget byte-for-byte.
88    pub(crate) warm_start_cache_hit: bool,
89    /// Converged exact outer Hessian `H(θ̂)` transferred from a prior
90    /// structurally-matching fit via the persistent cache (a warm-start *hit*),
91    /// in the full θ layout. When present and SPD, the BFGS host path seeds its
92    /// iter-0 metric with `InitialMetric::DenseInverseHessian(H⁻¹)` so the first
93    /// outer step is quasi-Newton instead of unscaled steepest descent — the
94    /// dominant LOSO line-search-bracketing cost (each bracketing probe is a
95    /// full inner joint-Newton re-solve). Strictly stronger than the scalar
96    /// `1/‖g₀‖` metric: it carries the full anisotropic curvature, which across
97    /// folds (one held-out point) is nearly identical to this fold's. Never
98    /// changes the converged optimum — BFGS reaches `∇V=0` under any SPD initial
99    /// metric. `None` on every cold-start / no-cache / pre-Hessian-schema path,
100    /// which falls back to the scalar warm metric byte-for-byte.
101    pub(crate) warm_start_outer_hessian: Option<Array2<f64>>,
102    /// Per-ρ-coordinate structural keys, in the objective's NATIVE (formula)
103    /// coordinate order, used to make the outer smoothing-parameter search
104    /// invariant to the order the user wrote the smooth terms / tensor margins
105    /// (#1538/#1539).
106    ///
107    /// When `Some` and the keys induce a non-identity canonical permutation,
108    /// [`run_outer`] reorders the coordinate layout the optimizer sees into a
109    /// stable canonical order (derived purely from the keys, never from the
110    /// native position) before seeding/optimizing, and inverts the permutation
111    /// on the returned ρ / gradient / Hessian so the caller still receives the
112    /// native layout. Seeding, multistart and tie-breaking then all operate on
113    /// the identical canonical layout for every term order, so both orders
114    /// reach the same λ̂ and the same fitted surface. `None` (or an identity
115    /// permutation) leaves the legacy native-order path byte-for-byte unchanged.
116    pub(crate) rho_canonical_keys: Option<Vec<u64>>,
117}
118
119impl Default for OuterConfig {
120    fn default() -> Self {
121        Self {
122            tolerance: 1e-5,
123            rel_cost_tolerance: None,
124            max_iter: 200,
125            bounds: None,
126            seed_config: gam_problem::SeedConfig::default(),
127            rho_bound: 30.0,
128            heuristic_lambdas: None,
129            initial_rho: None,
130            fallback_policy: FallbackPolicy::Automatic,
131            screening_cap: None,
132            screen_initial_rho: false,
133            outer_inner_cap: None,
134            operator_initial_trust_radius: None,
135            arc_initial_regularization: None,
136            objective_scale: None,
137            bfgs_step_cap: None,
138            bfgs_step_cap_psi: None,
139            cache_session: None,
140            cache_mirror_sessions: Vec::new(),
141            rho_uncertainty_problem_size:
142                crate::rho_uncertainty::RhoUncertaintyProblemSize::default(),
143            warm_start_cache_hit: false,
144            warm_start_outer_hessian: None,
145            rho_canonical_keys: None,
146        }
147    }
148}
149
150// ─── OuterProblem builder ─────────────────────────────────────────────
151//
152// Declarative builder for outer optimization problems.  Derives
153// OuterCapability flags from high-level inputs (gradient/hessian
154// availability, psi dimension, EFS eligibility) so call sites never
155// hand-copy capability flags.
156
157/// Declarative outer-problem builder.  Produces both the
158/// [`OuterCapability`] (what the objective can provide) and the
159/// [`OuterConfig`] (how the runner should behave) from a small set
160/// of high-level declarations.
161pub struct OuterProblem {
162    n_params: usize,
163    gradient: Derivative,
164    hessian: DeclaredHessianForm,
165    prefer_gradient_only: bool,
166    disable_fixed_point: bool,
167    psi_dim: usize,
168    barrier_config: Option<BarrierConfig>,
169    tolerance: f64,
170    rel_cost_tolerance: Option<f64>,
171    max_iter: usize,
172    bounds: Option<(Array1<f64>, Array1<f64>)>,
173    rho_bound: f64,
174    seed_config: gam_problem::SeedConfig,
175    heuristic_lambdas: Option<Vec<f64>>,
176    initial_rho: Option<Array1<f64>>,
177    fallback_policy: FallbackPolicy,
178    screening_cap: Option<Arc<AtomicUsize>>,
179    screen_initial_rho: bool,
180    outer_inner_cap: Option<InnerProgressFeedback>,
181    operator_initial_trust_radius: Option<f64>,
182    arc_initial_regularization: Option<f64>,
183    objective_scale: Option<f64>,
184    bfgs_step_cap: Option<f64>,
185    bfgs_step_cap_psi: Option<f64>,
186    cache_session: Option<Arc<CacheSession>>,
187    cache_mirror_sessions: Vec<Arc<CacheSession>>,
188    rho_uncertainty_problem_size: crate::rho_uncertainty::RhoUncertaintyProblemSize,
189    continuation_prewarm: bool,
190    rho_canonical_keys: Option<Vec<u64>>,
191}
192
193impl OuterProblem {
194    pub fn new(n_params: usize) -> Self {
195        Self {
196            n_params,
197            gradient: Derivative::Unavailable,
198            hessian: DeclaredHessianForm::Unavailable,
199            prefer_gradient_only: false,
200            disable_fixed_point: false,
201            psi_dim: 0,
202            barrier_config: None,
203            tolerance: 1e-5,
204            rel_cost_tolerance: None,
205            max_iter: 200,
206            bounds: None,
207            rho_bound: 30.0,
208            seed_config: gam_problem::SeedConfig::default(),
209            heuristic_lambdas: None,
210            initial_rho: None,
211            fallback_policy: FallbackPolicy::Automatic,
212            screening_cap: None,
213            screen_initial_rho: false,
214            outer_inner_cap: None,
215            operator_initial_trust_radius: None,
216            arc_initial_regularization: None,
217            objective_scale: None,
218            bfgs_step_cap: None,
219            bfgs_step_cap_psi: None,
220            cache_session: None,
221            cache_mirror_sessions: Vec::new(),
222            rho_uncertainty_problem_size:
223                crate::rho_uncertainty::RhoUncertaintyProblemSize::default(),
224            continuation_prewarm: true,
225            rho_canonical_keys: None,
226        }
227    }
228
229    /// Supply per-ρ-coordinate structural keys (native/formula order) so the
230    /// outer search is canonicalized to be invariant to the order the smooth
231    /// terms / tensor margins were written (#1538/#1539). See
232    /// [`OuterConfig::rho_canonical_keys`].
233    pub fn with_rho_canonical_keys(mut self, keys: Option<Vec<u64>>) -> Self {
234        self.rho_canonical_keys = keys;
235        self
236    }
237
238    pub fn with_gradient(mut self, d: Derivative) -> Self {
239        self.gradient = d;
240        self
241    }
242    pub fn with_hessian(mut self, form: DeclaredHessianForm) -> Self {
243        self.hessian = form;
244        self
245    }
246    pub fn with_prefer_gradient_only(mut self, prefer_gradient_only: bool) -> Self {
247        self.prefer_gradient_only = prefer_gradient_only;
248        self
249    }
250    /// Forbid the planner from selecting EFS/HybridEfs, even when the
251    /// objective implements `eval_efs()` and the coordinate structure would
252    /// otherwise make pure/hybrid EFS eligible.
253    ///
254    /// Callers use this for families where the Wood-Fasiolo structural
255    /// property is known not to hold (e.g. GAMLSS/location-scale with
256    /// β-dependent joint Hessian), so EFS would stagnate and burn budget
257    /// before the automatic cascade falls back to gradient-based BFGS.
258    pub fn with_disable_fixed_point(mut self, disable: bool) -> Self {
259        self.disable_fixed_point = disable;
260        self
261    }
262    // MEASURE-JET ψ REGISTRATION: the engine below is already complete for a
263    // 3-coordinate measure-jet ψ group (s, α, ln τ) — `psi_dim` is generic,
264    // `with_bounds` carries the s ∈ (0, 2) box (the same convention matern κ
265    // uses for its log-κ window; no logistic reparameterization exists or is
266    // needed in-house), `with_bfgs_step_cap_psi` caps per-iteration ψ moves,
267    // and `DirectionalHyperParam::new_compact` (solver/reml/mod.rs) carries
268    // penalty-only first/second/cross jets with `is_penalty_like`
269    // auto-derived from the identically-zero design drift (∂X/∂ψ ≡ 0).
270    // Every remaining registration arm is formula-layer dispatch in
271    // src/terms/smooth.rs (eligibility in
272    // `spatial_term_supports_hyper_optimization`, dims in
273    // `spatial_dims_per_term`, seed/bounds/write-back on
274    // `SpatialLogKappaCoords`, the per-trial rebuild in
275    // `apply_log_kappa_to_term`, and the derivative bundle in
276    // `try_build_spatial_term_log_kappa_derivative`, which currently returns
277    // `Ok(None)` for `SmoothBasisSpec::MeasureJet`) plus the
278    // `build_measure_jet_basis_psi_derivatives` producer in
279    // src/terms/basis/measure_jet_smooth.rs; both are owned by the
280    // measure-jet terms actor. Registration stays gated on those arms — do
281    // NOT add measure-jet-specific branches to this engine.
282    pub fn with_psi_dim(mut self, dim: usize) -> Self {
283        self.psi_dim = dim;
284        self
285    }
286    pub fn with_barrier(mut self, cfg: Option<BarrierConfig>) -> Self {
287        self.barrier_config = cfg;
288        self
289    }
290    pub fn with_tolerance(mut self, tol: f64) -> Self {
291        self.tolerance = tol;
292        self
293    }
294    pub fn with_max_iter(mut self, n: usize) -> Self {
295        self.max_iter = n;
296        self
297    }
298    pub fn with_bounds(mut self, lo: Array1<f64>, hi: Array1<f64>) -> Self {
299        self.bounds = Some((lo, hi));
300        self
301    }
302    pub fn with_rho_bound(mut self, b: f64) -> Self {
303        self.rho_bound = b;
304        self
305    }
306    pub fn with_seed_config(mut self, sc: gam_problem::SeedConfig) -> Self {
307        self.seed_config = sc;
308        self
309    }
310    pub fn with_heuristic_lambdas(mut self, h: Vec<f64>) -> Self {
311        self.heuristic_lambdas = Some(h);
312        self
313    }
314    pub fn with_initial_rho(mut self, rho: Array1<f64>) -> Self {
315        self.initial_rho = Some(rho);
316        self
317    }
318    /// Toggle the generic rho-continuation seed pre-warm. This does not affect
319    /// objectives that require an explicit continuation path; it only controls
320    /// the cheap-by-default pre-pass gated by `allow_continuation_prewarm()`.
321    pub fn with_continuation_prewarm(mut self, enabled: bool) -> Self {
322        self.continuation_prewarm = enabled;
323        self
324    }
325    pub fn with_screening_cap(mut self, screening_cap: Arc<AtomicUsize>) -> Self {
326        self.screening_cap = Some(screening_cap);
327        self
328    }
329    /// Allow seed screening to rank the explicit initial rho against generated
330    /// candidates even when the effective seed budget is one. The default keeps
331    /// a user-provided initial point authoritative and avoids a separate
332    /// screening pass.
333    pub fn with_screen_initial_rho(mut self, screen_initial_rho: bool) -> Self {
334        self.screen_initial_rho = screen_initial_rho;
335        self
336    }
337    /// Wire the bidirectional inner-PIRLS feedback channel.
338    ///
339    /// The outer bridge writes a coarsened iteration cap into
340    /// `feedback.cap` on every accepted gradient/Hessian eval; the inner
341    /// solver writes back into `feedback.last_iters` /
342    /// `feedback.last_converged` after each non-screening solve so the
343    /// next outer iter's schedule can adapt to the inner solver's
344    /// actual convergence behavior. Typical caller passes
345    /// `InnerProgressFeedback {
346    ///     cap: Arc::clone(&reml_state.outer_inner_cap),
347    ///     last_iters: Arc::clone(&reml_state.last_inner_iters),
348    ///     last_converged: Arc::clone(&reml_state.last_inner_converged),
349    /// }` so the inner and outer observe the same atomics.
350    pub fn with_outer_inner_cap(mut self, feedback: InnerProgressFeedback) -> Self {
351        self.outer_inner_cap = Some(feedback);
352        self
353    }
354    pub fn with_operator_initial_trust_radius(mut self, radius: Option<f64>) -> Self {
355        self.operator_initial_trust_radius = sanitized_operator_trust_restart_radius(radius);
356        self
357    }
358
359    /// Override the ARC initial cubic-regularization parameter sigma
360    /// (default in `opt`: 1.0). Smaller sigma → less cubic penalty on the
361    /// first step → larger first move on benign objectives. The matrix-
362    /// free Newton-TR analog is `with_operator_initial_trust_radius`.
363    ///
364    /// Used by Gaussian-identity REML at large-scale n: the objective is
365    /// quadratic-like in log-λ near the optimum (sigma is the right
366    /// scale), and log-λ moves of 2–4 units in the early iters
367    /// otherwise burn 4–8 iters of trust-region expansion before the
368    /// model trusts the analytic Hessian.
369    pub fn with_arc_initial_regularization(mut self, sigma: Option<f64>) -> Self {
370        self.arc_initial_regularization = sigma.filter(|v| v.is_finite() && *v > 0.0);
371        self
372    }
373
374    /// Set the objective's natural magnitude scale, used to derive an
375    /// `n`-aware absolute gradient-norm floor. When set to `Some(s)`,
376    /// the runner uses `abs_floor = max(tol, s * 1e-9)` for the
377    /// projected-gradient convergence check.
378    ///
379    /// Rationale: a fixed `abs = tol` (e.g. 1e-6) is appropriate when the
380    /// objective and its gradient live on a unit scale, but Gaussian-
381    /// identity REML carries an O(n) likelihood constant that flows into
382    /// ∂/∂logλ. At large-scale n the floor becomes binding even when the
383    /// relative-from-seed component (`rel_initial_grad * ‖g0‖`) declared
384    /// convergence iters earlier — chasing sub-ULP changes in log-λ at
385    /// the cost of repeated k²·n·p² analytic-Hessian assemblies.
386    pub fn with_objective_scale(mut self, scale: Option<f64>) -> Self {
387        self.objective_scale = scale.filter(|v| v.is_finite() && *v > 0.0);
388        self
389    }
390
391    /// Decouple the *relative-cost-decrease* convergence stop from the
392    /// absolute projected-gradient floor. By default both are derived from the
393    /// single `with_tolerance` value (`abs = max(tol, scale·1e-9)`,
394    /// `rel_cost = tol`). Supplying `Some(r)` here makes the rel-cost stop use
395    /// `r` while the absolute floor keeps using `tolerance` (so a caller can
396    /// keep a tight absolute floor for accuracy at large `n` AND a loose
397    /// rel-cost stop for perf on a flat REML ridge — see #1082). `None` keeps
398    /// the legacy coupling.
399    pub fn with_rel_cost_tolerance(mut self, rel_cost: Option<f64>) -> Self {
400        self.rel_cost_tolerance = rel_cost.filter(|v| v.is_finite() && *v > 0.0);
401        self
402    }
403
404    /// Cap the infinity-norm displacement of BFGS cost-only line-search probes
405    /// on the **rho axes** (the first `n_params - psi_dim` outer parameters,
406    /// = log-λ). Also scales the initial inverse metric so the first trial
407    /// direction respects the same local budget coordinate-wise. Documented
408    /// natural step on log-λ is ≈ 5; tighter values throttle BFGS and starve
409    /// convergence on flat REML valleys.
410    pub fn with_bfgs_step_cap(mut self, cap: Option<f64>) -> Self {
411        self.bfgs_step_cap = cap.filter(|v| v.is_finite() && *v > 0.0);
412        self
413    }
414
415    /// Cap the infinity-norm displacement of BFGS cost-only line-search probes
416    /// on the **psi axes** (the trailing `psi_dim` outer parameters, = kappa
417    /// or anisotropic log-scales). Mirrors [`Self::with_bfgs_step_cap`] but
418    /// scoped to kernel-scale parameters whose natural step is much smaller
419    /// than log-λ (≈ ln 2 per iter keeps kappa from oscillating). Without
420    /// this split, a uniform rho-scale cap lets psi explode while a uniform
421    /// psi-scale cap throttles rho — both fail the survival-marginal-slope
422    /// path at large scale, where rho needs |d|≈5 while psi wants |d|≤1.
423    pub fn with_bfgs_step_cap_psi(mut self, cap: Option<f64>) -> Self {
424        self.bfgs_step_cap_psi = cap.filter(|v| v.is_finite() && *v > 0.0);
425        self
426    }
427
428    pub fn with_cache_session(mut self, session: Arc<CacheSession>) -> Self {
429        self.cache_session = Some(session);
430        self
431    }
432
433    /// Attach mirror cache sessions that receive a broadcast copy of
434    /// the final-result finalize write. See
435    /// [`OuterConfig::cache_mirror_sessions`].
436    pub fn with_cache_mirror_sessions(mut self, sessions: Vec<Arc<CacheSession>>) -> Self {
437        self.cache_mirror_sessions = sessions;
438        self
439    }
440
441    pub fn with_problem_size(mut self, n_obs: usize, p_coefficients: usize) -> Self {
442        self.rho_uncertainty_problem_size = crate::rho_uncertainty::RhoUncertaintyProblemSize {
443            n_obs: Some(n_obs),
444            p_coefficients: Some(p_coefficients),
445        };
446        self
447    }
448
449    /// Override the fallback policy. Default is [`FallbackPolicy::Automatic`].
450    ///
451    /// Set [`FallbackPolicy::Disabled`] when the caller requires the primary
452    /// plan to stand on its own. Exact-Hessian objectives use this to ensure
453    /// failures surface on the analytic geometry instead of being reinterpreted
454    /// by a different optimizer class.
455    pub fn with_fallback_policy(mut self, policy: FallbackPolicy) -> Self {
456        self.fallback_policy = policy;
457        self
458    }
459
460    /// Derive the capability flags from the builder state.
461    /// `fixed_point_available` is set to `false` here; `build_objective`
462    /// overrides it based on whether an EFS closure is actually provided.
463    fn capability(&self) -> OuterCapability {
464        OuterCapability {
465            gradient: self.gradient,
466            hessian: self.hessian,
467            prefer_gradient_only: self.prefer_gradient_only,
468            disable_fixed_point: self.disable_fixed_point,
469            n_params: self.n_params,
470            psi_dim: self.psi_dim,
471            fixed_point_available: false,
472            barrier_config: self.barrier_config.clone(),
473        }
474    }
475
476    /// Derive the runner configuration from the builder state.
477    pub(crate) fn config(&self) -> OuterConfig {
478        OuterConfig {
479            tolerance: self.tolerance,
480            rel_cost_tolerance: self.rel_cost_tolerance,
481            max_iter: self.max_iter,
482            bounds: self.bounds.clone(),
483            seed_config: self.seed_config,
484            rho_bound: self.rho_bound,
485            heuristic_lambdas: self.heuristic_lambdas.clone(),
486            initial_rho: self.initial_rho.clone(),
487            fallback_policy: self.fallback_policy,
488            screening_cap: self.screening_cap.clone(),
489            screen_initial_rho: self.screen_initial_rho,
490            outer_inner_cap: self.outer_inner_cap.clone(),
491            operator_initial_trust_radius: self.operator_initial_trust_radius,
492            arc_initial_regularization: self.arc_initial_regularization,
493            objective_scale: self.objective_scale,
494            bfgs_step_cap: self.bfgs_step_cap,
495            bfgs_step_cap_psi: self.bfgs_step_cap_psi,
496            cache_session: self.cache_session.clone(),
497            cache_mirror_sessions: self.cache_mirror_sessions.clone(),
498            rho_uncertainty_problem_size: self.rho_uncertainty_problem_size,
499            // Cold by construction. The persistent-cache resume path in `run`
500            // flips this to `true` only after a warm-start cache *hit* installs
501            // a near-optimal seed; every other entry point keeps the cold-start
502            // continuation pre-warm budget byte-for-byte.
503            warm_start_cache_hit: false,
504            // Populated only by the persistent-cache resume path in `run` after
505            // a warm-start hit decodes a converged outer Hessian; cold by
506            // construction here, like `warm_start_cache_hit`.
507            warm_start_outer_hessian: None,
508            rho_canonical_keys: self.rho_canonical_keys.clone(),
509        }
510    }
511
512    /// Construct a [`ClosureObjective`] with capability flags derived from the
513    /// builder state **and** the closures actually provided.
514    ///
515    /// `fixed_point_available` is set to `true` when `efs_fn` is `Some`,
516    /// regardless of whether `.with_efs()` was called.  This is the canonical
517    /// way to create production objectives — it eliminates the drift risk of
518    /// manually entering capability flags.
519    pub fn build_objective<S, Fc, Fe, Fr, Fefs>(
520        &self,
521        state: S,
522        cost_fn: Fc,
523        eval_fn: Fe,
524        reset_fn: Option<Fr>,
525        efs_fn: Option<Fefs>,
526    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs>
527    where
528        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
529        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
530        Fr: FnMut(&mut S),
531        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
532    {
533        let mut cap = self.capability();
534        // Derive fixed_point_available from whether the caller actually
535        // provided an EFS hook, rather than relying on manual flags.
536        cap.fixed_point_available = efs_fn.is_some();
537        ClosureObjective {
538            state,
539            cap,
540            cost_fn,
541            eval_fn,
542            eval_order_fn: None,
543            reset_fn,
544            efs_fn,
545            screening_proxy_fn: None::<fn(&mut S, &Array1<f64>) -> Result<f64, EstimationError>>,
546            seed_fn: None::<fn(&mut S, &Array1<f64>) -> Result<SeedOutcome, EstimationError>>,
547            continuation_prewarm: self.continuation_prewarm,
548        }
549    }
550
551    /// Construct a [`ClosureObjective`] with an order-aware evaluation hook.
552    ///
553    /// This lets the runner request first-order vs second-order work based on
554    /// the active outer plan while preserving the legacy eager `eval_fn`.
555    pub fn build_objective_with_eval_order<S, Fc, Fe, Feo, Fr, Fefs>(
556        &self,
557        state: S,
558        cost_fn: Fc,
559        eval_fn: Fe,
560        eval_order_fn: Feo,
561        reset_fn: Option<Fr>,
562        efs_fn: Option<Fefs>,
563    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs, Feo>
564    where
565        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
566        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
567        Feo: FnMut(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
568        Fr: FnMut(&mut S),
569        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
570    {
571        let mut cap = self.capability();
572        cap.fixed_point_available = efs_fn.is_some();
573        ClosureObjective {
574            state,
575            cap,
576            cost_fn,
577            eval_fn,
578            eval_order_fn: Some(eval_order_fn),
579            reset_fn,
580            efs_fn,
581            screening_proxy_fn: None::<fn(&mut S, &Array1<f64>) -> Result<f64, EstimationError>>,
582            seed_fn: None::<fn(&mut S, &Array1<f64>) -> Result<SeedOutcome, EstimationError>>,
583            continuation_prewarm: self.continuation_prewarm,
584        }
585    }
586
587    /// Construct a [`ClosureObjective`] with both an order-aware evaluation
588    /// hook and a custom seed-screening ranking proxy. The proxy fires only
589    /// when the cascade in `rank_seeds_with_screening` calls it; outside
590    /// screening the regular cost path is unaffected.
591    pub fn build_objective_with_screening_proxy<S, Fc, Fe, Feo, Fr, Fefs, Fsp>(
592        &self,
593        state: S,
594        cost_fn: Fc,
595        eval_fn: Fe,
596        eval_order_fn: Feo,
597        reset_fn: Option<Fr>,
598        efs_fn: Option<Fefs>,
599        screening_proxy_fn: Fsp,
600    ) -> ClosureObjective<S, Fc, Fe, Fr, Fefs, Feo, Fsp>
601    where
602        Fc: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
603        Fe: FnMut(&mut S, &Array1<f64>) -> Result<OuterEval, EstimationError>,
604        Feo: FnMut(&mut S, &Array1<f64>, OuterEvalOrder) -> Result<OuterEval, EstimationError>,
605        Fr: FnMut(&mut S),
606        Fefs: FnMut(&mut S, &Array1<f64>) -> Result<EfsEval, EstimationError>,
607        Fsp: FnMut(&mut S, &Array1<f64>) -> Result<f64, EstimationError>,
608    {
609        let mut cap = self.capability();
610        cap.fixed_point_available = efs_fn.is_some();
611        ClosureObjective {
612            state,
613            cap,
614            cost_fn,
615            eval_fn,
616            eval_order_fn: Some(eval_order_fn),
617            reset_fn,
618            efs_fn,
619            screening_proxy_fn: Some(screening_proxy_fn),
620            seed_fn: None::<fn(&mut S, &Array1<f64>) -> Result<SeedOutcome, EstimationError>>,
621            continuation_prewarm: self.continuation_prewarm,
622        }
623    }
624
625    /// Run the outer optimization with a given objective.
626    pub fn run(
627        &self,
628        obj: &mut dyn OuterObjective,
629        context: &str,
630    ) -> Result<OuterResult, EstimationError> {
631        let mut config = self.config();
632        let Some(session) = config.cache_session.clone() else {
633            return run_outer(obj, &config, context);
634        };
635        let key_hex = session.key().to_hex();
636        let short_key = &key_hex[..8.min(key_hex.len())];
637        let mut had_hit = false;
638        let mut cached_inner_beta: Option<Array1<f64>> = None;
639        if let Some(loaded) = session.try_load_with_source() {
640            match classify_cache_entry_for_outer(&loaded, self.n_params) {
641                CacheSeedDecision::ExactFinal {
642                    rho,
643                    beta: _beta_final,
644                    final_value,
645                    iterations,
646                    prior_obj_display,
647                } => {
648                    let cap = primary_capability_for_config(obj.capability(), &config, context);
649                    let plan_used = plan(&cap);
650                    log::info!(
651                        "[CACHE] final-hit key={}.. context={} rho_dim={} prior_obj={:.6e} iter={} action=skip-outer-validation",
652                        short_key,
653                        context,
654                        rho.len(),
655                        prior_obj_display,
656                        iterations,
657                    );
658                    let mut result =
659                        OuterResult::new(rho, final_value, iterations, true, plan_used);
660                    result.rho_uncertainty_diagnostic = Some(compute_rho_uncertainty_diagnostic(
661                        obj,
662                        &config,
663                        context,
664                        &mut result,
665                    ));
666                    return Ok(result);
667                }
668                CacheSeedDecision::Seed {
669                    rho,
670                    beta,
671                    hessian,
672                    prior_obj_display,
673                    iteration,
674                } => {
675                    let beta_len = beta.len();
676                    let beta_arr = if beta.is_empty() {
677                        None
678                    } else {
679                        Some(Array1::from_vec(beta))
680                    };
681                    // Adopt the transferred converged outer Hessian only when it
682                    // matches this fit's full-θ dimension; a dimension drift
683                    // (structural change the cache key did not capture) falls
684                    // back to the scalar warm metric in run_plan.
685                    config.warm_start_outer_hessian = hessian.and_then(|(dim, flat)| {
686                        if dim == self.n_params && flat.len() == dim * dim {
687                            Array2::from_shape_vec((dim, dim), flat).ok()
688                        } else {
689                            None
690                        }
691                    });
692                    if config
693                        .initial_rho
694                        .as_ref()
695                        .is_none_or(|initial| initial != rho)
696                    {
697                        log::info!(
698                            "[CACHE] hit  key={}.. context={} rho_dim={} beta_dim={} prior_obj={:.6e} iter={}",
699                            short_key,
700                            context,
701                            rho.len(),
702                            beta_len,
703                            prior_obj_display,
704                            iteration,
705                        );
706                        config.initial_rho = Some(rho);
707                        config.screen_initial_rho = false;
708                        had_hit = true;
709                    } else {
710                        log::info!(
711                            "[CACHE] hit  key={}.. context={} rho_dim={} beta_dim={} already-aligned prior_obj={:.6e}",
712                            short_key,
713                            context,
714                            rho.len(),
715                            beta_len,
716                            prior_obj_display,
717                        );
718                        had_hit = true;
719                    }
720                    cached_inner_beta = beta_arr;
721                }
722                CacheSeedDecision::Discard {
723                    reason: "payload-shape-mismatch",
724                    ..
725                } => {
726                    log::info!(
727                        "[CACHE] skip key={}.. context={} reason=payload-shape-mismatch n_params={}",
728                        short_key,
729                        context,
730                        self.n_params,
731                    );
732                }
733                CacheSeedDecision::Discard {
734                    reason,
735                    prior_obj_display,
736                    all_rho_finite,
737                } => {
738                    log::info!(
739                        "[CACHE] skip key={}.. context={} reason={} prior_obj={:.6e} all_rho_finite={}",
740                        short_key,
741                        context,
742                        reason,
743                        prior_obj_display,
744                        all_rho_finite.unwrap_or(false),
745                    );
746                }
747            }
748        } else {
749            log::info!(
750                "[CACHE] miss key={}.. context={} reason=fresh-fingerprint n_params={}",
751                short_key,
752                context,
753                self.n_params,
754            );
755        }
756        // Propagate the warm-start cache-hit signal into the config the runner
757        // sees. On a hit the installed seed (ρ, and since 0.1.204 the inner β)
758        // is already near-optimal, so the continuation pre-warm — which exists
759        // purely to anneal a COLD seed — is redundant and is skipped downstream
760        // (`run_plan::continuation_prewarm_step_budget`). The outer BFGS/Newton
761        // still runs to its REML/KKT certificate, so the optimum is identical.
762        config.warm_start_cache_hit = had_hit;
763        let mut checkpointing = CheckpointingObjective::new(
764            obj,
765            Arc::clone(&session),
766            config.cache_mirror_sessions.clone(),
767        );
768        // Inject the cached inner β (when present) so the family's PIRLS
769        // opens at the prior converged iterate. Families that don't expose
770        // a β slot inherit the trait's no-op default and silently ignore
771        // the hint — that's a ρ-only resume, identical to the pre-β-cache
772        // behavior, but never a regression. Families that DO expose β
773        // (PIRLS-based GAMs, custom-family marginal slope, …) override
774        // `seed_inner_state` to install β before the first eval.
775        if let Some(beta) = cached_inner_beta.as_ref() {
776            match checkpointing.seed_inner_state(beta) {
777                Ok(SeedOutcome::Installed) => log::info!(
778                    "[CACHE] beta-warm key={}.. context={} beta_dim={} action=installed",
779                    short_key,
780                    context,
781                    beta.len(),
782                ),
783                Ok(SeedOutcome::NoSlot) => log::warn!(
784                    "[CACHE] beta-warm key={}.. context={} beta_dim={} action=skip \
785                     reason=objective_has_no_inner_beta_slot",
786                    short_key,
787                    context,
788                    beta.len(),
789                ),
790                Ok(SeedOutcome::Incompatible) => log::info!(
791                    // Not a warning: a row-relaxed cross-fit prefix seed
792                    // (`cache_seed_key`) legitimately carries a β whose length
793                    // reflects the PARENT fold's realized basis rank, which
794                    // differs from this fold's per-block widths. The ρ seed is
795                    // kept (already installed above); cross-length β transfer is
796                    // the gauge-projected FitArtifact channel's job. This is a
797                    // clean ρ-only resume, NOT a regression to full cold start.
798                    "[CACHE] beta-warm key={}.. context={} beta_dim={} action=rho-only \
799                     reason=seed_beta_length_incompatible_with_inner_blocks",
800                    short_key,
801                    context,
802                    beta.len(),
803                ),
804                Err(err) => log::warn!(
805                    "[CACHE] beta-warm key={}.. context={} beta_dim={} action=skip err={}",
806                    short_key,
807                    context,
808                    beta.len(),
809                    err,
810                ),
811            }
812        }
813        let result = run_outer(&mut checkpointing, &config, context);
814        // Pull the most-recent inner β surfaced by the inner solver so the
815        // finalize write encodes the (ρ, β) pair the BFGS optimum was
816        // actually fitted at, not a ρ-only seed that resumes at cold β.
817        let final_beta = checkpointing.last_inner_beta();
818        if let Ok(result) = result.as_ref()
819            && result.final_value.is_finite()
820            && result.converged
821            && let Some(bytes) = encode_iterate(
822                &result.rho,
823                final_beta.as_ref(),
824                result.final_hessian.as_ref(),
825                result.final_value,
826                result.iterations as u64,
827            )
828        {
829            let saved = session.finalize(
830                &bytes,
831                Some(result.final_value),
832                Some(result.iterations as u64),
833            );
834            if saved {
835                log::info!(
836                    "[CACHE] save key={}.. context={} final_obj={:.6e} iter={} resumed={}",
837                    short_key,
838                    context,
839                    result.final_value,
840                    result.iterations,
841                    had_hit,
842                );
843            }
844            // Broadcast finalize to mirror keys. The seed-prefix mirror
845            // exists so future fits with related-but-not-identical
846            // structure can warm-start from this run via the dispatcher's
847            // prefix lookup.
848            for mirror in &config.cache_mirror_sessions {
849                let mirror_saved = mirror.finalize(
850                    &bytes,
851                    Some(result.final_value),
852                    Some(result.iterations as u64),
853                );
854                if mirror_saved {
855                    let mirror_hex = mirror.key().to_hex();
856                    log::info!(
857                        "[CACHE] save key={}.. context={} mirror final_obj={:.6e} iter={}",
858                        &mirror_hex[..8.min(mirror_hex.len())],
859                        context,
860                        result.final_value,
861                        result.iterations,
862                    );
863                }
864            }
865        }
866        result
867    }
868}
869
870/// Result of a completed outer optimization.
871#[derive(Clone, Debug)]
872pub struct OuterResult {
873    /// Optimized log-smoothing parameters.
874    pub rho: Array1<f64>,
875    /// Final objective value.
876    pub final_value: f64,
877    /// Total outer iterations across all solver restarts.
878    pub iterations: usize,
879    /// Final gradient norm, when the solver computed an actual gradient.
880    pub final_grad_norm: Option<f64>,
881    /// Final gradient when the solver is gradient-based.
882    pub final_gradient: Option<Array1<f64>>,
883    /// Final Hessian when the solver tracks one.
884    pub final_hessian: Option<Array2<f64>>,
885    /// Whether the optimizer converged to a stationary point.
886    pub converged: bool,
887    /// Which plan was actually used (may differ from initial if fallback fired).
888    pub plan_used: OuterPlan,
889    /// Final trust radius for the internal operator trust-region solver.
890    ///
891    /// A non-converged operator-ARC attempt may be restarted by the budget
892    /// ladder. Restarting only from the last θ but resetting the trust radius
893    /// is not a warm start: it replays the same rejected large trial steps.
894    /// Carry this globalization state so retries resume from the scale the
895    /// previous attempt already learned.
896    pub operator_trust_radius: Option<f64>,
897    /// Why the internal operator trust-region solver stopped.
898    pub operator_stop_reason: Option<OperatorTrustRegionStopReason>,
899    /// First-order optimality self-audit at the returned point (#934).
900    ///
901    /// `None` when no analytic gradient was measured at termination
902    /// (gradient-free solvers, cache-hit short-circuits, per-atom EFS) or
903    /// when an audit probe failed to evaluate. Populated once by
904    /// [`run_outer`] after the solver ladder returns, outside all hot loops.
905    pub criterion_certificate: Option<CriterionCertificate>,
906    /// Post-fit PSIS diagnostic for whether sampled smoothing-parameter weights
907    /// show evidence that plug-in REML/LAML intervals are unreliable. Populated
908    /// once by [`run_outer`] when the exact rho Hessian is cheap enough to use.
909    pub rho_uncertainty_diagnostic: Option<crate::rho_uncertainty::RhoUncertaintyDiagnostic>,
910}
911
912impl OuterResult {
913    pub fn new(
914        rho: Array1<f64>,
915        final_value: f64,
916        iterations: usize,
917        converged: bool,
918        plan_used: OuterPlan,
919    ) -> Self {
920        Self {
921            rho,
922            final_value,
923            iterations,
924            final_grad_norm: None,
925            final_gradient: None,
926            final_hessian: None,
927            converged,
928            plan_used,
929            operator_trust_radius: None,
930            operator_stop_reason: None,
931            criterion_certificate: None,
932            rho_uncertainty_diagnostic: None,
933        }
934    }
935
936    /// Human-readable rendering of `final_grad_norm` for diagnostics. Returns
937    /// `"n/a"` when no gradient was measured (gradient-free / cache-hit paths).
938    pub fn final_grad_norm_report(&self) -> String {
939        match self.final_grad_norm {
940            Some(g) => format!("{g:.3e}"),
941            None => "n/a".to_string(),
942        }
943    }
944}
945
946// ─── First-order optimality certificate (#934) ────────────────────────
947//
948// The objective↔gradient desync bug genus (#748, #752, #808, #901, …) has a
949// universal signature: at the returned "optimum" the analytic gradient says
950// converged while a finite difference of the ACTUAL criterion value says
951// otherwise (or the optimizer stalls and rails λ). Every such bug was
952// diagnosed by a human running exactly that FD comparison by hand. The
953// certificate makes the engine run it on itself, once, at θ̂, on every fit:
954// two central-difference pairs of the VALUE path along one deterministic
955// random direction, compared against ∇F(θ̂)·v from the analytic path, plus
956// the two ancillary facts every desync postmortem asks for (is the outer
957// curvature PD here; did any λ rail to a bound). It is the runtime
958// enforcement layer for the criterion-atom architecture (#931): atoms make
959// desync structurally hard, the certificate makes any residue observable.
960//
961// Cost discipline: at most four value-path evaluations at the single final
962// point, outside every hot loop. The value path is evaluated through
963// `eval_cost` at θ̂±hv — points the gradient path never visited, so the
964// existing ρ-keyed caches naturally miss and the true value code runs.
965// Disagreement does not fail the fit: it names the broken criterion loudly
966// in the result, the log, and the report.
967
968/// Deterministic unit direction on the θ sphere for the certificate audit.
969///
970/// Seeded from the problem fingerprint (context string + θ̂ bits) via FNV-1a
971/// and expanded with SplitMix64 + Box–Muller — no clock, no global RNG, so
972/// the audit direction is reproducible across runs of the same fit.
973pub(crate) fn certificate_audit_direction(theta: &Array1<f64>, context: &str) -> Array1<f64> {
974    let mut seed: u64 = 0xcbf2_9ce4_8422_2325;
975    let mut fnv = |byte: u8| {
976        seed ^= u64::from(byte);
977        seed = seed.wrapping_mul(0x0000_0100_0000_01b3);
978    };
979    for byte in context.bytes() {
980        fnv(byte);
981    }
982    for &x in theta.iter() {
983        for byte in x.to_bits().to_le_bytes() {
984            fnv(byte);
985        }
986    }
987    let mut state = seed;
988    let mut next_unit = move || {
989        state = state.wrapping_add(0x9e37_79b9_7f4a_7c15);
990        let mut z = state;
991        z = (z ^ (z >> 30)).wrapping_mul(0xbf58_476d_1ce4_e5b9);
992        z = (z ^ (z >> 27)).wrapping_mul(0x94d0_49bb_1331_11eb);
993        z ^= z >> 31;
994        // Uniform in (0, 1): 53 mantissa bits, nudged off zero for the log.
995        ((z >> 11) as f64 + 0.5) / (1u64 << 53) as f64
996    };
997    let mut direction = Array1::<f64>::zeros(theta.len());
998    let mut i = 0;
999    while i < direction.len() {
1000        let (u1, u2) = (next_unit(), next_unit());
1001        let radius = (-2.0 * u1.ln()).sqrt();
1002        let angle = 2.0 * std::f64::consts::PI * u2;
1003        direction[i] = radius * angle.cos();
1004        if i + 1 < direction.len() {
1005            direction[i + 1] = radius * angle.sin();
1006        }
1007        i += 2;
1008    }
1009    let norm = direction.dot(&direction).sqrt();
1010    if norm.is_finite() && norm > f64::EPSILON {
1011        direction.mapv_inplace(|v| v / norm);
1012        direction
1013    } else {
1014        // Degenerate draw (probability ~0): fall back to the first axis.
1015        let mut fallback = Array1::<f64>::zeros(theta.len());
1016        fallback[0] = 1.0;
1017        fallback
1018    }
1019}
1020
1021/// Plain Cholesky positive-definiteness probe for the (small, outer-dim)
1022/// final Hessian. Returns `None` when the matrix is empty, non-square, or
1023/// non-finite; `Some(false)` on any non-positive pivot.
1024pub(crate) fn certificate_hessian_is_pd(hessian: &Array2<f64>) -> Option<bool> {
1025    let n = hessian.nrows();
1026    if n == 0 || hessian.ncols() != n || hessian.iter().any(|v| !v.is_finite()) {
1027        return None;
1028    }
1029    let mut chol = hessian.clone();
1030    for j in 0..n {
1031        for k in 0..j {
1032            let l_jk = chol[[j, k]];
1033            for i in j..n {
1034                chol[[i, j]] -= chol[[i, k]] * l_jk;
1035            }
1036        }
1037        let pivot = chol[[j, j]];
1038        if !(pivot > 0.0) || !pivot.is_finite() {
1039            return Some(false);
1040        }
1041        let inv_sqrt = 1.0 / pivot.sqrt();
1042        for i in j..n {
1043            chol[[i, j]] *= inv_sqrt;
1044        }
1045    }
1046    Some(true)
1047}
1048
1049/// Smoothing coordinates (leading ρ block) railed against the outer box.
1050pub(crate) fn certificate_railed_lambdas(
1051    rho: &Array1<f64>,
1052    rho_dim: usize,
1053    config: &OuterConfig,
1054) -> Vec<usize> {
1055    (0..rho_dim.min(rho.len()))
1056        .filter(|&k| {
1057            let (lo, hi) = match config.bounds.as_ref() {
1058                Some((lo, hi)) if k < lo.len() && k < hi.len() => (lo[k], hi[k]),
1059                Some(_) => return false,
1060                None => (-config.rho_bound, config.rho_bound),
1061            };
1062            (rho[k] - lo).abs() <= CERTIFICATE_RAIL_MARGIN
1063                || (hi - rho[k]).abs() <= CERTIFICATE_RAIL_MARGIN
1064        })
1065        .collect()
1066}
1067
1068/// Perform the randomized first-order self-audit at the returned optimum.
1069///
1070/// Requires an analytic final gradient (the thing being audited); returns
1071/// `None` — never an error — when the gradient is absent/non-finite or when
1072/// any of the four value probes fails to evaluate, so the audit can never
1073/// fail a fit that the optimizer accepted.
1074pub(crate) fn audit_first_order_optimality(
1075    obj: &mut dyn OuterObjective,
1076    config: &OuterConfig,
1077    context: &str,
1078    result: &OuterResult,
1079) -> Option<CriterionCertificate> {
1080    let gradient = result.final_gradient.as_ref()?;
1081    if gradient.is_empty()
1082        || gradient.len() != result.rho.len()
1083        || gradient.iter().any(|g| !g.is_finite())
1084        || result.rho.iter().any(|r| !r.is_finite())
1085    {
1086        return None;
1087    }
1088
1089    let theta = &result.rho;
1090    let rho_dim = obj.capability().theta_layout().rho_dim();
1091    let railed = certificate_railed_lambdas(theta, rho_dim, config);
1092
1093    // The full-space audit direction is unit-norm over all θ coordinates.
1094    let full_direction = certificate_audit_direction(theta, context);
1095    // At an active box bound the constrained first-order optimality condition
1096    // is KKT: ∇F·e_k need NOT vanish along a railed coordinate k (the bound
1097    // multiplier balances it), AND the central FD steps ρ_k across the bound
1098    // into the infeasible/clamped region, corrupting the value path. An
1099    // unconstrained FD-vs-analytic directional check that spans a railed
1100    // coordinate is therefore ill-posed and produces a spurious disagreement
1101    // (the railed-coordinate audit artifact). Restrict the comparison to the
1102    // free (non-railed, box-interior) subspace: zero the railed components of
1103    // the audit direction and re-normalize. When nothing is railed this is the
1104    // exact original unit direction (byte-identical), so the interior desync
1105    // check the certificate exists for (#748/#752/#808) is unchanged.
1106    let direction = if railed.is_empty() {
1107        full_direction
1108    } else {
1109        let mut projected = full_direction;
1110        for &k in &railed {
1111            if k < projected.len() {
1112                projected[k] = 0.0;
1113            }
1114        }
1115        let norm = projected.dot(&projected).sqrt();
1116        if norm.is_finite() && norm > f64::EPSILON {
1117            projected.mapv_inplace(|v| v / norm);
1118            projected
1119        } else {
1120            // Every audited coordinate is railed (free subspace empty): there
1121            // is no interior direction to audit, so the directional check is
1122            // vacuous. Skip the certificate rather than divide by ~0.
1123            log::debug!(
1124                "[CERTIFICATE] {context}: every audited coordinate railed \
1125                 (railed={railed:?}); no free direction to audit, certificate skipped"
1126            );
1127            return None;
1128        }
1129    };
1130    // Central-difference step on the optimal ε^(1/3) scale, sized to the
1131    // iterate so saturated ρ (|ρ| up to rho_bound) keeps θ̂±2hv resolvable.
1132    let theta_scale = theta.iter().fold(0.0_f64, |acc, &v| acc.max(v.abs()));
1133    let step = f64::EPSILON.cbrt() * (1.0 + theta_scale);
1134
1135    let mut probe = |scale: f64| -> Option<f64> {
1136        let point = theta + &(scale * &direction);
1137        match obj.eval_cost(&point) {
1138            Ok(value) if value.is_finite() => Some(value),
1139            Ok(value) => {
1140                log::debug!(
1141                    "[CERTIFICATE] {context}: audit probe at θ̂{scale:+.3e}·v returned \
1142                     non-finite criterion value {value}; certificate skipped"
1143                );
1144                None
1145            }
1146            Err(err) => {
1147                log::debug!(
1148                    "[CERTIFICATE] {context}: audit probe at θ̂{scale:+.3e}·v failed ({err}); \
1149                     certificate skipped"
1150                );
1151                None
1152            }
1153        }
1154    };
1155    let f_plus_h = probe(step)?;
1156    let f_minus_h = probe(-step)?;
1157    let f_plus_2h = probe(2.0 * step)?;
1158    let f_minus_2h = probe(-2.0 * step)?;
1159
1160    let d_h = (f_plus_h - f_minus_h) / (2.0 * step);
1161    let d_2h = (f_plus_2h - f_minus_2h) / (4.0 * step);
1162    // FD-OK: FD-audit certificate construction (Richardson FD oracle auditing the analytic gradient, never feeds the optimizer)
1163    let fd_directional = (4.0 * d_h - d_2h) / 3.0; // fd-ok: FD-audit certificate, not in math path
1164    // Error bar: the Richardson residual measures truncation + value-path
1165    // noise (inner-solve tolerance) empirically; the roundoff bound floors
1166    // it when the residual is accidentally tiny.
1167    let value_scale = f_plus_h
1168        .abs()
1169        .max(f_minus_h.abs())
1170        .max(f_plus_2h.abs())
1171        .max(f_minus_2h.abs());
1172    let roundoff = f64::EPSILON * (1.0 + value_scale) / step;
1173    let fd_error = (d_h - d_2h).abs().max(roundoff); // fd-ok: FD-audit certificate, not in math path
1174
1175    let analytic_directional = gradient.dot(&direction);
1176    let grad_norm = gradient.dot(gradient).sqrt();
1177    let agreement_z = (analytic_directional - fd_directional).abs() / fd_error; // fd-ok: FD-audit certificate, not in math path
1178
1179    let certificate = CriterionCertificate {
1180        grad_norm,
1181        analytic_directional,
1182        fd_directional, // fd-ok: FD-audit certificate, not in math path
1183        fd_error,       // fd-ok: FD-audit certificate, not in math path
1184        agreement_z,
1185        fd_step: step, // fd-ok: FD-audit certificate, not in math path
1186        // END-FD-OK
1187        hessian_pd: result
1188            .final_hessian
1189            .as_ref()
1190            .and_then(certificate_hessian_is_pd),
1191        lambdas_railed: railed,
1192    };
1193    if certificate.is_clean() {
1194        log::info!("[CERTIFICATE] {context}: {}", certificate.summary());
1195    } else {
1196        log::warn!(
1197            "[CERTIFICATE warning] {context}: optimality self-audit flagged the returned \
1198             optimum — {}",
1199            certificate.summary(),
1200        );
1201    }
1202    Some(certificate)
1203}
1204
1205pub(crate) fn compute_rho_uncertainty_diagnostic(
1206    obj: &mut dyn OuterObjective,
1207    config: &OuterConfig,
1208    context: &str,
1209    result: &mut OuterResult,
1210) -> crate::rho_uncertainty::RhoUncertaintyDiagnostic {
1211    let cap = obj.capability();
1212    let layout = cap.theta_layout();
1213    let rho_dim = layout.rho_dim();
1214    let gate = crate::rho_uncertainty::RhoUncertaintyCostGate {
1215        sample_count: 32,
1216        problem_size: config.rho_uncertainty_problem_size,
1217    };
1218    if let Err(reason) = crate::rho_uncertainty::cost_gate_allows(rho_dim, gate) {
1219        return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(reason, 0);
1220    }
1221    if result.rho.len() != layout.n_params {
1222        return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(
1223            format!(
1224                "final outer point length {} does not match objective dimension {}",
1225                result.rho.len(),
1226                layout.n_params
1227            ),
1228            0,
1229        );
1230    }
1231
1232    let final_eval = match obj.eval_with_order(&result.rho, OuterEvalOrder::ValueGradientHessian) {
1233        Ok(eval) => eval,
1234        Err(err) => {
1235            return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(
1236                format!("final exact Hessian evaluation failed: {err}"),
1237                1,
1238            );
1239        }
1240    };
1241    let hessian = match final_eval.hessian.materialize_dense() {
1242        Ok(Some(hessian)) => hessian,
1243        Ok(None) => {
1244            return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(
1245                "exact outer Hessian unavailable at fitted rho",
1246                1,
1247            );
1248        }
1249        Err(message) => {
1250            return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(
1251                format!("exact outer Hessian materialization failed: {message}"),
1252                1,
1253            );
1254        }
1255    };
1256    if hessian.nrows() != layout.n_params || hessian.ncols() != layout.n_params {
1257        return crate::rho_uncertainty::RhoUncertaintyDiagnostic::skipped(
1258            format!(
1259                "exact outer Hessian shape {}x{} does not match objective dimension {}",
1260                hessian.nrows(),
1261                hessian.ncols(),
1262                layout.n_params
1263            ),
1264            1,
1265        );
1266    }
1267    // Persist the exact outer curvature at θ̂ when the solver did not already
1268    // track one. A gradient-based BFGS solve keeps its inverse-Hessian
1269    // internally and `opt` does not surface it, so `result.final_hessian` is
1270    // `None` on the BFGS path — yet the exact analytic `H(θ̂)` was just
1271    // materialized here for the rho-uncertainty diagnostic and is otherwise
1272    // discarded. Stashing it lets the persistent-cache finalize write carry the
1273    // converged curvature, so the NEXT structurally-matching fit (e.g. the next
1274    // LOSO fold, whose θ̂ and curvature are nearly identical) can seed BFGS with
1275    // `InitialMetric::DenseInverseHessian` and take a quasi-Newton first step
1276    // instead of rediscovering curvature through line-search bracketing. This
1277    // never changes a converged optimum (BFGS converges to ∇V=0 under any SPD
1278    // initial metric); it only reshapes the starting line-search path. Guarded
1279    // on finiteness and on the solver not already owning a Hessian, so the
1280    // exact-Newton / ARC paths (which DO populate `final_hessian`) are untouched.
1281    if result.final_hessian.is_none() && hessian.iter().all(|v| v.is_finite()) {
1282        result.final_hessian = Some(hessian.clone());
1283    }
1284    let mut hessian_rho = Array2::<f64>::zeros((rho_dim, rho_dim));
1285    for row in 0..rho_dim {
1286        for col in 0..rho_dim {
1287            hessian_rho[[row, col]] = hessian[[row, col]];
1288        }
1289    }
1290    let rho_hat = result.rho.slice(ndarray::s![..rho_dim]).to_owned();
1291    let theta_hat = result.rho.clone();
1292    let cost_hat = final_eval.cost;
1293    let final_beta_hint = final_eval.inner_beta_hint.clone();
1294    let diagnostic = {
1295        let mut served_hat_cost = false;
1296        let mut criterion = |rho: &Array1<f64>| -> Option<f64> {
1297            let is_hat = rho.len() == rho_hat.len()
1298                && rho
1299                    .iter()
1300                    .zip(rho_hat.iter())
1301                    .all(|(&left, &right)| left.to_bits() == right.to_bits());
1302            if is_hat && !served_hat_cost {
1303                served_hat_cost = true;
1304                return Some(cost_hat);
1305            }
1306            let mut theta = theta_hat.clone();
1307            for idx in 0..rho_dim {
1308                theta[idx] = rho[idx];
1309            }
1310            if let Some(beta) = final_beta_hint.as_ref()
1311                && obj.seed_inner_state(beta).is_err()
1312            {
1313                return None;
1314            }
1315            obj.eval_cost(&theta).ok()
1316        };
1317        crate::rho_uncertainty::rho_uncertainty_diagnostic(
1318            &rho_hat,
1319            &hessian_rho,
1320            gate,
1321            &mut criterion,
1322        )
1323    };
1324    if let Some(beta) = final_beta_hint.as_ref()
1325        && let Err(err) = obj.seed_inner_state(beta)
1326    {
1327        log::debug!(
1328            "[RHO uncertainty] {context}: final inner-state restore skipped after diagnostic ({err})"
1329        );
1330    }
1331    match &diagnostic.status {
1332        crate::rho_uncertainty::RhoUncertaintyStatus::NoEvidenceOfHeavyTails => {
1333            log::info!(
1334                "[RHO uncertainty] {context}: no heavy-tail evidence at sampled rho proposals k_hat={:.3} evals={}",
1335                diagnostic.k_hat.unwrap_or(f64::NAN),
1336                diagnostic.n_evaluations,
1337            );
1338        }
1339        crate::rho_uncertainty::RhoUncertaintyStatus::HeavyTailsDetected { k_hat } => {
1340            log::warn!(
1341                "[RHO uncertainty] {context}: heavy rho-importance tail detected k_hat={:.3} evals={}",
1342                k_hat,
1343                diagnostic.n_evaluations,
1344            );
1345        }
1346        crate::rho_uncertainty::RhoUncertaintyStatus::Skipped { reason } => {
1347            log::info!("[RHO uncertainty] {context}: skipped ({reason})");
1348        }
1349    }
1350    diagnostic
1351}
1352
1353#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1354pub enum OperatorTrustRegionStopReason {
1355    Converged,
1356    RejectFloor,
1357    IterationBudget,
1358    CostStallFlatValley,
1359    /// Family returned a non-operator Hessian mid-flight after routing into
1360    /// the operator path. Best-effort `x_k` returned with this reason; the
1361    /// caller should consider re-fitting under a different solver class
1362    /// (e.g. BFGS gradient-only) instead of trusting the partial result.
1363    RoutingMismatch,
1364}
1365
1366/// Run the outer smoothing-parameter optimization.
1367///
1368/// This is the single entry point that replaces the scattered optimizer wiring
1369/// across estimate.rs, joint.rs, and custom_family.rs. It:
1370///
1371/// 1. Queries and canonicalizes the objective's capability declaration.
1372/// 2. Calls `plan()` to select solver + hessian source.
1373/// 3. Logs the plan and the analytic derivative capabilities it will consume.
1374/// 4. Generates seed candidates.
1375/// 5. Runs the chosen solver on candidates in heuristic order up to budget.
1376/// 6. If the configured fallback policy allows it, re-plans with degraded
1377///    capabilities chosen centrally inside outer_strategy and retries.
1378/// 7. Returns the best result (including which plan was actually used).
1379///
1380/// Do not wrap `run_outer` calls in try/catch with ad-hoc solver recovery.
1381/// Callers should declare only the primary capability and, at most, whether
1382/// automatic fallback is enabled at all.
1383pub(crate) fn run_outer(
1384    obj: &mut dyn OuterObjective,
1385    config: &OuterConfig,
1386    context: &str,
1387) -> Result<OuterResult, EstimationError> {
1388    // Permutation-invariant outer search (#1538/#1539). When the caller has
1389    // supplied per-coordinate structural keys that induce a non-identity
1390    // canonical order, run the ENTIRE outer pipeline (seeding, multistart,
1391    // optimization, and the #934 certificate / uncertainty audits) in that
1392    // canonical layout against a permuting wrapper, then map the result back to
1393    // the native layout. Seeding/tie-breaking then see byte-identical
1394    // coordinates for every term order, so both orders select the same λ̂.
1395    if let Some(keys) = config.rho_canonical_keys.as_ref()
1396        && let Some(perm) = canonical_permutation(keys)
1397    {
1398        let canonical_config = canonicalize_outer_config(config, &perm);
1399        let mut canonical_obj = CanonicalizedObjective::new(obj, perm.clone());
1400        let result = run_outer(&mut canonical_obj, &canonical_config, context)?;
1401        return Ok(outer_result_to_native(result, &perm));
1402    }
1403    let mut result = run_outer_uncertified(obj, config, context)?;
1404    if config.max_iter <= 1 {
1405        return Ok(result);
1406    }
1407    // First-order optimality self-audit (#934): once, at the returned θ̂,
1408    // outside all hot loops, for every entry point of the solver ladder
1409    // (dense, device, per-atom EFS, fallback plans). Probes evaluate the
1410    // value path at θ̂±hv AFTER the solve, so the only state they perturb
1411    // is warm-start residue O(h) from the optimum — every caller recovers
1412    // its fitted state from `result.rho`, not from last-eval residue.
1413    result.criterion_certificate = audit_first_order_optimality(obj, config, context, &result);
1414    result.rho_uncertainty_diagnostic = Some(compute_rho_uncertainty_diagnostic(
1415        obj,
1416        config,
1417        context,
1418        &mut result,
1419    ));
1420    Ok(result)
1421}
1422
1423/// Build a CANONICAL-order copy of an [`OuterConfig`] for the
1424/// permutation-invariant outer search (#1538/#1539).
1425///
1426/// `perm[c]` is the native coordinate at canonical slot `c`. Every
1427/// per-coordinate config field (initial ρ seed, heuristic-λ seed, per-axis
1428/// bounds, transferred warm Hessian) is reordered native→canonical so the
1429/// optimizer's seeding and multistart operate entirely in canonical space;
1430/// scalar fields are copied verbatim. `rho_canonical_keys` is cleared so the
1431/// recursive [`run_outer`] frame runs the normal (identity-order) pipeline on
1432/// the already-canonical objective.
1433fn canonicalize_outer_config(config: &OuterConfig, perm: &[usize]) -> OuterConfig {
1434    // Permute a per-coordinate slice native→canonical; pass through any length
1435    // that does not match the permutation (defensive — should not occur).
1436    let permute_vec = |v: &[f64]| -> Vec<f64> {
1437        if v.len() == perm.len() {
1438            perm.iter().map(|&i| v[i]).collect()
1439        } else {
1440            v.to_vec()
1441        }
1442    };
1443    let permute_arr = |a: &Array1<f64>| -> Array1<f64> {
1444        if a.len() == perm.len() {
1445            Array1::from_iter(perm.iter().map(|&i| a[i]))
1446        } else {
1447            a.clone()
1448        }
1449    };
1450    let mut canonical = config.clone();
1451    canonical.rho_canonical_keys = None;
1452    if let Some(initial) = config.initial_rho.as_ref() {
1453        canonical.initial_rho = Some(permute_arr(initial));
1454    }
1455    if let Some(h) = config.heuristic_lambdas.as_ref() {
1456        canonical.heuristic_lambdas = Some(permute_vec(h));
1457    }
1458    if let Some((lower, upper)) = config.bounds.as_ref() {
1459        canonical.bounds = Some((permute_arr(lower), permute_arr(upper)));
1460    }
1461    // A transferred dense outer Hessian is in native coordinate order; permute
1462    // it into canonical order so the BFGS warm metric stays aligned. (None on
1463    // the cold-start canonicalized path, so this is usually a no-op.)
1464    if let Some(h) = config.warm_start_outer_hessian.as_ref()
1465        && h.nrows() == perm.len()
1466        && h.ncols() == perm.len()
1467    {
1468        let mut hc = Array2::<f64>::zeros((perm.len(), perm.len()));
1469        for (a, &ia) in perm.iter().enumerate() {
1470            for (b, &ib) in perm.iter().enumerate() {
1471                hc[[a, b]] = h[[ia, ib]];
1472            }
1473        }
1474        canonical.warm_start_outer_hessian = Some(hc);
1475    }
1476    canonical
1477}
1478
1479/// The solver ladder behind [`run_outer`], without the #934 self-audit.
1480pub(crate) fn run_outer_uncertified(
1481    obj: &mut dyn OuterObjective,
1482    config: &OuterConfig,
1483    context: &str,
1484) -> Result<OuterResult, EstimationError> {
1485    let cap = primary_capability_for_config(obj.capability(), config, context);
1486    cap.validate_layout(context)?;
1487    if let Some(initial_rho) = config.initial_rho.as_ref() {
1488        cap.theta_layout()
1489            .validate_point_len(initial_rho, "initial outer seed")
1490            .map_err(|err| match err {
1491                ObjectiveEvalError::Recoverable { message }
1492                | ObjectiveEvalError::Fatal { message } => {
1493                    EstimationError::RemlOptimizationFailed(format!("{context}: {message}"))
1494                }
1495            })?;
1496    }
1497    crate::estimate::reml::outer_eval::clear_outer_ift_residual_energy_for_fit();
1498
1499    // Frontier ρ-scaling auto-switch (#986): at per-atom-EFS-eligible frontier
1500    // rho dimension the decoupled per-atom fixed point is the primary outer
1501    // iteration; everything else falls through to the dense / standard path
1502    // below. Routed here so every entry point inherits it (magic by default).
1503    if let Some(result) = run_per_atom_efs_if_frontier(obj, config, context)? {
1504        return Ok(result);
1505    }
1506
1507    if cap.n_params == 0 {
1508        let cost = obj.eval_cost(&Array1::zeros(0))?;
1509        let the_plan = plan(&cap);
1510        return Ok(outer_result_with_gradient_norm(
1511            Array1::zeros(0),
1512            cost,
1513            0,
1514            Some(0.0),
1515            true,
1516            the_plan,
1517        ));
1518    }
1519
1520    // Build the ordered list of capabilities to attempt: primary first, then
1521    // any centrally-derived degraded capabilities. Aux direct-search has no
1522    // degraded ladder — a single attempt either succeeds or the failure is
1523    // surfaced to the caller.
1524    let fallback_attempts = match config.fallback_policy {
1525        FallbackPolicy::Automatic => automatic_fallback_attempts(&cap),
1526        FallbackPolicy::Disabled => Vec::new(),
1527    };
1528    let mut attempts: Vec<OuterCapability> = Vec::with_capacity(1 + fallback_attempts.len());
1529    attempts.push(cap.clone());
1530    for degraded in fallback_attempts {
1531        attempts.push(degraded);
1532    }
1533
1534    let mut last_error: Option<EstimationError> = None;
1535
1536    for (attempt_idx, attempt_cap) in attempts.iter().enumerate() {
1537        let the_plan = plan(attempt_cap);
1538        if attempt_idx > 0 {
1539            log::debug!("[OUTER] {context}: primary plan failed; falling back to {the_plan}");
1540        }
1541        log_plan(context, attempt_cap, &the_plan);
1542
1543        obj.reset();
1544
1545        // ARC budget-exhaustion retry: when an Arc attempt runs out of
1546        // outer iterations, reseed a fresh Arc run from the previous
1547        // attempt's last ρ and trust radius. Inner caches (PIRLS LRU,
1548        // eval bundle, warm-start predictor, adaptive signals) are wiped
1549        // by `obj.reset()`; the operator-TR's Cauchy/Newton/CG state has
1550        // no resume API and is not preserved. The lever that changes for
1551        // the resumed run is the inner-PIRLS cap (uncapped via the
1552        // feedback handle), not `max_iter` — empirically the prior stall
1553        // was an inner-tolerance / model-fidelity issue, not an outer
1554        // budget shortfall, and doubling `max_iter` only replays the
1555        // same trajectory byte-for-byte. The retry is gated on observed
1556        // `‖g‖` progress so trajectories that made no headway fall
1557        // through to the degraded plan instead of replaying.
1558        let mut arc_retries_left: u32 = if matches!(the_plan.solver, Solver::Arc) {
1559            2
1560        } else {
1561            0
1562        };
1563        let mut retry_config: Option<OuterConfig> = None;
1564        // Tracks the previous ARC attempt's terminal `‖g‖`. The retry
1565        // gate compares attempt-over-attempt: if a retry didn't move
1566        // the gradient norm, the trajectory replayed (same seed, same
1567        // trust radius, cold caches, deterministic optimizer) and
1568        // further retries cannot help. First retry is unconditional
1569        // (no prior attempt to compare against).
1570        let mut prev_attempt_grad_norm: Option<f64> = None;
1571
1572        let outcome = loop {
1573            // Bind the active config by cloning into a local owned value so
1574            // subsequent retry-config assignment does not collide with the
1575            // borrow used inside this iteration body.
1576            let active_config_owned: OuterConfig =
1577                retry_config.clone().unwrap_or_else(|| config.clone());
1578            let active_config: &OuterConfig = &active_config_owned;
1579            match run_outer_with_plan(obj, active_config, context, attempt_cap, &the_plan) {
1580                Ok(result) => {
1581                    if result.converged
1582                        || arc_retries_left == 0
1583                        || matches!(
1584                            result.operator_stop_reason,
1585                            Some(OperatorTrustRegionStopReason::RejectFloor)
1586                        )
1587                    {
1588                        break Ok(result);
1589                    }
1590                    // Gate the retry on attempt-over-attempt `‖g‖`
1591                    // progress. The first retry is unconditional (no
1592                    // prior attempt). Subsequent retries fall through
1593                    // to the degraded plan when the gradient norm did
1594                    // not materially shrink — the deterministic
1595                    // optimizer with the same seed and trust radius
1596                    // would replay the same trajectory.
1597                    let Some(cur_grad_norm) = result.final_grad_norm else {
1598                        log::info!(
1599                            "[OUTER] {context}: ARC attempt exhausted budget at \
1600                             iter={} cost={:.6e} without a final gradient norm; \
1601                             falling through to degraded plan",
1602                            result.iterations,
1603                            result.final_value,
1604                        );
1605                        break Ok(result);
1606                    };
1607                    if let Some(prev_g) = prev_attempt_grad_norm {
1608                        let progressed = cur_grad_norm.is_finite()
1609                            && prev_g.is_finite()
1610                            && cur_grad_norm < 0.5 * prev_g;
1611                        if !progressed {
1612                            log::info!(
1613                                "[OUTER] {context}: ARC retry stalled at \
1614                                 iter={} cost={:.6e} |g|={:.6e} (prev |g|={:.6e}); \
1615                                 deterministic replay suspected, falling through \
1616                                 to degraded plan",
1617                                result.iterations,
1618                                result.final_value,
1619                                cur_grad_norm,
1620                                prev_g,
1621                            );
1622                            break Ok(result);
1623                        }
1624                    }
1625                    let next_trust_radius =
1626                        sanitized_operator_trust_restart_radius(result.operator_trust_radius);
1627                    log::info!(
1628                        "[OUTER] {context}: ARC attempt exhausted budget at \
1629                         iter={} cost={:.6e} |g|={:.6e}; resuming from last \
1630                         rho + trust_radius={:?}, inner-PIRLS uncapped \
1631                         (objective caches wiped; operator-TR Cauchy/Newton \
1632                         state is not resumable)",
1633                        result.iterations,
1634                        result.final_value,
1635                        cur_grad_norm,
1636                        next_trust_radius,
1637                    );
1638                    // Snapshot the cap-feedback handle before we
1639                    // reassign `retry_config` (which currently backs
1640                    // `active_config`'s borrow). `InnerProgressFeedback`
1641                    // is an Arc-wrapper bundle, so the clone is cheap.
1642                    let cap_feedback = active_config.outer_inner_cap.clone();
1643                    let mut next = active_config.clone();
1644                    prev_attempt_grad_norm = Some(cur_grad_norm);
1645                    next.initial_rho = Some(result.rho.clone());
1646                    next.operator_initial_trust_radius = next_trust_radius;
1647                    retry_config = Some(next);
1648                    arc_retries_left -= 1;
1649                    obj.reset();
1650                    // Lift any inner-PIRLS cap for the resumed run. The
1651                    // schedule's cold-start ladder (3/5/10) would
1652                    // re-coarsen exactly the inner solves whose tolerance
1653                    // is suspected to have starved the prior trajectory.
1654                    // The next outer iter consumes ρ near a near-stationary
1655                    // point where exact β / gradient / Hessian is the
1656                    // load-bearing input to the operator-TR geometry.
1657                    if let Some(feedback) = cap_feedback.as_ref() {
1658                        feedback.cap.store(0, Ordering::Relaxed);
1659                    }
1660                }
1661                Err(e) => break Err(e),
1662            }
1663        };
1664
1665        match outcome {
1666            Ok(result) => {
1667                if result.converged || attempt_idx + 1 == attempts.len() {
1668                    if !result.converged {
1669                        log::warn!(
1670                            "[OUTER warning] {context}: final outer attempt returned without convergence \
1671                             (plan={the_plan}, iterations={}, final_value={:.6e}, |g|={})",
1672                            result.iterations,
1673                            result.final_value,
1674                            result.final_grad_norm_report(),
1675                        );
1676                    }
1677                    return Ok(result);
1678                }
1679
1680                let message = format!(
1681                    "{context}: attempt {} (plan={the_plan}) exhausted without convergence",
1682                    attempt_idx + 1
1683                );
1684                log::debug!("[OUTER] {message}; trying degraded fallback plan");
1685                last_error = Some(EstimationError::RemlOptimizationFailed(message));
1686            }
1687            Err(e) => {
1688                log::debug!(
1689                    "[OUTER] {context}: attempt {} (plan={the_plan}) failed: {e}",
1690                    attempt_idx + 1
1691                );
1692                last_error = Some(e);
1693            }
1694        }
1695    }
1696
1697    Err(last_error.unwrap_or_else(|| {
1698        EstimationError::RemlOptimizationFailed(format!("all plan attempts exhausted ({context})"))
1699    }))
1700}
1701
1702// ─── Frontier ρ-scaling auto-switch (issue #986) ─────────────────────────
1703//
1704// ARD-per-atom assigns one smoothing coordinate per dictionary atom, so the
1705// ρ-vector reaches 10^4–10^5 coordinates. A dense outer quasi-Newton over that
1706// materializes an O(K²) Hessian and is impossible at scale. When the ρ-dimension
1707// is frontier-scale AND every coordinate is penalty-like with a working
1708// fixed-point hook, route the PRIMARY outer iteration to the per-atom decoupled
1709// EFS path (`crate::estimate::reml::per_atom_efs`) instead of the dense
1710// ARC/BFGS lane. The decision is auto-derived from the coordinate count alone —
1711// there is no flag — and it is additive: the dense path is unchanged for small K
1712// and for any objective that is not per-atom-EFS-eligible.
1713
1714/// Whether this capability is in the frontier ρ-scaling regime where the
1715/// per-atom decoupled EFS primary should take over from the dense outer.
1716///
1717/// Delegates the eligibility decision to
1718/// [`crate::estimate::reml::per_atom_efs::per_atom_efs_eligible`], which
1719/// requires all-penalty-like coordinates, a working `eval_efs` hook,
1720/// fixed-point not disabled, and a frontier-scale ρ-dimension. This is the
1721/// single auto-switch predicate; `plan` keeps selecting the
1722/// dense or standard-EFS solver for everything below the frontier threshold.
1723pub fn is_per_atom_efs_frontier(cap: &OuterCapability) -> bool {
1724    crate::estimate::reml::per_atom_efs::per_atom_efs_eligible(cap)
1725}
1726
1727/// Auto-switch entry point: when `cap` is frontier-scale per-atom-EFS-eligible,
1728/// run the per-atom decoupled EFS primary and return its [`OuterResult`];
1729/// otherwise return `Ok(None)` so the caller falls through to the existing dense
1730/// / standard-EFS path via [`OuterProblem::run`] / [`run_outer`].
1731///
1732/// Builds the same bounded seed and tolerance/budget the standard plan path
1733/// uses, picks the seed (initial-ρ if supplied, else the first generated
1734/// candidate — the per-atom fixed point is a contraction near the optimum and
1735/// does not need the multi-seed cascade the dense path runs for its non-convex
1736/// quasi-Newton surface), then drives the per-atom EFS loop. The shared-border
1737/// topology defaults to disjoint (every atom owns a private penalty block — the
1738/// common ARD-per-atom case); callers with a known arrow-border overlap can run
1739/// the module's `run_per_atom_efs` directly with a populated
1740/// `SharedBorderTopology`.
1741///
1742/// Additive: this function neither mutates nor bypasses the dense path; it is
1743/// the pre-dispatch shortcut [`run_outer`] calls before the dense ladder.
1744pub(crate) fn run_per_atom_efs_if_frontier(
1745    obj: &mut dyn OuterObjective,
1746    config: &OuterConfig,
1747    context: &str,
1748) -> Result<Option<OuterResult>, EstimationError> {
1749    let cap = primary_capability_for_config(obj.capability(), config, context);
1750    cap.validate_layout(context)?;
1751    if !is_per_atom_efs_frontier(&cap) {
1752        return Ok(None);
1753    }
1754
1755    let the_plan = plan(&cap);
1756    let rho_dim = cap.theta_layout().rho_dim();
1757
1758    let (lower, upper) = outer_bounds_template(config, cap.n_params);
1759
1760    // Seed: cache/explicit initial ρ if present, otherwise the first generated
1761    // candidate. The per-atom multiplicative fixed point is locally
1762    // contractive, so a single seed suffices; the heavy multi-seed cascade
1763    // exists for the dense quasi-Newton's non-convex surface, not for EFS.
1764    let seed = match config.initial_rho.as_ref() {
1765        Some(initial) if initial.len() == cap.n_params => initial.clone(),
1766        _ => {
1767            let generated = crate::seeding::generate_rho_candidates(
1768                cap.n_params,
1769                config.heuristic_lambdas.as_deref(),
1770                &config.seed_config,
1771            );
1772            match generated.into_iter().next() {
1773                Some(first) => first,
1774                None => Array1::<f64>::zeros(cap.n_params),
1775            }
1776        }
1777    };
1778
1779    log::info!(
1780        "[OUTER] {context}: frontier ρ-scaling (rho_dim={rho_dim}) → per-atom decoupled EFS primary"
1781    );
1782
1783    let pa_cfg = crate::estimate::reml::per_atom_efs::PerAtomEfsConfig::new(
1784        config.tolerance,
1785        config.max_iter,
1786        lower,
1787        upper,
1788    );
1789    let topology =
1790        crate::estimate::reml::per_atom_efs::SharedBorderTopology::disjoint(rho_dim);
1791
1792    obj.reset();
1793    let result = crate::estimate::reml::per_atom_efs::run_per_atom_efs(
1794        obj, &seed, &pa_cfg, &topology,
1795    )?;
1796    Ok(Some(result.into_outer_result(the_plan)))
1797}
1798
1799pub(crate) fn outer_bounds(lo: &Array1<f64>, hi: &Array1<f64>) -> Result<Bounds, EstimationError> {
1800    Bounds::new(lo.clone(), hi.clone(), 1e-6).map_err(|err| {
1801        EstimationError::InvalidInput(format!("outer rho bounds are invalid: {err}"))
1802    })
1803}
1804
1805pub(crate) fn outer_bounds_template(config: &OuterConfig, n: usize) -> (Array1<f64>, Array1<f64>) {
1806    config.bounds.clone().unwrap_or_else(|| {
1807        (
1808            Array1::<f64>::from_elem(n, -config.rho_bound),
1809            Array1::<f64>::from_elem(n, config.rho_bound),
1810        )
1811    })
1812}
1813
1814pub(crate) fn outer_tolerance(value: f64) -> Result<Tolerance, EstimationError> {
1815    Tolerance::new(value)
1816        .map_err(|err| EstimationError::InvalidInput(format!("outer tolerance is invalid: {err}")))
1817}
1818
1819pub(crate) fn outer_gradient_tolerance(config: &OuterConfig) -> GradientTolerance {
1820    let abs = config
1821        .objective_scale
1822        .map(|scale| config.tolerance.max(scale * 1.0e-9))
1823        .unwrap_or(config.tolerance);
1824    GradientTolerance {
1825        abs,
1826        rel_initial_grad: None,
1827        rel_cost: Some(config.rel_cost_tolerance.unwrap_or(config.tolerance)),
1828        projected: true,
1829    }
1830}
1831
1832pub(crate) fn outer_max_iterations(value: usize) -> Result<MaxIterations, EstimationError> {
1833    MaxIterations::new(value)
1834        .map_err(|err| EstimationError::InvalidInput(format!("outer max_iter is invalid: {err}")))
1835}
1836
1837pub(crate) fn sanitized_operator_trust_restart_radius(radius: Option<f64>) -> Option<f64> {
1838    radius
1839        .filter(|value| value.is_finite() && *value > 0.0)
1840        .map(|value| value.max(OPERATOR_TRUST_RESTART_RADIUS_FLOOR))
1841}
1842
1843pub(crate) fn bfgs_axis_step_caps(
1844    config: &OuterConfig,
1845    layout: OuterThetaLayout,
1846) -> Option<Array1<f64>> {
1847    if config.bfgs_step_cap.is_none() && config.bfgs_step_cap_psi.is_none() {
1848        return None;
1849    }
1850    let mut caps = Array1::from_elem(layout.n_params, f64::INFINITY);
1851    if let Some(cap) = config.bfgs_step_cap {
1852        for i in 0..layout.rho_dim() {
1853            caps[i] = cap;
1854        }
1855    }
1856    if let Some(cap) = config.bfgs_step_cap_psi {
1857        for i in layout.rho_dim()..layout.n_params {
1858            caps[i] = cap;
1859        }
1860    }
1861    Some(caps)
1862}
1863
1864pub(crate) enum FixedPointOuterRunError {
1865    SeedRejected(EstimationError),
1866    ImmediateFallback(EstimationError),
1867    Failed(EstimationError),
1868}
1869
1870pub(crate) fn run_fixed_point_outer_solver(
1871    obj: &mut dyn OuterObjective,
1872    layout: OuterThetaLayout,
1873    barrier_config: Option<BarrierConfig>,
1874    config: &OuterConfig,
1875    context: &str,
1876    seed: &Array1<f64>,
1877    the_plan: OuterPlan,
1878    label: &str,
1879    failure_prefix: &str,
1880) -> Result<OuterResult, FixedPointOuterRunError> {
1881    let mut objective = OuterFixedPointBridge {
1882        obj,
1883        layout,
1884        barrier_config,
1885        fixed_point_tolerance: config.tolerance,
1886        consecutive_psi_zero_iters: 0,
1887    };
1888    match objective.eval_step(seed) {
1889        Ok(_) => {}
1890        Err(err) => {
1891            let err = match err {
1892                ObjectiveEvalError::Recoverable { message }
1893                | ObjectiveEvalError::Fatal { message } => {
1894                    EstimationError::RemlOptimizationFailed(message)
1895                }
1896            };
1897            if requests_immediate_first_order_fallback(&err.to_string()) {
1898                return Err(FixedPointOuterRunError::ImmediateFallback(err));
1899            }
1900            return Err(FixedPointOuterRunError::SeedRejected(err));
1901        }
1902    };
1903    let (lo, hi) = outer_bounds_template(config, layout.n_params);
1904    let bounds = outer_bounds(&lo, &hi).map_err(FixedPointOuterRunError::Failed)?;
1905    let tol = outer_tolerance(config.tolerance).map_err(FixedPointOuterRunError::Failed)?;
1906    let max_iter =
1907        outer_max_iterations(config.max_iter).map_err(FixedPointOuterRunError::Failed)?;
1908    let mut optimizer = FixedPoint::new(seed.clone(), objective)
1909        .with_bounds(bounds)
1910        .with_tolerance(tol)
1911        .with_max_iterations(max_iter);
1912    match optimizer.run() {
1913        Ok(sol) => Ok(solution_into_outer_result(sol, true, the_plan)),
1914        Err(FixedPointError::MaxIterationsReached { last_solution }) => {
1915            log::warn!(
1916                "[OUTER warning] {context}: {label} hit max_iter={} at final_value={:.6e} step_norm={:.3e}",
1917                config.max_iter,
1918                last_solution.final_value,
1919                last_solution.final_gradient_norm.unwrap_or(f64::NAN),
1920            );
1921            Ok(solution_into_outer_result(*last_solution, false, the_plan))
1922        }
1923        Err(e) => Err(FixedPointOuterRunError::Failed(
1924            EstimationError::RemlOptimizationFailed(format!("{failure_prefix}: {e:?}")),
1925        )),
1926    }
1927}