cobre-sddp 0.8.2

Stochastic Dual Dynamic Programming (SDDP) for hydrothermal dispatch and energy planning
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
//! Per-trial-point opening-solve dispatch and the deterministic trial-point kernel.
//!
//! [`StageOpeningSolver`] is the closed two-variant per-opening solve strategy
//! (baked all-cuts vs lazy resident-set DCS), and `process_trial_point_backward`
//! is the deterministic hot-path kernel that drives it: it solves a trial point's
//! openings in a run-constant, rank-invariant `solve_order` permutation but writes
//! and aggregates outcomes by canonical ω, so the generated cut is bit-identical
//! regardless of solve order. The two co-locate because the kernel calls
//! `opening_solver.solve_opening(...)` and [`StageOpeningSolver::prepare`]; both
//! are hot-path behaviour-sensitive and move verbatim.

use cobre_solver::SolverInterface;

use crate::{
    SddpError,
    context::{StageContext, TrainingContext},
    dcs::{DcsParams, DcsSolveContext, build_initial_resident_set, lazy_solve_preloaded},
    risk_measure::RiskMeasure,
    state_exchange::ExchangeBuffers,
    workspace::{BasisStoreSliceMut, SolverWorkspace},
};

use super::{
    StagedCut, SuccessorSpec,
    duals_extraction::{extract_duals_from_view, extract_state_duals_only},
    lp_setup::{load_backward_lp, patch_opening_bounds, resolve_backward_basis},
    outcome_aggregation::{
        accumulate_dcs_binding_counts, accumulate_opening_outcome, save_basis_at_omega_zero,
        write_opening_outcome,
    },
};

/// Per-trial-point opening-solve strategy for the backward pass.
///
/// Constructed once per trial point from the Dynamic Cut Selection (DCS)
/// activation decision and dispatches each opening's solve through one uniform
/// call site ([`StageOpeningSolver::solve_opening`]), with a per-trial-point LP
/// load step ([`StageOpeningSolver::prepare`]) issued once before the openings
/// run. This is a closed two-variant sum type dispatched by `match` — never a
/// trait object (CLAUDE.md closed-variant-set rule), mirroring the
/// [`crate::cut_selection::CutSelectionStrategy`] / [`DcsParams::from_strategy`]
/// pattern already used in the crate.
///
/// The two variants are the two pre-existing, drifted per-opening solve paths:
///
/// - [`StageOpeningSolver::Baked`]: the baked all-cuts LP. Cross-iteration warm
///   basis on the first-solved opening, full state+cut dual extraction,
///   baked-order slot bumps, and first-solved basis capture.
/// - [`StageOpeningSolver::Lazy`]: the resident-set LP (DCS). A cut-free core is
///   loaded once per trial point and reused across the openings; the first-solved
///   opening solves fresh, the rest warm-carry; state-dual-only extraction;
///   row-map-correct slot bumps; no basis capture.
pub(crate) enum StageOpeningSolver {
    /// Baked all-cuts LP path (no DCS). See the type-level docs for the
    /// five behavioral axes.
    Baked,
    /// Lazy resident-set LP path (Dynamic Cut Selection). Carries the active
    /// [`DcsParams`] so the per-opening call needs no separate `Option` test.
    Lazy(DcsParams),
}

impl StageOpeningSolver {
    /// Choose the per-trial-point strategy from the already-`is_active`-filtered
    /// `dcs_params`: `Some(params)` selects [`StageOpeningSolver::Lazy`], `None`
    /// selects [`StageOpeningSolver::Baked`]. The decision is constant across a
    /// stage's trial points and openings for a given iteration.
    pub(crate) fn from_dcs_params(dcs_params: Option<DcsParams>) -> Self {
        match dcs_params {
            Some(params) => StageOpeningSolver::Lazy(params),
            None => StageOpeningSolver::Baked,
        }
    }

    /// Per-trial-point LP load. Issued once by the driver after
    /// `reset_solver_state()` and before any opening solve, replacing the former
    /// `dcs_active`-gated load skips: the driver no longer asks "is DCS active,
    /// so should I skip my normal load?" — both variants own their own load.
    ///
    /// - [`StageOpeningSolver::Baked`]: load the baked all-cuts LP via
    ///   [`load_backward_lp`] (the former per-trial-point load that paired with
    ///   the per-opening solver-state reset).
    /// - [`StageOpeningSolver::Lazy`]: load the cut-free core and build the
    ///   metadata seed ONCE here, then reuse the loaded LP across this trial
    ///   point's openings — the first-solved opening reuses the loaded cut-free
    ///   core with the appended seed and solves with no carried per-opening basis
    ///   (not a cold SDDP solve, it merely lacks a warm per-opening basis to
    ///   carry); the rest warm-carry. The core load also serves as the
    ///   per-trial-point `HiGHS` reset that preserves rank-invariance: state is
    ///   reset at every trial-point boundary and never carried across trial
    ///   points.
    pub(crate) fn prepare<S: SolverInterface + Send>(
        &self,
        ws: &mut SolverWorkspace<S>,
        ctx: &StageContext<'_>,
        succ: &SuccessorSpec<'_>,
        iteration: u64,
    ) {
        match self {
            StageOpeningSolver::Baked => {
                load_backward_lp(ws, succ);
            }
            StageOpeningSolver::Lazy(params) => {
                ws.solver.load_model(&ctx.templates[succ.successor]);
                build_initial_resident_set(
                    succ.successor_pool,
                    iteration,
                    params.k2,
                    &mut ws.backward_accum.dcs_initial_resident,
                );
            }
        }
    }

    /// Solve one backward opening and accumulate its outcome, dispatching to the
    /// active variant's path. The variant arms are the two pre-existing
    /// per-opening solve functions, re-homed verbatim.
    ///
    /// `is_first` is `true` for the trial point's **first-solved** opening — ω=0
    /// in canonical order, or the first entry of the solve order when reordering
    /// is enabled. The baked path uses it to load the per-(m, s) stored basis and
    /// capture the post-solve basis back into the store (only on the first-solved
    /// opening; the rest warm re-solve on the retained LU). The lazy path uses
    /// `continue_carry == !is_first`: the first-solved opening starts with no
    /// carried per-opening basis — it reuses the loaded cut-free core with the
    /// appended seed (not a cold SDDP solve, it merely lacks a warm per-opening
    /// basis to carry) — while the rest warm-carry the LP. Decoupling basis
    /// identity from the literal ω=0 is
    /// what lets the openings be solved in any order while the per-(m, s) basis
    /// store stays consistent with the actual first solve.
    // Rationale: the args are disjoint borrows (ws, ctx, training_ctx, succ,
    // basis_slice) and per-opening scalars (raw_noise, x_hat, s, scenario,
    // iteration, m, omega, is_first); no natural grouping reduces caller-side
    // borrows.
    #[allow(clippy::too_many_arguments)]
    fn solve_opening<S: SolverInterface + Send>(
        &self,
        ws: &mut SolverWorkspace<S>,
        ctx: &StageContext<'_>,
        training_ctx: &TrainingContext<'_>,
        succ: &SuccessorSpec<'_>,
        basis_slice: &mut BasisStoreSliceMut<'_>,
        raw_noise: &[f64],
        x_hat: &[f64],
        s: usize,
        scenario: usize,
        iteration: u64,
        m: usize,
        omega: usize,
        is_first: bool,
    ) -> Result<(), SddpError> {
        match self {
            StageOpeningSolver::Baked => Self::solve_baked(
                ws,
                ctx,
                training_ctx,
                succ,
                basis_slice,
                raw_noise,
                x_hat,
                s,
                scenario,
                iteration,
                m,
                omega,
                is_first,
            ),
            StageOpeningSolver::Lazy(params) => Self::solve_lazy(
                ws,
                ctx,
                training_ctx,
                succ,
                *params,
                raw_noise,
                x_hat,
                s,
                scenario,
                iteration,
                omega,
                // The first-solved opening starts with no carried per-opening
                // basis (it reuses the loaded cut-free core with the appended
                // seed — not a cold SDDP solve); subsequent openings warm-carry
                // the LP.
                !is_first,
            ),
        }
    }

    /// Baked all-cuts per-opening solve. Patch the opening bounds on the
    /// already-loaded baked/delta LP, reconstruct + solve via `run_stage_solve`,
    /// extract both state and cut duals, accumulate the outcome (including the
    /// binding-count `slot_increments` update), and capture the first-solved
    /// opening's basis. Body re-homed verbatim from the former free function.
    // Rationale: see [`StageOpeningSolver::solve_opening`]; the disjoint-borrow
    // argument list is unchanged from the former free function.
    #[allow(clippy::too_many_arguments)]
    fn solve_baked<S: SolverInterface + Send>(
        ws: &mut SolverWorkspace<S>,
        ctx: &StageContext<'_>,
        training_ctx: &TrainingContext<'_>,
        succ: &SuccessorSpec<'_>,
        basis_slice: &mut BasisStoreSliceMut<'_>,
        raw_noise: &[f64],
        x_hat: &[f64],
        s: usize,
        scenario: usize,
        iteration: u64,
        m: usize,
        omega: usize,
        is_first: bool,
    ) -> Result<(), SddpError> {
        let indexer = training_ctx.indexer;
        patch_opening_bounds(ws, ctx, training_ctx, raw_noise, x_hat, s);

        // Scratch buffers are moved out before the solve to avoid borrow conflicts
        // with `view`'s lifetime. Pre-warmed capacity is reused across openings.
        let mut state_duals = std::mem::take(&mut ws.backward_accum.state_duals_buf);
        let mut cut_duals = std::mem::take(&mut ws.backward_accum.cut_duals_buf);

        // Reuse the per-worker snapshot buffer (histogram allocation is retained
        // across openings) instead of cloning via `statistics()`.
        let mut stats_before_omega = std::mem::take(&mut ws.backward_accum.stats_before_buf);
        ws.solver.statistics_into(&mut stats_before_omega);

        // The per-(m, s) stored basis is loaded for, and captured from, the
        // FIRST-SOLVED opening of this trial point — not canonical ω=0. When the
        // openings are solved in canonical order these coincide; when solved in a
        // warm-start order they differ, and basis identity must follow the actual
        // first solve. See `process_trial_point_backward`.
        let stored_basis = if is_first {
            resolve_backward_basis(basis_slice, m, s)
        } else {
            None
        };
        let inputs = crate::stage_solve::StageInputs {
            stage_context: ctx,
            pool: succ.successor_pool,
            stored_basis,
            stage_index: s,
            scenario_index: scenario,
            iteration: Some(iteration),
        };

        let view = crate::stage_solve::run_stage_solve(ws, &inputs)?;

        // Extract duals from view (which borrows ws for 'ws).
        // Statistics must be captured after view is dropped.
        let objective = extract_duals_from_view(
            &view,
            indexer.n_state,
            indexer,
            &ctx.templates[s].col_scale,
            succ,
            &mut state_duals,
            &mut cut_duals,
        );
        let _ = view;

        ws.backward_accum.state_duals_buf = state_duals;
        ws.backward_accum.cut_duals_buf = cut_duals;

        let mut stats_after_omega = std::mem::take(&mut ws.backward_accum.stats_after_buf);
        ws.solver.statistics_into(&mut stats_after_omega);

        accumulate_opening_outcome(
            ws,
            succ,
            omega,
            objective,
            x_hat,
            &stats_before_omega,
            &stats_after_omega,
        );

        // Restore the snapshot buffers (with their grown histogram capacity) for
        // the next opening.
        ws.backward_accum.stats_before_buf = stats_before_omega;
        ws.backward_accum.stats_after_buf = stats_after_omega;

        if is_first {
            save_basis_at_omega_zero(ws, succ, basis_slice, m, x_hat);
        }

        Ok(())
    }

    /// Lazy resident-set per-opening solve under Dynamic Cut Selection.
    ///
    /// All openings of a trial point are processed by one worker, sharing the same
    /// pinned incoming state `x̂` and differing only in their noise RHS, so the
    /// cut-free core and the metadata seed are loaded/built ONCE per trial point
    /// by [`StageOpeningSolver::prepare`] and the LP is reused across the
    /// openings. This routine therefore never reloads or re-seeds; it only patches
    /// the opening's bounds and runs the lazy loop via [`lazy_solve_preloaded`]:
    ///
    /// - `continue_carry == false` (the first-solved opening): the lazy loop
    ///   resets the carried row map, appends the seed, and solves with no carried
    ///   per-opening basis (`stored_basis = None`) — it still reuses the loaded
    ///   cut-free core, so this is not a cold SDDP solve, it merely lacks a warm
    ///   per-opening basis to carry. The cut produced is identical to the (former)
    ///   per-opening path.
    /// - `continue_carry == true` (subsequent openings): warm-carry the prior
    ///   opening's LP, basis, and (monotonically grown) resident cut set; re-solve
    ///   warm under the new noise and add only the cuts this opening additionally
    ///   violates. This is what the paper's §3.4 "base recovery" buys, extended
    ///   across the trial point's openings.
    ///
    /// The Benders cut gradient and intercept are read from the final
    /// all-satisfied LP via [`extract_state_duals_only`] in both cases.
    ///
    /// **Binding-count metadata** (`slot_increments`) IS maintained here, but
    /// slot-correct under the lazy layout: the baked path bumps by baked cut-row
    /// order, whereas [`accumulate_dcs_binding_counts`] maps each resident cut row
    /// back to its pool slot via the final [`CutRowMap`] and bumps
    /// `slot_increments[slot]` for residents whose cut-row dual exceeds
    /// `cut_activity_tolerance` (the same criterion the baked path uses). This
    /// feeds the existing per-stage `metadata_sync_contribution` allreduce, which
    /// advances `last_active_iter` rank-invariantly and so restores the §3.1
    /// clause-1 seed (cuts **binding** in the last `k2` iterations) — without
    /// altering the extracted gradient/intercept (those come solely from the
    /// structural state-column reduced costs and are identical to the all-cuts cut
    /// by exactness).
    ///
    /// One piece of the baked path is intentionally NOT performed here:
    ///
    /// - **first-solved basis capture**: a captured basis would describe the baked
    ///   layout, not the DCS resident subset, so it is skipped; the next DCS solve
    ///   cold-starts its initial solve, which [`lazy_solve_preloaded`] supports
    ///   with `stored_basis = None`.
    ///
    /// Body re-homed verbatim from the former free function.
    // Rationale: the args are disjoint borrows (ws, ctx, training_ctx, succ) and
    // per-opening scalars (params, raw_noise, x_hat, s, scenario, iteration,
    // omega); no natural grouping reduces caller-side borrows.
    #[allow(clippy::too_many_arguments)]
    fn solve_lazy<S: SolverInterface + Send>(
        ws: &mut SolverWorkspace<S>,
        ctx: &StageContext<'_>,
        training_ctx: &TrainingContext<'_>,
        succ: &SuccessorSpec<'_>,
        params: DcsParams,
        raw_noise: &[f64],
        x_hat: &[f64],
        s: usize,
        scenario: usize,
        iteration: u64,
        omega: usize,
        continue_carry: bool,
    ) -> Result<(), SddpError> {
        let indexer = training_ctx.indexer;
        // The DCS LP must start from the cut-free base structural template, NOT
        // `succ.baked_template`: when baking is active the baked template already
        // carries the active cut rows, and loading it would make the lazy loop's
        // fresh CutRowMap treat those baked slots as non-resident and append them a
        // second time (duplicate rows, broken laziness). `ctx.templates[s]` is the
        // cut-free base (its `num_rows` equals `succ.template_num_rows`), so the
        // CutRowMap's base offset matches and the lazy loop exclusively owns the
        // cut rows.
        let core = &ctx.templates[s];
        let col_scale = &ctx.templates[s].col_scale;

        // [`StageOpeningSolver::prepare`] has already loaded the cut-free core and
        // built the metadata seed once for this trial point; here we only apply
        // this opening's state-pin + noise patch. `lazy_solve_preloaded` appends
        // cut rows to the loaded, patched LP and never reloads, so the patch — and,
        // on continued openings, the carried cut rows + warm basis — survive the
        // whole lazy loop.
        patch_opening_bounds(ws, ctx, training_ctx, raw_noise, x_hat, s);

        // Reuse the per-worker snapshot buffer (histogram allocation is retained
        // across openings) instead of cloning via `statistics()`.
        let mut stats_before_omega = std::mem::take(&mut ws.backward_accum.stats_before_buf);
        ws.solver.statistics_into(&mut stats_before_omega);

        // Move the state-duals buffer out of `ws.backward_accum` so it can be
        // filled by `extract_state_duals_only` (a `&mut Vec` sink) while `view`
        // holds an immutable borrow of the sibling `dcs_solve` field; restored at
        // the end.
        let mut state_duals = std::mem::take(&mut ws.backward_accum.state_duals_buf);

        let dcs_ctx = DcsSolveContext {
            stage_index: s,
            scenario_index: scenario,
            iteration: Some(iteration),
            // The first-solved opening of the trial point starts without a
            // carried per-opening basis (`stored_basis = None` below): the lazy
            // loop resets the carried row map and appends the metadata seed
            // before solving. Every subsequent opening warm-carries the prior
            // opening's LP, basis, and resident cut set.
            continue_carry,
        };
        // Disjoint borrows of `ws`: `solver`, `backward_accum.dcs_initial_resident`
        // (shared), and `backward_accum.dcs_solve` (mut) are distinct fields.
        // `lazy_solve_preloaded` copies the final solve into `dcs_solve`'s result
        // buffers and returns `Result<()>`; the zero-cost view is rebuilt below.
        lazy_solve_preloaded(
            &mut ws.solver,
            core,
            succ.successor_pool,
            indexer,
            col_scale,
            None,
            &ws.backward_accum.dcs_initial_resident,
            &params,
            &mut ws.backward_accum.dcs_solve,
            dcs_ctx,
        )?;
        // View over the result buffers (borrows `dcs_solve` immutably; coexists
        // with the immutable `dcs_solve.row_map` read in
        // `accumulate_dcs_binding_counts`).
        let view = ws.backward_accum.dcs_solve.result_view();

        // Cut gradient + intercept from the final all-satisfied LP only. The
        // gradient/intercept come solely from the structural state-column reduced
        // costs, identical in the all-cuts and lazy-solve LPs — exactness is
        // unaffected by the binding-count bookkeeping below.
        let objective =
            extract_state_duals_only(&view, indexer.n_state, indexer, col_scale, &mut state_duals);

        // Binding-count metadata, slot-correct under the resident `CutRowMap`. The
        // final all-satisfied solve's cut-row duals (`view.dual`, the full dual
        // copied into `dcs_solve.res_dual`) are read here and mapped slot→row via
        // the persistent row map that `lazy_solve_preloaded` just finalized.
        // Borrows: `view` and `dcs_solve.row_map` both borrow
        // `ws.backward_accum.dcs_solve` immutably (so they coexist);
        // `slot_increments` is a distinct field of `ws.backward_accum` borrowed
        // mutably.
        accumulate_dcs_binding_counts(
            view.dual,
            &ws.backward_accum.dcs_solve.row_map,
            succ.successor_pool,
            succ.cut_activity_tolerance,
            &mut ws.backward_accum.slot_increments,
        );
        let _ = view;

        ws.backward_accum.state_duals_buf = state_duals;

        let mut stats_after_omega = std::mem::take(&mut ws.backward_accum.stats_after_buf);
        ws.solver.statistics_into(&mut stats_after_omega);

        // Outcome only — no first-solved basis capture (that captured basis would
        // describe the baked layout, not the DCS resident subset). The
        // binding-count update is done above, slot-correct under the lazy layout.
        write_opening_outcome(
            ws,
            omega,
            objective,
            x_hat,
            &stats_before_omega,
            &stats_after_omega,
        );

        // Restore the snapshot buffers (with their grown histogram capacity) for
        // the next opening.
        ws.backward_accum.stats_before_buf = stats_before_omega;
        ws.backward_accum.stats_after_buf = stats_after_omega;

        Ok(())
    }
}

/// Process one trial point `m` in the backward pass, iterating over all openings.
///
/// Solves at each (scenario, opening) and accumulates duals into `per_opening_stats`.
/// On the **baked path** only, at the first-solved opening (`solve_order[0]`, which
/// equals canonical ω=0 only under the identity order), writes the post-solve basis
/// into `basis_slice`; later writes are forbidden (retained-LU corruption risk), and
/// infeasibility at the first-solved opening leaves the slot unchanged. The DCS arm
/// skips first-solved basis capture by design — a captured basis would describe the
/// baked layout, not the DCS resident subset (see [`StageOpeningSolver::solve_lazy`])
// RATIONALE: 12 args required — each is a disjoint borrow (ws, ctx, training_ctx, exchange,
// succ, basis_slice, opening_solver) or a plain scalar (fwd_offset, iteration, m, arena_offset)
// or a risk slice. Merging into a struct would add indirection without reducing the caller's
// borrow count.
#[allow(clippy::too_many_arguments)]
pub(crate) fn process_trial_point_backward<S: SolverInterface + Send>(
    ws: &mut SolverWorkspace<S>,
    ctx: &StageContext<'_>,
    training_ctx: &TrainingContext<'_>,
    exchange: &ExchangeBuffers,
    fwd_offset: usize,
    iteration: u64,
    risk_measures: &[RiskMeasure],
    succ: &SuccessorSpec<'_>,
    basis_slice: &mut BasisStoreSliceMut<'_>,
    opening_solver: &StageOpeningSolver,
    m: usize,
    arena_offset: usize,
) -> Result<StagedCut, SddpError> {
    let tree_view = training_ctx.stochastic.tree_view();
    let x_hat = exchange.state_at(succ.my_rank, m);
    let scenario = fwd_offset + m;
    let s = succ.successor;

    debug_assert_eq!(
        ws.backward_accum.per_opening_stats.len(),
        succ.probabilities.len(),
        "per_opening_stats must be initialised to n_openings before each stage's trial-point loop"
    );

    // Throwaway, env-gated backward `noise_key` diagnostic. `None` on the
    // default path (single hoisted boolean check; no key compute, no
    // allocation). When `Some`, after each opening solve we pair the precomputed
    // per-(stage, ω) noise key with that opening's just-consumed
    // `simplex_iterations` and emit one record to stderr.
    let noise_key_diag = training_ctx.noise_key_diag;

    // Solve order for this stage's openings. Openings are SOLVED in
    // `solve_order(s)` — a run-constant, rank-invariant permutation precomputed at
    // setup (always installed descending; see `StudySetup::from_broadcast_params`).
    // Per-opening outcomes are written and aggregated by CANONICAL ω below, so the
    // generated cut is bit-identical regardless of the solve order: the reorder
    // changes only the warm-start chain, never which cuts are produced.
    //
    // `solve_order(s)` defaults to the identity permutation when no order is
    // installed (e.g. in tests that build the context directly), so the loop
    // degrades to canonical `0..n` order in that case.
    let solve_order = tree_view.solve_order_data(s);
    debug_assert_eq!(
        solve_order.len(),
        succ.probabilities.len(),
        "solve_order(s) must be a permutation of 0..n_openings"
    );
    // The FIRST-SOLVED opening of the trial point: it owns the per-(m, s) basis
    // load/capture (baked path) and the fresh (non-warm-carry) solve (DCS path).
    // It is the first entry of the solve-order permutation (ω=0 under the identity
    // default).
    // `u32 -> usize` is a lossless widening on every supported target.
    let first = solve_order[0] as usize;

    let mut omega_position = 0usize;
    while omega_position < succ.probabilities.len() {
        // Canonical ω for this iteration: index the precomputed solve-order
        // permutation (the identity `0..n` when no order is installed).
        let omega = solve_order[omega_position] as usize;
        omega_position += 1;

        let raw_noise = tree_view.opening(s, omega);
        let is_first = omega == first;

        opening_solver.solve_opening(
            ws,
            ctx,
            training_ctx,
            succ,
            basis_slice,
            raw_noise,
            x_hat,
            s,
            scenario,
            iteration,
            m,
            omega,
            is_first,
        )?;

        // Env-gated throwaway emit: one record per backward opening pairing the
        // precomputed σ-weighted noise key with the opening's just-consumed
        // simplex iterations (the only live value pulled from the solve). The
        // outer `if let Some` is the single hoisted boolean check; the default
        // (`None`) path does nothing.
        if let Some(diag) = noise_key_diag {
            let simplex_iterations = ws.backward_accum.per_opening_stats[omega].simplex_iterations;
            let noise_key = diag.key(s, omega).unwrap_or(f64::NAN);
            eprintln!(
                "COBRE_W1_DIAG\tstage={s}\ttrial={scenario}\tomega={omega}\t\
                 noise_key={noise_key:.17e}\tsimplex_iterations={simplex_iterations}"
            );
        }
    }

    // Aggregate into the per-worker `agg_coefficients` scratch (unchanged
    // order/values), then copy the result verbatim into this trial point's slot
    // of the per-worker arena so the bytes outlive the parallel closure without
    // a per-cut heap allocation (see module-level docs and `agg_arena`).
    let n_openings = succ.probabilities.len();
    let mut agg_intercept = 0.0_f64;
    risk_measures[succ.t].aggregate_cut_into(
        &ws.backward_accum.outcomes[..n_openings],
        succ.probabilities,
        &mut agg_intercept,
        &mut ws.backward_accum.agg_coefficients,
        &mut ws.backward_accum.risk_scratch,
    );
    let n_state = ws.backward_accum.agg_coefficients.len();
    let coefficients_range = arena_offset..arena_offset + n_state;
    debug_assert!(
        coefficients_range.end <= ws.backward_accum.agg_arena.len(),
        "agg_arena must be sized to cover this trial point's slot before the solve"
    );
    // `copy_from_slice` writes exactly the bytes the former `.clone()` produced;
    // the arena is storage only — no value transform, no reordering.
    ws.backward_accum.agg_arena[coefficients_range.clone()]
        .copy_from_slice(&ws.backward_accum.agg_coefficients[..n_state]);
    debug_assert!(
        u32::try_from(scenario).is_ok(),
        "global scenario index overflows u32"
    );
    #[allow(clippy::cast_possible_truncation)]
    let forward_pass_index = scenario as u32;
    // Accumulate binding counts into the metadata buffer for later merge.
    let pop = ws.backward_accum.slot_increments.len();
    for slot in 0..pop {
        let count = ws.backward_accum.slot_increments[slot];
        if count > 0 {
            ws.backward_accum.metadata_sync_contribution[slot] += count;
        }
    }
    Ok(StagedCut {
        trial_point_idx: m,
        intercept: agg_intercept,
        coefficients_range,
        forward_pass_index,
    })
}