dsfb-debug 0.1.0

DSFB-Debug — Structural Semiotics Engine for Software Debugging. A deterministic, read-only, observer-only augmentation layer for execution-trace residual interpretation. Does NOT replace existing observability tools — augments them with typed structural interpretation.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
//! Leave-one-fixture-out cross-validation aggregation (Phase ζ.2).
//!
//! Operates on `LooCvFixtureRecord` values captured per-fixture by
//! the LO-CV test runner (`tests/loo_cross_validation.rs`); produces
//! cross-fixture mean / median / stddev aggregates plus per-fixture
//! deltas relative to the leave-one-out training mean.
//!
//! The LO-CV protocol (panel discipline P3 / P9 / P16):
//!
//! 1. For each fixture i ∈ [0..N], compute per-fixture metrics
//!    (RSCR, FP rate, fault recall, replay-holds count, episode
//!    count) under a fixed `FusionConfig`.
//! 2. The "training mean for held-out i" is the mean over the other
//!    N-1 fixtures.
//! 3. The "test value at i" is the per-fixture metric of fixture i.
//! 4. The cross-fixture aggregate stddev (over all N fixtures) is
//!    the LO-1 reliability proxy.
//!
//! No bank mutation, no synthetic-data generation. Theorem 9
//! preservation: every fixture's per-fixture metrics include
//! `deterministic_replay_holds`; the aggregate counts the number of
//! fixtures where replay was verified.

extern crate std;

use std::collections::BTreeMap;
use std::format;
use std::string::String;
use std::vec::Vec;

/// Per-fixture record captured during a LO-CV pass.
#[derive(Debug, Clone)]
pub struct LooCvFixtureRecord {
    pub fixture_name: &'static str,
    pub rscr: f64,
    pub clean_window_fp_rate: f64,
    pub fault_recall: f64,
    pub raw_alert_count: u64,
    pub fusion_episode_count: u64,
    pub consensus_confirmed_typed_episodes: u64,
    pub deterministic_replay_holds: bool,
}

/// Cross-fixture aggregate over LO-CV records.
#[derive(Debug, Clone)]
pub struct LooCvAggregate {
    pub fixtures_observed: usize,
    pub fixtures_with_replay_holds: usize,
    pub mean_rscr: f64,
    pub stddev_rscr: f64,
    pub mean_clean_window_fp_rate: f64,
    pub stddev_clean_window_fp_rate: f64,
    pub mean_fault_recall: f64,
    pub stddev_fault_recall: f64,
    pub total_raw_alerts: u64,
    pub total_episodes: u64,
    pub total_typed_episodes: u64,
    /// Phase ζ.9 — informational delta for bank-aware Layer-3 typing.
    /// Captured per-fixture; the aggregate sums them. Useful for
    /// detecting refinement effects that don't show up in L-2 metrics
    /// (e.g. family-tier detector filtering changes window_tier_mask
    /// → bank affinity scoring → typed-confirmed count, but not
    /// cell_consensus → L-2 FP rate).
    pub total_consensus_confirmed: u64,
    /// LO-CV per-fixture deltas: for each fixture i, the value of
    /// the metric at i minus the mean over the other N-1 fixtures.
    /// Keyed by metric name → Vec<(fixture_name, delta)>.
    pub per_fixture_deltas: BTreeMap<&'static str, Vec<(&'static str, f64)>>,
}

/// Convenience: run a LO-CV pass over a slice of per-fixture records.
///
/// The runner itself (the LO-CV iteration that calls
/// `run_fusion_evaluation` once per fixture) lives in
/// `tests/loo_cross_validation.rs`; this function takes the
/// already-captured records and produces the aggregate.
pub fn run_loo_cv(records: &[LooCvFixtureRecord]) -> LooCvAggregate {
    aggregate_loo_cv(records)
}

/// Compute the LO-CV aggregate from per-fixture records.
pub fn aggregate_loo_cv(records: &[LooCvFixtureRecord]) -> LooCvAggregate {
    let n = records.len();
    if n == 0 {
        return LooCvAggregate {
            fixtures_observed: 0,
            fixtures_with_replay_holds: 0,
            mean_rscr: 0.0,
            stddev_rscr: 0.0,
            mean_clean_window_fp_rate: 0.0,
            stddev_clean_window_fp_rate: 0.0,
            mean_fault_recall: 0.0,
            stddev_fault_recall: 0.0,
            total_raw_alerts: 0,
            total_episodes: 0,
            total_typed_episodes: 0,
            total_consensus_confirmed: 0,
            per_fixture_deltas: BTreeMap::new(),
        };
    }
    let nf = n as f64;

    let mean_rscr = records.iter().map(|r| r.rscr).sum::<f64>() / nf;
    let mean_fp = records.iter().map(|r| r.clean_window_fp_rate).sum::<f64>() / nf;
    let mean_recall = records.iter().map(|r| r.fault_recall).sum::<f64>() / nf;

    let stddev_rscr = (records.iter()
        .map(|r| (r.rscr - mean_rscr).powi(2))
        .sum::<f64>() / nf).sqrt();
    let stddev_fp = (records.iter()
        .map(|r| (r.clean_window_fp_rate - mean_fp).powi(2))
        .sum::<f64>() / nf).sqrt();
    let stddev_recall = (records.iter()
        .map(|r| (r.fault_recall - mean_recall).powi(2))
        .sum::<f64>() / nf).sqrt();

    let total_raw = records.iter().map(|r| r.raw_alert_count).sum::<u64>();
    let total_eps = records.iter().map(|r| r.fusion_episode_count).sum::<u64>();
    let total_typed = records.iter()
        .map(|r| r.consensus_confirmed_typed_episodes).sum::<u64>();
    let replay_holds = records.iter()
        .filter(|r| r.deterministic_replay_holds).count();

    // Per-fixture deltas: for each fixture i, value(i) - mean(others).
    // Using the closed-form: delta_i = (value(i) - mean) * n / (n-1).
    let mut per_fixture_deltas: BTreeMap<&'static str, Vec<(&'static str, f64)>>
        = BTreeMap::new();
    if n > 1 {
        let scale = nf / (nf - 1.0);
        for r in records {
            let delta_rscr = (r.rscr - mean_rscr) * scale;
            let delta_fp = (r.clean_window_fp_rate - mean_fp) * scale;
            let delta_recall = (r.fault_recall - mean_recall) * scale;
            per_fixture_deltas.entry("rscr").or_default()
                .push((r.fixture_name, delta_rscr));
            per_fixture_deltas.entry("clean_window_fp_rate").or_default()
                .push((r.fixture_name, delta_fp));
            per_fixture_deltas.entry("fault_recall").or_default()
                .push((r.fixture_name, delta_recall));
        }
    }

    LooCvAggregate {
        fixtures_observed: n,
        fixtures_with_replay_holds: replay_holds,
        mean_rscr,
        stddev_rscr,
        mean_clean_window_fp_rate: mean_fp,
        stddev_clean_window_fp_rate: stddev_fp,
        mean_fault_recall: mean_recall,
        stddev_fault_recall: stddev_recall,
        total_raw_alerts: total_raw,
        total_episodes: total_eps,
        total_typed_episodes: total_typed,
        total_consensus_confirmed: total_typed, // alias; refined-vs-baseline delta visible in render
        per_fixture_deltas,
    }
}

/// LO-CV refinement-acceptance gate.
///
/// Returns `true` iff every metric's post-refinement LO-CV mean is
/// within the user-locked tolerance of the baseline LO-CV mean.
///
/// Tolerance per Session-17 user choice: `ε = 0.5 × baseline-stddev`,
/// no regression on any of {rscr, clean_window_fp_rate, fault_recall,
/// replay_holds_count}.
pub fn refinement_passes_gate(
    baseline: &LooCvAggregate,
    refined: &LooCvAggregate,
) -> RefinementGateVerdict {
    let mut reasons: Vec<String> = Vec::new();

    // RSCR — higher is better.
    let rscr_floor = baseline.mean_rscr - 0.5 * baseline.stddev_rscr;
    if refined.mean_rscr < rscr_floor {
        reasons.push(format!(
            "RSCR regressed: refined mean {:.4} < baseline mean {:.4} − 0.5·stddev = {:.4}",
            refined.mean_rscr, baseline.mean_rscr, rscr_floor));
    }

    // FP rate — lower is better; floor is baseline + 0.5·stddev.
    let fp_ceiling = baseline.mean_clean_window_fp_rate
        + 0.5 * baseline.stddev_clean_window_fp_rate;
    if refined.mean_clean_window_fp_rate > fp_ceiling {
        reasons.push(format!(
            "FP rate regressed: refined mean {:.4} > baseline mean {:.4} + 0.5·stddev = {:.4}",
            refined.mean_clean_window_fp_rate, baseline.mean_clean_window_fp_rate, fp_ceiling));
    }

    // Fault recall — higher is better.
    let recall_floor = baseline.mean_fault_recall - 0.5 * baseline.stddev_fault_recall;
    if refined.mean_fault_recall < recall_floor {
        reasons.push(format!(
            "Fault recall regressed: refined mean {:.4} < baseline mean {:.4} − 0.5·stddev = {:.4}",
            refined.mean_fault_recall, baseline.mean_fault_recall, recall_floor));
    }

    // Replay holds — count must not decrease.
    if refined.fixtures_with_replay_holds < baseline.fixtures_with_replay_holds {
        reasons.push(format!(
            "Replay-holds regressed: refined {} < baseline {}",
            refined.fixtures_with_replay_holds, baseline.fixtures_with_replay_holds));
    }

    if reasons.is_empty() {
        RefinementGateVerdict::Accept
    } else {
        RefinementGateVerdict::Reject { reasons }
    }
}

#[derive(Debug, Clone)]
pub enum RefinementGateVerdict {
    Accept,
    Reject { reasons: Vec<String> },
}

// ===== Phase η.2 — K-fold cross-validation aggregate =====

/// K-fold CV record: one entry per fold.
///
/// For each fold, the test set is the held-out subset of fixtures
/// and the train set is the rest. The harness records the aggregate
/// over the test set for that fold.
#[derive(Debug, Clone)]
pub struct KFoldFoldRecord {
    pub fold_index: usize,
    pub test_fixtures: Vec<&'static str>,
    pub test_aggregate: LooCvAggregate,
}

/// K-fold cross-validation aggregate. Captures per-fold test
/// metrics + the cross-fold mean / stddev.
#[derive(Debug, Clone)]
pub struct KFoldCvAggregate {
    pub k: usize,
    pub fixtures_per_fold: usize,
    pub folds: Vec<KFoldFoldRecord>,
    pub cross_fold_mean_rscr: f64,
    pub cross_fold_stddev_rscr: f64,
    pub cross_fold_mean_fp: f64,
    pub cross_fold_stddev_fp: f64,
    pub cross_fold_mean_recall: f64,
    pub cross_fold_stddev_recall: f64,
    pub all_folds_replay_holds: bool,
}

/// Run K-fold cross-validation over a slice of records.
///
/// Records are partitioned into K folds in their declared order.
/// Each fold's aggregate is computed independently. The cross-fold
/// mean / stddev is reported as the K-fold CV result, complementing
/// the LO-1 CV from `aggregate_loo_cv`.
///
/// Constraints: K >= 2, n >= K (at least one record per fold).
pub fn aggregate_kfold_cv(records: &[LooCvFixtureRecord], k: usize) -> KFoldCvAggregate {
    let n = records.len();
    if k < 2 || n < k {
        return KFoldCvAggregate {
            k, fixtures_per_fold: 0, folds: Vec::new(),
            cross_fold_mean_rscr: 0.0, cross_fold_stddev_rscr: 0.0,
            cross_fold_mean_fp: 0.0, cross_fold_stddev_fp: 0.0,
            cross_fold_mean_recall: 0.0, cross_fold_stddev_recall: 0.0,
            all_folds_replay_holds: false,
        };
    }

    let fpf = (n + k - 1) / k;  // ceiling division — last fold may be smaller

    let mut folds: Vec<KFoldFoldRecord> = Vec::with_capacity(k);
    let mut all_replay = true;

    for fold_idx in 0..k {
        let start = fold_idx * fpf;
        let end = ((fold_idx + 1) * fpf).min(n);
        if start >= n { continue; }
        let test_set: Vec<LooCvFixtureRecord> = records[start..end].to_vec();
        let test_names: Vec<&'static str> = test_set.iter()
            .map(|r| r.fixture_name).collect();
        let agg = aggregate_loo_cv(&test_set);
        if agg.fixtures_with_replay_holds != agg.fixtures_observed {
            all_replay = false;
        }
        folds.push(KFoldFoldRecord {
            fold_index: fold_idx,
            test_fixtures: test_names,
            test_aggregate: agg,
        });
    }

    let kf = folds.len() as f64;
    let mean_rscr = folds.iter().map(|f| f.test_aggregate.mean_rscr).sum::<f64>() / kf;
    let mean_fp = folds.iter().map(|f| f.test_aggregate.mean_clean_window_fp_rate).sum::<f64>() / kf;
    let mean_recall = folds.iter().map(|f| f.test_aggregate.mean_fault_recall).sum::<f64>() / kf;
    let stddev_rscr = (folds.iter()
        .map(|f| (f.test_aggregate.mean_rscr - mean_rscr).powi(2))
        .sum::<f64>() / kf).sqrt();
    let stddev_fp = (folds.iter()
        .map(|f| (f.test_aggregate.mean_clean_window_fp_rate - mean_fp).powi(2))
        .sum::<f64>() / kf).sqrt();
    let stddev_recall = (folds.iter()
        .map(|f| (f.test_aggregate.mean_fault_recall - mean_recall).powi(2))
        .sum::<f64>() / kf).sqrt();

    KFoldCvAggregate {
        k, fixtures_per_fold: fpf,
        folds,
        cross_fold_mean_rscr: mean_rscr,
        cross_fold_stddev_rscr: stddev_rscr,
        cross_fold_mean_fp: mean_fp,
        cross_fold_stddev_fp: stddev_fp,
        cross_fold_mean_recall: mean_recall,
        cross_fold_stddev_recall: stddev_recall,
        all_folds_replay_holds: all_replay,
    }
}

/// Render the K-fold CV aggregate as markdown.
pub fn render_kfold_cv_md(agg: &KFoldCvAggregate) -> String {
    let mut out = String::new();
    out.push_str(&format!("# K-fold cross-validation (K = {}) — Phase η.2\n\n", agg.k));
    out.push_str("Source: Phase η.2 K-fold harness (`src/audit/loo_cv.rs::aggregate_kfold_cv`).\n\n");
    out.push_str(&format!("**Folds:** {}\n", agg.folds.len()));
    out.push_str(&format!("**Fixtures per fold:** {} (last fold may be smaller)\n", agg.fixtures_per_fold));
    out.push_str(&format!("**All folds Theorem 9 replay holds:** {}\n\n", agg.all_folds_replay_holds));

    out.push_str("## Per-fold test-set aggregates\n\n");
    out.push_str("| Fold | Test fixtures | RSCR | FP rate | Fault recall | Replay |\n");
    out.push_str("|-----:|---------------|-----:|--------:|-------------:|:------:|\n");
    for f in &agg.folds {
        out.push_str(&format!(
            "| {} | {} | {:.4} | {:.4} | {:.4} | {} / {} |\n",
            f.fold_index,
            f.test_fixtures.iter().map(|s| format!("`{}`", s))
                .collect::<Vec<_>>().join(", "),
            f.test_aggregate.mean_rscr,
            f.test_aggregate.mean_clean_window_fp_rate,
            f.test_aggregate.mean_fault_recall,
            f.test_aggregate.fixtures_with_replay_holds,
            f.test_aggregate.fixtures_observed,
        ));
    }

    out.push_str("\n## Cross-fold aggregate\n\n");
    out.push_str("| Metric | Cross-fold mean | Cross-fold stddev |\n");
    out.push_str("|--------|----------------:|------------------:|\n");
    out.push_str(&format!("| RSCR | {:.4} | {:.4} |\n",
        agg.cross_fold_mean_rscr, agg.cross_fold_stddev_rscr));
    out.push_str(&format!("| Clean-window FP rate | {:.4} | {:.4} |\n",
        agg.cross_fold_mean_fp, agg.cross_fold_stddev_fp));
    out.push_str(&format!("| Fault recall | {:.4} | {:.4} |\n",
        agg.cross_fold_mean_recall, agg.cross_fold_stddev_recall));

    out.push_str("\n## Honest empirical reading\n\n");
    out.push_str("K-fold CV averages over multiple test-set configurations\n");
    out.push_str("(in contrast to LO-1 which holds out one fixture at a time).\n");
    out.push_str("With N = ");
    let total_fixtures: usize = agg.folds.iter()
        .map(|f| f.test_aggregate.fixtures_observed).sum();
    out.push_str(&format!("{} fixtures and K = {}, ", total_fixtures, agg.k));
    out.push_str("each fold tests on a multi-fixture set (lower variance per\n");
    out.push_str("fold than LO-1, but fewer folds than LO-1 — the two views\n");
    out.push_str("triangulate the cross-validation noise floor).\n");
    out
}

/// Render the LO-CV baseline aggregate as markdown.
pub fn render_loo_cv_baseline_md(agg: &LooCvAggregate, label: &str) -> String {
    let mut out = String::new();
    out.push_str(&format!("# Leave-one-fixture-out cross-validation: {}\n\n", label));
    out.push_str("Source: Phase ζ.2 LO-CV harness (`src/audit/loo_cv.rs`).\n\n");
    out.push_str(&format!("**Fixtures observed:** {}\n", agg.fixtures_observed));
    out.push_str(&format!("**Fixtures with deterministic replay verified:** {} / {}\n\n",
        agg.fixtures_with_replay_holds, agg.fixtures_observed));
    out.push_str("## Cross-fixture aggregate\n\n");
    out.push_str("| Metric | Mean | Stddev | LO-CV gate floor (mean−0.5·stddev) |\n");
    out.push_str("|--------|-----:|-------:|----------------------------------:|\n");
    out.push_str(&format!("| RSCR | {:.4} | {:.4} | {:.4} |\n",
        agg.mean_rscr, agg.stddev_rscr,
        agg.mean_rscr - 0.5 * agg.stddev_rscr));
    out.push_str(&format!("| Clean-window FP rate | {:.4} | {:.4} | (ceiling: {:.4}) |\n",
        agg.mean_clean_window_fp_rate, agg.stddev_clean_window_fp_rate,
        agg.mean_clean_window_fp_rate + 0.5 * agg.stddev_clean_window_fp_rate));
    out.push_str(&format!("| Fault recall | {:.4} | {:.4} | {:.4} |\n",
        agg.mean_fault_recall, agg.stddev_fault_recall,
        agg.mean_fault_recall - 0.5 * agg.stddev_fault_recall));
    out.push_str(&format!("\n**Total raw alerts:** {}\n", agg.total_raw_alerts));
    out.push_str(&format!("**Total episodes:** {}\n", agg.total_episodes));
    out.push_str(&format!("**Total typed-confirmed episodes:** {}\n\n",
        agg.total_typed_episodes));

    out.push_str("## Per-fixture LO-CV deltas\n\n");
    out.push_str("Delta = (fixture metric) − (mean over other N-1 fixtures).\n");
    out.push_str("Large positive deltas indicate the fixture pulls the mean upward;\n");
    out.push_str("large negative deltas indicate the fixture pulls it downward.\n\n");
    for metric in ["rscr", "clean_window_fp_rate", "fault_recall"] {
        if let Some(deltas) = agg.per_fixture_deltas.get(metric) {
            out.push_str(&format!("### {}\n\n", metric));
            out.push_str("| Fixture | Delta |\n");
            out.push_str("|---------|------:|\n");
            for (fix, d) in deltas {
                out.push_str(&format!("| `{}` | {:+.4} |\n", fix, d));
            }
            out.push_str("\n");
        }
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::vec;

    fn record(name: &'static str, rscr: f64, fp: f64, recall: f64) -> LooCvFixtureRecord {
        LooCvFixtureRecord {
            fixture_name: name,
            rscr,
            clean_window_fp_rate: fp,
            fault_recall: recall,
            raw_alert_count: 1,
            fusion_episode_count: 1,
            consensus_confirmed_typed_episodes: 0,
            deterministic_replay_holds: true,
        }
    }

    #[test]
    fn empty_records_produce_zero_aggregate() {
        let agg = aggregate_loo_cv(&[]);
        assert_eq!(agg.fixtures_observed, 0);
        assert_eq!(agg.mean_rscr, 0.0);
    }

    #[test]
    fn aggregate_computes_mean_and_stddev() {
        let recs = vec![
            record("a", 1.0, 0.10, 0.5),
            record("b", 2.0, 0.20, 0.7),
            record("c", 3.0, 0.30, 0.9),
        ];
        let agg = aggregate_loo_cv(&recs);
        assert_eq!(agg.fixtures_observed, 3);
        assert!((agg.mean_rscr - 2.0).abs() < 1e-9);
        // stddev of [1, 2, 3] = sqrt(2/3) ≈ 0.8165
        assert!((agg.stddev_rscr - 0.8164965809277260).abs() < 1e-9);
    }

    #[test]
    fn refinement_gate_accepts_within_tolerance() {
        let base = aggregate_loo_cv(&[
            record("a", 1.0, 0.10, 0.5),
            record("b", 2.0, 0.20, 0.7),
            record("c", 3.0, 0.30, 0.9),
        ]);
        // Refinement: same fixtures, same numbers — no regression.
        let refined = base.clone();
        match refinement_passes_gate(&base, &refined) {
            RefinementGateVerdict::Accept => {},
            RefinementGateVerdict::Reject { reasons } => panic!("should accept: {:?}", reasons),
        }
    }

    #[test]
    fn refinement_gate_rejects_recall_regression() {
        let base = aggregate_loo_cv(&[
            record("a", 1.0, 0.10, 0.7),
            record("b", 2.0, 0.20, 0.7),
            record("c", 3.0, 0.30, 0.7),
        ]);
        // mean_recall = 0.7, stddev = 0; floor = 0.7. Refinement to 0.6 fails.
        let refined = aggregate_loo_cv(&[
            record("a", 1.0, 0.10, 0.6),
            record("b", 2.0, 0.20, 0.6),
            record("c", 3.0, 0.30, 0.6),
        ]);
        match refinement_passes_gate(&base, &refined) {
            RefinementGateVerdict::Reject { reasons } => {
                assert!(reasons.iter().any(|r| r.contains("Fault recall")));
            },
            _ => panic!("should reject"),
        }
    }
}