noos 0.4.1

Reliability layer for Rust LLM agents: scope drift, cost circuit breaks, and procedural correction memory as event-driven Decisions.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
//! Token statistics accumulator — rolling logprob window over an LLM's
//! token stream with an entropy-based confidence readout.
//!
//! **Scope note (P1 / P9b)**: this is an I/O sub-module of
//! [`regulator`](super), not a cognitive module. All computations here
//! operate on numeric tokens + logprobs (primary path) or on
//! language-neutral structural signals (length, question-mark density —
//! fallback path). No sentiment lexicons, no topic regex. P1 applies to
//! the wrapped [`CognitiveSession`](crate::session::CognitiveSession);
//! here we satisfy P9b by construction.
//!
//! ## R2 decision (Session 17)
//!
//! Per-token logprobs are not universally available:
//!
//! - **Available**: OpenAI (`logprobs: true`), vLLM, local candle
//!   inference, most open-model runtimes.
//! - **Not available by default**: Anthropic API (as of 2026-04).
//!
//! This module follows a hybrid strategy (recommended in the Path 2
//! architecture plan R2 row):
//!
//! 1. **Primary path** — when the provider exposes logprobs, compute
//!    confidence from mean negative-log-likelihood over a rolling
//!    window. This is the high-fidelity signal that detects gibberish
//!    (uniformly high NLL on OOD text) and local uncertainty runs.
//! 2. **Fallback path** — when logprobs are absent (callers pass
//!    `LOGPROB_UNAVAILABLE` = `0.0` per the `LLMEvent::Token` contract),
//!    fall back to language-neutral structural heuristics on the final
//!    response text: very short replies or high question-mark density
//!    cap confidence below neutral.
//!
//! The fallback is deliberately conservative: it cannot distinguish a
//! confident short answer from a refusal, so it errs low. Callers that
//! need stronger calibration should prefer a provider that exposes
//! logprobs.
//!
//! ## Scope for Session 17
//!
//! This module lands the scalar confidence readout. Per-fragment
//! `ConfidenceSpan` analysis (from [`Decision::LowConfidenceSpans`]) is
//! Session 18+ work — it needs span metadata this MVP doesn't track.
//!
//! ## Gating (P10)
//!
//! This module does NOT produce a [`Decision`] variant in 0.3.0. The
//! scalar it exposes via [`Regulator::confidence`] is an observability
//! readout — callers can use it to build their own heuristics or to
//! feed external logging.
//!
//! - **Suppresses**: nothing (pure observability).
//! - **Suppressed by**: nothing.
//! - **Inactive when**: no tokens have been observed this turn AND no
//!   [`TurnComplete`](super::LLMEvent::TurnComplete) has landed yet.
//!   In that state [`Regulator::confidence`] returns
//!   [`NEUTRAL_CONFIDENCE`] (0.5) rather than claiming false certainty.
//!
//! The reserved [`Decision::LowConfidenceSpans`] variant will consume
//! the same rolling-window data once span-level tracking lands; its
//! own gating section will be added with that change.
//!
//! [`CognitiveSession`]: crate::session::CognitiveSession
//! [`Decision`]: super::Decision
//! [`Decision::LowConfidenceSpans`]: super::Decision::LowConfidenceSpans
//! [`Regulator::confidence`]: super::Regulator::confidence

use std::collections::VecDeque;

use serde::{Deserialize, Serialize};

use crate::math::clamp;

// ── Constants ──────────────────────────────────────────────────────────

/// Default rolling window size for per-token logprobs.
///
/// 128 tokens ≈ one or two short paragraphs at typical tokenisation rates.
/// Large enough to smooth per-token noise, small enough that a local
/// uncertainty run (a confused span in the middle of a response) is not
/// drowned out by surrounding confident tokens. Derivation: at an average
/// of ~1 token / 4 chars and ~600-char confident chunks surrounding a
/// 200-char confused span, 128 tokens captures the confused span
/// proportionally.
pub const DEFAULT_WINDOW_SIZE: usize = 128;

/// Sentinel value for "logprob unavailable", per the
/// [`LLMEvent::Token`](super::LLMEvent::Token) contract documented in
/// `docs/regulator-design.md` §3.2.
///
/// Real natural-log probabilities are always ≤ 0 (ln of a value in
/// `(0, 1]` is non-positive); a logprob of exactly `0.0` would mean
/// p = 1.0 which effectively never happens on LLM output. Using `0.0`
/// as the sentinel avoids a type-level `Option<f64>` while keeping the
/// collision risk negligible in practice. Spurious positive or non-finite
/// values are also treated as "unavailable" (fail-open per P5).
pub const LOGPROB_UNAVAILABLE: f64 = 0.0;

/// Mean negative-log-likelihood (nats) that maps to confidence = 0.5.
///
/// Derivation: modern LLMs on in-distribution English text show mean
/// per-token NLL around 2.5–3.5 nats (perplexity 12–33). OOD / gibberish
/// text pushes NLL to 5–7 nats (perplexity 150–1100). 4.0 sits between
/// the two regimes — confident output pulls above 0.5, uncertain /
/// gibberish pulls below 0.5. Calibration is linear in MVP; a logistic
/// fit is tracked for Session 24 real-LLM eval.
pub const MEAN_NLL_CONFIDENCE_MIDPOINT: f64 = 4.0;

/// Half-range of the linear NLL→confidence mapping (nats).
///
/// With midpoint 4.0 and half-range 3.0, NLL = 1.0 (very confident) maps
/// to confidence 1.0, NLL = 7.0 (very uncertain / gibberish) maps to
/// confidence 0.0. Symmetric around the midpoint.
pub const MEAN_NLL_CONFIDENCE_HALF_RANGE: f64 = 3.0;

/// Structural fallback: minimum response length in characters below
/// which confidence is capped low.
///
/// Very short replies are disproportionately likely to be refusals
/// ("I can't help with that") or failed attempts ("Sorry, I don't know")
/// rather than decisive short answers. The cap is conservative by design.
pub const STRUCT_MIN_LENGTH_CHARS: usize = 40;

/// Structural fallback: question-mark-to-character ratio above which the
/// response is treated as clarification-seeking rather than answer-giving.
///
/// 0.02 = 2 `?` per 100 chars. A normal declarative response is 0%. A
/// response asking back ("Which file did you mean? And which function?")
/// typically exceeds 5% on any non-trivial clarification.
pub const STRUCT_HIGH_QUESTION_RATIO: f64 = 0.02;

/// Structural fallback: confidence returned when the response text looks
/// unremarkable (not too short, not question-heavy).
///
/// Set at 0.7 rather than neutral 0.5 because absence of red flags is
/// mildly positive evidence — but we cap below 1.0 to reflect that the
/// fallback path lacks the signal a logprob window provides.
pub const STRUCT_FALLBACK_DEFAULT: f64 = 0.7;

/// Structural fallback: confidence returned when one red flag is present
/// (short response OR question-heavy, but not both).
///
/// Set at 0.4 — below `NEUTRAL_CONFIDENCE` (0.5) so callers that use
/// `confidence < 0.5` as an abstention trigger fire, but not so far
/// below that it overrides the "clearly uncertain" band reserved for
/// both-flags. The 0.25 spacing from `NEUTRAL_CONFIDENCE` gives each
/// tri-bucket band a clearly separated regime label.
pub const STRUCT_FALLBACK_WEAK: f64 = 0.4;

/// Structural fallback: confidence returned when multiple red flags are
/// present (short AND question-heavy).
///
/// Set at 0.2 — 0.2 below `STRUCT_FALLBACK_WEAK` (0.4) to preserve
/// the 0.2-band spacing between regimes. Low enough that `confidence
/// < 0.3` thresholds (used by stricter abstention callers) fire
/// reliably, while staying non-zero so downstream code doesn't treat
/// the signal as "missing" when it's really "clearly uncertain".
pub const STRUCT_FALLBACK_STRONG: f64 = 0.2;

/// Confidence returned when no signal is available (empty accumulator,
/// no response text). Matches the legacy `CognitiveSignals::confidence`
/// base so Path 1 ↔ Path 2 users see the same default.
pub const NEUTRAL_CONFIDENCE: f64 = 0.5;

// ── Accumulator ────────────────────────────────────────────────────────

/// Rolling statistics over the current turn's LLM output.
///
/// Produces a turn-level confidence estimate from per-token logprobs via
/// [`logprob_confidence`](Self::logprob_confidence). The accumulator is
/// per-turn: callers drive lifecycle via
/// [`begin_turn`](Self::begin_turn) / [`on_token`](Self::on_token) and
/// read state via [`has_logprobs`](Self::has_logprobs) +
/// [`logprob_confidence`](Self::logprob_confidence).
///
/// Structural fallback is a free function ([`structural_confidence`])
/// rather than a method — it needs the response text, which the
/// accumulator doesn't store (the wrapping [`Regulator`](super::Regulator)
/// already buffers it in `pending_response`). Use
/// [`confidence_with_fallback`] to compose both paths.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TokenStatsAccumulator {
    /// Rolling window of recent token logprobs. Only finite, non-sentinel
    /// logprobs enter the window. Oldest values are evicted when the
    /// window fills past `window_size`.
    logprobs: VecDeque<f64>,
    /// Count of tokens seen in the current turn, including those whose
    /// logprob was unavailable.
    total_tokens: usize,
    /// Count of tokens in the current turn that arrived with
    /// `LOGPROB_UNAVAILABLE`. Used by [`logprob_coverage`](Self::logprob_coverage)
    /// to surface how much of the turn is logprob-backed.
    unavailable_count: usize,
    /// Maximum window size. Constructor clamps to >= 1.
    window_size: usize,
}

impl TokenStatsAccumulator {
    /// Construct with [`DEFAULT_WINDOW_SIZE`].
    pub fn new() -> Self {
        Self::with_window(DEFAULT_WINDOW_SIZE)
    }

    /// Construct with a custom window size (clamped to ≥ 1 for
    /// fail-open P5 behaviour).
    pub fn with_window(window_size: usize) -> Self {
        Self {
            logprobs: VecDeque::new(),
            total_tokens: 0,
            unavailable_count: 0,
            window_size: window_size.max(1),
        }
    }

    /// Mutable: reset all per-turn state. Called at each `TurnStart`
    /// boundary. Requires mutation because the accumulator is a rolling
    /// statistic that must not leak across turn boundaries.
    pub fn begin_turn(&mut self) {
        self.logprobs.clear();
        self.total_tokens = 0;
        self.unavailable_count = 0;
    }

    /// Mutable: record one token from the current turn. Requires
    /// mutation because every token updates the rolling window and
    /// coverage counters.
    ///
    /// Logprobs ≥ 0.0 or non-finite are treated as "unavailable" and
    /// bypass the window (fail-open per P5): real natural-log
    /// probabilities are strictly < 0, and the
    /// [`LOGPROB_UNAVAILABLE`] sentinel is exactly `0.0`.
    pub fn on_token(&mut self, logprob: f64) {
        self.total_tokens += 1;
        if logprob >= 0.0 || !logprob.is_finite() {
            self.unavailable_count += 1;
            return;
        }
        self.logprobs.push_back(logprob);
        if self.logprobs.len() > self.window_size {
            self.logprobs.pop_front();
        }
    }

    /// Whether the rolling window contains any logprobs (primary
    /// confidence path is available).
    pub fn has_logprobs(&self) -> bool {
        !self.logprobs.is_empty()
    }

    /// Confidence from the rolling logprob window, in `[0, 1]`.
    ///
    /// Returns [`NEUTRAL_CONFIDENCE`] when the window is empty. Otherwise
    /// maps mean negative-log-likelihood linearly through
    /// `[MIDPOINT - HALF_RANGE, MIDPOINT + HALF_RANGE]` → `[1, 0]` and
    /// clamps to `[0, 1]`.
    pub fn logprob_confidence(&self) -> f64 {
        if self.logprobs.is_empty() {
            return NEUTRAL_CONFIDENCE;
        }
        let mean_nll: f64 =
            self.logprobs.iter().map(|lp| -lp).sum::<f64>() / self.logprobs.len() as f64;
        let offset = MEAN_NLL_CONFIDENCE_MIDPOINT - mean_nll;
        let confidence = 0.5 + 0.5 * (offset / MEAN_NLL_CONFIDENCE_HALF_RANGE);
        clamp(confidence, 0.0, 1.0)
    }

    /// Total tokens observed in the current turn (including unavailable
    /// logprobs).
    pub fn token_count(&self) -> usize {
        self.total_tokens
    }

    /// Fraction of tokens in the current turn whose logprobs were
    /// available, in `[0, 1]`. 0.0 when no tokens have been observed.
    ///
    /// Callers can use this to decide between the primary and fallback
    /// paths: coverage near 1.0 means the logprob confidence is
    /// well-supported; coverage near 0.0 means the structural fallback
    /// should be preferred.
    pub fn logprob_coverage(&self) -> f64 {
        if self.total_tokens == 0 {
            return 0.0;
        }
        let available = self.total_tokens - self.unavailable_count;
        available as f64 / self.total_tokens as f64
    }
}

impl Default for TokenStatsAccumulator {
    fn default() -> Self {
        Self::new()
    }
}

// ── Structural fallback ────────────────────────────────────────────────

/// Language-neutral structural confidence heuristic.
///
/// Used when the provider does not expose per-token logprobs. Signals:
///
/// - **Short response** (under [`STRUCT_MIN_LENGTH_CHARS`] chars): red
///   flag — disproportionately indicates refusal or failed attempt.
/// - **High question-mark density** (ratio ≥
///   [`STRUCT_HIGH_QUESTION_RATIO`]): red flag — response is
///   clarification-seeking rather than answering.
///
/// Both flags → [`STRUCT_FALLBACK_STRONG`] (0.2). One flag →
/// [`STRUCT_FALLBACK_WEAK`] (0.4). Neither flag →
/// [`STRUCT_FALLBACK_DEFAULT`] (0.7). Empty string →
/// [`NEUTRAL_CONFIDENCE`] (caller has no signal to work with).
///
/// This is the P9b-compliant fallback: no sentiment lexicon, no language
/// assumption. Works on any language the LLM produces — but is less
/// discriminating than logprob-based confidence.
pub fn structural_confidence(response_text: &str) -> f64 {
    let len = response_text.chars().count();
    if len == 0 {
        return NEUTRAL_CONFIDENCE;
    }
    let q_count = response_text.chars().filter(|c| *c == '?').count();
    let q_ratio = q_count as f64 / len as f64;

    let short = len < STRUCT_MIN_LENGTH_CHARS;
    let question_heavy = q_ratio >= STRUCT_HIGH_QUESTION_RATIO;

    match (short, question_heavy) {
        (true, true) => STRUCT_FALLBACK_STRONG,
        (true, false) | (false, true) => STRUCT_FALLBACK_WEAK,
        (false, false) => STRUCT_FALLBACK_DEFAULT,
    }
}

/// Compose the primary logprob-based path with the structural fallback.
///
/// - `stats` has any logprobs → use [`logprob_confidence`](TokenStatsAccumulator::logprob_confidence).
/// - Otherwise, `response_text` is `Some(text)` → use
///   [`structural_confidence`].
/// - Otherwise → [`NEUTRAL_CONFIDENCE`].
///
/// This is the function [`Regulator::confidence`](super::Regulator::confidence)
/// delegates to; it's exposed so external callers that assemble their own
/// regulation pipelines (bypassing `Regulator`) can reuse the same
/// composition rule.
pub fn confidence_with_fallback(
    stats: &TokenStatsAccumulator,
    response_text: Option<&str>,
) -> f64 {
    if stats.has_logprobs() {
        return stats.logprob_confidence();
    }
    if let Some(text) = response_text {
        return structural_confidence(text);
    }
    NEUTRAL_CONFIDENCE
}

#[cfg(test)]
mod tests {
    use super::*;

    // ── TokenStatsAccumulator ──────────────────────────────────────────

    #[test]
    fn empty_accumulator_is_neutral() {
        let stats = TokenStatsAccumulator::new();
        assert!(!stats.has_logprobs());
        assert_eq!(stats.token_count(), 0);
        assert_eq!(stats.logprob_coverage(), 0.0);
        assert!(
            (stats.logprob_confidence() - NEUTRAL_CONFIDENCE).abs() < 1e-9,
            "empty window must return neutral confidence"
        );
    }

    #[test]
    fn high_logprob_tokens_raise_confidence() {
        // -0.1 nats per token → mean NLL 0.1 → well below midpoint 4.0
        // → confidence pinned at 1.0 by the mapping.
        let mut stats = TokenStatsAccumulator::new();
        for _ in 0..20 {
            stats.on_token(-0.1);
        }
        let c = stats.logprob_confidence();
        assert!(c > 0.9, "high-probability tokens should push confidence >0.9 (got {c})");
    }

    #[test]
    fn low_logprob_tokens_lower_confidence() {
        // -7.0 nats per token → mean NLL 7.0 → at MIDPOINT + HALF_RANGE
        // → confidence at 0.0 after clamp.
        let mut stats = TokenStatsAccumulator::new();
        for _ in 0..20 {
            stats.on_token(-7.0);
        }
        let c = stats.logprob_confidence();
        assert!(
            c < 0.1,
            "high-NLL (gibberish-like) tokens should pull confidence <0.1 (got {c})"
        );
    }

    #[test]
    fn mid_range_logprobs_produce_mid_confidence() {
        // mean NLL 4.0 (midpoint) → confidence exactly 0.5.
        let mut stats = TokenStatsAccumulator::new();
        for _ in 0..10 {
            stats.on_token(-4.0);
        }
        let c = stats.logprob_confidence();
        assert!(
            (c - 0.5).abs() < 1e-9,
            "mean NLL at midpoint must map to confidence 0.5 (got {c})"
        );
    }

    #[test]
    fn begin_turn_resets_accumulator() {
        let mut stats = TokenStatsAccumulator::new();
        stats.on_token(-1.0);
        stats.on_token(-2.0);
        assert!(stats.has_logprobs());

        stats.begin_turn();
        assert!(!stats.has_logprobs());
        assert_eq!(stats.token_count(), 0);
        assert_eq!(stats.logprob_coverage(), 0.0);
    }

    #[test]
    fn rolling_window_evicts_oldest() {
        // Contract: once the window fills, older logprobs are evicted so
        // confidence reflects only the most recent `window_size` tokens.
        // We push 6 tokens into a window of 4; if eviction works, the
        // surviving window is [-3, -4, -5, -6] (mean NLL = 4.5), which
        // maps to confidence = 0.5 + 0.5 * (4.0 - 4.5) / 3.0 ≈ 0.4167.
        // If eviction were broken and all 6 survived, mean NLL = 3.5
        // would give confidence ≈ 0.5833 — observably different.
        let mut reference = TokenStatsAccumulator::with_window(4);
        reference.on_token(-3.0);
        reference.on_token(-4.0);
        reference.on_token(-5.0);
        reference.on_token(-6.0);
        let expected = reference.logprob_confidence();

        let mut stats = TokenStatsAccumulator::with_window(4);
        stats.on_token(-1.0); // evicted
        stats.on_token(-2.0); // evicted
        stats.on_token(-3.0);
        stats.on_token(-4.0);
        stats.on_token(-5.0);
        stats.on_token(-6.0);

        assert!(
            (stats.logprob_confidence() - expected).abs() < 1e-9,
            "eviction must match a reference built only from surviving tokens \
             (got {}, expected {})",
            stats.logprob_confidence(),
            expected
        );
        // `total_tokens` counts ALL tokens seen (observable via
        // `token_count`), not just the ones in the window.
        assert_eq!(stats.token_count(), 6);
    }

    #[test]
    fn unavailable_logprob_does_not_enter_window() {
        let mut stats = TokenStatsAccumulator::new();
        stats.on_token(LOGPROB_UNAVAILABLE);
        assert!(!stats.has_logprobs());
        assert_eq!(stats.token_count(), 1);
        assert_eq!(stats.logprob_coverage(), 0.0);
    }

    #[test]
    fn non_finite_or_positive_logprob_treated_as_unavailable() {
        // Fail-open: spurious positive values + NaN + Inf all bypass
        // the window rather than corrupt the mean.
        let mut stats = TokenStatsAccumulator::new();
        stats.on_token(1.5); // positive — not a real logprob
        stats.on_token(f64::NAN);
        stats.on_token(f64::INFINITY);
        stats.on_token(f64::NEG_INFINITY);
        assert!(!stats.has_logprobs());
        assert_eq!(stats.token_count(), 4);
        assert_eq!(stats.logprob_coverage(), 0.0);
    }

    #[test]
    fn logprob_coverage_tracks_available_fraction() {
        let mut stats = TokenStatsAccumulator::new();
        stats.on_token(-1.0);
        stats.on_token(LOGPROB_UNAVAILABLE);
        stats.on_token(-2.0);
        stats.on_token(LOGPROB_UNAVAILABLE);
        // 2 of 4 tokens have real logprobs.
        assert!((stats.logprob_coverage() - 0.5).abs() < 1e-9);
    }

    #[test]
    fn zero_window_size_clamped_to_one() {
        // Contract: `with_window(0)` must not divide-by-zero or panic;
        // fail-open clamps to 1. Observable: after two pushes, confidence
        // reflects only the most recent logprob (the one that survived
        // eviction in a size-1 window).
        let mut reference = TokenStatsAccumulator::with_window(1);
        reference.on_token(-2.0);
        let expected = reference.logprob_confidence();

        let mut stats = TokenStatsAccumulator::with_window(0);
        stats.on_token(-1.0);
        stats.on_token(-2.0);
        assert!(
            (stats.logprob_confidence() - expected).abs() < 1e-9,
            "size-0 window should clamp to 1 and evict oldest \
             (got {}, expected {})",
            stats.logprob_confidence(),
            expected
        );
    }

    // ── structural_confidence ──────────────────────────────────────────

    #[test]
    fn structural_empty_text_is_neutral() {
        assert!((structural_confidence("") - NEUTRAL_CONFIDENCE).abs() < 1e-9);
    }

    #[test]
    fn structural_short_response_is_low() {
        // Under STRUCT_MIN_LENGTH_CHARS (40), no questions.
        let c = structural_confidence("I don't know.");
        assert!(
            (c - STRUCT_FALLBACK_WEAK).abs() < 1e-9,
            "short response should return weak fallback (got {c})"
        );
    }

    #[test]
    fn structural_question_heavy_short_is_strongest_low() {
        // Short AND question-heavy — both red flags fire.
        let c = structural_confidence("What? How? When?");
        assert!(
            (c - STRUCT_FALLBACK_STRONG).abs() < 1e-9,
            "short + question-heavy should return strong-low fallback (got {c})"
        );
    }

    #[test]
    fn structural_question_heavy_long_is_weak() {
        // Long enough but lots of `?` — one red flag.
        let c = structural_confidence(
            "Which file did you mean? And which function inside it? \
             Also, should the refactor preserve the existing signature?",
        );
        assert!(
            (c - STRUCT_FALLBACK_WEAK).abs() < 1e-9,
            "question-heavy long response should return weak fallback (got {c})"
        );
    }

    #[test]
    fn structural_normal_response_is_default() {
        // Long enough, few/no questions — no red flags.
        let c = structural_confidence(
            "Here is the refactored function. It preserves the original \
             signature and moves the body into an async block returning \
             a Future. No behaviour changes for synchronous callers.",
        );
        assert!(
            (c - STRUCT_FALLBACK_DEFAULT).abs() < 1e-9,
            "unremarkable response should return default fallback (got {c})"
        );
    }

    // ── confidence_with_fallback ───────────────────────────────────────

    #[test]
    fn fallback_prefers_logprobs_when_available() {
        let mut stats = TokenStatsAccumulator::new();
        for _ in 0..10 {
            stats.on_token(-0.5); // very confident
        }
        // Even with deliberately unfavourable structural text, logprobs
        // win.
        let c = confidence_with_fallback(&stats, Some("???"));
        assert!(c > 0.8, "logprob path should override structural (got {c})");
    }

    #[test]
    fn fallback_uses_structural_when_no_logprobs() {
        let stats = TokenStatsAccumulator::new();
        let c = confidence_with_fallback(
            &stats,
            Some("Here is a clear answer with enough length to pass the minimum."),
        );
        assert!(
            (c - STRUCT_FALLBACK_DEFAULT).abs() < 1e-9,
            "empty-logprobs + clean text should use structural default (got {c})"
        );
    }

    #[test]
    fn fallback_neutral_when_no_signal() {
        let stats = TokenStatsAccumulator::new();
        let c = confidence_with_fallback(&stats, None);
        assert!((c - NEUTRAL_CONFIDENCE).abs() < 1e-9);
    }

    #[test]
    fn fallback_gibberish_path_yields_low_confidence() {
        // End-to-end: simulate a gibberish token stream (high NLL) and
        // confirm the Regulator-facing confidence drops into the low
        // band. This is the Session 17 test-target scenario
        // ("gibberish text produces low confidence") at the sub-module
        // level.
        let mut stats = TokenStatsAccumulator::new();
        for _ in 0..30 {
            stats.on_token(-6.5); // ~perplexity 670 — very uncertain
        }
        let c = confidence_with_fallback(&stats, None);
        assert!(
            c < 0.2,
            "gibberish-level mean NLL should land confidence in the low band (got {c})"
        );
    }
}