marque-core 0.2.1

Format-agnostic text scanner and attribute parser — the marque engine's front end
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
// SPDX-FileCopyrightText: 2026 Knitli Inc.
//
// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0

//! Vocabulary-aware fuzzy correction for CAPCO tokens.
//!
//! # Design
//!
//! CAPCO markings are built from a **closed vocabulary** of ~52 CVE tokens
//! (classification levels, SCI controls, dissemination controls, and a handful
//! of structural keywords). OCR and manual transcription errors produce
//! near-miss variants — `SERCET`, `NOFRON`, `CONFIDETIAL` — that no rule ever
//! fires on because the scanner never detects them as marking candidates.
//!
//! The approach here mirrors what makes [`typos`](https://github.com/crate-ci/typos)
//! so effective at eliminating false positives, adapted for the closed-world
//! property of CAPCO vocabulary:
//!
//! 1. **Closed-world validation first.** If the input token is already in the
//!    vocabulary, return `None` immediately — no correction needed, no false
//!    positive possible.
//!
//! 2. **Exhaustive near-miss search.** Because the vocabulary is tiny (~52
//!    tokens), computing Levenshtein edit distance to every known token is
//!    fast (microseconds).
//!
//! 3. **Ambiguity rejection.** If two or more vocabulary entries are equally
//!    close, the correction is ambiguous. Return `None` and let the engine
//!    surface it as a human-review item — exactly what `typos` does for words
//!    that could correct to multiple targets.
//!
//! 4. **Minimum-length guard.** Very short tokens (1-2 characters) are excluded
//!    from fuzzy matching because edit distance is semantically unreliable at
//!    that length. `C`, `S`, `U` are valid in context but look similar enough to
//!    dozens of other possibilities that any fuzzy suggestion would be noise.
//!    See [`MIN_FUZZY_LEN`] for the 2-char rationale (PR 7 SAR
//!    sub-compartment false-positives).
//!
//! 5. **Confidence scores.** Each `FuzzyCorrection` carries a base confidence
//!    derived from edit distance and token length. The calling engine multiplies
//!    this by a **context factor** (+0.10–0.15 when the token is inside a
//!    detected marking region) before comparing against the configured threshold.
//!
//! # Integration Points
//!
//! The [`FuzzyVocabMatcher`] is injected into the engine's pre-scanner step.
//! In the default configuration it operates after the AhoCorasick corrections
//! map pass — user-configured exact corrections take priority; the fuzzy matcher
//! handles residual OCR noise the exact map doesn't cover.
//!
//! **WASM-safe**: no I/O, no platform-specific code, and only small transient
//! heap allocations during edit-distance computation.

/// A correction candidate produced by [`FuzzyVocabMatcher::correct`].
///
/// Callers should multiply `confidence` by a context factor before comparing
/// against the engine threshold:
/// - Inside a detected marking region (between `//` boundaries or `(...)`):
///   multiply by `1.0 + 0.15 * context_strength` (practical range 1.10–1.15).
/// - In open prose with no structural signal: use `confidence` as-is.
#[derive(Debug, Clone, PartialEq)]
pub struct FuzzyCorrection {
    /// The suggested canonical token from the vocabulary.
    pub token: &'static str,
    /// Levenshtein edit distance between the input and `token`.
    pub distance: u8,
    /// Base confidence score in `[0.0, 1.0]`.
    ///
    /// Derived from `distance` and token length. Does **not** include a
    /// context factor — callers should apply one before thresholding.
    pub confidence: f32,
}

/// Vocabulary-aware fuzzy corrector for a closed token set.
///
/// Construct once per engine session from the vocabulary slice exposed by
/// `marque_ism::TokenSet::correction_vocab`. The vocab must be sorted and
/// deduplicated: the "is already valid" fast path uses [`slice::binary_search`],
/// and the ambiguity check assumes each candidate appears at most once. For
/// [`marque_ism::CapcoTokenSet`] the invariant is enforced at the source —
/// `ALL_CVE_TOKENS` is emitted sorted and deduplicated by
/// `marque-ism/build.rs` and verified by `token_set::tests`.
///
/// # Example
/// ```
/// use marque_core::fuzzy::FuzzyVocabMatcher;
/// use marque_ism::CapcoTokenSet;
/// use marque_ism::token_set::TokenSet as _;
///
/// let vocab = CapcoTokenSet.correction_vocab();
/// let matcher = FuzzyVocabMatcher::new(vocab);
///
/// // "SERCET" is one transpose away from "SECRET"
/// let result = matcher.correct("SERCET");
/// assert_eq!(result.map(|c| c.token), Some("SECRET"));
///
/// // Known tokens → no correction
/// assert!(matcher.correct("SECRET").is_none());
///
/// // Too ambiguous → no correction
/// // (tokens equidistant from the input → ambiguous, return None)
/// ```
pub struct FuzzyVocabMatcher<'v> {
    vocab: &'v [&'static str],
}

impl<'v> FuzzyVocabMatcher<'v> {
    /// Create a new matcher over `vocab`.
    ///
    /// `vocab` must be the sorted, deduplicated CVE token slice returned by
    /// [`TokenSet::correction_vocab`]. Construction is `O(1)` — the slice is
    /// not copied or indexed. The slice itself may live on the caller's
    /// `TokenSet` implementation (e.g., a `Vec<&'static str>` field), but each
    /// entry must be `&'static str` so that [`FuzzyCorrection::token`] — which
    /// borrows directly from the vocabulary — outlives the matcher.
    pub fn new(vocab: &'v [&'static str]) -> Self {
        Self { vocab }
    }

    /// Attempt to find a fuzzy correction for an unknown `token`.
    ///
    /// Returns `None` when:
    /// - `token` is already a known vocabulary entry (no correction needed).
    /// - `token` is too short (< `MIN_FUZZY_LEN` bytes).
    /// - No vocabulary entry is within [`MAX_EDIT_DISTANCE`] edits.
    /// - Multiple vocabulary entries tie at the closest distance (ambiguous).
    ///
    /// Returns `Some(FuzzyCorrection)` only when the correction is unambiguous.
    ///
    /// # ASCII invariant
    ///
    /// Length checks and the underlying edit-distance computation both operate
    /// on byte counts. The CAPCO vocabulary is pure ASCII (classification
    /// levels, SCI/dissem/SAR tokens, etc.), so byte count and character count
    /// coincide for every expected input. Non-ASCII input is compared byte-wise
    /// and will not produce meaningful corrections — which is the intended
    /// behavior, since no non-ASCII candidate exists in the closed vocab.
    pub fn correct(&self, token: &str) -> Option<FuzzyCorrection> {
        // Fast path: token is already valid → nothing to correct.
        if self.vocab.binary_search(&token).is_ok() {
            return None;
        }

        let token_len = token.len();

        // Very short tokens are too noisy for edit-distance correction.
        if token_len < MIN_FUZZY_LEN {
            return None;
        }

        // Scratch buffers reused across every candidate in this call. The
        // rolling two-row array needs `shorter + 1` slots, where
        // `shorter = min(token.len(), candidate.len())`. The length-diff
        // filter below guarantees `|token.len() - candidate.len()| <=
        // MAX_EDIT_DISTANCE`, so `shorter <= token_len`, and a single
        // allocation of `token_len + 1` fits every candidate. This avoids
        // the ~2 heap allocations per candidate (~100 per correction attempt)
        // that allocating inside the levenshtein helper would incur.
        let scratch_len = token_len + 1;
        let mut prev = vec![0u8; scratch_len];
        let mut curr = vec![0u8; scratch_len];

        let mut best_dist = u8::MAX;
        let mut best_token: Option<&'static str> = None;
        let mut ambiguous = false;

        for &candidate in self.vocab {
            let cand_len = candidate.len();

            // Skip candidates whose length difference alone exceeds our bound.
            // This is a fast filter before the full Levenshtein computation.
            let len_diff = token_len.abs_diff(cand_len);
            if len_diff > MAX_EDIT_DISTANCE as usize {
                continue;
            }

            let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);

            if d < best_dist {
                best_dist = d;
                best_token = Some(candidate);
                ambiguous = false;
            } else if d == best_dist {
                // Another candidate at the same distance — correction is
                // ambiguous; do not auto-suggest.
                ambiguous = true;
            }
        }

        if ambiguous || best_dist > MAX_EDIT_DISTANCE {
            return None;
        }

        let token = best_token?;
        let confidence = correction_confidence(best_dist, token_len);

        // Confidence too low to be meaningful — treat as no match.
        if confidence < MIN_USEFUL_CONFIDENCE {
            return None;
        }

        Some(FuzzyCorrection {
            token,
            distance: best_dist,
            confidence,
        })
    }

    /// Return every vocabulary entry within
    /// [`MAX_EDIT_DISTANCE`] of `token`, paired with its distance.
    ///
    /// Behaves like [`Self::correct`] but does NOT collapse ambiguous
    /// matches to `None`. The decoder uses this when the caller needs
    /// to score multiple candidates against a downstream prior — for
    /// REL TO trigraph fuzzy recovery, the corpus-weighted log-prior
    /// breaks ties that the matcher itself cannot (issue #233).
    /// Fast-paths the same as [`Self::correct`]:
    ///
    /// - `token` is already in vocab → returns an empty vec.
    /// - `token.len() < MIN_FUZZY_LEN` → returns an empty vec.
    ///
    /// Output is ordered by ascending distance, then by the
    /// vocabulary's lexicographic order (because the iteration walks
    /// the sorted vocab slice). Capped by [`MAX_EDIT_DISTANCE`] so a
    /// single call cannot run away on a tiny vocab; the priors-bake
    /// vocabulary stays well bounded in practice.
    pub fn correct_all(&self, token: &str) -> Vec<FuzzyCorrection> {
        self.correct_all_with_floor(token, MIN_USEFUL_CONFIDENCE)
    }

    /// Like [`Self::correct_all`] but with a caller-controlled
    /// confidence floor.
    ///
    /// `confidence_floor` MUST lie in `[0.0, 1.0]` — `correction_confidence`
    /// returns values in that range, so a negative floor would silently
    /// disable filtering (the comparison `confidence >= negative_floor`
    /// is always true) and a floor `> 1.0` would silently drop every
    /// match. A debug build panics on a misuse instead of producing a
    /// release binary that returns counterintuitive empty / unfiltered
    /// results.
    ///
    /// The default floor (`MIN_USEFUL_CONFIDENCE` = 0.45) excludes
    /// distance-2 corrections of 3-char inputs, which is the right
    /// safety policy for the standard fuzzy path because those
    /// corrections are too speculative without surrounding context.
    /// The decoder's REL TO trigraph expansion (issue #233) supplies
    /// surrounding context — the candidate goes through the strict
    /// REL TO parser, the resulting marking has a corpus-weighted
    /// trigraph prior, and the decoder's `UNAMBIGUOUS_LOG_MARGIN`
    /// breaks ties at score time. Lowering the floor for that
    /// specific call site is what lets a typo like `ASU → AUS`
    /// (distance 2 in plain Levenshtein) reach the scorer.
    ///
    /// Callers passing a floor of `0.0` get every match within
    /// [`MAX_EDIT_DISTANCE`].
    pub fn correct_all_with_floor(
        &self,
        token: &str,
        confidence_floor: f32,
    ) -> Vec<FuzzyCorrection> {
        debug_assert!(
            (0.0..=1.0).contains(&confidence_floor),
            "confidence_floor must be in [0.0, 1.0], got {confidence_floor}"
        );
        if self.vocab.binary_search(&token).is_ok() {
            return Vec::new();
        }

        let token_len = token.len();
        if token_len < MIN_FUZZY_LEN {
            return Vec::new();
        }

        let scratch_len = token_len + 1;
        let mut prev = vec![0u8; scratch_len];
        let mut curr = vec![0u8; scratch_len];

        let mut hits: Vec<FuzzyCorrection> = Vec::new();
        for &candidate in self.vocab {
            let cand_len = candidate.len();
            let len_diff = token_len.abs_diff(cand_len);
            if len_diff > MAX_EDIT_DISTANCE as usize {
                continue;
            }
            let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);
            if d > MAX_EDIT_DISTANCE {
                continue;
            }
            let confidence = correction_confidence(d, token_len);
            if confidence < confidence_floor {
                continue;
            }
            hits.push(FuzzyCorrection {
                token: candidate,
                distance: d,
                confidence,
            });
        }
        // Sort ascending by distance; preserve vocab order within a
        // distance band (vocab is already sorted, so the secondary
        // ordering falls out of the iteration without a re-sort).
        hits.sort_by_key(|h| h.distance);
        hits
    }
}

// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------

/// Minimum input token length for fuzzy matching.
///
/// Tokens shorter than this are excluded. Single- and two-char tokens
/// are too noisy for context-free edit-distance correction:
///
/// - **Single-char**: `S`, `C`, `U`, `R` are valid classification CAPCO
///   abbreviations and at edit distance 1 from too many other
///   single-char tokens to produce reliable corrections.
/// - **Two-char**: tried briefly during issue #133 PR 7 to recover
///   `UK → TK`-style typos, but reverted because of false-positive
///   collisions with SAR identifier sub-compartment letters. Most
///   visibly, the canonical Enron-corpus SAR fixture
///   `SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN`
///   has `RB` as a standalone 2-char token (sub-compartment of XRA),
///   and `RB` is at edit distance 1 from `RS` (the IC dissem portion
///   form for RSEN) — so 2-char fuzzy silently converted SAR
///   sub-compartment letters into dissem controls. The fuzzy
///   matcher has no context awareness to know "we're inside a SAR
///   block, skip identifier-shaped tokens", so the safer choice is
///   to keep 2-char tokens out of fuzzy and address the
///   `UK→TK`-style cases via a context-aware structural pass in
///   the decoder.
pub const MIN_FUZZY_LEN: usize = 3;

/// Maximum Levenshtein edit distance considered for a correction.
///
/// Edit distance 2 covers:
/// - Transpositions (`SERCET` → `SECRET`, distance 2).
/// - Single-char substitutions + insertions (`CONFIDETIAL` → `CONFIDENTIAL`,
///   distance 1).
/// - Two-step corrections for heavily OCR'd text (`SECRECT` → `SECRET`,
///   distance 2).
///
/// Distance 3+ produces too many false positives against short CAPCO tokens
/// and is therefore excluded.
pub const MAX_EDIT_DISTANCE: u8 = 2;

/// Minimum confidence score below which a correction is suppressed.
///
/// This threshold prevents low-quality guesses from entering the pipeline
/// even when they satisfy the edit-distance bound — for example, a
/// distance-2 correction of a 4-character input has so many possible sources
/// that the prior is too weak to act on.
const MIN_USEFUL_CONFIDENCE: f32 = 0.45;

// ---------------------------------------------------------------------------
// Confidence scoring
// ---------------------------------------------------------------------------

/// Derive a base confidence score from edit distance and token length.
///
/// # Rationale
///
/// The prior that an edit-distance-1 deviation from a known CAPCO token is a
/// typo is much higher for long tokens than for short ones:
/// - `NOFRON` → `NOFORN` (distance 1, length 6): almost certainly a typo.
/// - `SB` → `SI` (distance 1, length 2): could be many things.
///
/// The formula is:
/// ```text
/// base = match distance {
///     1 => 0.55 + 0.05 * min(token_len, 6).saturating_sub(3)  →  [0.55, 0.70]
///     2 => 0.40 + 0.05 * min(token_len, 8).saturating_sub(5)  →  [0.40, 0.55]
///     _ => 0.0
/// }
/// ```
/// These are intentionally conservative. The engine's context factor
/// (marking-region signal) raises the effective confidence at call time.
fn correction_confidence(distance: u8, token_len: usize) -> f32 {
    match distance {
        0 => 1.0, // exact match — should not reach this path
        1 => {
            // 0.55 base, +0.05 per char over 3 (capped at 6 chars → 0.70).
            let bonus = (token_len.min(6).saturating_sub(3)) as f32 * 0.05;
            0.55 + bonus
        }
        2 => {
            // 0.40 base, +0.05 per char over 5 (capped at 8 chars → 0.55).
            let bonus = (token_len.min(8).saturating_sub(5)) as f32 * 0.05;
            0.40 + bonus
        }
        _ => 0.0,
    }
}

// ---------------------------------------------------------------------------
// Levenshtein edit distance
// ---------------------------------------------------------------------------

/// Compute the Levenshtein edit distance between `a` and `b`.
///
/// Uses a two-row rolling-array approach — O(min(|a|, |b|)) space, O(|a|·|b|)
/// time. Input strings are short CAPCO tokens (2–20 chars), so this runs in
/// microseconds.
///
/// Returns the edit distance clamped to `u8::MAX` (any distance > 255 is
/// already far beyond the useful range for CAPCO corrections).
///
/// Operates on bytes: two characters are "equal" iff they have the same byte
/// representation. The CAPCO vocabulary is pure ASCII, so this matches a
/// character-level edit distance for every expected input.
///
/// This wrapper allocates two `Vec<u8>` buffers per call. On hot paths that
/// call Levenshtein for each of many candidates (e.g.,
/// [`FuzzyVocabMatcher::correct`]), prefer [`levenshtein_with_scratch`] and
/// reuse the scratch buffers across candidates.
#[cfg(test)]
pub(crate) fn levenshtein(a: &str, b: &str) -> u8 {
    let shorter = a.len().min(b.len());
    let mut prev = vec![0u8; shorter + 1];
    let mut curr = vec![0u8; shorter + 1];
    levenshtein_with_scratch(a, b, &mut prev, &mut curr)
}

/// Same as [`levenshtein`] but reuses caller-owned scratch buffers.
///
/// `prev_buf` and `curr_buf` must each have length strictly greater than
/// `min(a.len(), b.len())`; their contents are overwritten. In debug builds
/// this is enforced by a `debug_assert!`.
pub(crate) fn levenshtein_with_scratch(
    a: &str,
    b: &str,
    prev_buf: &mut [u8],
    curr_buf: &mut [u8],
) -> u8 {
    // Ensure `a` is the shorter string (minimizes inner-loop iterations).
    let (a, b) = if a.len() <= b.len() { (a, b) } else { (b, a) };

    let a_bytes = a.as_bytes();
    let b_bytes = b.as_bytes();
    let n = a_bytes.len();
    let m = b_bytes.len();

    if n == 0 {
        return m.min(u8::MAX as usize) as u8;
    }

    debug_assert!(
        prev_buf.len() > n && curr_buf.len() > n,
        "scratch buffers must have len > min(a.len(), b.len())"
    );

    // Local mutable bindings so `mem::swap` can rotate which buffer is `prev`
    // and which is `curr` without moving bytes.
    let (mut prev, mut curr) = (prev_buf, curr_buf);

    // prev[i] holds the distance between a[0..i] and the previous b prefix
    // (before the loop: b[0..0], and at outer iteration j: b[0..j-1]).
    for (i, slot) in prev.iter_mut().enumerate().take(n + 1) {
        *slot = i.min(u8::MAX as usize) as u8;
    }

    for j in 1..=m {
        curr[0] = j.min(u8::MAX as usize) as u8;

        for i in 1..=n {
            let cost = if a_bytes[i - 1] == b_bytes[j - 1] {
                0u8
            } else {
                1u8
            };

            let del = prev[i].saturating_add(1);
            let ins = curr[i - 1].saturating_add(1);
            let sub = prev[i - 1].saturating_add(cost);
            curr[i] = del.min(ins).min(sub);
        }

        std::mem::swap(&mut prev, &mut curr);
    }

    prev[n]
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
    use super::*;

    // ----- levenshtein -----

    #[test]
    fn lev_identical_strings() {
        assert_eq!(levenshtein("SECRET", "SECRET"), 0);
    }

    #[test]
    fn lev_single_transpose() {
        // SERCET: R and C swapped — that's a substitution at position 3 and 4,
        // which is distance 2 in standard Levenshtein (no transposition op).
        // But the user's intuition is "one swap", which is distance 2.
        assert_eq!(levenshtein("SERCET", "SECRET"), 2);
    }

    #[test]
    fn lev_insertion() {
        // CONFIDETIAL is missing the second N → distance 1.
        assert_eq!(levenshtein("CONFIDETIAL", "CONFIDENTIAL"), 1);
    }

    #[test]
    fn lev_transposition() {
        // Adjacent transposition (O and R swapped) → distance 2 in standard
        // Levenshtein (two substitutions: position 3 O→R, position 4 R→O).
        assert_eq!(levenshtein("NOFORN", "NOFRON"), 2);
    }

    #[test]
    fn lev_empty_vs_nonempty() {
        assert_eq!(levenshtein("", "SECRET"), 6);
        assert_eq!(levenshtein("SECRET", ""), 6);
    }

    #[test]
    fn lev_symmetry() {
        assert_eq!(
            levenshtein("NOFORN", "NOFRON"),
            levenshtein("NOFRON", "NOFORN")
        );
    }

    // ----- FuzzyVocabMatcher -----

    /// A minimal offline vocabulary for tests — does not require a Cargo build.
    static TEST_VOCAB: &[&str] = &[
        "CONFIDENTIAL",
        "FOUO",
        "NOFORN",
        "SECRET",
        "TOP SECRET",
        "UNCLASSIFIED",
    ];

    fn matcher() -> FuzzyVocabMatcher<'static> {
        FuzzyVocabMatcher::new(TEST_VOCAB)
    }

    #[test]
    fn known_token_returns_none() {
        // "SECRET" is in vocab — no correction needed.
        assert!(matcher().correct("SECRET").is_none());
    }

    #[test]
    fn short_token_returns_none() {
        // Single-char inputs are below MIN_FUZZY_LEN.
        assert!(matcher().correct("S").is_none());
        // 2-char inputs are also below MIN_FUZZY_LEN — issue #133 PR 7
        // experimented with lowering this to 2 but reverted because
        // 2-char SAR identifier sub-compartments (most visibly `RB`
        // in the canonical Enron-corpus SAR fixture) collide with
        // `RS` (the RSEN portion form) at edit distance 1, silently
        // corrupting SAR sub-compartment text into dissem controls.
        // See `MIN_FUZZY_LEN` doc for the full PR-7 rationale.
        assert!(matcher().correct("NF").is_none());
    }

    #[test]
    fn confidetial_corrects_to_confidential() {
        // One missing character — should correct.
        let result = matcher().correct("CONFIDETIAL");
        assert_eq!(result.as_ref().map(|c| c.token), Some("CONFIDENTIAL"));
        // Distance should be 1.
        assert_eq!(result.map(|c| c.distance), Some(1));
    }

    #[test]
    fn nofron_corrects_to_noforn() {
        // Adjacent transposition (O and R swapped) → distance 2 in Levenshtein.
        let result = matcher().correct("NOFRON");
        assert_eq!(result.map(|c| c.token), Some("NOFORN"));
    }

    #[test]
    fn sercet_corrects_to_secret() {
        // Common transposition typo: R and C transposed in "SERCET" vs "SECRET".
        // Standard Levenshtein counts this as distance 2 (two substitutions).
        let result = matcher().correct("SERCET");
        assert_eq!(result.as_ref().map(|c| c.token), Some("SECRET"));
        let c = result.unwrap();
        // Distance-2, length-6 → confidence = 0.40 + 1*0.05 = 0.45.
        assert!(
            c.confidence > 0.44,
            "confidence should be non-trivial: {}",
            c.confidence
        );
    }

    #[test]
    fn confidence_increases_with_token_length() {
        // Longer tokens at distance 1 should have higher confidence.
        let short_conf = correction_confidence(1, 4); // e.g., FOUO-like
        let long_conf = correction_confidence(1, 12); // e.g., CONFIDENTIAL-like
        assert!(
            long_conf > short_conf,
            "expected long_conf {long_conf} > short_conf {short_conf}"
        );
    }

    #[test]
    fn completely_unrelated_string_returns_none() {
        // "BANANA" is distance > 2 from every test vocab entry.
        assert!(matcher().correct("BANANA").is_none());
    }

    #[test]
    fn ambiguous_corrections_return_none() {
        // Use a tiny local vocabulary where the input is tied between two
        // candidates within MAX_EDIT_DISTANCE, so None is returned because the
        // best correction is ambiguous rather than because all candidates are
        // filtered out for being too far away.
        let vocab = &["BOOK", "COOK"];
        let matcher = FuzzyVocabMatcher::new(vocab);
        assert!(matcher.correct("NOOK").is_none());
    }

    #[test]
    fn distance_2_edit_returns_correction_for_long_tokens() {
        // "UNCLASSIFEID" — two character errors in a long token.
        let result = matcher().correct("UNCLASSIFEID");
        assert_eq!(result.map(|c| c.token), Some("UNCLASSIFIED"));
    }

    #[test]
    fn correction_confidence_distance1_scales_with_length() {
        // Sanity-check the confidence formula directly.
        // Use approximate comparison to avoid f32 precision noise.
        let eps = 1e-5_f32;
        assert!((correction_confidence(1, 3) - 0.55).abs() < eps); // 0.55 + 0 bonus
        assert!((correction_confidence(1, 4) - 0.60).abs() < eps); // 0.55 + 1*0.05
        assert!((correction_confidence(1, 6) - 0.70).abs() < eps); // 0.55 + 3*0.05 (capped)
        assert!((correction_confidence(1, 12) - 0.70).abs() < eps); // capped at 6
    }

    // ----- Real-vocab regression tests (issue #133 root cause #1) -----
    //
    // These tests use `CapcoTokenSet::correction_vocab()` directly so a
    // future change that removed the banner long-form additions from
    // the extended vocab (or unintentionally narrowed it back to
    // `ALL_CVE_TOKENS`) would fail here, not silently regress the
    // SC-004 accuracy harness. The unit tests above use a minimal
    // local TEST_VOCAB and would not catch the regression.

    #[test]
    fn real_vocab_corrects_noforon_to_noforn() {
        // Issue #133 root cause #1, primary example from PR #136
        // diagnostic: `NOFORON` is one insertion away from `NOFORN`.
        // Before the long-form vocab fix this returned None because
        // `NOFORN` was not in the matcher's vocabulary at all.
        use marque_ism::CapcoTokenSet;
        use marque_ism::token_set::TokenSet as _;
        let vocab = CapcoTokenSet.correction_vocab();
        let matcher = FuzzyVocabMatcher::new(vocab);
        let result = matcher.correct("NOFORON");
        assert_eq!(result.as_ref().map(|c| c.token), Some("NOFORN"));
        assert_eq!(result.map(|c| c.distance), Some(1));
    }

    #[test]
    fn real_vocab_corrects_nofron_to_noforn() {
        // Adjacent transposition (R↔O): standard Levenshtein distance 2.
        use marque_ism::CapcoTokenSet;
        use marque_ism::token_set::TokenSet as _;
        let vocab = CapcoTokenSet.correction_vocab();
        let matcher = FuzzyVocabMatcher::new(vocab);
        let result = matcher.correct("NOFRON");
        assert_eq!(result.map(|c| c.token), Some("NOFORN"));
    }

    #[test]
    fn real_vocab_corrects_orcon_typo() {
        // Coverage for the §G.1 Table 4 ORCON / OC pair beyond NOFORN.
        use marque_ism::CapcoTokenSet;
        use marque_ism::token_set::TokenSet as _;
        let vocab = CapcoTokenSet.correction_vocab();
        let matcher = FuzzyVocabMatcher::new(vocab);
        let result = matcher.correct("ORCN");
        assert_eq!(result.as_ref().map(|c| c.token), Some("ORCON"));
    }

    #[test]
    fn real_vocab_emits_multi_word_banner_for_whitespace_free_typo() {
        // Pin the documented behavior of multi-word entries in
        // `EXTENDED_CORRECTION_VOCAB`. The fuzzy matcher CAN emit a
        // multi-word vocab entry as the correction for a
        // whitespace-free typo (here: `SBUNOFORN` → `SBU NOFORN` at
        // distance 1, single-character insertion of the space).
        // The strict parser then accepts the corrected form via
        // `parse_non_ic_full_form`, so the round-trip lands as the
        // expected `NonIcDissem::SbuNf`.
        //
        // Pinning this lets us word the doc comment on
        // `EXTENDED_CORRECTION_VOCAB` accurately — multi-word
        // entries are reachable, not "inert".
        use marque_ism::CapcoTokenSet;
        use marque_ism::token_set::TokenSet as _;
        let vocab = CapcoTokenSet.correction_vocab();
        let matcher = FuzzyVocabMatcher::new(vocab);
        let result = matcher.correct("SBUNOFORN");
        assert_eq!(result.as_ref().map(|c| c.token), Some("SBU NOFORN"));
        assert_eq!(result.map(|c| c.distance), Some(1));
    }

    #[test]
    fn correction_confidence_distance2_scales_with_length() {
        let eps = 1e-5_f32;
        assert!((correction_confidence(2, 5) - 0.40).abs() < eps); // 0.40 + 0 bonus
        assert!((correction_confidence(2, 6) - 0.45).abs() < eps); // 0.40 + 1*0.05
        assert!((correction_confidence(2, 8) - 0.55).abs() < eps); // 0.40 + 3*0.05 (capped)
        assert!((correction_confidence(2, 15) - 0.55).abs() < eps); // capped at 8
    }

    // ----- correct_all / correct_all_with_floor -----

    #[test]
    fn correct_all_returns_empty_for_known_token() {
        // Fast-path: token is already in vocab → no alternates.
        let result = matcher().correct_all("SECRET");
        assert!(
            result.is_empty(),
            "known token must return empty vec from correct_all, got {result:?}"
        );
    }

    #[test]
    fn correct_all_returns_empty_for_short_token() {
        // Tokens shorter than MIN_FUZZY_LEN → no alternates.
        assert!(matcher().correct_all("S").is_empty());
        assert!(matcher().correct_all("NF").is_empty());
    }

    #[test]
    fn correct_all_returns_multiple_for_ambiguous_input() {
        // `correct_all` differs from `correct`: it does NOT collapse tied
        // candidates to None. Both BOOK and COOK are at distance 1 from NOOK,
        // so `correct` returns None but `correct_all` returns both.
        let vocab = &["BOOK", "COOK"];
        let m = FuzzyVocabMatcher::new(vocab);
        let result = m.correct_all("NOOK");
        assert_eq!(
            result.len(),
            2,
            "correct_all must return all tied candidates; got {result:?}"
        );
        let tokens: Vec<&str> = result.iter().map(|c| c.token).collect();
        assert!(tokens.contains(&"BOOK"), "BOOK must be among alternates");
        assert!(tokens.contains(&"COOK"), "COOK must be among alternates");
    }

    #[test]
    fn correct_all_with_floor_filters_confidence() {
        // A high floor should keep only distance-1 matches; a low floor also
        // includes distance-2 matches. Use a vocab and token where both
        // distance classes exist.
        let vocab = &["NOFORN", "UNCLASSIFIED"];
        let m = FuzzyVocabMatcher::new(vocab);

        // "NOFORON" is distance 1 from "NOFORN". confidence_floor = MIN_USEFUL_CONFIDENCE
        // keeps it; a floor of 0.80 would drop it (distance-1 on a 7-char token gives
        // 0.55 + 2*0.05 = 0.65, below 0.80).
        let with_default_floor = m.correct_all("NOFORON");
        assert_eq!(
            with_default_floor.len(),
            1,
            "default floor should keep the distance-1 match; got {with_default_floor:?}"
        );
        assert_eq!(with_default_floor[0].token, "NOFORN");

        let with_high_floor = m.correct_all_with_floor("NOFORON", 0.80);
        assert!(
            with_high_floor.is_empty(),
            "floor 0.80 must exclude NOFORON→NOFORN (confidence 0.65); got {with_high_floor:?}"
        );
    }

    #[test]
    fn correct_all_with_zero_floor_includes_distance2_on_short_token() {
        // With floor=0.0, distance-2 corrections for 3-char inputs are included
        // even though correction_confidence(2, 3) = 0.40 < MIN_USEFUL_CONFIDENCE.
        // This is the REL TO trigraph expansion path from issue #233.
        let vocab = &["AUS", "AUT"];
        let m = FuzzyVocabMatcher::new(vocab);

        // "ASU" is distance 2 from "AUS" (swap + sub). confidence = 0.40 (3-char, dist-2).
        // Default floor (0.45) filters it out; zero floor includes it.
        let default_floor = m.correct_all("ASU");
        assert!(
            default_floor.is_empty(),
            "default floor must exclude distance-2 3-char correction; got {default_floor:?}"
        );

        let zero_floor = m.correct_all_with_floor("ASU", 0.0);
        assert!(
            !zero_floor.is_empty(),
            "zero floor must include distance-2 3-char corrections; got {zero_floor:?}"
        );
        let tokens: Vec<&str> = zero_floor.iter().map(|c| c.token).collect();
        assert!(
            tokens.contains(&"AUS") || tokens.contains(&"AUT"),
            "at least one of AUS/AUT must be a distance-2 candidate of ASU; got {zero_floor:?}"
        );
    }

    #[test]
    fn correct_all_result_is_sorted_by_distance_ascending() {
        // The contract says output is ordered by ascending distance.
        let vocab = &["NOFORN", "NORORN", "NONORN"];
        let m = FuzzyVocabMatcher::new(vocab);
        // "NOFORN" is distance 0 — but that's in vocab, so it's the token being corrected.
        // Use "NOFRRN": distance 1 from NOFORN, distance 2 from NORORN/NONORN.
        let result = m.correct_all("NOFRRN");
        if result.len() > 1 {
            for pair in result.windows(2) {
                assert!(
                    pair[0].distance <= pair[1].distance,
                    "correct_all result not sorted by distance: {:?}",
                    result
                );
            }
        }
    }
}