marque_core/
fuzzy.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Vocabulary-aware fuzzy correction for CAPCO tokens.
6//!
7//! # Design
8//!
9//! CAPCO markings are built from a **closed vocabulary** of ~52 CVE tokens
10//! (classification levels, SCI controls, dissemination controls, and a handful
11//! of structural keywords). OCR and manual transcription errors produce
12//! near-miss variants — `SERCET`, `NOFRON`, `CONFIDETIAL` — that no rule ever
13//! fires on because the scanner never detects them as marking candidates.
14//!
15//! The approach here mirrors what makes [`typos`](https://github.com/crate-ci/typos)
16//! so effective at eliminating false positives, adapted for the closed-world
17//! property of CAPCO vocabulary:
18//!
19//! 1. **Closed-world validation first.** If the input token is already in the
20//!    vocabulary, return `None` immediately — no correction needed, no false
21//!    positive possible.
22//!
23//! 2. **Exhaustive near-miss search.** Because the vocabulary is tiny (~52
24//!    tokens), computing Levenshtein edit distance to every known token is
25//!    fast (microseconds).
26//!
27//! 3. **Ambiguity rejection.** If two or more vocabulary entries are equally
28//!    close, the correction is ambiguous. Return `None` and let the engine
29//!    surface it as a human-review item — exactly what `typos` does for words
30//!    that could correct to multiple targets.
31//!
32//! 4. **Minimum-length guard.** Very short tokens (1-2 characters) are excluded
33//!    from fuzzy matching because edit distance is semantically unreliable at
34//!    that length. `C`, `S`, `U` are valid in context but look similar enough to
35//!    dozens of other possibilities that any fuzzy suggestion would be noise.
36//!    See [`MIN_FUZZY_LEN`] for the 2-char rationale (PR 7 SAR
37//!    sub-compartment false-positives).
38//!
39//! 5. **Confidence scores.** Each `FuzzyCorrection` carries a base confidence
40//!    derived from edit distance and token length. The calling engine multiplies
41//!    this by a **context factor** (+0.10–0.15 when the token is inside a
42//!    detected marking region) before comparing against the configured threshold.
43//!
44//! # Integration Points
45//!
46//! The [`FuzzyVocabMatcher`] is injected into the engine's pre-scanner step.
47//! In the default configuration it operates after the AhoCorasick corrections
48//! map pass — user-configured exact corrections take priority; the fuzzy matcher
49//! handles residual OCR noise the exact map doesn't cover.
50//!
51//! **WASM-safe**: no I/O, no platform-specific code, and only small transient
52//! heap allocations during edit-distance computation.
53
54/// A correction candidate produced by [`FuzzyVocabMatcher::correct`].
55///
56/// Callers should multiply `confidence` by a context factor before comparing
57/// against the engine threshold:
58/// - Inside a detected marking region (between `//` boundaries or `(...)`):
59///   multiply by `1.0 + 0.15 * context_strength` (practical range 1.10–1.15).
60/// - In open prose with no structural signal: use `confidence` as-is.
61#[derive(Debug, Clone, PartialEq)]
62pub struct FuzzyCorrection {
63    /// The suggested canonical token from the vocabulary.
64    pub token: &'static str,
65    /// Levenshtein edit distance between the input and `token`.
66    pub distance: u8,
67    /// Base confidence score in `[0.0, 1.0]`.
68    ///
69    /// Derived from `distance` and token length. Does **not** include a
70    /// context factor — callers should apply one before thresholding.
71    pub confidence: f32,
72}
73
74/// Vocabulary-aware fuzzy corrector for a closed token set.
75///
76/// Construct once per engine session from the vocabulary slice exposed by
77/// `marque_ism::TokenSet::correction_vocab`. The vocab must be sorted and
78/// deduplicated: the "is already valid" fast path uses [`slice::binary_search`],
79/// and the ambiguity check assumes each candidate appears at most once. For
80/// [`marque_ism::CapcoTokenSet`] the invariant is enforced at the source —
81/// `ALL_CVE_TOKENS` is emitted sorted and deduplicated by
82/// `marque-ism/build.rs` and verified by `token_set::tests`.
83///
84/// # Example
85/// ```
86/// use marque_core::fuzzy::FuzzyVocabMatcher;
87/// use marque_ism::CapcoTokenSet;
88/// use marque_ism::token_set::TokenSet as _;
89///
90/// let vocab = CapcoTokenSet.correction_vocab();
91/// let matcher = FuzzyVocabMatcher::new(vocab);
92///
93/// // "SERCET" is one transpose away from "SECRET"
94/// let result = matcher.correct("SERCET");
95/// assert_eq!(result.map(|c| c.token), Some("SECRET"));
96///
97/// // Known tokens → no correction
98/// assert!(matcher.correct("SECRET").is_none());
99///
100/// // Too ambiguous → no correction
101/// // (tokens equidistant from the input → ambiguous, return None)
102/// ```
103pub struct FuzzyVocabMatcher<'v> {
104    vocab: &'v [&'static str],
105}
106
107impl<'v> FuzzyVocabMatcher<'v> {
108    /// Create a new matcher over `vocab`.
109    ///
110    /// `vocab` must be the sorted, deduplicated CVE token slice returned by
111    /// [`TokenSet::correction_vocab`]. Construction is `O(1)` — the slice is
112    /// not copied or indexed. The slice itself may live on the caller's
113    /// `TokenSet` implementation (e.g., a `Vec<&'static str>` field), but each
114    /// entry must be `&'static str` so that [`FuzzyCorrection::token`] — which
115    /// borrows directly from the vocabulary — outlives the matcher.
116    pub fn new(vocab: &'v [&'static str]) -> Self {
117        Self { vocab }
118    }
119
120    /// Attempt to find a fuzzy correction for an unknown `token`.
121    ///
122    /// Returns `None` when:
123    /// - `token` is already a known vocabulary entry (no correction needed).
124    /// - `token` is too short (< `MIN_FUZZY_LEN` bytes).
125    /// - No vocabulary entry is within [`MAX_EDIT_DISTANCE`] edits.
126    /// - Multiple vocabulary entries tie at the closest distance (ambiguous).
127    ///
128    /// Returns `Some(FuzzyCorrection)` only when the correction is unambiguous.
129    ///
130    /// # ASCII invariant
131    ///
132    /// Length checks and the underlying edit-distance computation both operate
133    /// on byte counts. The CAPCO vocabulary is pure ASCII (classification
134    /// levels, SCI/dissem/SAR tokens, etc.), so byte count and character count
135    /// coincide for every expected input. Non-ASCII input is compared byte-wise
136    /// and will not produce meaningful corrections — which is the intended
137    /// behavior, since no non-ASCII candidate exists in the closed vocab.
138    pub fn correct(&self, token: &str) -> Option<FuzzyCorrection> {
139        // Fast path: token is already valid → nothing to correct.
140        if self.vocab.binary_search(&token).is_ok() {
141            return None;
142        }
143
144        let token_len = token.len();
145
146        // Very short tokens are too noisy for edit-distance correction.
147        if token_len < MIN_FUZZY_LEN {
148            return None;
149        }
150
151        // Scratch buffers reused across every candidate in this call. The
152        // rolling two-row array needs `shorter + 1` slots, where
153        // `shorter = min(token.len(), candidate.len())`. The length-diff
154        // filter below guarantees `|token.len() - candidate.len()| <=
155        // MAX_EDIT_DISTANCE`, so `shorter <= token_len`, and a single
156        // allocation of `token_len + 1` fits every candidate. This avoids
157        // the ~2 heap allocations per candidate (~100 per correction attempt)
158        // that allocating inside the levenshtein helper would incur.
159        let scratch_len = token_len + 1;
160        let mut prev = vec![0u8; scratch_len];
161        let mut curr = vec![0u8; scratch_len];
162
163        let mut best_dist = u8::MAX;
164        let mut best_token: Option<&'static str> = None;
165        let mut ambiguous = false;
166
167        for &candidate in self.vocab {
168            let cand_len = candidate.len();
169
170            // Skip candidates whose length difference alone exceeds our bound.
171            // This is a fast filter before the full Levenshtein computation.
172            let len_diff = token_len.abs_diff(cand_len);
173            if len_diff > MAX_EDIT_DISTANCE as usize {
174                continue;
175            }
176
177            let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);
178
179            if d < best_dist {
180                best_dist = d;
181                best_token = Some(candidate);
182                ambiguous = false;
183            } else if d == best_dist {
184                // Another candidate at the same distance — correction is
185                // ambiguous; do not auto-suggest.
186                ambiguous = true;
187            }
188        }
189
190        if ambiguous || best_dist > MAX_EDIT_DISTANCE {
191            return None;
192        }
193
194        let token = best_token?;
195        let confidence = correction_confidence(best_dist, token_len);
196
197        // Confidence too low to be meaningful — treat as no match.
198        if confidence < MIN_USEFUL_CONFIDENCE {
199            return None;
200        }
201
202        Some(FuzzyCorrection {
203            token,
204            distance: best_dist,
205            confidence,
206        })
207    }
208
209    /// Return every vocabulary entry within
210    /// [`MAX_EDIT_DISTANCE`] of `token`, paired with its distance.
211    ///
212    /// Behaves like [`Self::correct`] but does NOT collapse ambiguous
213    /// matches to `None`. The decoder uses this when the caller needs
214    /// to score multiple candidates against a downstream prior — for
215    /// REL TO trigraph fuzzy recovery, the corpus-weighted log-prior
216    /// breaks ties that the matcher itself cannot (issue #233).
217    /// Fast-paths the same as [`Self::correct`]:
218    ///
219    /// - `token` is already in vocab → returns an empty vec.
220    /// - `token.len() < MIN_FUZZY_LEN` → returns an empty vec.
221    ///
222    /// Output is ordered by ascending distance, then by the
223    /// vocabulary's lexicographic order (because the iteration walks
224    /// the sorted vocab slice). Capped by [`MAX_EDIT_DISTANCE`] so a
225    /// single call cannot run away on a tiny vocab; the priors-bake
226    /// vocabulary stays well bounded in practice.
227    pub fn correct_all(&self, token: &str) -> Vec<FuzzyCorrection> {
228        self.correct_all_with_floor(token, MIN_USEFUL_CONFIDENCE)
229    }
230
231    /// Like [`Self::correct_all`] but with a caller-controlled
232    /// confidence floor.
233    ///
234    /// `confidence_floor` MUST lie in `[0.0, 1.0]` — `correction_confidence`
235    /// returns values in that range, so a negative floor would silently
236    /// disable filtering (the comparison `confidence >= negative_floor`
237    /// is always true) and a floor `> 1.0` would silently drop every
238    /// match. A debug build panics on a misuse instead of producing a
239    /// release binary that returns counterintuitive empty / unfiltered
240    /// results.
241    ///
242    /// The default floor (`MIN_USEFUL_CONFIDENCE` = 0.45) excludes
243    /// distance-2 corrections of 3-char inputs, which is the right
244    /// safety policy for the standard fuzzy path because those
245    /// corrections are too speculative without surrounding context.
246    /// The decoder's REL TO trigraph expansion (issue #233) supplies
247    /// surrounding context — the candidate goes through the strict
248    /// REL TO parser, the resulting marking has a corpus-weighted
249    /// trigraph prior, and the decoder's `UNAMBIGUOUS_LOG_MARGIN`
250    /// breaks ties at score time. Lowering the floor for that
251    /// specific call site is what lets a typo like `ASU → AUS`
252    /// (distance 2 in plain Levenshtein) reach the scorer.
253    ///
254    /// Callers passing a floor of `0.0` get every match within
255    /// [`MAX_EDIT_DISTANCE`].
256    pub fn correct_all_with_floor(
257        &self,
258        token: &str,
259        confidence_floor: f32,
260    ) -> Vec<FuzzyCorrection> {
261        debug_assert!(
262            (0.0..=1.0).contains(&confidence_floor),
263            "confidence_floor must be in [0.0, 1.0], got {confidence_floor}"
264        );
265        if self.vocab.binary_search(&token).is_ok() {
266            return Vec::new();
267        }
268
269        let token_len = token.len();
270        if token_len < MIN_FUZZY_LEN {
271            return Vec::new();
272        }
273
274        let scratch_len = token_len + 1;
275        let mut prev = vec![0u8; scratch_len];
276        let mut curr = vec![0u8; scratch_len];
277
278        let mut hits: Vec<FuzzyCorrection> = Vec::new();
279        for &candidate in self.vocab {
280            let cand_len = candidate.len();
281            let len_diff = token_len.abs_diff(cand_len);
282            if len_diff > MAX_EDIT_DISTANCE as usize {
283                continue;
284            }
285            let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);
286            if d > MAX_EDIT_DISTANCE {
287                continue;
288            }
289            let confidence = correction_confidence(d, token_len);
290            if confidence < confidence_floor {
291                continue;
292            }
293            hits.push(FuzzyCorrection {
294                token: candidate,
295                distance: d,
296                confidence,
297            });
298        }
299        // Sort ascending by distance; preserve vocab order within a
300        // distance band (vocab is already sorted, so the secondary
301        // ordering falls out of the iteration without a re-sort).
302        hits.sort_by_key(|h| h.distance);
303        hits
304    }
305}
306
307// ---------------------------------------------------------------------------
308// Constants
309// ---------------------------------------------------------------------------
310
311/// Minimum input token length for fuzzy matching.
312///
313/// Tokens shorter than this are excluded. Single- and two-char tokens
314/// are too noisy for context-free edit-distance correction:
315///
316/// - **Single-char**: `S`, `C`, `U`, `R` are valid classification CAPCO
317///   abbreviations and at edit distance 1 from too many other
318///   single-char tokens to produce reliable corrections.
319/// - **Two-char**: tried briefly during issue #133 PR 7 to recover
320///   `UK → TK`-style typos, but reverted because of false-positive
321///   collisions with SAR identifier sub-compartment letters. Most
322///   visibly, the canonical Enron-corpus SAR fixture
323///   `SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN`
324///   has `RB` as a standalone 2-char token (sub-compartment of XRA),
325///   and `RB` is at edit distance 1 from `RS` (the IC dissem portion
326///   form for RSEN) — so 2-char fuzzy silently converted SAR
327///   sub-compartment letters into dissem controls. The fuzzy
328///   matcher has no context awareness to know "we're inside a SAR
329///   block, skip identifier-shaped tokens", so the safer choice is
330///   to keep 2-char tokens out of fuzzy and address the
331///   `UK→TK`-style cases via a context-aware structural pass in
332///   the decoder.
333pub const MIN_FUZZY_LEN: usize = 3;
334
335/// Maximum Levenshtein edit distance considered for a correction.
336///
337/// Edit distance 2 covers:
338/// - Transpositions (`SERCET` → `SECRET`, distance 2).
339/// - Single-char substitutions + insertions (`CONFIDETIAL` → `CONFIDENTIAL`,
340///   distance 1).
341/// - Two-step corrections for heavily OCR'd text (`SECRECT` → `SECRET`,
342///   distance 2).
343///
344/// Distance 3+ produces too many false positives against short CAPCO tokens
345/// and is therefore excluded.
346pub const MAX_EDIT_DISTANCE: u8 = 2;
347
348/// Minimum confidence score below which a correction is suppressed.
349///
350/// This threshold prevents low-quality guesses from entering the pipeline
351/// even when they satisfy the edit-distance bound — for example, a
352/// distance-2 correction of a 4-character input has so many possible sources
353/// that the prior is too weak to act on.
354const MIN_USEFUL_CONFIDENCE: f32 = 0.45;
355
356// ---------------------------------------------------------------------------
357// Confidence scoring
358// ---------------------------------------------------------------------------
359
360/// Derive a base confidence score from edit distance and token length.
361///
362/// # Rationale
363///
364/// The prior that an edit-distance-1 deviation from a known CAPCO token is a
365/// typo is much higher for long tokens than for short ones:
366/// - `NOFRON` → `NOFORN` (distance 1, length 6): almost certainly a typo.
367/// - `SB` → `SI` (distance 1, length 2): could be many things.
368///
369/// The formula is:
370/// ```text
371/// base = match distance {
372///     1 => 0.55 + 0.05 * min(token_len, 6).saturating_sub(3)  →  [0.55, 0.70]
373///     2 => 0.40 + 0.05 * min(token_len, 8).saturating_sub(5)  →  [0.40, 0.55]
374///     _ => 0.0
375/// }
376/// ```
377/// These are intentionally conservative. The engine's context factor
378/// (marking-region signal) raises the effective confidence at call time.
379fn correction_confidence(distance: u8, token_len: usize) -> f32 {
380    match distance {
381        0 => 1.0, // exact match — should not reach this path
382        1 => {
383            // 0.55 base, +0.05 per char over 3 (capped at 6 chars → 0.70).
384            let bonus = (token_len.min(6).saturating_sub(3)) as f32 * 0.05;
385            0.55 + bonus
386        }
387        2 => {
388            // 0.40 base, +0.05 per char over 5 (capped at 8 chars → 0.55).
389            let bonus = (token_len.min(8).saturating_sub(5)) as f32 * 0.05;
390            0.40 + bonus
391        }
392        _ => 0.0,
393    }
394}
395
396// ---------------------------------------------------------------------------
397// Levenshtein edit distance
398// ---------------------------------------------------------------------------
399
400/// Compute the Levenshtein edit distance between `a` and `b`.
401///
402/// Uses a two-row rolling-array approach — O(min(|a|, |b|)) space, O(|a|·|b|)
403/// time. Input strings are short CAPCO tokens (2–20 chars), so this runs in
404/// microseconds.
405///
406/// Returns the edit distance clamped to `u8::MAX` (any distance > 255 is
407/// already far beyond the useful range for CAPCO corrections).
408///
409/// Operates on bytes: two characters are "equal" iff they have the same byte
410/// representation. The CAPCO vocabulary is pure ASCII, so this matches a
411/// character-level edit distance for every expected input.
412///
413/// This wrapper allocates two `Vec<u8>` buffers per call. On hot paths that
414/// call Levenshtein for each of many candidates (e.g.,
415/// [`FuzzyVocabMatcher::correct`]), prefer [`levenshtein_with_scratch`] and
416/// reuse the scratch buffers across candidates.
417#[cfg(test)]
418pub(crate) fn levenshtein(a: &str, b: &str) -> u8 {
419    let shorter = a.len().min(b.len());
420    let mut prev = vec![0u8; shorter + 1];
421    let mut curr = vec![0u8; shorter + 1];
422    levenshtein_with_scratch(a, b, &mut prev, &mut curr)
423}
424
425/// Same as [`levenshtein`] but reuses caller-owned scratch buffers.
426///
427/// `prev_buf` and `curr_buf` must each have length strictly greater than
428/// `min(a.len(), b.len())`; their contents are overwritten. In debug builds
429/// this is enforced by a `debug_assert!`.
430pub(crate) fn levenshtein_with_scratch(
431    a: &str,
432    b: &str,
433    prev_buf: &mut [u8],
434    curr_buf: &mut [u8],
435) -> u8 {
436    // Ensure `a` is the shorter string (minimizes inner-loop iterations).
437    let (a, b) = if a.len() <= b.len() { (a, b) } else { (b, a) };
438
439    let a_bytes = a.as_bytes();
440    let b_bytes = b.as_bytes();
441    let n = a_bytes.len();
442    let m = b_bytes.len();
443
444    if n == 0 {
445        return m.min(u8::MAX as usize) as u8;
446    }
447
448    debug_assert!(
449        prev_buf.len() > n && curr_buf.len() > n,
450        "scratch buffers must have len > min(a.len(), b.len())"
451    );
452
453    // Local mutable bindings so `mem::swap` can rotate which buffer is `prev`
454    // and which is `curr` without moving bytes.
455    let (mut prev, mut curr) = (prev_buf, curr_buf);
456
457    // prev[i] holds the distance between a[0..i] and the previous b prefix
458    // (before the loop: b[0..0], and at outer iteration j: b[0..j-1]).
459    for (i, slot) in prev.iter_mut().enumerate().take(n + 1) {
460        *slot = i.min(u8::MAX as usize) as u8;
461    }
462
463    for j in 1..=m {
464        curr[0] = j.min(u8::MAX as usize) as u8;
465
466        for i in 1..=n {
467            let cost = if a_bytes[i - 1] == b_bytes[j - 1] {
468                0u8
469            } else {
470                1u8
471            };
472
473            let del = prev[i].saturating_add(1);
474            let ins = curr[i - 1].saturating_add(1);
475            let sub = prev[i - 1].saturating_add(cost);
476            curr[i] = del.min(ins).min(sub);
477        }
478
479        std::mem::swap(&mut prev, &mut curr);
480    }
481
482    prev[n]
483}
484
485// ---------------------------------------------------------------------------
486// Tests
487// ---------------------------------------------------------------------------
488
489#[cfg(test)]
490#[cfg_attr(coverage_nightly, coverage(off))]
491mod tests {
492    use super::*;
493
494    // ----- levenshtein -----
495
496    #[test]
497    fn lev_identical_strings() {
498        assert_eq!(levenshtein("SECRET", "SECRET"), 0);
499    }
500
501    #[test]
502    fn lev_single_transpose() {
503        // SERCET: R and C swapped — that's a substitution at position 3 and 4,
504        // which is distance 2 in standard Levenshtein (no transposition op).
505        // But the user's intuition is "one swap", which is distance 2.
506        assert_eq!(levenshtein("SERCET", "SECRET"), 2);
507    }
508
509    #[test]
510    fn lev_insertion() {
511        // CONFIDETIAL is missing the second N → distance 1.
512        assert_eq!(levenshtein("CONFIDETIAL", "CONFIDENTIAL"), 1);
513    }
514
515    #[test]
516    fn lev_transposition() {
517        // Adjacent transposition (O and R swapped) → distance 2 in standard
518        // Levenshtein (two substitutions: position 3 O→R, position 4 R→O).
519        assert_eq!(levenshtein("NOFORN", "NOFRON"), 2);
520    }
521
522    #[test]
523    fn lev_empty_vs_nonempty() {
524        assert_eq!(levenshtein("", "SECRET"), 6);
525        assert_eq!(levenshtein("SECRET", ""), 6);
526    }
527
528    #[test]
529    fn lev_symmetry() {
530        assert_eq!(
531            levenshtein("NOFORN", "NOFRON"),
532            levenshtein("NOFRON", "NOFORN")
533        );
534    }
535
536    // ----- FuzzyVocabMatcher -----
537
538    /// A minimal offline vocabulary for tests — does not require a Cargo build.
539    static TEST_VOCAB: &[&str] = &[
540        "CONFIDENTIAL",
541        "FOUO",
542        "NOFORN",
543        "SECRET",
544        "TOP SECRET",
545        "UNCLASSIFIED",
546    ];
547
548    fn matcher() -> FuzzyVocabMatcher<'static> {
549        FuzzyVocabMatcher::new(TEST_VOCAB)
550    }
551
552    #[test]
553    fn known_token_returns_none() {
554        // "SECRET" is in vocab — no correction needed.
555        assert!(matcher().correct("SECRET").is_none());
556    }
557
558    #[test]
559    fn short_token_returns_none() {
560        // Single-char inputs are below MIN_FUZZY_LEN.
561        assert!(matcher().correct("S").is_none());
562        // 2-char inputs are also below MIN_FUZZY_LEN — issue #133 PR 7
563        // experimented with lowering this to 2 but reverted because
564        // 2-char SAR identifier sub-compartments (most visibly `RB`
565        // in the canonical Enron-corpus SAR fixture) collide with
566        // `RS` (the RSEN portion form) at edit distance 1, silently
567        // corrupting SAR sub-compartment text into dissem controls.
568        // See `MIN_FUZZY_LEN` doc for the full PR-7 rationale.
569        assert!(matcher().correct("NF").is_none());
570    }
571
572    #[test]
573    fn confidetial_corrects_to_confidential() {
574        // One missing character — should correct.
575        let result = matcher().correct("CONFIDETIAL");
576        assert_eq!(result.as_ref().map(|c| c.token), Some("CONFIDENTIAL"));
577        // Distance should be 1.
578        assert_eq!(result.map(|c| c.distance), Some(1));
579    }
580
581    #[test]
582    fn nofron_corrects_to_noforn() {
583        // Adjacent transposition (O and R swapped) → distance 2 in Levenshtein.
584        let result = matcher().correct("NOFRON");
585        assert_eq!(result.map(|c| c.token), Some("NOFORN"));
586    }
587
588    #[test]
589    fn sercet_corrects_to_secret() {
590        // Common transposition typo: R and C transposed in "SERCET" vs "SECRET".
591        // Standard Levenshtein counts this as distance 2 (two substitutions).
592        let result = matcher().correct("SERCET");
593        assert_eq!(result.as_ref().map(|c| c.token), Some("SECRET"));
594        let c = result.unwrap();
595        // Distance-2, length-6 → confidence = 0.40 + 1*0.05 = 0.45.
596        assert!(
597            c.confidence > 0.44,
598            "confidence should be non-trivial: {}",
599            c.confidence
600        );
601    }
602
603    #[test]
604    fn confidence_increases_with_token_length() {
605        // Longer tokens at distance 1 should have higher confidence.
606        let short_conf = correction_confidence(1, 4); // e.g., FOUO-like
607        let long_conf = correction_confidence(1, 12); // e.g., CONFIDENTIAL-like
608        assert!(
609            long_conf > short_conf,
610            "expected long_conf {long_conf} > short_conf {short_conf}"
611        );
612    }
613
614    #[test]
615    fn completely_unrelated_string_returns_none() {
616        // "BANANA" is distance > 2 from every test vocab entry.
617        assert!(matcher().correct("BANANA").is_none());
618    }
619
620    #[test]
621    fn ambiguous_corrections_return_none() {
622        // Use a tiny local vocabulary where the input is tied between two
623        // candidates within MAX_EDIT_DISTANCE, so None is returned because the
624        // best correction is ambiguous rather than because all candidates are
625        // filtered out for being too far away.
626        let vocab = &["BOOK", "COOK"];
627        let matcher = FuzzyVocabMatcher::new(vocab);
628        assert!(matcher.correct("NOOK").is_none());
629    }
630
631    #[test]
632    fn distance_2_edit_returns_correction_for_long_tokens() {
633        // "UNCLASSIFEID" — two character errors in a long token.
634        let result = matcher().correct("UNCLASSIFEID");
635        assert_eq!(result.map(|c| c.token), Some("UNCLASSIFIED"));
636    }
637
638    #[test]
639    fn correction_confidence_distance1_scales_with_length() {
640        // Sanity-check the confidence formula directly.
641        // Use approximate comparison to avoid f32 precision noise.
642        let eps = 1e-5_f32;
643        assert!((correction_confidence(1, 3) - 0.55).abs() < eps); // 0.55 + 0 bonus
644        assert!((correction_confidence(1, 4) - 0.60).abs() < eps); // 0.55 + 1*0.05
645        assert!((correction_confidence(1, 6) - 0.70).abs() < eps); // 0.55 + 3*0.05 (capped)
646        assert!((correction_confidence(1, 12) - 0.70).abs() < eps); // capped at 6
647    }
648
649    // ----- Real-vocab regression tests (issue #133 root cause #1) -----
650    //
651    // These tests use `CapcoTokenSet::correction_vocab()` directly so a
652    // future change that removed the banner long-form additions from
653    // the extended vocab (or unintentionally narrowed it back to
654    // `ALL_CVE_TOKENS`) would fail here, not silently regress the
655    // SC-004 accuracy harness. The unit tests above use a minimal
656    // local TEST_VOCAB and would not catch the regression.
657
658    #[test]
659    fn real_vocab_corrects_noforon_to_noforn() {
660        // Issue #133 root cause #1, primary example from PR #136
661        // diagnostic: `NOFORON` is one insertion away from `NOFORN`.
662        // Before the long-form vocab fix this returned None because
663        // `NOFORN` was not in the matcher's vocabulary at all.
664        use marque_ism::CapcoTokenSet;
665        use marque_ism::token_set::TokenSet as _;
666        let vocab = CapcoTokenSet.correction_vocab();
667        let matcher = FuzzyVocabMatcher::new(vocab);
668        let result = matcher.correct("NOFORON");
669        assert_eq!(result.as_ref().map(|c| c.token), Some("NOFORN"));
670        assert_eq!(result.map(|c| c.distance), Some(1));
671    }
672
673    #[test]
674    fn real_vocab_corrects_nofron_to_noforn() {
675        // Adjacent transposition (R↔O): standard Levenshtein distance 2.
676        use marque_ism::CapcoTokenSet;
677        use marque_ism::token_set::TokenSet as _;
678        let vocab = CapcoTokenSet.correction_vocab();
679        let matcher = FuzzyVocabMatcher::new(vocab);
680        let result = matcher.correct("NOFRON");
681        assert_eq!(result.map(|c| c.token), Some("NOFORN"));
682    }
683
684    #[test]
685    fn real_vocab_corrects_orcon_typo() {
686        // Coverage for the §G.1 Table 4 ORCON / OC pair beyond NOFORN.
687        use marque_ism::CapcoTokenSet;
688        use marque_ism::token_set::TokenSet as _;
689        let vocab = CapcoTokenSet.correction_vocab();
690        let matcher = FuzzyVocabMatcher::new(vocab);
691        let result = matcher.correct("ORCN");
692        assert_eq!(result.as_ref().map(|c| c.token), Some("ORCON"));
693    }
694
695    #[test]
696    fn real_vocab_emits_multi_word_banner_for_whitespace_free_typo() {
697        // Pin the documented behavior of multi-word entries in
698        // `EXTENDED_CORRECTION_VOCAB`. The fuzzy matcher CAN emit a
699        // multi-word vocab entry as the correction for a
700        // whitespace-free typo (here: `SBUNOFORN` → `SBU NOFORN` at
701        // distance 1, single-character insertion of the space).
702        // The strict parser then accepts the corrected form via
703        // `parse_non_ic_full_form`, so the round-trip lands as the
704        // expected `NonIcDissem::SbuNf`.
705        //
706        // Pinning this lets us word the doc comment on
707        // `EXTENDED_CORRECTION_VOCAB` accurately — multi-word
708        // entries are reachable, not "inert".
709        use marque_ism::CapcoTokenSet;
710        use marque_ism::token_set::TokenSet as _;
711        let vocab = CapcoTokenSet.correction_vocab();
712        let matcher = FuzzyVocabMatcher::new(vocab);
713        let result = matcher.correct("SBUNOFORN");
714        assert_eq!(result.as_ref().map(|c| c.token), Some("SBU NOFORN"));
715        assert_eq!(result.map(|c| c.distance), Some(1));
716    }
717
718    #[test]
719    fn correction_confidence_distance2_scales_with_length() {
720        let eps = 1e-5_f32;
721        assert!((correction_confidence(2, 5) - 0.40).abs() < eps); // 0.40 + 0 bonus
722        assert!((correction_confidence(2, 6) - 0.45).abs() < eps); // 0.40 + 1*0.05
723        assert!((correction_confidence(2, 8) - 0.55).abs() < eps); // 0.40 + 3*0.05 (capped)
724        assert!((correction_confidence(2, 15) - 0.55).abs() < eps); // capped at 8
725    }
726
727    // ----- correct_all / correct_all_with_floor -----
728
729    #[test]
730    fn correct_all_returns_empty_for_known_token() {
731        // Fast-path: token is already in vocab → no alternates.
732        let result = matcher().correct_all("SECRET");
733        assert!(
734            result.is_empty(),
735            "known token must return empty vec from correct_all, got {result:?}"
736        );
737    }
738
739    #[test]
740    fn correct_all_returns_empty_for_short_token() {
741        // Tokens shorter than MIN_FUZZY_LEN → no alternates.
742        assert!(matcher().correct_all("S").is_empty());
743        assert!(matcher().correct_all("NF").is_empty());
744    }
745
746    #[test]
747    fn correct_all_returns_multiple_for_ambiguous_input() {
748        // `correct_all` differs from `correct`: it does NOT collapse tied
749        // candidates to None. Both BOOK and COOK are at distance 1 from NOOK,
750        // so `correct` returns None but `correct_all` returns both.
751        let vocab = &["BOOK", "COOK"];
752        let m = FuzzyVocabMatcher::new(vocab);
753        let result = m.correct_all("NOOK");
754        assert_eq!(
755            result.len(),
756            2,
757            "correct_all must return all tied candidates; got {result:?}"
758        );
759        let tokens: Vec<&str> = result.iter().map(|c| c.token).collect();
760        assert!(tokens.contains(&"BOOK"), "BOOK must be among alternates");
761        assert!(tokens.contains(&"COOK"), "COOK must be among alternates");
762    }
763
764    #[test]
765    fn correct_all_with_floor_filters_confidence() {
766        // A high floor should keep only distance-1 matches; a low floor also
767        // includes distance-2 matches. Use a vocab and token where both
768        // distance classes exist.
769        let vocab = &["NOFORN", "UNCLASSIFIED"];
770        let m = FuzzyVocabMatcher::new(vocab);
771
772        // "NOFORON" is distance 1 from "NOFORN". confidence_floor = MIN_USEFUL_CONFIDENCE
773        // keeps it; a floor of 0.80 would drop it (distance-1 on a 7-char token gives
774        // 0.55 + 2*0.05 = 0.65, below 0.80).
775        let with_default_floor = m.correct_all("NOFORON");
776        assert_eq!(
777            with_default_floor.len(),
778            1,
779            "default floor should keep the distance-1 match; got {with_default_floor:?}"
780        );
781        assert_eq!(with_default_floor[0].token, "NOFORN");
782
783        let with_high_floor = m.correct_all_with_floor("NOFORON", 0.80);
784        assert!(
785            with_high_floor.is_empty(),
786            "floor 0.80 must exclude NOFORON→NOFORN (confidence 0.65); got {with_high_floor:?}"
787        );
788    }
789
790    #[test]
791    fn correct_all_with_zero_floor_includes_distance2_on_short_token() {
792        // With floor=0.0, distance-2 corrections for 3-char inputs are included
793        // even though correction_confidence(2, 3) = 0.40 < MIN_USEFUL_CONFIDENCE.
794        // This is the REL TO trigraph expansion path from issue #233.
795        let vocab = &["AUS", "AUT"];
796        let m = FuzzyVocabMatcher::new(vocab);
797
798        // "ASU" is distance 2 from "AUS" (swap + sub). confidence = 0.40 (3-char, dist-2).
799        // Default floor (0.45) filters it out; zero floor includes it.
800        let default_floor = m.correct_all("ASU");
801        assert!(
802            default_floor.is_empty(),
803            "default floor must exclude distance-2 3-char correction; got {default_floor:?}"
804        );
805
806        let zero_floor = m.correct_all_with_floor("ASU", 0.0);
807        assert!(
808            !zero_floor.is_empty(),
809            "zero floor must include distance-2 3-char corrections; got {zero_floor:?}"
810        );
811        let tokens: Vec<&str> = zero_floor.iter().map(|c| c.token).collect();
812        assert!(
813            tokens.contains(&"AUS") || tokens.contains(&"AUT"),
814            "at least one of AUS/AUT must be a distance-2 candidate of ASU; got {zero_floor:?}"
815        );
816    }
817
818    #[test]
819    fn correct_all_result_is_sorted_by_distance_ascending() {
820        // The contract says output is ordered by ascending distance.
821        let vocab = &["NOFORN", "NORORN", "NONORN"];
822        let m = FuzzyVocabMatcher::new(vocab);
823        // "NOFORN" is distance 0 — but that's in vocab, so it's the token being corrected.
824        // Use "NOFRRN": distance 1 from NOFORN, distance 2 from NORORN/NONORN.
825        let result = m.correct_all("NOFRRN");
826        if result.len() > 1 {
827            for pair in result.windows(2) {
828                assert!(
829                    pair[0].distance <= pair[1].distance,
830                    "correct_all result not sorted by distance: {:?}",
831                    result
832                );
833            }
834        }
835    }
836}
marque_core/fuzzy.rs

marque_core/
fuzzy.rs