marque_core/fuzzy.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Vocabulary-aware fuzzy correction for CAPCO tokens.
6//!
7//! # Design
8//!
9//! CAPCO markings are built from a **closed vocabulary** of ~52 CVE tokens
10//! (classification levels, SCI controls, dissemination controls, and a handful
11//! of structural keywords). OCR and manual transcription errors produce
12//! near-miss variants — `SERCET`, `NOFRON`, `CONFIDETIAL` — that no rule ever
13//! fires on because the scanner never detects them as marking candidates.
14//!
15//! The approach here mirrors what makes [`typos`](https://github.com/crate-ci/typos)
16//! so effective at eliminating false positives, adapted for the closed-world
17//! property of CAPCO vocabulary:
18//!
19//! 1. **Closed-world validation first.** If the input token is already in the
20//! vocabulary, return `None` immediately — no correction needed, no false
21//! positive possible.
22//!
23//! 2. **Exhaustive near-miss search.** Because the vocabulary is tiny (~52
24//! tokens), computing Levenshtein edit distance to every known token is
25//! fast (microseconds).
26//!
27//! 3. **Ambiguity rejection.** If two or more vocabulary entries are equally
28//! close, the correction is ambiguous. Return `None` and let the engine
29//! surface it as a human-review item — exactly what `typos` does for words
30//! that could correct to multiple targets.
31//!
32//! 4. **Minimum-length guard.** Very short tokens (1-2 characters) are excluded
33//! from fuzzy matching because edit distance is semantically unreliable at
34//! that length. `C`, `S`, `U` are valid in context but look similar enough to
35//! dozens of other possibilities that any fuzzy suggestion would be noise.
36//! See [`MIN_FUZZY_LEN`] for the 2-char rationale (PR 7 SAR
37//! sub-compartment false-positives).
38//!
39//! 5. **Confidence scores.** Each `FuzzyCorrection` carries a base confidence
40//! derived from edit distance and token length. The calling engine multiplies
41//! this by a **context factor** (+0.10–0.15 when the token is inside a
42//! detected marking region) before comparing against the configured threshold.
43//!
44//! # Integration Points
45//!
46//! The [`FuzzyVocabMatcher`] is injected into the engine's pre-scanner step.
47//! In the default configuration it operates after the AhoCorasick corrections
48//! map pass — user-configured exact corrections take priority; the fuzzy matcher
49//! handles residual OCR noise the exact map doesn't cover.
50//!
51//! **WASM-safe**: no I/O, no platform-specific code, and only small transient
52//! heap allocations during edit-distance computation.
53
54/// A correction candidate produced by [`FuzzyVocabMatcher::correct`].
55///
56/// Callers should multiply `confidence` by a context factor before comparing
57/// against the engine threshold:
58/// - Inside a detected marking region (between `//` boundaries or `(...)`):
59/// multiply by `1.0 + 0.15 * context_strength` (practical range 1.10–1.15).
60/// - In open prose with no structural signal: use `confidence` as-is.
61#[derive(Debug, Clone, PartialEq)]
62pub struct FuzzyCorrection {
63 /// The suggested canonical token from the vocabulary.
64 pub token: &'static str,
65 /// Levenshtein edit distance between the input and `token`.
66 pub distance: u8,
67 /// Base confidence score in `[0.0, 1.0]`.
68 ///
69 /// Derived from `distance` and token length. Does **not** include a
70 /// context factor — callers should apply one before thresholding.
71 pub confidence: f32,
72}
73
74/// Vocabulary-aware fuzzy corrector for a closed token set.
75///
76/// Construct once per engine session from the vocabulary slice exposed by
77/// `marque_ism::TokenSet::correction_vocab`. The vocab must be sorted and
78/// deduplicated: the "is already valid" fast path uses [`slice::binary_search`],
79/// and the ambiguity check assumes each candidate appears at most once. For
80/// [`marque_ism::CapcoTokenSet`] the invariant is enforced at the source —
81/// `ALL_CVE_TOKENS` is emitted sorted and deduplicated by
82/// `marque-ism/build.rs` and verified by `token_set::tests`.
83///
84/// # Example
85/// ```
86/// use marque_core::fuzzy::FuzzyVocabMatcher;
87/// use marque_ism::CapcoTokenSet;
88/// use marque_ism::token_set::TokenSet as _;
89///
90/// let vocab = CapcoTokenSet.correction_vocab();
91/// let matcher = FuzzyVocabMatcher::new(vocab);
92///
93/// // "SERCET" is one transpose away from "SECRET"
94/// let result = matcher.correct("SERCET");
95/// assert_eq!(result.map(|c| c.token), Some("SECRET"));
96///
97/// // Known tokens → no correction
98/// assert!(matcher.correct("SECRET").is_none());
99///
100/// // Too ambiguous → no correction
101/// // (tokens equidistant from the input → ambiguous, return None)
102/// ```
103pub struct FuzzyVocabMatcher<'v> {
104 vocab: &'v [&'static str],
105}
106
107impl<'v> FuzzyVocabMatcher<'v> {
108 /// Create a new matcher over `vocab`.
109 ///
110 /// `vocab` must be the sorted, deduplicated CVE token slice returned by
111 /// [`TokenSet::correction_vocab`]. Construction is `O(1)` — the slice is
112 /// not copied or indexed. The slice itself may live on the caller's
113 /// `TokenSet` implementation (e.g., a `Vec<&'static str>` field), but each
114 /// entry must be `&'static str` so that [`FuzzyCorrection::token`] — which
115 /// borrows directly from the vocabulary — outlives the matcher.
116 pub fn new(vocab: &'v [&'static str]) -> Self {
117 Self { vocab }
118 }
119
120 /// Attempt to find a fuzzy correction for an unknown `token`.
121 ///
122 /// Returns `None` when:
123 /// - `token` is already a known vocabulary entry (no correction needed).
124 /// - `token` is too short (< `MIN_FUZZY_LEN` bytes).
125 /// - No vocabulary entry is within [`MAX_EDIT_DISTANCE`] edits.
126 /// - Multiple vocabulary entries tie at the closest distance (ambiguous).
127 ///
128 /// Returns `Some(FuzzyCorrection)` only when the correction is unambiguous.
129 ///
130 /// # ASCII invariant
131 ///
132 /// Length checks and the underlying edit-distance computation both operate
133 /// on byte counts. The CAPCO vocabulary is pure ASCII (classification
134 /// levels, SCI/dissem/SAR tokens, etc.), so byte count and character count
135 /// coincide for every expected input. Non-ASCII input is compared byte-wise
136 /// and will not produce meaningful corrections — which is the intended
137 /// behavior, since no non-ASCII candidate exists in the closed vocab.
138 pub fn correct(&self, token: &str) -> Option<FuzzyCorrection> {
139 // Fast path: token is already valid → nothing to correct.
140 if self.vocab.binary_search(&token).is_ok() {
141 return None;
142 }
143
144 let token_len = token.len();
145
146 // Very short tokens are too noisy for edit-distance correction.
147 if token_len < MIN_FUZZY_LEN {
148 return None;
149 }
150
151 // Scratch buffers reused across every candidate in this call. The
152 // rolling two-row array needs `shorter + 1` slots, where
153 // `shorter = min(token.len(), candidate.len())`. The length-diff
154 // filter below guarantees `|token.len() - candidate.len()| <=
155 // MAX_EDIT_DISTANCE`, so `shorter <= token_len`, and a single
156 // allocation of `token_len + 1` fits every candidate. This avoids
157 // the ~2 heap allocations per candidate (~100 per correction attempt)
158 // that allocating inside the levenshtein helper would incur.
159 let scratch_len = token_len + 1;
160 let mut prev = vec![0u8; scratch_len];
161 let mut curr = vec![0u8; scratch_len];
162
163 let mut best_dist = u8::MAX;
164 let mut best_token: Option<&'static str> = None;
165 let mut ambiguous = false;
166
167 for &candidate in self.vocab {
168 let cand_len = candidate.len();
169
170 // Skip candidates whose length difference alone exceeds our bound.
171 // This is a fast filter before the full Levenshtein computation.
172 let len_diff = token_len.abs_diff(cand_len);
173 if len_diff > MAX_EDIT_DISTANCE as usize {
174 continue;
175 }
176
177 let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);
178
179 if d < best_dist {
180 best_dist = d;
181 best_token = Some(candidate);
182 ambiguous = false;
183 } else if d == best_dist {
184 // Another candidate at the same distance — correction is
185 // ambiguous; do not auto-suggest.
186 ambiguous = true;
187 }
188 }
189
190 if ambiguous || best_dist > MAX_EDIT_DISTANCE {
191 return None;
192 }
193
194 let token = best_token?;
195 let confidence = correction_confidence(best_dist, token_len);
196
197 // Confidence too low to be meaningful — treat as no match.
198 if confidence < MIN_USEFUL_CONFIDENCE {
199 return None;
200 }
201
202 Some(FuzzyCorrection {
203 token,
204 distance: best_dist,
205 confidence,
206 })
207 }
208
209 /// Return every vocabulary entry within
210 /// [`MAX_EDIT_DISTANCE`] of `token`, paired with its distance.
211 ///
212 /// Behaves like [`Self::correct`] but does NOT collapse ambiguous
213 /// matches to `None`. The decoder uses this when the caller needs
214 /// to score multiple candidates against a downstream prior — for
215 /// REL TO trigraph fuzzy recovery, the corpus-weighted log-prior
216 /// breaks ties that the matcher itself cannot (issue #233).
217 /// Fast-paths the same as [`Self::correct`]:
218 ///
219 /// - `token` is already in vocab → returns an empty vec.
220 /// - `token.len() < MIN_FUZZY_LEN` → returns an empty vec.
221 ///
222 /// Output is ordered by ascending distance, then by the
223 /// vocabulary's lexicographic order (because the iteration walks
224 /// the sorted vocab slice). Capped by [`MAX_EDIT_DISTANCE`] so a
225 /// single call cannot run away on a tiny vocab; the priors-bake
226 /// vocabulary stays well bounded in practice.
227 pub fn correct_all(&self, token: &str) -> Vec<FuzzyCorrection> {
228 self.correct_all_with_floor(token, MIN_USEFUL_CONFIDENCE)
229 }
230
231 /// Like [`Self::correct_all`] but with a caller-controlled
232 /// confidence floor.
233 ///
234 /// `confidence_floor` MUST lie in `[0.0, 1.0]` — `correction_confidence`
235 /// returns values in that range, so a negative floor would silently
236 /// disable filtering (the comparison `confidence >= negative_floor`
237 /// is always true) and a floor `> 1.0` would silently drop every
238 /// match. A debug build panics on a misuse instead of producing a
239 /// release binary that returns counterintuitive empty / unfiltered
240 /// results.
241 ///
242 /// The default floor (`MIN_USEFUL_CONFIDENCE` = 0.45) excludes
243 /// distance-2 corrections of 3-char inputs, which is the right
244 /// safety policy for the standard fuzzy path because those
245 /// corrections are too speculative without surrounding context.
246 /// The decoder's REL TO trigraph expansion (issue #233) supplies
247 /// surrounding context — the candidate goes through the strict
248 /// REL TO parser, the resulting marking has a corpus-weighted
249 /// trigraph prior, and the decoder's `UNAMBIGUOUS_LOG_MARGIN`
250 /// breaks ties at score time. Lowering the floor for that
251 /// specific call site is what lets a typo like `ASU → AUS`
252 /// (distance 2 in plain Levenshtein) reach the scorer.
253 ///
254 /// Callers passing a floor of `0.0` get every match within
255 /// [`MAX_EDIT_DISTANCE`].
256 pub fn correct_all_with_floor(
257 &self,
258 token: &str,
259 confidence_floor: f32,
260 ) -> Vec<FuzzyCorrection> {
261 debug_assert!(
262 (0.0..=1.0).contains(&confidence_floor),
263 "confidence_floor must be in [0.0, 1.0], got {confidence_floor}"
264 );
265 if self.vocab.binary_search(&token).is_ok() {
266 return Vec::new();
267 }
268
269 let token_len = token.len();
270 if token_len < MIN_FUZZY_LEN {
271 return Vec::new();
272 }
273
274 let scratch_len = token_len + 1;
275 let mut prev = vec![0u8; scratch_len];
276 let mut curr = vec![0u8; scratch_len];
277
278 let mut hits: Vec<FuzzyCorrection> = Vec::new();
279 for &candidate in self.vocab {
280 let cand_len = candidate.len();
281 let len_diff = token_len.abs_diff(cand_len);
282 if len_diff > MAX_EDIT_DISTANCE as usize {
283 continue;
284 }
285 let d = levenshtein_with_scratch(token, candidate, &mut prev, &mut curr);
286 if d > MAX_EDIT_DISTANCE {
287 continue;
288 }
289 let confidence = correction_confidence(d, token_len);
290 if confidence < confidence_floor {
291 continue;
292 }
293 hits.push(FuzzyCorrection {
294 token: candidate,
295 distance: d,
296 confidence,
297 });
298 }
299 // Sort ascending by distance; preserve vocab order within a
300 // distance band (vocab is already sorted, so the secondary
301 // ordering falls out of the iteration without a re-sort).
302 hits.sort_by_key(|h| h.distance);
303 hits
304 }
305}
306
307// ---------------------------------------------------------------------------
308// Constants
309// ---------------------------------------------------------------------------
310
311/// Minimum input token length for fuzzy matching.
312///
313/// Tokens shorter than this are excluded. Single- and two-char tokens
314/// are too noisy for context-free edit-distance correction:
315///
316/// - **Single-char**: `S`, `C`, `U`, `R` are valid classification CAPCO
317/// abbreviations and at edit distance 1 from too many other
318/// single-char tokens to produce reliable corrections.
319/// - **Two-char**: tried briefly during issue #133 PR 7 to recover
320/// `UK → TK`-style typos, but reverted because of false-positive
321/// collisions with SAR identifier sub-compartment letters. Most
322/// visibly, the canonical Enron-corpus SAR fixture
323/// `SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN`
324/// has `RB` as a standalone 2-char token (sub-compartment of XRA),
325/// and `RB` is at edit distance 1 from `RS` (the IC dissem portion
326/// form for RSEN) — so 2-char fuzzy silently converted SAR
327/// sub-compartment letters into dissem controls. The fuzzy
328/// matcher has no context awareness to know "we're inside a SAR
329/// block, skip identifier-shaped tokens", so the safer choice is
330/// to keep 2-char tokens out of fuzzy and address the
331/// `UK→TK`-style cases via a context-aware structural pass in
332/// the decoder.
333pub const MIN_FUZZY_LEN: usize = 3;
334
335/// Maximum Levenshtein edit distance considered for a correction.
336///
337/// Edit distance 2 covers:
338/// - Transpositions (`SERCET` → `SECRET`, distance 2).
339/// - Single-char substitutions + insertions (`CONFIDETIAL` → `CONFIDENTIAL`,
340/// distance 1).
341/// - Two-step corrections for heavily OCR'd text (`SECRECT` → `SECRET`,
342/// distance 2).
343///
344/// Distance 3+ produces too many false positives against short CAPCO tokens
345/// and is therefore excluded.
346pub const MAX_EDIT_DISTANCE: u8 = 2;
347
348/// Minimum confidence score below which a correction is suppressed.
349///
350/// This threshold prevents low-quality guesses from entering the pipeline
351/// even when they satisfy the edit-distance bound — for example, a
352/// distance-2 correction of a 4-character input has so many possible sources
353/// that the prior is too weak to act on.
354const MIN_USEFUL_CONFIDENCE: f32 = 0.45;
355
356// ---------------------------------------------------------------------------
357// Confidence scoring
358// ---------------------------------------------------------------------------
359
360/// Derive a base confidence score from edit distance and token length.
361///
362/// # Rationale
363///
364/// The prior that an edit-distance-1 deviation from a known CAPCO token is a
365/// typo is much higher for long tokens than for short ones:
366/// - `NOFRON` → `NOFORN` (distance 1, length 6): almost certainly a typo.
367/// - `SB` → `SI` (distance 1, length 2): could be many things.
368///
369/// The formula is:
370/// ```text
371/// base = match distance {
372/// 1 => 0.55 + 0.05 * min(token_len, 6).saturating_sub(3) → [0.55, 0.70]
373/// 2 => 0.40 + 0.05 * min(token_len, 8).saturating_sub(5) → [0.40, 0.55]
374/// _ => 0.0
375/// }
376/// ```
377/// These are intentionally conservative. The engine's context factor
378/// (marking-region signal) raises the effective confidence at call time.
379fn correction_confidence(distance: u8, token_len: usize) -> f32 {
380 match distance {
381 0 => 1.0, // exact match — should not reach this path
382 1 => {
383 // 0.55 base, +0.05 per char over 3 (capped at 6 chars → 0.70).
384 let bonus = (token_len.min(6).saturating_sub(3)) as f32 * 0.05;
385 0.55 + bonus
386 }
387 2 => {
388 // 0.40 base, +0.05 per char over 5 (capped at 8 chars → 0.55).
389 let bonus = (token_len.min(8).saturating_sub(5)) as f32 * 0.05;
390 0.40 + bonus
391 }
392 _ => 0.0,
393 }
394}
395
396// ---------------------------------------------------------------------------
397// Levenshtein edit distance
398// ---------------------------------------------------------------------------
399
400/// Compute the Levenshtein edit distance between `a` and `b`.
401///
402/// Uses a two-row rolling-array approach — O(min(|a|, |b|)) space, O(|a|·|b|)
403/// time. Input strings are short CAPCO tokens (2–20 chars), so this runs in
404/// microseconds.
405///
406/// Returns the edit distance clamped to `u8::MAX` (any distance > 255 is
407/// already far beyond the useful range for CAPCO corrections).
408///
409/// Operates on bytes: two characters are "equal" iff they have the same byte
410/// representation. The CAPCO vocabulary is pure ASCII, so this matches a
411/// character-level edit distance for every expected input.
412///
413/// This wrapper allocates two `Vec<u8>` buffers per call. On hot paths that
414/// call Levenshtein for each of many candidates (e.g.,
415/// [`FuzzyVocabMatcher::correct`]), prefer [`levenshtein_with_scratch`] and
416/// reuse the scratch buffers across candidates.
417#[cfg(test)]
418pub(crate) fn levenshtein(a: &str, b: &str) -> u8 {
419 let shorter = a.len().min(b.len());
420 let mut prev = vec![0u8; shorter + 1];
421 let mut curr = vec![0u8; shorter + 1];
422 levenshtein_with_scratch(a, b, &mut prev, &mut curr)
423}
424
425/// Same as [`levenshtein`] but reuses caller-owned scratch buffers.
426///
427/// `prev_buf` and `curr_buf` must each have length strictly greater than
428/// `min(a.len(), b.len())`; their contents are overwritten. In debug builds
429/// this is enforced by a `debug_assert!`.
430pub(crate) fn levenshtein_with_scratch(
431 a: &str,
432 b: &str,
433 prev_buf: &mut [u8],
434 curr_buf: &mut [u8],
435) -> u8 {
436 // Ensure `a` is the shorter string (minimizes inner-loop iterations).
437 let (a, b) = if a.len() <= b.len() { (a, b) } else { (b, a) };
438
439 let a_bytes = a.as_bytes();
440 let b_bytes = b.as_bytes();
441 let n = a_bytes.len();
442 let m = b_bytes.len();
443
444 if n == 0 {
445 return m.min(u8::MAX as usize) as u8;
446 }
447
448 debug_assert!(
449 prev_buf.len() > n && curr_buf.len() > n,
450 "scratch buffers must have len > min(a.len(), b.len())"
451 );
452
453 // Local mutable bindings so `mem::swap` can rotate which buffer is `prev`
454 // and which is `curr` without moving bytes.
455 let (mut prev, mut curr) = (prev_buf, curr_buf);
456
457 // prev[i] holds the distance between a[0..i] and the previous b prefix
458 // (before the loop: b[0..0], and at outer iteration j: b[0..j-1]).
459 for (i, slot) in prev.iter_mut().enumerate().take(n + 1) {
460 *slot = i.min(u8::MAX as usize) as u8;
461 }
462
463 for j in 1..=m {
464 curr[0] = j.min(u8::MAX as usize) as u8;
465
466 for i in 1..=n {
467 let cost = if a_bytes[i - 1] == b_bytes[j - 1] {
468 0u8
469 } else {
470 1u8
471 };
472
473 let del = prev[i].saturating_add(1);
474 let ins = curr[i - 1].saturating_add(1);
475 let sub = prev[i - 1].saturating_add(cost);
476 curr[i] = del.min(ins).min(sub);
477 }
478
479 std::mem::swap(&mut prev, &mut curr);
480 }
481
482 prev[n]
483}
484
485// ---------------------------------------------------------------------------
486// Tests
487// ---------------------------------------------------------------------------
488
489#[cfg(test)]
490#[cfg_attr(coverage_nightly, coverage(off))]
491mod tests {
492 use super::*;
493
494 // ----- levenshtein -----
495
496 #[test]
497 fn lev_identical_strings() {
498 assert_eq!(levenshtein("SECRET", "SECRET"), 0);
499 }
500
501 #[test]
502 fn lev_single_transpose() {
503 // SERCET: R and C swapped — that's a substitution at position 3 and 4,
504 // which is distance 2 in standard Levenshtein (no transposition op).
505 // But the user's intuition is "one swap", which is distance 2.
506 assert_eq!(levenshtein("SERCET", "SECRET"), 2);
507 }
508
509 #[test]
510 fn lev_insertion() {
511 // CONFIDETIAL is missing the second N → distance 1.
512 assert_eq!(levenshtein("CONFIDETIAL", "CONFIDENTIAL"), 1);
513 }
514
515 #[test]
516 fn lev_transposition() {
517 // Adjacent transposition (O and R swapped) → distance 2 in standard
518 // Levenshtein (two substitutions: position 3 O→R, position 4 R→O).
519 assert_eq!(levenshtein("NOFORN", "NOFRON"), 2);
520 }
521
522 #[test]
523 fn lev_empty_vs_nonempty() {
524 assert_eq!(levenshtein("", "SECRET"), 6);
525 assert_eq!(levenshtein("SECRET", ""), 6);
526 }
527
528 #[test]
529 fn lev_symmetry() {
530 assert_eq!(
531 levenshtein("NOFORN", "NOFRON"),
532 levenshtein("NOFRON", "NOFORN")
533 );
534 }
535
536 // ----- FuzzyVocabMatcher -----
537
538 /// A minimal offline vocabulary for tests — does not require a Cargo build.
539 static TEST_VOCAB: &[&str] = &[
540 "CONFIDENTIAL",
541 "FOUO",
542 "NOFORN",
543 "SECRET",
544 "TOP SECRET",
545 "UNCLASSIFIED",
546 ];
547
548 fn matcher() -> FuzzyVocabMatcher<'static> {
549 FuzzyVocabMatcher::new(TEST_VOCAB)
550 }
551
552 #[test]
553 fn known_token_returns_none() {
554 // "SECRET" is in vocab — no correction needed.
555 assert!(matcher().correct("SECRET").is_none());
556 }
557
558 #[test]
559 fn short_token_returns_none() {
560 // Single-char inputs are below MIN_FUZZY_LEN.
561 assert!(matcher().correct("S").is_none());
562 // 2-char inputs are also below MIN_FUZZY_LEN — issue #133 PR 7
563 // experimented with lowering this to 2 but reverted because
564 // 2-char SAR identifier sub-compartments (most visibly `RB`
565 // in the canonical Enron-corpus SAR fixture) collide with
566 // `RS` (the RSEN portion form) at edit distance 1, silently
567 // corrupting SAR sub-compartment text into dissem controls.
568 // See `MIN_FUZZY_LEN` doc for the full PR-7 rationale.
569 assert!(matcher().correct("NF").is_none());
570 }
571
572 #[test]
573 fn confidetial_corrects_to_confidential() {
574 // One missing character — should correct.
575 let result = matcher().correct("CONFIDETIAL");
576 assert_eq!(result.as_ref().map(|c| c.token), Some("CONFIDENTIAL"));
577 // Distance should be 1.
578 assert_eq!(result.map(|c| c.distance), Some(1));
579 }
580
581 #[test]
582 fn nofron_corrects_to_noforn() {
583 // Adjacent transposition (O and R swapped) → distance 2 in Levenshtein.
584 let result = matcher().correct("NOFRON");
585 assert_eq!(result.map(|c| c.token), Some("NOFORN"));
586 }
587
588 #[test]
589 fn sercet_corrects_to_secret() {
590 // Common transposition typo: R and C transposed in "SERCET" vs "SECRET".
591 // Standard Levenshtein counts this as distance 2 (two substitutions).
592 let result = matcher().correct("SERCET");
593 assert_eq!(result.as_ref().map(|c| c.token), Some("SECRET"));
594 let c = result.unwrap();
595 // Distance-2, length-6 → confidence = 0.40 + 1*0.05 = 0.45.
596 assert!(
597 c.confidence > 0.44,
598 "confidence should be non-trivial: {}",
599 c.confidence
600 );
601 }
602
603 #[test]
604 fn confidence_increases_with_token_length() {
605 // Longer tokens at distance 1 should have higher confidence.
606 let short_conf = correction_confidence(1, 4); // e.g., FOUO-like
607 let long_conf = correction_confidence(1, 12); // e.g., CONFIDENTIAL-like
608 assert!(
609 long_conf > short_conf,
610 "expected long_conf {long_conf} > short_conf {short_conf}"
611 );
612 }
613
614 #[test]
615 fn completely_unrelated_string_returns_none() {
616 // "BANANA" is distance > 2 from every test vocab entry.
617 assert!(matcher().correct("BANANA").is_none());
618 }
619
620 #[test]
621 fn ambiguous_corrections_return_none() {
622 // Use a tiny local vocabulary where the input is tied between two
623 // candidates within MAX_EDIT_DISTANCE, so None is returned because the
624 // best correction is ambiguous rather than because all candidates are
625 // filtered out for being too far away.
626 let vocab = &["BOOK", "COOK"];
627 let matcher = FuzzyVocabMatcher::new(vocab);
628 assert!(matcher.correct("NOOK").is_none());
629 }
630
631 #[test]
632 fn distance_2_edit_returns_correction_for_long_tokens() {
633 // "UNCLASSIFEID" — two character errors in a long token.
634 let result = matcher().correct("UNCLASSIFEID");
635 assert_eq!(result.map(|c| c.token), Some("UNCLASSIFIED"));
636 }
637
638 #[test]
639 fn correction_confidence_distance1_scales_with_length() {
640 // Sanity-check the confidence formula directly.
641 // Use approximate comparison to avoid f32 precision noise.
642 let eps = 1e-5_f32;
643 assert!((correction_confidence(1, 3) - 0.55).abs() < eps); // 0.55 + 0 bonus
644 assert!((correction_confidence(1, 4) - 0.60).abs() < eps); // 0.55 + 1*0.05
645 assert!((correction_confidence(1, 6) - 0.70).abs() < eps); // 0.55 + 3*0.05 (capped)
646 assert!((correction_confidence(1, 12) - 0.70).abs() < eps); // capped at 6
647 }
648
649 // ----- Real-vocab regression tests (issue #133 root cause #1) -----
650 //
651 // These tests use `CapcoTokenSet::correction_vocab()` directly so a
652 // future change that removed the banner long-form additions from
653 // the extended vocab (or unintentionally narrowed it back to
654 // `ALL_CVE_TOKENS`) would fail here, not silently regress the
655 // SC-004 accuracy harness. The unit tests above use a minimal
656 // local TEST_VOCAB and would not catch the regression.
657
658 #[test]
659 fn real_vocab_corrects_noforon_to_noforn() {
660 // Issue #133 root cause #1, primary example from PR #136
661 // diagnostic: `NOFORON` is one insertion away from `NOFORN`.
662 // Before the long-form vocab fix this returned None because
663 // `NOFORN` was not in the matcher's vocabulary at all.
664 use marque_ism::CapcoTokenSet;
665 use marque_ism::token_set::TokenSet as _;
666 let vocab = CapcoTokenSet.correction_vocab();
667 let matcher = FuzzyVocabMatcher::new(vocab);
668 let result = matcher.correct("NOFORON");
669 assert_eq!(result.as_ref().map(|c| c.token), Some("NOFORN"));
670 assert_eq!(result.map(|c| c.distance), Some(1));
671 }
672
673 #[test]
674 fn real_vocab_corrects_nofron_to_noforn() {
675 // Adjacent transposition (R↔O): standard Levenshtein distance 2.
676 use marque_ism::CapcoTokenSet;
677 use marque_ism::token_set::TokenSet as _;
678 let vocab = CapcoTokenSet.correction_vocab();
679 let matcher = FuzzyVocabMatcher::new(vocab);
680 let result = matcher.correct("NOFRON");
681 assert_eq!(result.map(|c| c.token), Some("NOFORN"));
682 }
683
684 #[test]
685 fn real_vocab_corrects_orcon_typo() {
686 // Coverage for the §G.1 Table 4 ORCON / OC pair beyond NOFORN.
687 use marque_ism::CapcoTokenSet;
688 use marque_ism::token_set::TokenSet as _;
689 let vocab = CapcoTokenSet.correction_vocab();
690 let matcher = FuzzyVocabMatcher::new(vocab);
691 let result = matcher.correct("ORCN");
692 assert_eq!(result.as_ref().map(|c| c.token), Some("ORCON"));
693 }
694
695 #[test]
696 fn real_vocab_emits_multi_word_banner_for_whitespace_free_typo() {
697 // Pin the documented behavior of multi-word entries in
698 // `EXTENDED_CORRECTION_VOCAB`. The fuzzy matcher CAN emit a
699 // multi-word vocab entry as the correction for a
700 // whitespace-free typo (here: `SBUNOFORN` → `SBU NOFORN` at
701 // distance 1, single-character insertion of the space).
702 // The strict parser then accepts the corrected form via
703 // `parse_non_ic_full_form`, so the round-trip lands as the
704 // expected `NonIcDissem::SbuNf`.
705 //
706 // Pinning this lets us word the doc comment on
707 // `EXTENDED_CORRECTION_VOCAB` accurately — multi-word
708 // entries are reachable, not "inert".
709 use marque_ism::CapcoTokenSet;
710 use marque_ism::token_set::TokenSet as _;
711 let vocab = CapcoTokenSet.correction_vocab();
712 let matcher = FuzzyVocabMatcher::new(vocab);
713 let result = matcher.correct("SBUNOFORN");
714 assert_eq!(result.as_ref().map(|c| c.token), Some("SBU NOFORN"));
715 assert_eq!(result.map(|c| c.distance), Some(1));
716 }
717
718 #[test]
719 fn correction_confidence_distance2_scales_with_length() {
720 let eps = 1e-5_f32;
721 assert!((correction_confidence(2, 5) - 0.40).abs() < eps); // 0.40 + 0 bonus
722 assert!((correction_confidence(2, 6) - 0.45).abs() < eps); // 0.40 + 1*0.05
723 assert!((correction_confidence(2, 8) - 0.55).abs() < eps); // 0.40 + 3*0.05 (capped)
724 assert!((correction_confidence(2, 15) - 0.55).abs() < eps); // capped at 8
725 }
726
727 // ----- correct_all / correct_all_with_floor -----
728
729 #[test]
730 fn correct_all_returns_empty_for_known_token() {
731 // Fast-path: token is already in vocab → no alternates.
732 let result = matcher().correct_all("SECRET");
733 assert!(
734 result.is_empty(),
735 "known token must return empty vec from correct_all, got {result:?}"
736 );
737 }
738
739 #[test]
740 fn correct_all_returns_empty_for_short_token() {
741 // Tokens shorter than MIN_FUZZY_LEN → no alternates.
742 assert!(matcher().correct_all("S").is_empty());
743 assert!(matcher().correct_all("NF").is_empty());
744 }
745
746 #[test]
747 fn correct_all_returns_multiple_for_ambiguous_input() {
748 // `correct_all` differs from `correct`: it does NOT collapse tied
749 // candidates to None. Both BOOK and COOK are at distance 1 from NOOK,
750 // so `correct` returns None but `correct_all` returns both.
751 let vocab = &["BOOK", "COOK"];
752 let m = FuzzyVocabMatcher::new(vocab);
753 let result = m.correct_all("NOOK");
754 assert_eq!(
755 result.len(),
756 2,
757 "correct_all must return all tied candidates; got {result:?}"
758 );
759 let tokens: Vec<&str> = result.iter().map(|c| c.token).collect();
760 assert!(tokens.contains(&"BOOK"), "BOOK must be among alternates");
761 assert!(tokens.contains(&"COOK"), "COOK must be among alternates");
762 }
763
764 #[test]
765 fn correct_all_with_floor_filters_confidence() {
766 // A high floor should keep only distance-1 matches; a low floor also
767 // includes distance-2 matches. Use a vocab and token where both
768 // distance classes exist.
769 let vocab = &["NOFORN", "UNCLASSIFIED"];
770 let m = FuzzyVocabMatcher::new(vocab);
771
772 // "NOFORON" is distance 1 from "NOFORN". confidence_floor = MIN_USEFUL_CONFIDENCE
773 // keeps it; a floor of 0.80 would drop it (distance-1 on a 7-char token gives
774 // 0.55 + 2*0.05 = 0.65, below 0.80).
775 let with_default_floor = m.correct_all("NOFORON");
776 assert_eq!(
777 with_default_floor.len(),
778 1,
779 "default floor should keep the distance-1 match; got {with_default_floor:?}"
780 );
781 assert_eq!(with_default_floor[0].token, "NOFORN");
782
783 let with_high_floor = m.correct_all_with_floor("NOFORON", 0.80);
784 assert!(
785 with_high_floor.is_empty(),
786 "floor 0.80 must exclude NOFORON→NOFORN (confidence 0.65); got {with_high_floor:?}"
787 );
788 }
789
790 #[test]
791 fn correct_all_with_zero_floor_includes_distance2_on_short_token() {
792 // With floor=0.0, distance-2 corrections for 3-char inputs are included
793 // even though correction_confidence(2, 3) = 0.40 < MIN_USEFUL_CONFIDENCE.
794 // This is the REL TO trigraph expansion path from issue #233.
795 let vocab = &["AUS", "AUT"];
796 let m = FuzzyVocabMatcher::new(vocab);
797
798 // "ASU" is distance 2 from "AUS" (swap + sub). confidence = 0.40 (3-char, dist-2).
799 // Default floor (0.45) filters it out; zero floor includes it.
800 let default_floor = m.correct_all("ASU");
801 assert!(
802 default_floor.is_empty(),
803 "default floor must exclude distance-2 3-char correction; got {default_floor:?}"
804 );
805
806 let zero_floor = m.correct_all_with_floor("ASU", 0.0);
807 assert!(
808 !zero_floor.is_empty(),
809 "zero floor must include distance-2 3-char corrections; got {zero_floor:?}"
810 );
811 let tokens: Vec<&str> = zero_floor.iter().map(|c| c.token).collect();
812 assert!(
813 tokens.contains(&"AUS") || tokens.contains(&"AUT"),
814 "at least one of AUS/AUT must be a distance-2 candidate of ASU; got {zero_floor:?}"
815 );
816 }
817
818 #[test]
819 fn correct_all_result_is_sorted_by_distance_ascending() {
820 // The contract says output is ordered by ascending distance.
821 let vocab = &["NOFORN", "NORORN", "NONORN"];
822 let m = FuzzyVocabMatcher::new(vocab);
823 // "NOFORN" is distance 0 — but that's in vocab, so it's the token being corrected.
824 // Use "NOFRRN": distance 1 from NOFORN, distance 2 from NORORN/NONORN.
825 let result = m.correct_all("NOFRRN");
826 if result.len() > 1 {
827 for pair in result.windows(2) {
828 assert!(
829 pair[0].distance <= pair[1].distance,
830 "correct_all result not sorted by distance: {:?}",
831 result
832 );
833 }
834 }
835 }
836}