marque_ism/
token_set.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Compile-time Aho-Corasick automaton over CVE token vocabulary.
6//!
7//! The automaton is built from all known CVE tokens at startup (via LazyLock)
8//! and injected into the parser as a `TokenSet` implementation.
9
10use aho_corasick::AhoCorasick;
11use std::sync::LazyLock;
12
13use crate::generated::values;
14use crate::marking_forms::MARKING_FORMS;
15
16/// Minimal interface the parser needs from the token set.
17/// Implemented by `CapcoTokenSet`; injected at engine init.
18pub trait TokenSet: Send + Sync {
19    /// Returns the canonical token string if `token` is a known CVE value.
20    fn canonicalize(&self, token: &str) -> Option<&'static str>;
21
22    /// Returns true if `token` is a known country trigraph.
23    fn is_trigraph(&self, token: &str) -> bool;
24
25    /// Returns the vocabulary slice used for fuzzy correction lookups.
26    ///
27    /// This is the token vocabulary against which unknown tokens are compared
28    /// by the `marque_core::fuzzy` module. Must be sorted and deduplicated
29    /// (binary search is used for the "is already valid" check).
30    ///
31    /// The returned slice is borrowed from the implementor, which allows
32    /// implementations to hold the vocabulary on `self` (e.g., in a `Vec`
33    /// built at construction time) rather than in a global static. Each
34    /// entry is `&'static str` because the fuzzy matcher returns canonical
35    /// tokens with `'static` lifetime in `FuzzyCorrection::token`.
36    ///
37    /// The default implementation returns an empty slice, disabling fuzzy
38    /// correction for external `TokenSet` implementors that do not override it.
39    fn correction_vocab(&self) -> &[&'static str] {
40        &[]
41    }
42}
43
44/// Aho-Corasick automaton over all CVE tokens — built once from generated data.
45static AUTOMATON: LazyLock<AhoCorasick> = LazyLock::new(|| {
46    AhoCorasick::builder()
47        .ascii_case_insensitive(false) // markings are case-sensitive
48        .build(values::ALL_CVE_TOKENS)
49        .expect("CVE token automaton construction failed")
50});
51
52/// Classification structural keywords not present as standalone
53/// entries in `ALL_CVE_TOKENS`. Issue #133 PR 8.
54///
55/// `TOP SECRET` is in `ALL_CVE_TOKENS` as a single multi-word entry,
56/// but the bare `TOP` is not — and the decoder's `scan_token`
57/// tokenizer splits on whitespace, so an input like `TPP SECRET`
58/// arrives at the fuzzy matcher as the standalone token `TPP` with
59/// no `TOP` correction target available. Adding `TOP` to the fuzzy
60/// vocab lets the standard edit-distance path recover the
61/// `TPP→TOP`, `UOP→TOP`, `TDOP→TOP`, `QTOP→TOP`, `TOPW→TOP` family
62/// of typos seen in the SC-004 mangled corpus. The strict parser
63/// then re-joins `TOP SECRET` into the canonical multi-word
64/// classification.
65///
66/// Round-trip safety: a fuzzy-correction returning `TOP` for an
67/// input typo lands as the bare token `TOP`, which the strict
68/// parser combines with the following `SECRET` token into
69/// `MarkingClassification::Us(Classification::TopSecret)` via the
70/// usual two-word classification path. Round-trip pinned by the
71/// PR-8 integration tests in `decoder_recovery.rs`.
72const CLASSIFICATION_STRUCTURAL_KEYWORDS: &[&str] = &["TOP"];
73
74/// NATO classification structural keywords not present in `ALL_CVE_TOKENS`.
75///
76/// NATO-specific classification words appear in multi-word forms that the
77/// strict parser recognises: `COSMIC TOP SECRET`, `COSMIC TOP SECRET-BOHEMIA`,
78/// `COSMIC TOP SECRET-BALK`, `COSMIC TOP SECRET ATOMAL`. Like `TOP` above,
79/// the decoder's whitespace tokenizer splits these multi-word forms so each
80/// word arrives individually at the fuzzy matcher. Without these entries in
81/// the correction vocab, OCR/transcription typos (`COSMID`, `BOHFMIA`,
82/// `ATOAML`, `BBLE`) produce `TokenKind::Unknown` spans and the decoder
83/// discards the candidate.
84///
85/// Round-trip safety: the strict parser in `marque-core` recognises each
86/// multi-word NATO form and maps it to the corresponding
87/// `MarkingClassification::NonUs(NatoClassification::*)` variant, so a
88/// fuzzy-corrected `COSMIC` / `BOHEMIA` / `ATOMAL` / `BALK` token lands
89/// on the correct classification after strict parsing.
90///
91/// Authority: CAPCO-2016 §H.7 p147–148 (NATO classification markings).
92const NATO_CLASSIFICATION_KEYWORDS: &[&str] = &["ATOMAL", "BALK", "BOHEMIA", "COSMIC"];
93
94/// SAR structural keywords (CAPCO-2016 §H.5 p100, "SAR-" indicator and
95/// "SPECIAL ACCESS REQUIRED-" full form), included in the fuzzy
96/// correction vocabulary so OCR/transcription typos in the indicator
97/// keywords (`SPCIAL`, `CCESS`, `SEPCIAL`, etc.) get corrected to the
98/// canonical form before the strict SAR parser's literal `starts_with`
99/// matches in `crates/core/src/parser.rs::parse_sar_category` run.
100///
101/// These keywords are NOT in `ALL_CVE_TOKENS` because the ODNI
102/// `CVEnumISMSAR.xml` is empty — SAR program identifiers are
103/// agency-assigned codewords not centrally registered. The structural
104/// SAR parser handles `SAR-`/`SPECIAL ACCESS REQUIRED-` as fixed
105/// literal indicator strings, but the fuzzy matcher had no way to
106/// recover a typo in those keywords because they weren't in any
107/// vocabulary it consulted. Issue #133 PR 6.
108///
109/// `REQUIRED` is intentionally excluded: in real corpus inputs it is
110/// always followed immediately by `-<program-nickname>` (e.g.,
111/// `REQUIRED-BUTTER`), and the decoder's `scan_token` includes
112/// internal hyphens in a single token, so `REQUIRED-BUTTER` is one
113/// 14-character token that no fuzzy correction targeting `REQUIRED`
114/// (8 chars) can reach within `MAX_EDIT_DISTANCE = 2`. Adding
115/// `REQUIRED` would be a no-op for this hot path; if a future
116/// fixture surfaces with `REQUIRED` as an isolated token (e.g.,
117/// `SPECIAL ACCESS REQUIRED -BUTTER`), revisit. `SAR` is similarly
118/// excluded because it is always glued to a program identifier
119/// (`SAR-BP-J12`) — see `try_sar_indicator_repair` in
120/// `crates/engine/src/decoder.rs` for the structural prefix-strip /
121/// missing-hyphen path that handles `USAR-BP` / `SARBP`.
122const SAR_STRUCTURAL_KEYWORDS: &[&str] = &["ACCESS", "SPECIAL"];
123
124/// AEA and SCI structural keywords not present in `ALL_CVE_TOKENS`.
125///
126/// These individual words appear as components of multi-word Marking Titles
127/// that the strict parser recognises. The decoder's whitespace tokeniser
128/// splits them, so each word arrives at the fuzzy matcher independently. Without
129/// these entries, OCR/transcription typos (`TAELNT`, `TALNET`, `FRMERLY`,
130/// `KEYOLE`) produce `TokenKind::Unknown` spans that cause the decoder to
131/// discard the candidate.
132///
133/// **TALENT / KEYHOLE** (§H.4 p73): The full Marking Title for TK is "TALENT
134/// KEYHOLE". OCR commonly mangles individual words of long titles; having both
135/// bare words in the vocab lets `TAELNT KEYHOLE` → `TALENT KEYHOLE` → `TK`.
136///
137/// **FORMERLY** (§H.6 p116): The full Marking Title for FRD is "FORMERLY
138/// RESTRICTED DATA". A typo like `FRMERLY RESTRICTED DATA` arrives at the
139/// fuzzy matcher as the token `FRMERLY`; without `FORMERLY` in the vocab the
140/// decoder cannot recover it.
141///
142/// Round-trip safety: the strict parser already handles `TALENT KEYHOLE` →
143/// `TK` and `FORMERLY RESTRICTED DATA` → `FRD` via `parse_sci_block` /
144/// `title_to_portion` paths respectively, so fuzzy-corrected tokens land at
145/// the expected parsed values without further changes.
146///
147/// Note: `NUCLEAR` (appears in CNWDI/TFNI/DOD-UCNI/DOE-UCNI titles) is
148/// intentionally excluded — it is a very common English word and would produce
149/// excessive false-positive fuzzy corrections on unrelated text.
150const AEA_SCI_STRUCTURAL_KEYWORDS: &[&str] = &["FORMERLY", "KEYHOLE", "TALENT"];
151
152/// Extended fuzzy-correction vocabulary: `ALL_CVE_TOKENS` ∪ banner long forms
153/// from [`MARKING_FORMS`] ∪ [`SAR_STRUCTURAL_KEYWORDS`] ∪
154/// [`CLASSIFICATION_STRUCTURAL_KEYWORDS`] ∪ [`NATO_CLASSIFICATION_KEYWORDS`] ∪
155/// [`AEA_SCI_STRUCTURAL_KEYWORDS`],
156/// sorted and deduplicated.
157///
158/// `ALL_CVE_TOKENS` carries only the **portion-form** abbreviations
159/// (`NF`, `OC`, `PR`, `XD`, `ND`) and a handful of single-form tokens
160/// (`RELIDO`, `FISA`, `FOUO`). The banner long forms — which are valid
161/// inputs the strict parser handles via
162/// [`crate::marking_forms::banner_to_portion`] — were missing from the
163/// vocabulary the fuzzy matcher consults, so an OCR/transcription typo
164/// like `NOFORON` (distance 1 from `NOFORN`) found no correction target
165/// and the decoder discarded it as `TokenKind::Unknown`. See issue #133.
166///
167/// Round-trip safety: the strict parser's `parse_dissem_full_form` and
168/// `parse_non_ic_full_form` already accept the banner forms here and
169/// translate them to the canonical portion enum, so a fuzzy correction
170/// returning `NOFORN` (rather than `NF`) lands on the same final
171/// [`crate::DissemControl::Nf`] after strict parsing. The SAR
172/// structural keywords (`SAR_STRUCTURAL_KEYWORDS`) are similarly
173/// round-trip safe: `parse_sar_category` accepts the canonical
174/// `SPECIAL ACCESS REQUIRED-` indicator literally, so a correction
175/// returning `SPECIAL` for `SPCIAL` lands at the same `SarMarking`
176/// after strict parsing.
177///
178/// Multi-word banner forms (`DEA SENSITIVE`, `SBU NOFORN`,
179/// `LES NOFORN`, `DOD UCNI`, `DOE UCNI`) are retained intentionally.
180/// The decoder's per-token fuzzy tokenizer (`scan_token` in
181/// `crates/engine/src/decoder.rs`) splits raw input on whitespace, so
182/// these never appear as a single *input* token to the matcher — but
183/// fuzzy correction can still emit one of them as the canonical
184/// *output* for a whitespace-free typo (e.g., `SBUNOFORN` →
185/// `SBU NOFORN`, distance 1, single-character insertion of the
186/// space). The strict parser then accepts the corrected multi-word
187/// form via `parse_non_ic_full_form` / `parse_dissem_full_form` and
188/// translates it to the canonical portion enum, so the round-trip
189/// lands at the expected `NonIcDissem::SbuNf` (or peer). Pinned by
190/// `marque-core::fuzzy::tests::real_vocab_emits_multi_word_banner_for_whitespace_free_typo`.
191static EXTENDED_CORRECTION_VOCAB: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
192    let mut v: Vec<&'static str> = values::ALL_CVE_TOKENS.to_vec();
193    for f in MARKING_FORMS {
194        v.push(f.banner);
195    }
196    v.extend_from_slice(SAR_STRUCTURAL_KEYWORDS);
197    v.extend_from_slice(CLASSIFICATION_STRUCTURAL_KEYWORDS);
198    v.extend_from_slice(NATO_CLASSIFICATION_KEYWORDS);
199    v.extend_from_slice(AEA_SCI_STRUCTURAL_KEYWORDS);
200    v.sort();
201    v.dedup();
202    v
203});
204
205pub struct CapcoTokenSet;
206
207impl TokenSet for CapcoTokenSet {
208    fn canonicalize(&self, token: &str) -> Option<&'static str> {
209        // `ALL_CVE_TOKENS` is emitted sorted and deduplicated by build.rs,
210        // so an O(log n) binary search is correct and faster than the
211        // previous O(n) linear scan.
212        values::ALL_CVE_TOKENS
213            .binary_search(&token)
214            .ok()
215            .map(|i| values::ALL_CVE_TOKENS[i])
216    }
217
218    fn is_trigraph(&self, token: &str) -> bool {
219        // TRIGRAPHS is emitted sorted and deduplicated by build.rs, so
220        // binary_search is O(log n) over ~340 entries instead of the old
221        // O(n) `.contains()` linear scan. Hot path for every REL TO parse.
222        values::TRIGRAPHS.binary_search(&token).is_ok()
223    }
224
225    fn correction_vocab(&self) -> &[&'static str] {
226        EXTENDED_CORRECTION_VOCAB.as_slice()
227    }
228}
229
230impl CapcoTokenSet {
231    /// Returns a reference to the Aho-Corasick automaton built from all CVE tokens.
232    /// Reserved for Phase 2 multi-pattern matching when per-token spans are wired.
233    #[allow(dead_code)]
234    pub(crate) fn automaton() -> &'static AhoCorasick {
235        &AUTOMATON
236    }
237}
238
239#[cfg(test)]
240#[cfg_attr(coverage_nightly, coverage(off))]
241mod tests {
242    use super::*;
243
244    #[test]
245    fn all_cve_tokens_are_sorted_and_unique() {
246        let tokens = values::ALL_CVE_TOKENS;
247        for window in tokens.windows(2) {
248            assert!(
249                window[0] < window[1],
250                "ALL_CVE_TOKENS is not strictly sorted: {:?} >= {:?}",
251                window[0],
252                window[1],
253            );
254        }
255    }
256
257    #[test]
258    fn trigraphs_are_sorted_and_unique() {
259        // `is_trigraph` relies on binary_search, so the slice must be
260        // strictly-sorted. If a future ODNI XSD update shuffles the order,
261        // build.rs collects into a BTreeSet and this test catches any
262        // regression of that contract.
263        let trigraphs = values::TRIGRAPHS;
264        for window in trigraphs.windows(2) {
265            assert!(
266                window[0] < window[1],
267                "TRIGRAPHS is not strictly sorted: {:?} >= {:?}",
268                window[0],
269                window[1],
270            );
271        }
272    }
273
274    #[test]
275    fn canonicalize_returns_known_token() {
276        let set = CapcoTokenSet;
277        // SECRET is in the banner-words we always emit.
278        assert_eq!(set.canonicalize("SECRET"), Some("SECRET"));
279    }
280
281    #[test]
282    fn canonicalize_returns_none_for_unknown() {
283        let set = CapcoTokenSet;
284        assert_eq!(set.canonicalize("BANANAPHONE"), None);
285    }
286
287    #[test]
288    fn usa_is_a_known_trigraph() {
289        let set = CapcoTokenSet;
290        assert!(set.is_trigraph("USA"));
291    }
292
293    #[test]
294    fn unknown_string_is_not_a_trigraph() {
295        let set = CapcoTokenSet;
296        assert!(!set.is_trigraph("XYZ_NOT_A_COUNTRY"));
297    }
298
299    #[test]
300    fn correction_vocab_returns_sorted_nonempty_slice() {
301        let vocab = CapcoTokenSet.correction_vocab();
302        assert!(!vocab.is_empty(), "correction vocab must not be empty");
303        for window in vocab.windows(2) {
304            assert!(
305                window[0] < window[1],
306                "correction_vocab must be strictly sorted: {:?} >= {:?}",
307                window[0],
308                window[1],
309            );
310        }
311    }
312
313    #[test]
314    fn correction_vocab_contains_core_classification_tokens() {
315        let vocab = CapcoTokenSet.correction_vocab();
316        for expected in &["SECRET", "CONFIDENTIAL", "UNCLASSIFIED"] {
317            assert!(
318                vocab.binary_search(expected).is_ok(),
319                "correction_vocab must contain {expected:?}"
320            );
321        }
322    }
323
324    #[test]
325    fn correction_vocab_excludes_non_ic_dissem_caveats() {
326        // Regression guard for the non-IC dissem deny-list invariant.
327        // ODNI's `CVEnumISMDissem.xml` is a UNION enum bundling IC
328        // dissem controls (CAPCO source 1) with the ISOO CUI Registry
329        // caveat tail (AC, AWP, DL_ONLY, FED_ONLY, FEDCON, NOCON) and
330        // the DOD-SAP `WAIVED` entry. CAPCO-2016 line 283 explicitly
331        // disclaims caveats from its scope. The `build.rs` of
332        // `marque-ism` deny-lists those seven tokens so they never
333        // enter the IC `DissemControl` enum or `ALL_CVE_TOKENS`. This
334        // test pins that invariant — a future schema-update bump that
335        // re-introduces them, or a deny-list typo, fails here loudly
336        // rather than silently broadening the CAPCO grammar to accept
337        // caveats as IC dissem controls.
338        //
339        // Tracking issue for the broader caveat / second-banner-line
340        // data model: github.com/marquetools/marque#128.
341        let vocab = CapcoTokenSet.correction_vocab();
342        for forbidden in &[
343            "WAIVED", "AC", "AWP", "DL_ONLY", "FED_ONLY", "FEDCON", "NOCON",
344        ] {
345            assert!(
346                vocab.binary_search(forbidden).is_err(),
347                "correction_vocab MUST NOT contain {forbidden:?} — \
348                 it is a non-IC caveat (CAPCO-2016 line 283 \
349                 disclaimer) that should be filtered by build.rs's \
350                 NON_IC_DISSEM_DENY_LIST"
351            );
352        }
353    }
354
355    #[test]
356    fn correction_vocab_contains_dissem_banner_long_forms() {
357        // Issue #133 root cause #1: the fuzzy matcher saw only
358        // `ALL_CVE_TOKENS`, which carries the dissem **portion**
359        // abbreviations (NF, OC, PR) plus `RELIDO`/`FISA`/`FOUO`,
360        // but not the banner long forms (NOFORN, ORCON, PROPIN,
361        // EXDIS, NODIS, …). So `NOFORON` had no edit-distance
362        // candidate and the decoder discarded it. The extended
363        // vocab pulls every entry's banner form from
364        // `marking_forms::MARKING_FORMS`, with the strict parser's
365        // `parse_dissem_full_form` then normalizing the matched
366        // long form to the canonical portion enum.
367        let vocab = CapcoTokenSet.correction_vocab();
368        for expected in &[
369            "NOFORN",
370            "ORCON",
371            "ORCON-USGOV",
372            "IMCON",
373            "PROPIN",
374            "RSEN",
375            "LIMDIS",
376            "EXDIS",
377            "NODIS",
378        ] {
379            assert!(
380                vocab.binary_search(expected).is_ok(),
381                "correction_vocab MUST contain {expected:?} — \
382                 banner long form per CAPCO-2016 §G.1 Table 4 \
383                 (issue #133 root cause #1)"
384            );
385        }
386    }
387
388    #[test]
389    fn correction_vocab_keeps_ic_dissem_controls() {
390        // Companion to `correction_vocab_excludes_non_ic_dissem_caveats`:
391        // make sure the deny-list didn't take a real IC dissem control
392        // with it. Every entry below appears in CAPCO-2016 §A.5 page 38
393        // as an IC dissem (or §H.8 for the per-marking detail page);
394        // RAWFISA + EXEMPT_FROM_ICD501_DISCOVERY are post-CAPCO-2016
395        // additions in the live ICRM XML, kept by the deny-list-rather-
396        // than-allowlist approach so future IC additions flow through
397        // automatically.
398        let vocab = CapcoTokenSet.correction_vocab();
399        for expected in &[
400            "RS",
401            "FOUO",
402            "OC",
403            "OC-USGOV",
404            "IMC",
405            "NF",
406            "PR",
407            "REL",
408            "RELIDO",
409            "EYES",
410            "DSEN",
411            "RAWFISA",
412            "FISA",
413            "DISPLAYONLY",
414            "EXEMPT_FROM_ICD501_DISCOVERY",
415        ] {
416            assert!(
417                vocab.binary_search(expected).is_ok(),
418                "correction_vocab MUST contain {expected:?} — \
419                 IC dissem control per CAPCO-2016 §A.5 / §H.8 or \
420                 a post-2016 ICRM addition"
421            );
422        }
423    }
424
425    #[test]
426    fn correction_vocab_contains_top_classification_keyword() {
427        // Issue #133 PR 8: bare `TOP` lives outside `ALL_CVE_TOKENS`
428        // because the CVE schema only lists the full multi-word
429        // `TOP SECRET` classification entry. The decoder's
430        // `scan_token` whitespace tokenizer arrives at the fuzzy
431        // matcher with `TPP` (or other 3/4-char typos) as a
432        // standalone token, so without `TOP` in the correction vocab
433        // there's no fuzzy target and the candidate gets dropped.
434        // Adding `TOP` here lets the standard edit-distance fuzzy
435        // path recover `TPP→TOP` (dist 1), `UOP→TOP` (dist 1),
436        // `TDOP→TOP` (dist 1, 4-char input via length-diff filter),
437        // `QTOP→TOP` (dist 1), and `TOPW→TOP` (dist 1). Strict
438        // parser then re-joins `TOP SECRET` into the canonical
439        // multi-word classification.
440        let vocab = CapcoTokenSet.correction_vocab();
441        assert!(
442            vocab.binary_search(&"TOP").is_ok(),
443            "correction_vocab MUST contain bare \"TOP\" — issue #133 PR 8 \
444             classification typo recovery target",
445        );
446    }
447
448    #[test]
449    fn correction_vocab_contains_sar_structural_keywords() {
450        // Issue #133 PR 6: the SAR indicator keywords (`SPECIAL`,
451        // `ACCESS`) live outside `ALL_CVE_TOKENS` because the ODNI
452        // `CVEnumISMSAR.xml` is empty (SAR program identifiers are
453        // agency-assigned and not centrally registered). The structural
454        // SAR parser handles the `SPECIAL ACCESS REQUIRED-` indicator
455        // as a literal string match, but the fuzzy matcher had no
456        // vocabulary entry for `SPECIAL` or `ACCESS` — so an OCR typo
457        // like `SPCIAL` (distance 1 from `SPECIAL`) produced no
458        // correction, the token survived as `TokenKind::Unknown`,
459        // and the decoder discarded the candidate via step 3a's
460        // Unknown-span filter. This test pins the fix.
461        //
462        // `REQUIRED` and `SAR` are deliberately NOT in this list —
463        // they are always glued to a program nickname / identifier
464        // (`REQUIRED-BUTTER`, `SAR-BP-J12`) inside one `scan_token`
465        // chunk, so adding them to the vocab is a no-op for the hot
466        // path. See `SAR_STRUCTURAL_KEYWORDS` doc comment.
467        let vocab = CapcoTokenSet.correction_vocab();
468        for expected in &["ACCESS", "SPECIAL"] {
469            assert!(
470                vocab.binary_search(expected).is_ok(),
471                "correction_vocab MUST contain {expected:?} — \
472                 SAR structural keyword per CAPCO-2016 §H.5 p100 \
473                 (issue #133 PR 6)"
474            );
475        }
476    }
477
478    #[test]
479    fn correction_vocab_contains_aea_sci_structural_keywords() {
480        // PR #256: AEA/SCI long-title structural keywords added so the fuzzy
481        // matcher can recover OCR typos in "FORMERLY RESTRICTED DATA" (FRD,
482        // §H.6) and "TALENT KEYHOLE" (TK, §H.4 p71). `NUCLEAR` is
483        // intentionally excluded — see `AEA_SCI_STRUCTURAL_KEYWORDS` doc
484        // comment.
485        let vocab = CapcoTokenSet.correction_vocab();
486        for expected in &["FORMERLY", "KEYHOLE", "TALENT"] {
487            assert!(
488                vocab.binary_search(expected).is_ok(),
489                "correction_vocab MUST contain {expected:?} — \
490                 AEA/SCI structural keyword per CAPCO-2016 §H.6 / §H.4 p71 \
491                 (PR #256)"
492            );
493        }
494    }
495}
marque_ism/token_set.rs

marque_ism/
token_set.rs