marque_ism/token_set.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Compile-time Aho-Corasick automaton over CVE token vocabulary.
6//!
7//! The automaton is built from all known CVE tokens at startup (via LazyLock)
8//! and injected into the parser as a `TokenSet` implementation.
9
10use aho_corasick::AhoCorasick;
11use std::sync::LazyLock;
12
13use crate::generated::values;
14use crate::marking_forms::MARKING_FORMS;
15
16/// Minimal interface the parser needs from the token set.
17/// Implemented by `CapcoTokenSet`; injected at engine init.
18pub trait TokenSet: Send + Sync {
19 /// Returns the canonical token string if `token` is a known CVE value.
20 fn canonicalize(&self, token: &str) -> Option<&'static str>;
21
22 /// Returns true if `token` is a known country trigraph.
23 fn is_trigraph(&self, token: &str) -> bool;
24
25 /// Returns the vocabulary slice used for fuzzy correction lookups.
26 ///
27 /// This is the token vocabulary against which unknown tokens are compared
28 /// by the `marque_core::fuzzy` module. Must be sorted and deduplicated
29 /// (binary search is used for the "is already valid" check).
30 ///
31 /// The returned slice is borrowed from the implementor, which allows
32 /// implementations to hold the vocabulary on `self` (e.g., in a `Vec`
33 /// built at construction time) rather than in a global static. Each
34 /// entry is `&'static str` because the fuzzy matcher returns canonical
35 /// tokens with `'static` lifetime in `FuzzyCorrection::token`.
36 ///
37 /// The default implementation returns an empty slice, disabling fuzzy
38 /// correction for external `TokenSet` implementors that do not override it.
39 fn correction_vocab(&self) -> &[&'static str] {
40 &[]
41 }
42}
43
44/// Aho-Corasick automaton over all CVE tokens — built once from generated data.
45static AUTOMATON: LazyLock<AhoCorasick> = LazyLock::new(|| {
46 AhoCorasick::builder()
47 .ascii_case_insensitive(false) // markings are case-sensitive
48 .build(values::ALL_CVE_TOKENS)
49 .expect("CVE token automaton construction failed")
50});
51
52/// Classification structural keywords not present as standalone
53/// entries in `ALL_CVE_TOKENS`. Issue #133 PR 8.
54///
55/// `TOP SECRET` is in `ALL_CVE_TOKENS` as a single multi-word entry,
56/// but the bare `TOP` is not — and the decoder's `scan_token`
57/// tokenizer splits on whitespace, so an input like `TPP SECRET`
58/// arrives at the fuzzy matcher as the standalone token `TPP` with
59/// no `TOP` correction target available. Adding `TOP` to the fuzzy
60/// vocab lets the standard edit-distance path recover the
61/// `TPP→TOP`, `UOP→TOP`, `TDOP→TOP`, `QTOP→TOP`, `TOPW→TOP` family
62/// of typos seen in the SC-004 mangled corpus. The strict parser
63/// then re-joins `TOP SECRET` into the canonical multi-word
64/// classification.
65///
66/// Round-trip safety: a fuzzy-correction returning `TOP` for an
67/// input typo lands as the bare token `TOP`, which the strict
68/// parser combines with the following `SECRET` token into
69/// `MarkingClassification::Us(Classification::TopSecret)` via the
70/// usual two-word classification path. Round-trip pinned by the
71/// PR-8 integration tests in `decoder_recovery.rs`.
72const CLASSIFICATION_STRUCTURAL_KEYWORDS: &[&str] = &["TOP"];
73
74/// NATO classification structural keywords not present in `ALL_CVE_TOKENS`.
75///
76/// NATO-specific classification words appear in multi-word forms that the
77/// strict parser recognises: `COSMIC TOP SECRET`, `COSMIC TOP SECRET-BOHEMIA`,
78/// `COSMIC TOP SECRET-BALK`, `COSMIC TOP SECRET ATOMAL`. Like `TOP` above,
79/// the decoder's whitespace tokenizer splits these multi-word forms so each
80/// word arrives individually at the fuzzy matcher. Without these entries in
81/// the correction vocab, OCR/transcription typos (`COSMID`, `BOHFMIA`,
82/// `ATOAML`, `BBLE`) produce `TokenKind::Unknown` spans and the decoder
83/// discards the candidate.
84///
85/// Round-trip safety: the strict parser in `marque-core` recognises each
86/// multi-word NATO form and maps it to the corresponding
87/// `MarkingClassification::NonUs(NatoClassification::*)` variant, so a
88/// fuzzy-corrected `COSMIC` / `BOHEMIA` / `ATOMAL` / `BALK` token lands
89/// on the correct classification after strict parsing.
90///
91/// Authority: CAPCO-2016 §H.7 p147–148 (NATO classification markings).
92const NATO_CLASSIFICATION_KEYWORDS: &[&str] = &["ATOMAL", "BALK", "BOHEMIA", "COSMIC"];
93
94/// SAR structural keywords (CAPCO-2016 §H.5 p100, "SAR-" indicator and
95/// "SPECIAL ACCESS REQUIRED-" full form), included in the fuzzy
96/// correction vocabulary so OCR/transcription typos in the indicator
97/// keywords (`SPCIAL`, `CCESS`, `SEPCIAL`, etc.) get corrected to the
98/// canonical form before the strict SAR parser's literal `starts_with`
99/// matches in `crates/core/src/parser.rs::parse_sar_category` run.
100///
101/// These keywords are NOT in `ALL_CVE_TOKENS` because the ODNI
102/// `CVEnumISMSAR.xml` is empty — SAR program identifiers are
103/// agency-assigned codewords not centrally registered. The structural
104/// SAR parser handles `SAR-`/`SPECIAL ACCESS REQUIRED-` as fixed
105/// literal indicator strings, but the fuzzy matcher had no way to
106/// recover a typo in those keywords because they weren't in any
107/// vocabulary it consulted. Issue #133 PR 6.
108///
109/// `REQUIRED` is intentionally excluded: in real corpus inputs it is
110/// always followed immediately by `-<program-nickname>` (e.g.,
111/// `REQUIRED-BUTTER`), and the decoder's `scan_token` includes
112/// internal hyphens in a single token, so `REQUIRED-BUTTER` is one
113/// 14-character token that no fuzzy correction targeting `REQUIRED`
114/// (8 chars) can reach within `MAX_EDIT_DISTANCE = 2`. Adding
115/// `REQUIRED` would be a no-op for this hot path; if a future
116/// fixture surfaces with `REQUIRED` as an isolated token (e.g.,
117/// `SPECIAL ACCESS REQUIRED -BUTTER`), revisit. `SAR` is similarly
118/// excluded because it is always glued to a program identifier
119/// (`SAR-BP-J12`) — see `try_sar_indicator_repair` in
120/// `crates/engine/src/decoder.rs` for the structural prefix-strip /
121/// missing-hyphen path that handles `USAR-BP` / `SARBP`.
122const SAR_STRUCTURAL_KEYWORDS: &[&str] = &["ACCESS", "SPECIAL"];
123
124/// AEA and SCI structural keywords not present in `ALL_CVE_TOKENS`.
125///
126/// These individual words appear as components of multi-word Marking Titles
127/// that the strict parser recognises. The decoder's whitespace tokeniser
128/// splits them, so each word arrives at the fuzzy matcher independently. Without
129/// these entries, OCR/transcription typos (`TAELNT`, `TALNET`, `FRMERLY`,
130/// `KEYOLE`) produce `TokenKind::Unknown` spans that cause the decoder to
131/// discard the candidate.
132///
133/// **TALENT / KEYHOLE** (§H.4 p73): The full Marking Title for TK is "TALENT
134/// KEYHOLE". OCR commonly mangles individual words of long titles; having both
135/// bare words in the vocab lets `TAELNT KEYHOLE` → `TALENT KEYHOLE` → `TK`.
136///
137/// **FORMERLY** (§H.6 p116): The full Marking Title for FRD is "FORMERLY
138/// RESTRICTED DATA". A typo like `FRMERLY RESTRICTED DATA` arrives at the
139/// fuzzy matcher as the token `FRMERLY`; without `FORMERLY` in the vocab the
140/// decoder cannot recover it.
141///
142/// Round-trip safety: the strict parser already handles `TALENT KEYHOLE` →
143/// `TK` and `FORMERLY RESTRICTED DATA` → `FRD` via `parse_sci_block` /
144/// `title_to_portion` paths respectively, so fuzzy-corrected tokens land at
145/// the expected parsed values without further changes.
146///
147/// Note: `NUCLEAR` (appears in CNWDI/TFNI/DOD-UCNI/DOE-UCNI titles) is
148/// intentionally excluded — it is a very common English word and would produce
149/// excessive false-positive fuzzy corrections on unrelated text.
150const AEA_SCI_STRUCTURAL_KEYWORDS: &[&str] = &["FORMERLY", "KEYHOLE", "TALENT"];
151
152/// Extended fuzzy-correction vocabulary: `ALL_CVE_TOKENS` ∪ banner long forms
153/// from [`MARKING_FORMS`] ∪ [`SAR_STRUCTURAL_KEYWORDS`] ∪
154/// [`CLASSIFICATION_STRUCTURAL_KEYWORDS`] ∪ [`NATO_CLASSIFICATION_KEYWORDS`] ∪
155/// [`AEA_SCI_STRUCTURAL_KEYWORDS`],
156/// sorted and deduplicated.
157///
158/// `ALL_CVE_TOKENS` carries only the **portion-form** abbreviations
159/// (`NF`, `OC`, `PR`, `XD`, `ND`) and a handful of single-form tokens
160/// (`RELIDO`, `FISA`, `FOUO`). The banner long forms — which are valid
161/// inputs the strict parser handles via
162/// [`crate::marking_forms::banner_to_portion`] — were missing from the
163/// vocabulary the fuzzy matcher consults, so an OCR/transcription typo
164/// like `NOFORON` (distance 1 from `NOFORN`) found no correction target
165/// and the decoder discarded it as `TokenKind::Unknown`. See issue #133.
166///
167/// Round-trip safety: the strict parser's `parse_dissem_full_form` and
168/// `parse_non_ic_full_form` already accept the banner forms here and
169/// translate them to the canonical portion enum, so a fuzzy correction
170/// returning `NOFORN` (rather than `NF`) lands on the same final
171/// [`crate::DissemControl::Nf`] after strict parsing. The SAR
172/// structural keywords (`SAR_STRUCTURAL_KEYWORDS`) are similarly
173/// round-trip safe: `parse_sar_category` accepts the canonical
174/// `SPECIAL ACCESS REQUIRED-` indicator literally, so a correction
175/// returning `SPECIAL` for `SPCIAL` lands at the same `SarMarking`
176/// after strict parsing.
177///
178/// Multi-word banner forms (`DEA SENSITIVE`, `SBU NOFORN`,
179/// `LES NOFORN`, `DOD UCNI`, `DOE UCNI`) are retained intentionally.
180/// The decoder's per-token fuzzy tokenizer (`scan_token` in
181/// `crates/engine/src/decoder.rs`) splits raw input on whitespace, so
182/// these never appear as a single *input* token to the matcher — but
183/// fuzzy correction can still emit one of them as the canonical
184/// *output* for a whitespace-free typo (e.g., `SBUNOFORN` →
185/// `SBU NOFORN`, distance 1, single-character insertion of the
186/// space). The strict parser then accepts the corrected multi-word
187/// form via `parse_non_ic_full_form` / `parse_dissem_full_form` and
188/// translates it to the canonical portion enum, so the round-trip
189/// lands at the expected `NonIcDissem::SbuNf` (or peer). Pinned by
190/// `marque-core::fuzzy::tests::real_vocab_emits_multi_word_banner_for_whitespace_free_typo`.
191static EXTENDED_CORRECTION_VOCAB: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
192 let mut v: Vec<&'static str> = values::ALL_CVE_TOKENS.to_vec();
193 for f in MARKING_FORMS {
194 v.push(f.banner);
195 }
196 v.extend_from_slice(SAR_STRUCTURAL_KEYWORDS);
197 v.extend_from_slice(CLASSIFICATION_STRUCTURAL_KEYWORDS);
198 v.extend_from_slice(NATO_CLASSIFICATION_KEYWORDS);
199 v.extend_from_slice(AEA_SCI_STRUCTURAL_KEYWORDS);
200 v.sort();
201 v.dedup();
202 v
203});
204
205pub struct CapcoTokenSet;
206
207impl TokenSet for CapcoTokenSet {
208 fn canonicalize(&self, token: &str) -> Option<&'static str> {
209 // `ALL_CVE_TOKENS` is emitted sorted and deduplicated by build.rs,
210 // so an O(log n) binary search is correct and faster than the
211 // previous O(n) linear scan.
212 values::ALL_CVE_TOKENS
213 .binary_search(&token)
214 .ok()
215 .map(|i| values::ALL_CVE_TOKENS[i])
216 }
217
218 fn is_trigraph(&self, token: &str) -> bool {
219 // TRIGRAPHS is emitted sorted and deduplicated by build.rs, so
220 // binary_search is O(log n) over ~340 entries instead of the old
221 // O(n) `.contains()` linear scan. Hot path for every REL TO parse.
222 values::TRIGRAPHS.binary_search(&token).is_ok()
223 }
224
225 fn correction_vocab(&self) -> &[&'static str] {
226 EXTENDED_CORRECTION_VOCAB.as_slice()
227 }
228}
229
230impl CapcoTokenSet {
231 /// Returns a reference to the Aho-Corasick automaton built from all CVE tokens.
232 /// Reserved for Phase 2 multi-pattern matching when per-token spans are wired.
233 #[allow(dead_code)]
234 pub(crate) fn automaton() -> &'static AhoCorasick {
235 &AUTOMATON
236 }
237}
238
239#[cfg(test)]
240#[cfg_attr(coverage_nightly, coverage(off))]
241mod tests {
242 use super::*;
243
244 #[test]
245 fn all_cve_tokens_are_sorted_and_unique() {
246 let tokens = values::ALL_CVE_TOKENS;
247 for window in tokens.windows(2) {
248 assert!(
249 window[0] < window[1],
250 "ALL_CVE_TOKENS is not strictly sorted: {:?} >= {:?}",
251 window[0],
252 window[1],
253 );
254 }
255 }
256
257 #[test]
258 fn trigraphs_are_sorted_and_unique() {
259 // `is_trigraph` relies on binary_search, so the slice must be
260 // strictly-sorted. If a future ODNI XSD update shuffles the order,
261 // build.rs collects into a BTreeSet and this test catches any
262 // regression of that contract.
263 let trigraphs = values::TRIGRAPHS;
264 for window in trigraphs.windows(2) {
265 assert!(
266 window[0] < window[1],
267 "TRIGRAPHS is not strictly sorted: {:?} >= {:?}",
268 window[0],
269 window[1],
270 );
271 }
272 }
273
274 #[test]
275 fn canonicalize_returns_known_token() {
276 let set = CapcoTokenSet;
277 // SECRET is in the banner-words we always emit.
278 assert_eq!(set.canonicalize("SECRET"), Some("SECRET"));
279 }
280
281 #[test]
282 fn canonicalize_returns_none_for_unknown() {
283 let set = CapcoTokenSet;
284 assert_eq!(set.canonicalize("BANANAPHONE"), None);
285 }
286
287 #[test]
288 fn usa_is_a_known_trigraph() {
289 let set = CapcoTokenSet;
290 assert!(set.is_trigraph("USA"));
291 }
292
293 #[test]
294 fn unknown_string_is_not_a_trigraph() {
295 let set = CapcoTokenSet;
296 assert!(!set.is_trigraph("XYZ_NOT_A_COUNTRY"));
297 }
298
299 #[test]
300 fn correction_vocab_returns_sorted_nonempty_slice() {
301 let vocab = CapcoTokenSet.correction_vocab();
302 assert!(!vocab.is_empty(), "correction vocab must not be empty");
303 for window in vocab.windows(2) {
304 assert!(
305 window[0] < window[1],
306 "correction_vocab must be strictly sorted: {:?} >= {:?}",
307 window[0],
308 window[1],
309 );
310 }
311 }
312
313 #[test]
314 fn correction_vocab_contains_core_classification_tokens() {
315 let vocab = CapcoTokenSet.correction_vocab();
316 for expected in &["SECRET", "CONFIDENTIAL", "UNCLASSIFIED"] {
317 assert!(
318 vocab.binary_search(expected).is_ok(),
319 "correction_vocab must contain {expected:?}"
320 );
321 }
322 }
323
324 #[test]
325 fn correction_vocab_excludes_non_ic_dissem_caveats() {
326 // Regression guard for the non-IC dissem deny-list invariant.
327 // ODNI's `CVEnumISMDissem.xml` is a UNION enum bundling IC
328 // dissem controls (CAPCO source 1) with the ISOO CUI Registry
329 // caveat tail (AC, AWP, DL_ONLY, FED_ONLY, FEDCON, NOCON) and
330 // the DOD-SAP `WAIVED` entry. CAPCO-2016 line 283 explicitly
331 // disclaims caveats from its scope. The `build.rs` of
332 // `marque-ism` deny-lists those seven tokens so they never
333 // enter the IC `DissemControl` enum or `ALL_CVE_TOKENS`. This
334 // test pins that invariant — a future schema-update bump that
335 // re-introduces them, or a deny-list typo, fails here loudly
336 // rather than silently broadening the CAPCO grammar to accept
337 // caveats as IC dissem controls.
338 //
339 // Tracking issue for the broader caveat / second-banner-line
340 // data model: github.com/marquetools/marque#128.
341 let vocab = CapcoTokenSet.correction_vocab();
342 for forbidden in &[
343 "WAIVED", "AC", "AWP", "DL_ONLY", "FED_ONLY", "FEDCON", "NOCON",
344 ] {
345 assert!(
346 vocab.binary_search(forbidden).is_err(),
347 "correction_vocab MUST NOT contain {forbidden:?} — \
348 it is a non-IC caveat (CAPCO-2016 line 283 \
349 disclaimer) that should be filtered by build.rs's \
350 NON_IC_DISSEM_DENY_LIST"
351 );
352 }
353 }
354
355 #[test]
356 fn correction_vocab_contains_dissem_banner_long_forms() {
357 // Issue #133 root cause #1: the fuzzy matcher saw only
358 // `ALL_CVE_TOKENS`, which carries the dissem **portion**
359 // abbreviations (NF, OC, PR) plus `RELIDO`/`FISA`/`FOUO`,
360 // but not the banner long forms (NOFORN, ORCON, PROPIN,
361 // EXDIS, NODIS, …). So `NOFORON` had no edit-distance
362 // candidate and the decoder discarded it. The extended
363 // vocab pulls every entry's banner form from
364 // `marking_forms::MARKING_FORMS`, with the strict parser's
365 // `parse_dissem_full_form` then normalizing the matched
366 // long form to the canonical portion enum.
367 let vocab = CapcoTokenSet.correction_vocab();
368 for expected in &[
369 "NOFORN",
370 "ORCON",
371 "ORCON-USGOV",
372 "IMCON",
373 "PROPIN",
374 "RSEN",
375 "LIMDIS",
376 "EXDIS",
377 "NODIS",
378 ] {
379 assert!(
380 vocab.binary_search(expected).is_ok(),
381 "correction_vocab MUST contain {expected:?} — \
382 banner long form per CAPCO-2016 §G.1 Table 4 \
383 (issue #133 root cause #1)"
384 );
385 }
386 }
387
388 #[test]
389 fn correction_vocab_keeps_ic_dissem_controls() {
390 // Companion to `correction_vocab_excludes_non_ic_dissem_caveats`:
391 // make sure the deny-list didn't take a real IC dissem control
392 // with it. Every entry below appears in CAPCO-2016 §A.5 page 38
393 // as an IC dissem (or §H.8 for the per-marking detail page);
394 // RAWFISA + EXEMPT_FROM_ICD501_DISCOVERY are post-CAPCO-2016
395 // additions in the live ICRM XML, kept by the deny-list-rather-
396 // than-allowlist approach so future IC additions flow through
397 // automatically.
398 let vocab = CapcoTokenSet.correction_vocab();
399 for expected in &[
400 "RS",
401 "FOUO",
402 "OC",
403 "OC-USGOV",
404 "IMC",
405 "NF",
406 "PR",
407 "REL",
408 "RELIDO",
409 "EYES",
410 "DSEN",
411 "RAWFISA",
412 "FISA",
413 "DISPLAYONLY",
414 "EXEMPT_FROM_ICD501_DISCOVERY",
415 ] {
416 assert!(
417 vocab.binary_search(expected).is_ok(),
418 "correction_vocab MUST contain {expected:?} — \
419 IC dissem control per CAPCO-2016 §A.5 / §H.8 or \
420 a post-2016 ICRM addition"
421 );
422 }
423 }
424
425 #[test]
426 fn correction_vocab_contains_top_classification_keyword() {
427 // Issue #133 PR 8: bare `TOP` lives outside `ALL_CVE_TOKENS`
428 // because the CVE schema only lists the full multi-word
429 // `TOP SECRET` classification entry. The decoder's
430 // `scan_token` whitespace tokenizer arrives at the fuzzy
431 // matcher with `TPP` (or other 3/4-char typos) as a
432 // standalone token, so without `TOP` in the correction vocab
433 // there's no fuzzy target and the candidate gets dropped.
434 // Adding `TOP` here lets the standard edit-distance fuzzy
435 // path recover `TPP→TOP` (dist 1), `UOP→TOP` (dist 1),
436 // `TDOP→TOP` (dist 1, 4-char input via length-diff filter),
437 // `QTOP→TOP` (dist 1), and `TOPW→TOP` (dist 1). Strict
438 // parser then re-joins `TOP SECRET` into the canonical
439 // multi-word classification.
440 let vocab = CapcoTokenSet.correction_vocab();
441 assert!(
442 vocab.binary_search(&"TOP").is_ok(),
443 "correction_vocab MUST contain bare \"TOP\" — issue #133 PR 8 \
444 classification typo recovery target",
445 );
446 }
447
448 #[test]
449 fn correction_vocab_contains_sar_structural_keywords() {
450 // Issue #133 PR 6: the SAR indicator keywords (`SPECIAL`,
451 // `ACCESS`) live outside `ALL_CVE_TOKENS` because the ODNI
452 // `CVEnumISMSAR.xml` is empty (SAR program identifiers are
453 // agency-assigned and not centrally registered). The structural
454 // SAR parser handles the `SPECIAL ACCESS REQUIRED-` indicator
455 // as a literal string match, but the fuzzy matcher had no
456 // vocabulary entry for `SPECIAL` or `ACCESS` — so an OCR typo
457 // like `SPCIAL` (distance 1 from `SPECIAL`) produced no
458 // correction, the token survived as `TokenKind::Unknown`,
459 // and the decoder discarded the candidate via step 3a's
460 // Unknown-span filter. This test pins the fix.
461 //
462 // `REQUIRED` and `SAR` are deliberately NOT in this list —
463 // they are always glued to a program nickname / identifier
464 // (`REQUIRED-BUTTER`, `SAR-BP-J12`) inside one `scan_token`
465 // chunk, so adding them to the vocab is a no-op for the hot
466 // path. See `SAR_STRUCTURAL_KEYWORDS` doc comment.
467 let vocab = CapcoTokenSet.correction_vocab();
468 for expected in &["ACCESS", "SPECIAL"] {
469 assert!(
470 vocab.binary_search(expected).is_ok(),
471 "correction_vocab MUST contain {expected:?} — \
472 SAR structural keyword per CAPCO-2016 §H.5 p100 \
473 (issue #133 PR 6)"
474 );
475 }
476 }
477
478 #[test]
479 fn correction_vocab_contains_aea_sci_structural_keywords() {
480 // PR #256: AEA/SCI long-title structural keywords added so the fuzzy
481 // matcher can recover OCR typos in "FORMERLY RESTRICTED DATA" (FRD,
482 // §H.6) and "TALENT KEYHOLE" (TK, §H.4 p71). `NUCLEAR` is
483 // intentionally excluded — see `AEA_SCI_STRUCTURAL_KEYWORDS` doc
484 // comment.
485 let vocab = CapcoTokenSet.correction_vocab();
486 for expected in &["FORMERLY", "KEYHOLE", "TALENT"] {
487 assert!(
488 vocab.binary_search(expected).is_ok(),
489 "correction_vocab MUST contain {expected:?} — \
490 AEA/SCI structural keyword per CAPCO-2016 §H.6 / §H.4 p71 \
491 (PR #256)"
492 );
493 }
494 }
495}