marque-ism 0.2.1

ISM vocabulary types, generated CVE enums, and Span for marque
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
// SPDX-FileCopyrightText: 2026 Knitli Inc.
//
// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0

//! Compile-time Aho-Corasick automaton over CVE token vocabulary.
//!
//! The automaton is built from all known CVE tokens at startup (via LazyLock)
//! and injected into the parser as a `TokenSet` implementation.

use aho_corasick::AhoCorasick;
use std::sync::LazyLock;

use crate::generated::values;
use crate::marking_forms::MARKING_FORMS;

/// Minimal interface the parser needs from the token set.
/// Implemented by `CapcoTokenSet`; injected at engine init.
pub trait TokenSet: Send + Sync {
    /// Returns the canonical token string if `token` is a known CVE value.
    fn canonicalize(&self, token: &str) -> Option<&'static str>;

    /// Returns true if `token` is a known country trigraph.
    fn is_trigraph(&self, token: &str) -> bool;

    /// Returns the vocabulary slice used for fuzzy correction lookups.
    ///
    /// This is the token vocabulary against which unknown tokens are compared
    /// by the `marque_core::fuzzy` module. Must be sorted and deduplicated
    /// (binary search is used for the "is already valid" check).
    ///
    /// The returned slice is borrowed from the implementor, which allows
    /// implementations to hold the vocabulary on `self` (e.g., in a `Vec`
    /// built at construction time) rather than in a global static. Each
    /// entry is `&'static str` because the fuzzy matcher returns canonical
    /// tokens with `'static` lifetime in `FuzzyCorrection::token`.
    ///
    /// The default implementation returns an empty slice, disabling fuzzy
    /// correction for external `TokenSet` implementors that do not override it.
    fn correction_vocab(&self) -> &[&'static str] {
        &[]
    }
}

/// Aho-Corasick automaton over all CVE tokens — built once from generated data.
static AUTOMATON: LazyLock<AhoCorasick> = LazyLock::new(|| {
    AhoCorasick::builder()
        .ascii_case_insensitive(false) // markings are case-sensitive
        .build(values::ALL_CVE_TOKENS)
        .expect("CVE token automaton construction failed")
});

/// Classification structural keywords not present as standalone
/// entries in `ALL_CVE_TOKENS`. Issue #133 PR 8.
///
/// `TOP SECRET` is in `ALL_CVE_TOKENS` as a single multi-word entry,
/// but the bare `TOP` is not — and the decoder's `scan_token`
/// tokenizer splits on whitespace, so an input like `TPP SECRET`
/// arrives at the fuzzy matcher as the standalone token `TPP` with
/// no `TOP` correction target available. Adding `TOP` to the fuzzy
/// vocab lets the standard edit-distance path recover the
/// `TPP→TOP`, `UOP→TOP`, `TDOP→TOP`, `QTOP→TOP`, `TOPW→TOP` family
/// of typos seen in the SC-004 mangled corpus. The strict parser
/// then re-joins `TOP SECRET` into the canonical multi-word
/// classification.
///
/// Round-trip safety: a fuzzy-correction returning `TOP` for an
/// input typo lands as the bare token `TOP`, which the strict
/// parser combines with the following `SECRET` token into
/// `MarkingClassification::Us(Classification::TopSecret)` via the
/// usual two-word classification path. Round-trip pinned by the
/// PR-8 integration tests in `decoder_recovery.rs`.
const CLASSIFICATION_STRUCTURAL_KEYWORDS: &[&str] = &["TOP"];

/// NATO classification structural keywords not present in `ALL_CVE_TOKENS`.
///
/// NATO-specific classification words appear in multi-word forms that the
/// strict parser recognises: `COSMIC TOP SECRET`, `COSMIC TOP SECRET-BOHEMIA`,
/// `COSMIC TOP SECRET-BALK`, `COSMIC TOP SECRET ATOMAL`. Like `TOP` above,
/// the decoder's whitespace tokenizer splits these multi-word forms so each
/// word arrives individually at the fuzzy matcher. Without these entries in
/// the correction vocab, OCR/transcription typos (`COSMID`, `BOHFMIA`,
/// `ATOAML`, `BBLE`) produce `TokenKind::Unknown` spans and the decoder
/// discards the candidate.
///
/// Round-trip safety: the strict parser in `marque-core` recognises each
/// multi-word NATO form and maps it to the corresponding
/// `MarkingClassification::NonUs(NatoClassification::*)` variant, so a
/// fuzzy-corrected `COSMIC` / `BOHEMIA` / `ATOMAL` / `BALK` token lands
/// on the correct classification after strict parsing.
///
/// Authority: CAPCO-2016 §H.7 p147–148 (NATO classification markings).
const NATO_CLASSIFICATION_KEYWORDS: &[&str] = &["ATOMAL", "BALK", "BOHEMIA", "COSMIC"];

/// SAR structural keywords (CAPCO-2016 §H.5 p100, "SAR-" indicator and
/// "SPECIAL ACCESS REQUIRED-" full form), included in the fuzzy
/// correction vocabulary so OCR/transcription typos in the indicator
/// keywords (`SPCIAL`, `CCESS`, `SEPCIAL`, etc.) get corrected to the
/// canonical form before the strict SAR parser's literal `starts_with`
/// matches in `crates/core/src/parser.rs::parse_sar_category` run.
///
/// These keywords are NOT in `ALL_CVE_TOKENS` because the ODNI
/// `CVEnumISMSAR.xml` is empty — SAR program identifiers are
/// agency-assigned codewords not centrally registered. The structural
/// SAR parser handles `SAR-`/`SPECIAL ACCESS REQUIRED-` as fixed
/// literal indicator strings, but the fuzzy matcher had no way to
/// recover a typo in those keywords because they weren't in any
/// vocabulary it consulted. Issue #133 PR 6.
///
/// `REQUIRED` is intentionally excluded: in real corpus inputs it is
/// always followed immediately by `-<program-nickname>` (e.g.,
/// `REQUIRED-BUTTER`), and the decoder's `scan_token` includes
/// internal hyphens in a single token, so `REQUIRED-BUTTER` is one
/// 14-character token that no fuzzy correction targeting `REQUIRED`
/// (8 chars) can reach within `MAX_EDIT_DISTANCE = 2`. Adding
/// `REQUIRED` would be a no-op for this hot path; if a future
/// fixture surfaces with `REQUIRED` as an isolated token (e.g.,
/// `SPECIAL ACCESS REQUIRED -BUTTER`), revisit. `SAR` is similarly
/// excluded because it is always glued to a program identifier
/// (`SAR-BP-J12`) — see `try_sar_indicator_repair` in
/// `crates/engine/src/decoder.rs` for the structural prefix-strip /
/// missing-hyphen path that handles `USAR-BP` / `SARBP`.
const SAR_STRUCTURAL_KEYWORDS: &[&str] = &["ACCESS", "SPECIAL"];

/// AEA and SCI structural keywords not present in `ALL_CVE_TOKENS`.
///
/// These individual words appear as components of multi-word Marking Titles
/// that the strict parser recognises. The decoder's whitespace tokeniser
/// splits them, so each word arrives at the fuzzy matcher independently. Without
/// these entries, OCR/transcription typos (`TAELNT`, `TALNET`, `FRMERLY`,
/// `KEYOLE`) produce `TokenKind::Unknown` spans that cause the decoder to
/// discard the candidate.
///
/// **TALENT / KEYHOLE** (§H.4 p73): The full Marking Title for TK is "TALENT
/// KEYHOLE". OCR commonly mangles individual words of long titles; having both
/// bare words in the vocab lets `TAELNT KEYHOLE` → `TALENT KEYHOLE` → `TK`.
///
/// **FORMERLY** (§H.6 p116): The full Marking Title for FRD is "FORMERLY
/// RESTRICTED DATA". A typo like `FRMERLY RESTRICTED DATA` arrives at the
/// fuzzy matcher as the token `FRMERLY`; without `FORMERLY` in the vocab the
/// decoder cannot recover it.
///
/// Round-trip safety: the strict parser already handles `TALENT KEYHOLE` →
/// `TK` and `FORMERLY RESTRICTED DATA` → `FRD` via `parse_sci_block` /
/// `title_to_portion` paths respectively, so fuzzy-corrected tokens land at
/// the expected parsed values without further changes.
///
/// Note: `NUCLEAR` (appears in CNWDI/TFNI/DOD-UCNI/DOE-UCNI titles) is
/// intentionally excluded — it is a very common English word and would produce
/// excessive false-positive fuzzy corrections on unrelated text.
const AEA_SCI_STRUCTURAL_KEYWORDS: &[&str] = &["FORMERLY", "KEYHOLE", "TALENT"];

/// Extended fuzzy-correction vocabulary: `ALL_CVE_TOKENS` ∪ banner long forms
/// from [`MARKING_FORMS`] ∪ [`SAR_STRUCTURAL_KEYWORDS`] ∪
/// [`CLASSIFICATION_STRUCTURAL_KEYWORDS`] ∪ [`NATO_CLASSIFICATION_KEYWORDS`] ∪
/// [`AEA_SCI_STRUCTURAL_KEYWORDS`],
/// sorted and deduplicated.
///
/// `ALL_CVE_TOKENS` carries only the **portion-form** abbreviations
/// (`NF`, `OC`, `PR`, `XD`, `ND`) and a handful of single-form tokens
/// (`RELIDO`, `FISA`, `FOUO`). The banner long forms — which are valid
/// inputs the strict parser handles via
/// [`crate::marking_forms::banner_to_portion`] — were missing from the
/// vocabulary the fuzzy matcher consults, so an OCR/transcription typo
/// like `NOFORON` (distance 1 from `NOFORN`) found no correction target
/// and the decoder discarded it as `TokenKind::Unknown`. See issue #133.
///
/// Round-trip safety: the strict parser's `parse_dissem_full_form` and
/// `parse_non_ic_full_form` already accept the banner forms here and
/// translate them to the canonical portion enum, so a fuzzy correction
/// returning `NOFORN` (rather than `NF`) lands on the same final
/// [`crate::DissemControl::Nf`] after strict parsing. The SAR
/// structural keywords (`SAR_STRUCTURAL_KEYWORDS`) are similarly
/// round-trip safe: `parse_sar_category` accepts the canonical
/// `SPECIAL ACCESS REQUIRED-` indicator literally, so a correction
/// returning `SPECIAL` for `SPCIAL` lands at the same `SarMarking`
/// after strict parsing.
///
/// Multi-word banner forms (`DEA SENSITIVE`, `SBU NOFORN`,
/// `LES NOFORN`, `DOD UCNI`, `DOE UCNI`) are retained intentionally.
/// The decoder's per-token fuzzy tokenizer (`scan_token` in
/// `crates/engine/src/decoder.rs`) splits raw input on whitespace, so
/// these never appear as a single *input* token to the matcher — but
/// fuzzy correction can still emit one of them as the canonical
/// *output* for a whitespace-free typo (e.g., `SBUNOFORN` →
/// `SBU NOFORN`, distance 1, single-character insertion of the
/// space). The strict parser then accepts the corrected multi-word
/// form via `parse_non_ic_full_form` / `parse_dissem_full_form` and
/// translates it to the canonical portion enum, so the round-trip
/// lands at the expected `NonIcDissem::SbuNf` (or peer). Pinned by
/// `marque-core::fuzzy::tests::real_vocab_emits_multi_word_banner_for_whitespace_free_typo`.
static EXTENDED_CORRECTION_VOCAB: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
    let mut v: Vec<&'static str> = values::ALL_CVE_TOKENS.to_vec();
    for f in MARKING_FORMS {
        v.push(f.banner);
    }
    v.extend_from_slice(SAR_STRUCTURAL_KEYWORDS);
    v.extend_from_slice(CLASSIFICATION_STRUCTURAL_KEYWORDS);
    v.extend_from_slice(NATO_CLASSIFICATION_KEYWORDS);
    v.extend_from_slice(AEA_SCI_STRUCTURAL_KEYWORDS);
    v.sort();
    v.dedup();
    v
});

pub struct CapcoTokenSet;

impl TokenSet for CapcoTokenSet {
    fn canonicalize(&self, token: &str) -> Option<&'static str> {
        // `ALL_CVE_TOKENS` is emitted sorted and deduplicated by build.rs,
        // so an O(log n) binary search is correct and faster than the
        // previous O(n) linear scan.
        values::ALL_CVE_TOKENS
            .binary_search(&token)
            .ok()
            .map(|i| values::ALL_CVE_TOKENS[i])
    }

    fn is_trigraph(&self, token: &str) -> bool {
        // TRIGRAPHS is emitted sorted and deduplicated by build.rs, so
        // binary_search is O(log n) over ~340 entries instead of the old
        // O(n) `.contains()` linear scan. Hot path for every REL TO parse.
        values::TRIGRAPHS.binary_search(&token).is_ok()
    }

    fn correction_vocab(&self) -> &[&'static str] {
        EXTENDED_CORRECTION_VOCAB.as_slice()
    }
}

impl CapcoTokenSet {
    /// Returns a reference to the Aho-Corasick automaton built from all CVE tokens.
    /// Reserved for Phase 2 multi-pattern matching when per-token spans are wired.
    #[allow(dead_code)]
    pub(crate) fn automaton() -> &'static AhoCorasick {
        &AUTOMATON
    }
}

#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
    use super::*;

    #[test]
    fn all_cve_tokens_are_sorted_and_unique() {
        let tokens = values::ALL_CVE_TOKENS;
        for window in tokens.windows(2) {
            assert!(
                window[0] < window[1],
                "ALL_CVE_TOKENS is not strictly sorted: {:?} >= {:?}",
                window[0],
                window[1],
            );
        }
    }

    #[test]
    fn trigraphs_are_sorted_and_unique() {
        // `is_trigraph` relies on binary_search, so the slice must be
        // strictly-sorted. If a future ODNI XSD update shuffles the order,
        // build.rs collects into a BTreeSet and this test catches any
        // regression of that contract.
        let trigraphs = values::TRIGRAPHS;
        for window in trigraphs.windows(2) {
            assert!(
                window[0] < window[1],
                "TRIGRAPHS is not strictly sorted: {:?} >= {:?}",
                window[0],
                window[1],
            );
        }
    }

    #[test]
    fn canonicalize_returns_known_token() {
        let set = CapcoTokenSet;
        // SECRET is in the banner-words we always emit.
        assert_eq!(set.canonicalize("SECRET"), Some("SECRET"));
    }

    #[test]
    fn canonicalize_returns_none_for_unknown() {
        let set = CapcoTokenSet;
        assert_eq!(set.canonicalize("BANANAPHONE"), None);
    }

    #[test]
    fn usa_is_a_known_trigraph() {
        let set = CapcoTokenSet;
        assert!(set.is_trigraph("USA"));
    }

    #[test]
    fn unknown_string_is_not_a_trigraph() {
        let set = CapcoTokenSet;
        assert!(!set.is_trigraph("XYZ_NOT_A_COUNTRY"));
    }

    #[test]
    fn correction_vocab_returns_sorted_nonempty_slice() {
        let vocab = CapcoTokenSet.correction_vocab();
        assert!(!vocab.is_empty(), "correction vocab must not be empty");
        for window in vocab.windows(2) {
            assert!(
                window[0] < window[1],
                "correction_vocab must be strictly sorted: {:?} >= {:?}",
                window[0],
                window[1],
            );
        }
    }

    #[test]
    fn correction_vocab_contains_core_classification_tokens() {
        let vocab = CapcoTokenSet.correction_vocab();
        for expected in &["SECRET", "CONFIDENTIAL", "UNCLASSIFIED"] {
            assert!(
                vocab.binary_search(expected).is_ok(),
                "correction_vocab must contain {expected:?}"
            );
        }
    }

    #[test]
    fn correction_vocab_excludes_non_ic_dissem_caveats() {
        // Regression guard for the non-IC dissem deny-list invariant.
        // ODNI's `CVEnumISMDissem.xml` is a UNION enum bundling IC
        // dissem controls (CAPCO source 1) with the ISOO CUI Registry
        // caveat tail (AC, AWP, DL_ONLY, FED_ONLY, FEDCON, NOCON) and
        // the DOD-SAP `WAIVED` entry. CAPCO-2016 line 283 explicitly
        // disclaims caveats from its scope. The `build.rs` of
        // `marque-ism` deny-lists those seven tokens so they never
        // enter the IC `DissemControl` enum or `ALL_CVE_TOKENS`. This
        // test pins that invariant — a future schema-update bump that
        // re-introduces them, or a deny-list typo, fails here loudly
        // rather than silently broadening the CAPCO grammar to accept
        // caveats as IC dissem controls.
        //
        // Tracking issue for the broader caveat / second-banner-line
        // data model: github.com/marquetools/marque#128.
        let vocab = CapcoTokenSet.correction_vocab();
        for forbidden in &[
            "WAIVED", "AC", "AWP", "DL_ONLY", "FED_ONLY", "FEDCON", "NOCON",
        ] {
            assert!(
                vocab.binary_search(forbidden).is_err(),
                "correction_vocab MUST NOT contain {forbidden:?} — \
                 it is a non-IC caveat (CAPCO-2016 line 283 \
                 disclaimer) that should be filtered by build.rs's \
                 NON_IC_DISSEM_DENY_LIST"
            );
        }
    }

    #[test]
    fn correction_vocab_contains_dissem_banner_long_forms() {
        // Issue #133 root cause #1: the fuzzy matcher saw only
        // `ALL_CVE_TOKENS`, which carries the dissem **portion**
        // abbreviations (NF, OC, PR) plus `RELIDO`/`FISA`/`FOUO`,
        // but not the banner long forms (NOFORN, ORCON, PROPIN,
        // EXDIS, NODIS, …). So `NOFORON` had no edit-distance
        // candidate and the decoder discarded it. The extended
        // vocab pulls every entry's banner form from
        // `marking_forms::MARKING_FORMS`, with the strict parser's
        // `parse_dissem_full_form` then normalizing the matched
        // long form to the canonical portion enum.
        let vocab = CapcoTokenSet.correction_vocab();
        for expected in &[
            "NOFORN",
            "ORCON",
            "ORCON-USGOV",
            "IMCON",
            "PROPIN",
            "RSEN",
            "LIMDIS",
            "EXDIS",
            "NODIS",
        ] {
            assert!(
                vocab.binary_search(expected).is_ok(),
                "correction_vocab MUST contain {expected:?} — \
                 banner long form per CAPCO-2016 §G.1 Table 4 \
                 (issue #133 root cause #1)"
            );
        }
    }

    #[test]
    fn correction_vocab_keeps_ic_dissem_controls() {
        // Companion to `correction_vocab_excludes_non_ic_dissem_caveats`:
        // make sure the deny-list didn't take a real IC dissem control
        // with it. Every entry below appears in CAPCO-2016 §A.5 page 38
        // as an IC dissem (or §H.8 for the per-marking detail page);
        // RAWFISA + EXEMPT_FROM_ICD501_DISCOVERY are post-CAPCO-2016
        // additions in the live ICRM XML, kept by the deny-list-rather-
        // than-allowlist approach so future IC additions flow through
        // automatically.
        let vocab = CapcoTokenSet.correction_vocab();
        for expected in &[
            "RS",
            "FOUO",
            "OC",
            "OC-USGOV",
            "IMC",
            "NF",
            "PR",
            "REL",
            "RELIDO",
            "EYES",
            "DSEN",
            "RAWFISA",
            "FISA",
            "DISPLAYONLY",
            "EXEMPT_FROM_ICD501_DISCOVERY",
        ] {
            assert!(
                vocab.binary_search(expected).is_ok(),
                "correction_vocab MUST contain {expected:?} — \
                 IC dissem control per CAPCO-2016 §A.5 / §H.8 or \
                 a post-2016 ICRM addition"
            );
        }
    }

    #[test]
    fn correction_vocab_contains_top_classification_keyword() {
        // Issue #133 PR 8: bare `TOP` lives outside `ALL_CVE_TOKENS`
        // because the CVE schema only lists the full multi-word
        // `TOP SECRET` classification entry. The decoder's
        // `scan_token` whitespace tokenizer arrives at the fuzzy
        // matcher with `TPP` (or other 3/4-char typos) as a
        // standalone token, so without `TOP` in the correction vocab
        // there's no fuzzy target and the candidate gets dropped.
        // Adding `TOP` here lets the standard edit-distance fuzzy
        // path recover `TPP→TOP` (dist 1), `UOP→TOP` (dist 1),
        // `TDOP→TOP` (dist 1, 4-char input via length-diff filter),
        // `QTOP→TOP` (dist 1), and `TOPW→TOP` (dist 1). Strict
        // parser then re-joins `TOP SECRET` into the canonical
        // multi-word classification.
        let vocab = CapcoTokenSet.correction_vocab();
        assert!(
            vocab.binary_search(&"TOP").is_ok(),
            "correction_vocab MUST contain bare \"TOP\" — issue #133 PR 8 \
             classification typo recovery target",
        );
    }

    #[test]
    fn correction_vocab_contains_sar_structural_keywords() {
        // Issue #133 PR 6: the SAR indicator keywords (`SPECIAL`,
        // `ACCESS`) live outside `ALL_CVE_TOKENS` because the ODNI
        // `CVEnumISMSAR.xml` is empty (SAR program identifiers are
        // agency-assigned and not centrally registered). The structural
        // SAR parser handles the `SPECIAL ACCESS REQUIRED-` indicator
        // as a literal string match, but the fuzzy matcher had no
        // vocabulary entry for `SPECIAL` or `ACCESS` — so an OCR typo
        // like `SPCIAL` (distance 1 from `SPECIAL`) produced no
        // correction, the token survived as `TokenKind::Unknown`,
        // and the decoder discarded the candidate via step 3a's
        // Unknown-span filter. This test pins the fix.
        //
        // `REQUIRED` and `SAR` are deliberately NOT in this list —
        // they are always glued to a program nickname / identifier
        // (`REQUIRED-BUTTER`, `SAR-BP-J12`) inside one `scan_token`
        // chunk, so adding them to the vocab is a no-op for the hot
        // path. See `SAR_STRUCTURAL_KEYWORDS` doc comment.
        let vocab = CapcoTokenSet.correction_vocab();
        for expected in &["ACCESS", "SPECIAL"] {
            assert!(
                vocab.binary_search(expected).is_ok(),
                "correction_vocab MUST contain {expected:?} — \
                 SAR structural keyword per CAPCO-2016 §H.5 p100 \
                 (issue #133 PR 6)"
            );
        }
    }

    #[test]
    fn correction_vocab_contains_aea_sci_structural_keywords() {
        // PR #256: AEA/SCI long-title structural keywords added so the fuzzy
        // matcher can recover OCR typos in "FORMERLY RESTRICTED DATA" (FRD,
        // §H.6) and "TALENT KEYHOLE" (TK, §H.4 p71). `NUCLEAR` is
        // intentionally excluded — see `AEA_SCI_STRUCTURAL_KEYWORDS` doc
        // comment.
        let vocab = CapcoTokenSet.correction_vocab();
        for expected in &["FORMERLY", "KEYHOLE", "TALENT"] {
            assert!(
                vocab.binary_search(expected).is_ok(),
                "correction_vocab MUST contain {expected:?} — \
                 AEA/SCI structural keyword per CAPCO-2016 §H.6 / §H.4 p71 \
                 (PR #256)"
            );
        }
    }
}