use aho_corasick::AhoCorasick;
use std::sync::LazyLock;
use crate::generated::values;
use crate::marking_forms::MARKING_FORMS;
pub trait TokenSet: Send + Sync {
fn canonicalize(&self, token: &str) -> Option<&'static str>;
fn is_trigraph(&self, token: &str) -> bool;
fn correction_vocab(&self) -> &[&'static str] {
&[]
}
}
static AUTOMATON: LazyLock<AhoCorasick> = LazyLock::new(|| {
AhoCorasick::builder()
.ascii_case_insensitive(false) .build(values::ALL_CVE_TOKENS)
.expect("CVE token automaton construction failed")
});
const CLASSIFICATION_STRUCTURAL_KEYWORDS: &[&str] = &["TOP"];
const NATO_CLASSIFICATION_KEYWORDS: &[&str] = &["ATOMAL", "BALK", "BOHEMIA", "COSMIC"];
const SAR_STRUCTURAL_KEYWORDS: &[&str] = &["ACCESS", "SPECIAL"];
const AEA_SCI_STRUCTURAL_KEYWORDS: &[&str] = &["FORMERLY", "KEYHOLE", "TALENT"];
static EXTENDED_CORRECTION_VOCAB: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
let mut v: Vec<&'static str> = values::ALL_CVE_TOKENS.to_vec();
for f in MARKING_FORMS {
v.push(f.banner);
}
v.extend_from_slice(SAR_STRUCTURAL_KEYWORDS);
v.extend_from_slice(CLASSIFICATION_STRUCTURAL_KEYWORDS);
v.extend_from_slice(NATO_CLASSIFICATION_KEYWORDS);
v.extend_from_slice(AEA_SCI_STRUCTURAL_KEYWORDS);
v.sort();
v.dedup();
v
});
pub struct CapcoTokenSet;
impl TokenSet for CapcoTokenSet {
fn canonicalize(&self, token: &str) -> Option<&'static str> {
values::ALL_CVE_TOKENS
.binary_search(&token)
.ok()
.map(|i| values::ALL_CVE_TOKENS[i])
}
fn is_trigraph(&self, token: &str) -> bool {
values::TRIGRAPHS.binary_search(&token).is_ok()
}
fn correction_vocab(&self) -> &[&'static str] {
EXTENDED_CORRECTION_VOCAB.as_slice()
}
}
impl CapcoTokenSet {
#[allow(dead_code)]
pub(crate) fn automaton() -> &'static AhoCorasick {
&AUTOMATON
}
}
#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
use super::*;
#[test]
fn all_cve_tokens_are_sorted_and_unique() {
let tokens = values::ALL_CVE_TOKENS;
for window in tokens.windows(2) {
assert!(
window[0] < window[1],
"ALL_CVE_TOKENS is not strictly sorted: {:?} >= {:?}",
window[0],
window[1],
);
}
}
#[test]
fn trigraphs_are_sorted_and_unique() {
let trigraphs = values::TRIGRAPHS;
for window in trigraphs.windows(2) {
assert!(
window[0] < window[1],
"TRIGRAPHS is not strictly sorted: {:?} >= {:?}",
window[0],
window[1],
);
}
}
#[test]
fn canonicalize_returns_known_token() {
let set = CapcoTokenSet;
assert_eq!(set.canonicalize("SECRET"), Some("SECRET"));
}
#[test]
fn canonicalize_returns_none_for_unknown() {
let set = CapcoTokenSet;
assert_eq!(set.canonicalize("BANANAPHONE"), None);
}
#[test]
fn usa_is_a_known_trigraph() {
let set = CapcoTokenSet;
assert!(set.is_trigraph("USA"));
}
#[test]
fn unknown_string_is_not_a_trigraph() {
let set = CapcoTokenSet;
assert!(!set.is_trigraph("XYZ_NOT_A_COUNTRY"));
}
#[test]
fn correction_vocab_returns_sorted_nonempty_slice() {
let vocab = CapcoTokenSet.correction_vocab();
assert!(!vocab.is_empty(), "correction vocab must not be empty");
for window in vocab.windows(2) {
assert!(
window[0] < window[1],
"correction_vocab must be strictly sorted: {:?} >= {:?}",
window[0],
window[1],
);
}
}
#[test]
fn correction_vocab_contains_core_classification_tokens() {
let vocab = CapcoTokenSet.correction_vocab();
for expected in &["SECRET", "CONFIDENTIAL", "UNCLASSIFIED"] {
assert!(
vocab.binary_search(expected).is_ok(),
"correction_vocab must contain {expected:?}"
);
}
}
#[test]
fn correction_vocab_excludes_non_ic_dissem_caveats() {
let vocab = CapcoTokenSet.correction_vocab();
for forbidden in &[
"WAIVED", "AC", "AWP", "DL_ONLY", "FED_ONLY", "FEDCON", "NOCON",
] {
assert!(
vocab.binary_search(forbidden).is_err(),
"correction_vocab MUST NOT contain {forbidden:?} — \
it is a non-IC caveat (CAPCO-2016 line 283 \
disclaimer) that should be filtered by build.rs's \
NON_IC_DISSEM_DENY_LIST"
);
}
}
#[test]
fn correction_vocab_contains_dissem_banner_long_forms() {
let vocab = CapcoTokenSet.correction_vocab();
for expected in &[
"NOFORN",
"ORCON",
"ORCON-USGOV",
"IMCON",
"PROPIN",
"RSEN",
"LIMDIS",
"EXDIS",
"NODIS",
] {
assert!(
vocab.binary_search(expected).is_ok(),
"correction_vocab MUST contain {expected:?} — \
banner long form per CAPCO-2016 §G.1 Table 4 \
(issue #133 root cause #1)"
);
}
}
#[test]
fn correction_vocab_keeps_ic_dissem_controls() {
let vocab = CapcoTokenSet.correction_vocab();
for expected in &[
"RS",
"FOUO",
"OC",
"OC-USGOV",
"IMC",
"NF",
"PR",
"REL",
"RELIDO",
"EYES",
"DSEN",
"RAWFISA",
"FISA",
"DISPLAYONLY",
"EXEMPT_FROM_ICD501_DISCOVERY",
] {
assert!(
vocab.binary_search(expected).is_ok(),
"correction_vocab MUST contain {expected:?} — \
IC dissem control per CAPCO-2016 §A.5 / §H.8 or \
a post-2016 ICRM addition"
);
}
}
#[test]
fn correction_vocab_contains_top_classification_keyword() {
let vocab = CapcoTokenSet.correction_vocab();
assert!(
vocab.binary_search(&"TOP").is_ok(),
"correction_vocab MUST contain bare \"TOP\" — issue #133 PR 8 \
classification typo recovery target",
);
}
#[test]
fn correction_vocab_contains_sar_structural_keywords() {
let vocab = CapcoTokenSet.correction_vocab();
for expected in &["ACCESS", "SPECIAL"] {
assert!(
vocab.binary_search(expected).is_ok(),
"correction_vocab MUST contain {expected:?} — \
SAR structural keyword per CAPCO-2016 §H.5 p100 \
(issue #133 PR 6)"
);
}
}
#[test]
fn correction_vocab_contains_aea_sci_structural_keywords() {
let vocab = CapcoTokenSet.correction_vocab();
for expected in &["FORMERLY", "KEYHOLE", "TALENT"] {
assert!(
vocab.binary_search(expected).is_ok(),
"correction_vocab MUST contain {expected:?} — \
AEA/SCI structural keyword per CAPCO-2016 §H.6 / §H.4 p71 \
(PR #256)"
);
}
}
}