Skip to main content

marque_ism/
token_set.rs

1//! Compile-time Aho-Corasick automaton over CVE token vocabulary.
2//!
3//! The automaton is built from all known CVE tokens at startup (via LazyLock)
4//! and injected into the parser as a `TokenSet` implementation.
5
6use aho_corasick::AhoCorasick;
7use std::sync::LazyLock;
8
9use crate::generated::values;
10
11/// Minimal interface the parser needs from the token set.
12/// Implemented by `CapcoTokenSet`; injected at engine init.
13pub trait TokenSet: Send + Sync {
14    /// Returns the canonical token string if `token` is a known CVE value.
15    fn canonicalize(&self, token: &str) -> Option<&'static str>;
16
17    /// Returns true if `token` is a known country trigraph.
18    fn is_trigraph(&self, token: &str) -> bool;
19}
20
21/// Aho-Corasick automaton over all CVE tokens — built once from generated data.
22static AUTOMATON: LazyLock<AhoCorasick> = LazyLock::new(|| {
23    AhoCorasick::builder()
24        .ascii_case_insensitive(false) // markings are case-sensitive
25        .build(values::ALL_CVE_TOKENS)
26        .expect("CVE token automaton construction failed")
27});
28
29pub struct CapcoTokenSet;
30
31impl TokenSet for CapcoTokenSet {
32    fn canonicalize(&self, token: &str) -> Option<&'static str> {
33        // `ALL_CVE_TOKENS` is emitted sorted and deduplicated by build.rs,
34        // so an O(log n) binary search is correct and faster than the
35        // previous O(n) linear scan.
36        values::ALL_CVE_TOKENS
37            .binary_search(&token)
38            .ok()
39            .map(|i| values::ALL_CVE_TOKENS[i])
40    }
41
42    fn is_trigraph(&self, token: &str) -> bool {
43        // TRIGRAPHS is emitted sorted and deduplicated by build.rs, so
44        // binary_search is O(log n) over ~340 entries instead of the old
45        // O(n) `.contains()` linear scan. Hot path for every REL TO parse.
46        values::TRIGRAPHS.binary_search(&token).is_ok()
47    }
48}
49
50impl CapcoTokenSet {
51    /// Returns a reference to the Aho-Corasick automaton built from all CVE tokens.
52    /// Reserved for Phase 2 multi-pattern matching when per-token spans are wired.
53    #[allow(dead_code)]
54    pub(crate) fn automaton() -> &'static AhoCorasick {
55        &AUTOMATON
56    }
57}
58
59#[cfg(test)]
60mod tests {
61    use super::*;
62
63    #[test]
64    fn all_cve_tokens_are_sorted_and_unique() {
65        let tokens = values::ALL_CVE_TOKENS;
66        for window in tokens.windows(2) {
67            assert!(
68                window[0] < window[1],
69                "ALL_CVE_TOKENS is not strictly sorted: {:?} >= {:?}",
70                window[0],
71                window[1],
72            );
73        }
74    }
75
76    #[test]
77    fn trigraphs_are_sorted_and_unique() {
78        // `is_trigraph` relies on binary_search, so the slice must be
79        // strictly-sorted. If a future ODNI XSD update shuffles the order,
80        // build.rs collects into a BTreeSet and this test catches any
81        // regression of that contract.
82        let trigraphs = values::TRIGRAPHS;
83        for window in trigraphs.windows(2) {
84            assert!(
85                window[0] < window[1],
86                "TRIGRAPHS is not strictly sorted: {:?} >= {:?}",
87                window[0],
88                window[1],
89            );
90        }
91    }
92
93    #[test]
94    fn canonicalize_returns_known_token() {
95        let set = CapcoTokenSet;
96        // SECRET is in the banner-words we always emit.
97        assert_eq!(set.canonicalize("SECRET"), Some("SECRET"));
98    }
99
100    #[test]
101    fn canonicalize_returns_none_for_unknown() {
102        let set = CapcoTokenSet;
103        assert_eq!(set.canonicalize("BANANAPHONE"), None);
104    }
105
106    #[test]
107    fn usa_is_a_known_trigraph() {
108        let set = CapcoTokenSet;
109        assert!(set.is_trigraph("USA"));
110    }
111
112    #[test]
113    fn unknown_string_is_not_a_trigraph() {
114        let set = CapcoTokenSet;
115        assert!(!set.is_trigraph("XYZ_NOT_A_COUNTRY"));
116    }
117}