Skip to main content

marque_ism/
attrs.rs

1//! `IsmAttributes` — the canonical in-memory representation of a classification marking.
2//!
3//! Mirrors the IC ISM XML attribute model. Every source format (free text, XML, web forms)
4//! normalizes into this struct before rule validation.
5//!
6//! # Type design
7//! Multi-value fields use `Box<[T]>` rather than `Vec<T>` to avoid over-allocation
8//! after parsing. Most markings have 0–4 values per field.
9//!
10//! # Code generation
11//! CVE enum variants (`SciControl`, `DissemControl`, `DeclassExemption`, `SarIdentifier`)
12//! are generated by `build.rs` from ODNI CVE XML files and re-exported from
13//! `crate::generated::values`.
14
15use crate::generated::values;
16use crate::span::Span;
17
18// Re-export generated enum types for convenience.
19pub use values::{DeclassExemption, DissemControl, SarIdentifier, SciControl};
20
21/// Canonical in-memory representation of an IC classification marking.
22///
23/// Produced by `marque-core::parser` from scanner candidates.
24/// Consumed by `marque-rules::Rule` implementations for validation.
25#[non_exhaustive]
26#[derive(Debug, Clone, Default, PartialEq, Eq)]
27pub struct IsmAttributes {
28    /// Primary classification level. `None` means parsing failed to identify one.
29    pub classification: Option<Classification>,
30
31    /// SCI controls (e.g., SI, TK, HCS). Ordered per CAPCO block ordering rules.
32    pub sci_controls: Box<[SciControl]>,
33
34    /// Special Access Required identifiers.
35    pub sar_identifiers: Box<[SarIdentifier]>,
36
37    /// Dissemination controls (e.g., NOFORN, RELIDO, ORCON, FISA).
38    pub dissem_controls: Box<[DissemControl]>,
39
40    /// REL TO country trigraphs. USA must be present and first if non-empty.
41    pub rel_to: Box<[Trigraph]>,
42
43    /// Declassification date from CAB (free text, e.g., "20331231").
44    pub declassify_on: Option<Box<str>>,
45
46    /// Free-text "Classified By" identifier from CAB.
47    pub classified_by: Option<Box<str>>,
48
49    /// Free-text "Derived From" source from CAB.
50    pub derived_from: Option<Box<str>>,
51
52    /// Declassification exemption code from CAB (e.g., 25X1, 50X1-HUM).
53    pub declass_exemption: Option<DeclassExemption>,
54
55    /// Per-token byte spans into the *original source buffer*, recorded by
56    /// the parser as it walks the marking string. Phase 3 added this so
57    /// rules can point at the exact offending byte range instead of the
58    /// whole marking. Empty for CAB markings (CAB parsing is line-structured
59    /// and doesn't go through the token-walking path).
60    ///
61    /// Indexing convention: `token_spans` is in document order. To find the
62    /// span for the Nth `DissemControl`, walk the slice and pick the Nth
63    /// entry whose `kind == TokenKind::DissemControl`.
64    pub token_spans: Box<[TokenSpan]>,
65}
66
67/// One parser-recognized token plus its byte span in the original source.
68///
69/// Used by Phase 3 rules to surface byte-precise diagnostic spans without
70/// re-parsing the source. The `text` field carries the literal token bytes
71/// so rules that need the source content (E006, E007, E008 against migration
72/// keys) can look up entries without threading `&[u8] source` through every
73/// `Rule::check` signature.
74#[derive(Debug, Clone, PartialEq, Eq)]
75pub struct TokenSpan {
76    pub kind: TokenKind,
77    pub span: Span,
78    pub text: Box<str>,
79}
80
81/// Discriminant for `TokenSpan`. Phase 3 rules read these to filter
82/// token-span lookups by category.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
84pub enum TokenKind {
85    /// Classification level token (S, SECRET, TS, TOP SECRET, ...).
86    Classification,
87    /// SCI control token (SI, TK, HCS, ...).
88    SciControl,
89    /// SAR identifier token.
90    SarIdentifier,
91    /// Dissemination control token (NOFORN, NF, ORCON, OC, RELIDO, ...).
92    DissemControl,
93    /// REL TO country trigraph (USA, GBR, AUS, ...). One per token, not the
94    /// whole REL TO list.
95    RelToTrigraph,
96    /// Declassification exemption code in CAB or banner (25X1, 50X1-HUM).
97    DeclassExemption,
98    /// Declassification date in CAB or banner (YYYYMMDD or YYYY).
99    DeclassDate,
100    /// `//` separator between blocks. Recorded so E004 can detect extra/
101    /// missing separator runs.
102    Separator,
103    /// A non-empty block that did not match any known token kind. E008 fires
104    /// one diagnostic per `Unknown` entry.
105    Unknown,
106}
107
108/// Classification level. These values are stable across CAPCO schema versions.
109/// Hand-written rather than generated because the CVE uses abbreviations (R/C/S/TS/U)
110/// while the tool needs both abbreviated and full-word forms.
111#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
112pub enum Classification {
113    Unclassified,
114    Confidential,
115    Secret,
116    TopSecret,
117}
118
119impl Classification {
120    /// Banner form (full words, no abbreviations).
121    pub fn banner_str(self) -> &'static str {
122        match self {
123            Self::Unclassified => "UNCLASSIFIED",
124            Self::Confidential => "CONFIDENTIAL",
125            Self::Secret => "SECRET",
126            Self::TopSecret => "TOP SECRET",
127        }
128    }
129
130    /// Portion form (abbreviation used in portion markings).
131    pub fn portion_str(self) -> &'static str {
132        match self {
133            Self::Unclassified => "U",
134            Self::Confidential => "C",
135            Self::Secret => "S",
136            Self::TopSecret => "TS",
137        }
138    }
139}
140
141/// A 3-character country trigraph (e.g., USA, GBR, AUS).
142/// Validated against CVE country code list at rule-check time.
143///
144/// The inner bytes are private; construction goes through [`Trigraph::try_new`]
145/// which enforces ASCII-uppercase invariants so that [`Trigraph::as_str`] can
146/// return a `&str` infallibly without panicking at runtime.
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
148pub struct Trigraph([u8; 3]);
149
150impl Trigraph {
151    /// The always-valid `USA` trigraph constant.
152    pub const USA: Self = Self(*b"USA");
153
154    /// Attempt to construct a trigraph from 3 bytes.
155    ///
156    /// Returns `None` unless every byte is an ASCII uppercase letter
157    /// (`A`–`Z`), which is the invariant enforced by CAPCO for all valid
158    /// country/entity codes.
159    #[inline]
160    pub const fn try_new(bytes: [u8; 3]) -> Option<Self> {
161        let mut i = 0;
162        while i < 3 {
163            if !bytes[i].is_ascii_uppercase() {
164                return None;
165            }
166            i += 1;
167        }
168        Some(Self(bytes))
169    }
170
171    /// Return the trigraph as a string slice.
172    ///
173    /// Infallible because construction via [`Trigraph::try_new`] (or the
174    /// [`Trigraph::USA`] constant) guarantees ASCII-uppercase bytes, which
175    /// are always valid UTF-8.
176    #[inline]
177    pub fn as_str(&self) -> &str {
178        // SAFETY: `Trigraph` can only be constructed via `try_new` or the
179        // `USA` constant, both of which require ASCII uppercase letters.
180        // ASCII is a subset of valid UTF-8.
181        unsafe { std::str::from_utf8_unchecked(&self.0) }
182    }
183}
184
185impl std::fmt::Display for Trigraph {
186    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
187        f.write_str(self.as_str())
188    }
189}
190
191#[cfg(test)]
192mod tests {
193    use super::*;
194
195    #[test]
196    fn trigraph_usa_constant_is_valid() {
197        assert_eq!(Trigraph::USA.as_str(), "USA");
198    }
199
200    #[test]
201    fn trigraph_try_new_accepts_uppercase() {
202        let t = Trigraph::try_new(*b"GBR").unwrap();
203        assert_eq!(t.as_str(), "GBR");
204    }
205
206    #[test]
207    fn trigraph_try_new_rejects_lowercase() {
208        assert!(Trigraph::try_new(*b"usa").is_none());
209    }
210
211    #[test]
212    fn trigraph_try_new_rejects_digits() {
213        assert!(Trigraph::try_new(*b"US1").is_none());
214    }
215
216    #[test]
217    fn trigraph_try_new_rejects_high_bytes() {
218        assert!(Trigraph::try_new([0xFF, 0xFF, 0xFF]).is_none());
219    }
220
221    #[test]
222    fn classification_round_trip() {
223        for c in [
224            Classification::Unclassified,
225            Classification::Confidential,
226            Classification::Secret,
227            Classification::TopSecret,
228        ] {
229            assert!(!c.banner_str().is_empty());
230            assert!(!c.portion_str().is_empty());
231        }
232    }
233}