marque_ism/attrs.rs
1//! `IsmAttributes` — the canonical in-memory representation of a classification marking.
2//!
3//! Mirrors the IC ISM XML attribute model. Every source format (free text, XML, web forms)
4//! normalizes into this struct before rule validation.
5//!
6//! # Type design
7//! Multi-value fields use `Box<[T]>` rather than `Vec<T>` to avoid over-allocation
8//! after parsing. Most markings have 0–4 values per field.
9//!
10//! # Code generation
11//! CVE enum variants (`SciControl`, `DissemControl`, `DeclassExemption`, `SarIdentifier`)
12//! are generated by `build.rs` from ODNI CVE XML files and re-exported from
13//! `crate::generated::values`.
14
15use crate::generated::values;
16use crate::span::Span;
17
18// Re-export generated enum types for convenience.
19pub use values::{DeclassExemption, DissemControl, SarIdentifier, SciControl};
20
21/// Canonical in-memory representation of an IC classification marking.
22///
23/// Produced by `marque-core::parser` from scanner candidates.
24/// Consumed by `marque-rules::Rule` implementations for validation.
25#[non_exhaustive]
26#[derive(Debug, Clone, Default, PartialEq, Eq)]
27pub struct IsmAttributes {
28 /// Primary classification level. `None` means parsing failed to identify one.
29 pub classification: Option<Classification>,
30
31 /// SCI controls (e.g., SI, TK, HCS). Ordered per CAPCO block ordering rules.
32 pub sci_controls: Box<[SciControl]>,
33
34 /// Special Access Required identifiers.
35 pub sar_identifiers: Box<[SarIdentifier]>,
36
37 /// Dissemination controls (e.g., NOFORN, RELIDO, ORCON, FISA).
38 pub dissem_controls: Box<[DissemControl]>,
39
40 /// REL TO country trigraphs. USA must be present and first if non-empty.
41 pub rel_to: Box<[Trigraph]>,
42
43 /// Declassification date from CAB (free text, e.g., "20331231").
44 pub declassify_on: Option<Box<str>>,
45
46 /// Free-text "Classified By" identifier from CAB.
47 pub classified_by: Option<Box<str>>,
48
49 /// Free-text "Derived From" source from CAB.
50 pub derived_from: Option<Box<str>>,
51
52 /// Declassification exemption code from CAB (e.g., 25X1, 50X1-HUM).
53 pub declass_exemption: Option<DeclassExemption>,
54
55 /// Per-token byte spans into the *original source buffer*, recorded by
56 /// the parser as it walks the marking string. Phase 3 added this so
57 /// rules can point at the exact offending byte range instead of the
58 /// whole marking. Empty for CAB markings (CAB parsing is line-structured
59 /// and doesn't go through the token-walking path).
60 ///
61 /// Indexing convention: `token_spans` is in document order. To find the
62 /// span for the Nth `DissemControl`, walk the slice and pick the Nth
63 /// entry whose `kind == TokenKind::DissemControl`.
64 pub token_spans: Box<[TokenSpan]>,
65}
66
67/// One parser-recognized token plus its byte span in the original source.
68///
69/// Used by Phase 3 rules to surface byte-precise diagnostic spans without
70/// re-parsing the source. The `text` field carries the literal token bytes
71/// so rules that need the source content (E006, E007, E008 against migration
72/// keys) can look up entries without threading `&[u8] source` through every
73/// `Rule::check` signature.
74#[derive(Debug, Clone, PartialEq, Eq)]
75pub struct TokenSpan {
76 pub kind: TokenKind,
77 pub span: Span,
78 pub text: Box<str>,
79}
80
81/// Discriminant for `TokenSpan`. Phase 3 rules read these to filter
82/// token-span lookups by category.
83#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
84pub enum TokenKind {
85 /// Classification level token (S, SECRET, TS, TOP SECRET, ...).
86 Classification,
87 /// SCI control token (SI, TK, HCS, ...).
88 SciControl,
89 /// SAR identifier token.
90 SarIdentifier,
91 /// Dissemination control token (NOFORN, NF, ORCON, OC, RELIDO, ...).
92 DissemControl,
93 /// REL TO country trigraph (USA, GBR, AUS, ...). One per token, not the
94 /// whole REL TO list.
95 RelToTrigraph,
96 /// Declassification exemption code in CAB or banner (25X1, 50X1-HUM).
97 DeclassExemption,
98 /// Declassification date in CAB or banner (YYYYMMDD or YYYY).
99 DeclassDate,
100 /// `//` separator between blocks. Recorded so E004 can detect extra/
101 /// missing separator runs.
102 Separator,
103 /// A non-empty block that did not match any known token kind. E008 fires
104 /// one diagnostic per `Unknown` entry.
105 Unknown,
106}
107
108/// Classification level. These values are stable across CAPCO schema versions.
109/// Hand-written rather than generated because the CVE uses abbreviations (R/C/S/TS/U)
110/// while the tool needs both abbreviated and full-word forms.
111#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
112pub enum Classification {
113 Unclassified,
114 Confidential,
115 Secret,
116 TopSecret,
117}
118
119impl Classification {
120 /// Banner form (full words, no abbreviations).
121 pub fn banner_str(self) -> &'static str {
122 match self {
123 Self::Unclassified => "UNCLASSIFIED",
124 Self::Confidential => "CONFIDENTIAL",
125 Self::Secret => "SECRET",
126 Self::TopSecret => "TOP SECRET",
127 }
128 }
129
130 /// Portion form (abbreviation used in portion markings).
131 pub fn portion_str(self) -> &'static str {
132 match self {
133 Self::Unclassified => "U",
134 Self::Confidential => "C",
135 Self::Secret => "S",
136 Self::TopSecret => "TS",
137 }
138 }
139}
140
141/// A 3-character country trigraph (e.g., USA, GBR, AUS).
142/// Validated against CVE country code list at rule-check time.
143///
144/// The inner bytes are private; construction goes through [`Trigraph::try_new`]
145/// which enforces ASCII-uppercase invariants so that [`Trigraph::as_str`] can
146/// return a `&str` infallibly without panicking at runtime.
147#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
148pub struct Trigraph([u8; 3]);
149
150impl Trigraph {
151 /// The always-valid `USA` trigraph constant.
152 pub const USA: Self = Self(*b"USA");
153
154 /// Attempt to construct a trigraph from 3 bytes.
155 ///
156 /// Returns `None` unless every byte is an ASCII uppercase letter
157 /// (`A`–`Z`), which is the invariant enforced by CAPCO for all valid
158 /// country/entity codes.
159 #[inline]
160 pub const fn try_new(bytes: [u8; 3]) -> Option<Self> {
161 let mut i = 0;
162 while i < 3 {
163 if !bytes[i].is_ascii_uppercase() {
164 return None;
165 }
166 i += 1;
167 }
168 Some(Self(bytes))
169 }
170
171 /// Return the trigraph as a string slice.
172 ///
173 /// Infallible because construction via [`Trigraph::try_new`] (or the
174 /// [`Trigraph::USA`] constant) guarantees ASCII-uppercase bytes, which
175 /// are always valid UTF-8.
176 #[inline]
177 pub fn as_str(&self) -> &str {
178 // SAFETY: `Trigraph` can only be constructed via `try_new` or the
179 // `USA` constant, both of which require ASCII uppercase letters.
180 // ASCII is a subset of valid UTF-8.
181 unsafe { std::str::from_utf8_unchecked(&self.0) }
182 }
183}
184
185impl std::fmt::Display for Trigraph {
186 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
187 f.write_str(self.as_str())
188 }
189}
190
191#[cfg(test)]
192mod tests {
193 use super::*;
194
195 #[test]
196 fn trigraph_usa_constant_is_valid() {
197 assert_eq!(Trigraph::USA.as_str(), "USA");
198 }
199
200 #[test]
201 fn trigraph_try_new_accepts_uppercase() {
202 let t = Trigraph::try_new(*b"GBR").unwrap();
203 assert_eq!(t.as_str(), "GBR");
204 }
205
206 #[test]
207 fn trigraph_try_new_rejects_lowercase() {
208 assert!(Trigraph::try_new(*b"usa").is_none());
209 }
210
211 #[test]
212 fn trigraph_try_new_rejects_digits() {
213 assert!(Trigraph::try_new(*b"US1").is_none());
214 }
215
216 #[test]
217 fn trigraph_try_new_rejects_high_bytes() {
218 assert!(Trigraph::try_new([0xFF, 0xFF, 0xFF]).is_none());
219 }
220
221 #[test]
222 fn classification_round_trip() {
223 for c in [
224 Classification::Unclassified,
225 Classification::Confidential,
226 Classification::Secret,
227 Classification::TopSecret,
228 ] {
229 assert!(!c.banner_str().is_empty());
230 assert!(!c.portion_str().is_empty());
231 }
232 }
233}