Skip to main content

marque_core/
parser.rs

1//! Phase 2/3: token extraction and structural parsing.
2//!
3//! Takes [`MarkingCandidate`] spans from the scanner and produces [`IsmAttributes`].
4//!
5//! # Phase 2 — Token Extraction
6//! A compile-time Aho-Corasick automaton (built from CVE token list in marque-capco)
7//! runs over each candidate span, identifying known tokens and their positions.
8//! Unrecognized tokens within a candidate boundary are themselves diagnostics.
9//!
10//! # Phase 3 — Structural Parsing
11//! Token sequence → IsmAttributes. Validates ordering and block structure.
12//! Produces `ParseError` for structural violations; these feed into the rule engine
13//! as diagnostics with associated fixes.
14//!
15//! Note: the Aho-Corasick automaton is injected via `TokenSet` to keep marque-core
16//! free of a direct dependency on marque-capco's generated data.
17
18use crate::error::CoreError;
19use marque_ism::attrs::{
20    Classification, DeclassExemption, DissemControl, IsmAttributes, SarIdentifier, SciControl,
21    TokenKind, TokenSpan, Trigraph,
22};
23// Note: unused import warnings for SarIdentifier are expected until the SAR CVE
24// has entries. The type is used in from_str() which returns None for now.
25use marque_ism::span::{MarkingCandidate, MarkingType, Span};
26use marque_ism::token_set::TokenSet;
27
28/// Parse result for a single candidate.
29#[derive(Debug)]
30pub struct ParsedMarking {
31    pub attrs: IsmAttributes,
32    pub source_span: Span,
33    pub kind: MarkingType,
34}
35
36/// Phase 2+3 parser. Stateless; call [`Parser::parse`] per candidate.
37pub struct Parser<'t> {
38    tokens: &'t dyn TokenSet,
39}
40
41impl<'t> Parser<'t> {
42    pub fn new(tokens: &'t dyn TokenSet) -> Self {
43        Self { tokens }
44    }
45
46    /// Parse a single scanner candidate into [`IsmAttributes`].
47    pub fn parse(
48        &self,
49        candidate: &MarkingCandidate,
50        source: &[u8],
51    ) -> Result<ParsedMarking, CoreError> {
52        let text = candidate
53            .span
54            .as_str(source)
55            .map_err(|_| CoreError::InvalidUtf8(candidate.span))?;
56        match candidate.kind {
57            MarkingType::Portion => self.parse_portion(text, candidate),
58            MarkingType::Banner => self.parse_banner(text, candidate),
59            MarkingType::Cab => self.parse_cab(text, candidate),
60            // PageBreak candidates are scanner-emitted boundaries with no
61            // parsable content. Engine::lint filters them out before calling
62            // `parse`; reaching this arm is a programming error in the
63            // pipeline, so a `MalformedMarking` is the right surface.
64            MarkingType::PageBreak => Err(CoreError::MalformedMarking(
65                "page-break candidate must not be parsed".to_owned(),
66            )),
67        }
68    }
69
70    fn parse_portion(
71        &self,
72        text: &str,
73        candidate: &MarkingCandidate,
74    ) -> Result<ParsedMarking, CoreError> {
75        // Strip outer parentheses: "(TS//SI//NF)" -> "TS//SI//NF"
76        // The inner-string offset is `candidate.span.start + 1` because
77        // the leading `(` is one byte (verified ASCII by the scanner).
78        let inner = text
79            .strip_prefix('(')
80            .and_then(|s| s.strip_suffix(')'))
81            .ok_or_else(|| CoreError::MalformedMarking(text.to_owned()))?;
82
83        let attrs =
84            self.parse_marking_string(inner, MarkingType::Portion, candidate.span.start + 1)?;
85        Ok(ParsedMarking {
86            attrs,
87            source_span: candidate.span,
88            kind: MarkingType::Portion,
89        })
90    }
91
92    fn parse_banner(
93        &self,
94        text: &str,
95        candidate: &MarkingCandidate,
96    ) -> Result<ParsedMarking, CoreError> {
97        // For banner candidates, `text` is the full line bytes from the
98        // scanner. `text.trim()` may consume leading whitespace, which
99        // shifts the per-token offsets. Compute the leading whitespace
100        // length so we can add it to candidate.span.start.
101        let trimmed = text.trim_start();
102        let lead_ws = text.len() - trimmed.len();
103        let trimmed = trimmed.trim_end();
104        let attrs = self.parse_marking_string(
105            trimmed,
106            MarkingType::Banner,
107            candidate.span.start + lead_ws,
108        )?;
109        Ok(ParsedMarking {
110            attrs,
111            source_span: candidate.span,
112            kind: MarkingType::Banner,
113        })
114    }
115
116    fn parse_cab(
117        &self,
118        text: &str,
119        candidate: &MarkingCandidate,
120    ) -> Result<ParsedMarking, CoreError> {
121        // CAB is line-structured: "Classified By: ...\nDerived From: ...\nDeclassify On: ..."
122        let mut attrs = IsmAttributes::default();
123
124        for line in text.lines() {
125            if let Some(val) = line.strip_prefix("Classified By:") {
126                attrs.classified_by = Some(val.trim().into());
127            } else if let Some(val) = line.strip_prefix("Derived From:") {
128                attrs.derived_from = Some(val.trim().into());
129            } else if let Some(val) = line.strip_prefix("Declassify On:") {
130                let s = val.trim();
131                if let Some(exemption) = DeclassExemption::parse(s) {
132                    attrs.declass_exemption = Some(exemption);
133                } else {
134                    attrs.declassify_on = Some(s.into());
135                }
136            }
137        }
138
139        Ok(ParsedMarking {
140            attrs,
141            source_span: candidate.span,
142            kind: MarkingType::Cab,
143        })
144    }
145
146    /// Parse a marking string (without outer parentheses) into IsmAttributes.
147    /// Handles both portion form (abbreviated) and banner form (full words).
148    ///
149    /// `s_offset` is the absolute byte offset of `s` within the original
150    /// source buffer. Phase 3 uses it to record per-token absolute spans on
151    /// `IsmAttributes::token_spans` so rules can point at byte-precise
152    /// diagnostic locations.
153    fn parse_marking_string(
154        &self,
155        s: &str,
156        context: MarkingType,
157        s_offset: usize,
158    ) -> Result<IsmAttributes, CoreError> {
159        let mut attrs = IsmAttributes::default();
160
161        if s.is_empty() {
162            return Err(CoreError::MalformedMarking(s.to_owned()));
163        }
164
165        // Walk separator (`//`) positions inside `s`. Each block is the
166        // substring between consecutive separators (or string ends). Track
167        // both the block content and its inner offset so we can compute
168        // per-token absolute spans.
169        let separators: Vec<usize> = s.match_indices("//").map(|(i, _)| i).collect();
170        let mut block_ranges: Vec<(usize, usize)> = Vec::with_capacity(separators.len() + 1);
171        let mut prev_end = 0usize;
172        for &sep_start in &separators {
173            block_ranges.push((prev_end, sep_start));
174            prev_end = sep_start + 2; // skip the `//`
175        }
176        block_ranges.push((prev_end, s.len()));
177
178        let mut token_spans: Vec<TokenSpan> = Vec::new();
179
180        // First block is the classification.
181        let mut sci: Vec<SciControl> = Vec::new();
182        let mut sar: Vec<SarIdentifier> = Vec::new();
183        let mut dissem: Vec<DissemControl> = Vec::new();
184        let mut rel_to: Vec<Trigraph> = Vec::new();
185
186        for (idx, &(rel_start, rel_end)) in block_ranges.iter().enumerate() {
187            let raw = &s[rel_start..rel_end];
188            // Trim ASCII whitespace and recover the trimmed-content offset
189            // within the raw block, then add to the absolute s_offset.
190            let trimmed = raw.trim();
191            if trimmed.is_empty() {
192                continue;
193            }
194            let trim_lead = raw.len() - raw.trim_start().len();
195            let abs_start = s_offset + rel_start + trim_lead;
196            let abs_end = abs_start + trimmed.len();
197            let span = Span::new(abs_start, abs_end);
198
199            if idx == 0 {
200                // Classification block — track even when parse fails so the
201                // span is available for diagnostics.
202                attrs.classification = parse_classification(trimmed);
203                token_spans.push(TokenSpan {
204                    kind: TokenKind::Classification,
205                    span,
206                    text: trimmed.into(),
207                });
208                continue;
209            }
210
211            if trimmed.starts_with("REL TO") || trimmed.starts_with("REL ") {
212                let parsed_trigraphs =
213                    parse_rel_to_with_spans(trimmed, abs_start, self.tokens, &mut token_spans);
214                rel_to.extend(parsed_trigraphs);
215            } else if let Some(ctrl) = SciControl::parse(trimmed) {
216                sci.push(ctrl);
217                token_spans.push(TokenSpan {
218                    kind: TokenKind::SciControl,
219                    span,
220                    text: trimmed.into(),
221                });
222            } else if let Some(ctrl) =
223                DissemControl::parse(trimmed).or_else(|| parse_dissem_full_form(trimmed))
224            {
225                dissem.push(ctrl);
226                token_spans.push(TokenSpan {
227                    kind: TokenKind::DissemControl,
228                    span,
229                    text: trimmed.into(),
230                });
231            } else if let Some(sar_id) = SarIdentifier::parse(trimmed) {
232                sar.push(sar_id);
233                token_spans.push(TokenSpan {
234                    kind: TokenKind::SarIdentifier,
235                    span,
236                    text: trimmed.into(),
237                });
238            } else if let Some(exemption) = DeclassExemption::parse(trimmed) {
239                // Declass exemption codes (e.g., 25X1, 50X1-HUM) that appear
240                // inside a banner or portion marking trigger E005 — they belong
241                // in the CAB "Declassify On:" line, not in the marking string.
242                attrs.declass_exemption = Some(exemption);
243                token_spans.push(TokenSpan {
244                    kind: TokenKind::DeclassExemption,
245                    span,
246                    text: trimmed.into(),
247                });
248            } else if is_declass_date(trimmed) {
249                // Free-text declassification dates (YYYYMMDD or YYYY) that
250                // appear inside a banner or portion also belong in the CAB.
251                attrs.declassify_on = Some(trimmed.into());
252                token_spans.push(TokenSpan {
253                    kind: TokenKind::DeclassDate,
254                    span,
255                    text: trimmed.into(),
256                });
257            } else {
258                // Unrecognized — Phase 3 records this as TokenKind::Unknown
259                // so E008 can fire one diagnostic per unknown token without
260                // re-parsing. E007 also walks Unknown tokens and looks each
261                // up in the migration table to detect deprecated forms.
262                token_spans.push(TokenSpan {
263                    kind: TokenKind::Unknown,
264                    span,
265                    text: trimmed.into(),
266                });
267            }
268        }
269
270        attrs.sci_controls = sci.into_boxed_slice();
271        attrs.sar_identifiers = sar.into_boxed_slice();
272        attrs.dissem_controls = dissem.into_boxed_slice();
273        attrs.rel_to = rel_to.into_boxed_slice();
274        // Record separator spans (Phase 3 needs them for E004). Push them
275        // here alongside block tokens, then sort by start offset so the
276        // final slice is in document (source) order.
277        for &sep_start in &separators {
278            token_spans.push(TokenSpan {
279                kind: TokenKind::Separator,
280                span: Span::new(s_offset + sep_start, s_offset + sep_start + 2),
281                text: "//".into(),
282            });
283        }
284        token_spans.sort_unstable_by_key(|ts| ts.span.start);
285        attrs.token_spans = token_spans.into_boxed_slice();
286
287        let _ = context; // used for future context-aware validation
288
289        Ok(attrs)
290    }
291}
292
293/// Parse a classification string in either portion form (`"TS"`, `"S"`, `"C"`,
294/// `"U"`) or banner form (`"TOP SECRET"`, `"SECRET"`, ...).
295///
296/// Note: `Classification` is hand-written in `marque-ism::attrs` rather than
297/// generated from the CVE because the CVE only ships single-letter abbreviations
298/// and the tool needs both forms. Other CVE-derived enums (`SciControl`,
299/// `DissemControl`, `SarIdentifier`, `DeclassExemption`) go through their
300/// generated `parse()` methods.
301fn parse_classification(s: &str) -> Option<Classification> {
302    match s {
303        "TS" | "TOP SECRET" => Some(Classification::TopSecret),
304        "S" | "SECRET" => Some(Classification::Secret),
305        "C" | "CONFIDENTIAL" => Some(Classification::Confidential),
306        "U" | "UNCLASSIFIED" => Some(Classification::Unclassified),
307        _ => None,
308    }
309}
310
311/// Map a banner-form (full-word) dissemination control to its CVE
312/// abbreviation form. The CVE only ships abbreviations (`NF`, `OC`, ...),
313/// but banner markings use the full words (`NOFORN`, `ORCON`, ...) and the
314/// parser must accept both. Phase 3 added this fallback so banner-form
315/// markings parse cleanly into a typed `DissemControl`.
316///
317/// Rules that detect "banner uses portion abbreviation" (E001) read the
318/// raw token span via `attrs.token_spans` and inspect the original bytes,
319/// so this mapping does not lose the abbreviation-vs-full-word signal.
320fn parse_dissem_full_form(s: &str) -> Option<DissemControl> {
321    let abbrev = match s {
322        "NOFORN" => "NF",
323        "ORCON" => "OC",
324        "IMCON" => "IMC",
325        "DEA SENSITIVE" => "DSEN",
326        "PROPIN" => "PR",
327        "RELIDO" => "RELIDO",
328        _ => return None,
329    };
330    DissemControl::parse(abbrev)
331}
332
333/// Span-aware parse of a `REL TO ...` block. Records one
334/// `TokenKind::RelToTrigraph` per recognized country code.
335///
336/// `block_offset` is the absolute byte offset of `block` within the
337/// original source buffer.
338fn parse_rel_to_with_spans(
339    block: &str,
340    block_offset: usize,
341    tokens: &dyn TokenSet,
342    token_spans: &mut Vec<TokenSpan>,
343) -> Vec<Trigraph> {
344    // Skip the "REL TO" / "REL" prefix to land on the trigraph list. We
345    // need the offset of the *trigraph list* within `block` so that each
346    // trigraph's absolute span can be computed.
347    let prefix_skip = if let Some(rest) = block.strip_prefix("REL TO") {
348        block.len() - rest.len()
349    } else if let Some(rest) = block.strip_prefix("REL") {
350        block.len() - rest.len()
351    } else {
352        0
353    };
354    let after_rel = &block[prefix_skip..];
355
356    let mut out: Vec<Trigraph> = Vec::new();
357    // Walk comma-separated entries, tracking each entry's offset within
358    // `after_rel` so we can land an absolute span on the trigraph itself
359    // (not on any leading whitespace).
360    let mut cursor = 0usize;
361    for entry in after_rel.split(',') {
362        let entry_start_in_after = cursor;
363        // Advance past the entry and its trailing comma. On the final
364        // iteration this steps one past the end of `after_rel`, but the
365        // cursor is never read after the loop ends — the split iterator
366        // drives loop termination, not the cursor. usize addition here
367        // is bounded by the document size, so no overflow in practice.
368        cursor += entry.len() + 1;
369
370        let trim_lead = entry.len() - entry.trim_start().len();
371        let trimmed = entry.trim();
372        if trimmed.is_empty() || !tokens.is_trigraph(trimmed) {
373            continue;
374        }
375        let b = trimmed.as_bytes();
376        if b.len() != 3 {
377            continue;
378        }
379        let Some(t) = Trigraph::try_new([b[0], b[1], b[2]]) else {
380            continue;
381        };
382        out.push(t);
383        let abs_start = block_offset + prefix_skip + entry_start_in_after + trim_lead;
384        token_spans.push(TokenSpan {
385            kind: TokenKind::RelToTrigraph,
386            span: Span::new(abs_start, abs_start + 3),
387            text: trimmed.into(),
388        });
389    }
390    out
391}
392
393// SCI controls, dissemination controls, SAR identifiers, and declass
394// exemptions all parse via their generated `parse()` methods (see
395// `parse_marking_string` above). The single hand-coded path is
396// `parse_classification`, which is documented inline.
397
398/// Returns `true` if `s` looks like an inline declassification date.
399///
400/// CAPCO allows `YYYYMMDD` (8-digit) or `YYYY` (4-digit, meaning declassify
401/// at the start of that calendar year). Both forms are valid in a CAB but
402/// are a violation (E005) if they appear directly in a banner or portion
403/// marking string.
404fn is_declass_date(s: &str) -> bool {
405    let bytes = s.as_bytes();
406    matches!(bytes.len(), 4 | 8) && bytes.iter().all(u8::is_ascii_digit)
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412    use marque_ism::span::{MarkingCandidate, MarkingType, Span};
413    use marque_ism::token_set::CapcoTokenSet;
414
415    fn make_candidate(text: &[u8], kind: MarkingType, offset: usize) -> MarkingCandidate {
416        MarkingCandidate {
417            span: Span::new(offset, offset + text.len()),
418            kind,
419        }
420    }
421
422    fn parse_banner(text: &str) -> ParsedMarking {
423        let source = text.as_bytes();
424        let tokens = CapcoTokenSet;
425        let parser = Parser::new(&tokens);
426        let candidate = make_candidate(source, MarkingType::Banner, 0);
427        parser
428            .parse(&candidate, source)
429            .expect("parse should succeed")
430    }
431
432    fn parse_portion(text: &str) -> ParsedMarking {
433        let source = text.as_bytes();
434        let tokens = CapcoTokenSet;
435        let parser = Parser::new(&tokens);
436        let candidate = make_candidate(source, MarkingType::Portion, 0);
437        parser
438            .parse(&candidate, source)
439            .expect("parse should succeed")
440    }
441
442    // --- declass exemption in banner (E005 detection) ---
443
444    #[test]
445    fn banner_with_declass_exemption_populates_attrs() {
446        // A banner string that (incorrectly) contains a declass exemption code.
447        // parse_marking_string must populate declass_exemption so E005 can fire.
448        let parsed = parse_banner("SECRET//25X1//NOFORN");
449        assert!(
450            parsed.attrs.declass_exemption.is_some(),
451            "declass_exemption should be populated when 25X1 appears in banner"
452        );
453        use marque_ism::DeclassExemption;
454        assert_eq!(
455            parsed.attrs.declass_exemption,
456            Some(DeclassExemption::X25x1)
457        );
458    }
459
460    #[test]
461    fn portion_with_declass_exemption_populates_attrs() {
462        let parsed = parse_portion("(SECRET//50X1-HUM)");
463        assert!(parsed.attrs.declass_exemption.is_some());
464    }
465
466    // --- declass date in banner (E005 detection) ---
467
468    #[test]
469    fn banner_with_declass_date_populates_attrs() {
470        let parsed = parse_banner("SECRET//20301231//NOFORN");
471        assert_eq!(
472            parsed.attrs.declassify_on.as_deref(),
473            Some("20301231"),
474            "declassify_on should be populated when YYYYMMDD appears in banner"
475        );
476    }
477
478    #[test]
479    fn banner_with_four_digit_year_populates_attrs() {
480        let parsed = parse_banner("SECRET//2035");
481        assert_eq!(parsed.attrs.declassify_on.as_deref(), Some("2035"));
482    }
483
484    // --- normal banner (no declass tokens) ---
485
486    #[test]
487    fn banner_without_declass_leaves_fields_none() {
488        let parsed = parse_banner("TOP SECRET//SI//NOFORN");
489        assert!(parsed.attrs.declassify_on.is_none());
490        assert!(parsed.attrs.declass_exemption.is_none());
491    }
492
493    // --- is_declass_date helper ---
494
495    #[test]
496    fn is_declass_date_accepts_yyyymmdd() {
497        assert!(is_declass_date("20301231"));
498    }
499
500    #[test]
501    fn is_declass_date_accepts_yyyy() {
502        assert!(is_declass_date("2035"));
503    }
504
505    #[test]
506    fn is_declass_date_rejects_non_digit() {
507        assert!(!is_declass_date("2030X231"));
508        assert!(!is_declass_date("YYYYMMDD"));
509    }
510
511    #[test]
512    fn is_declass_date_rejects_wrong_length() {
513        assert!(!is_declass_date("203012"));
514        assert!(!is_declass_date("203012311"));
515    }
516
517    // --- token spans ---
518
519    #[test]
520    fn token_spans_track_offsets_in_banner() {
521        let parsed = parse_banner("TOP SECRET//SI//NF");
522        let kinds: Vec<TokenKind> = parsed.attrs.token_spans.iter().map(|t| t.kind).collect();
523        // Two separators + classification + sci + dissem.
524        assert!(kinds.contains(&TokenKind::Separator));
525        assert!(kinds.contains(&TokenKind::Classification));
526        assert!(kinds.contains(&TokenKind::SciControl));
527        assert!(kinds.contains(&TokenKind::DissemControl));
528
529        // Find each by kind and verify the byte slice matches.
530        let src = b"TOP SECRET//SI//NF";
531        let cls = parsed
532            .attrs
533            .token_spans
534            .iter()
535            .find(|t| t.kind == TokenKind::Classification)
536            .unwrap();
537        assert_eq!(cls.span.as_str(src).unwrap(), "TOP SECRET");
538
539        let sci = parsed
540            .attrs
541            .token_spans
542            .iter()
543            .find(|t| t.kind == TokenKind::SciControl)
544            .unwrap();
545        assert_eq!(sci.span.as_str(src).unwrap(), "SI");
546
547        let dissem = parsed
548            .attrs
549            .token_spans
550            .iter()
551            .find(|t| t.kind == TokenKind::DissemControl)
552            .unwrap();
553        assert_eq!(dissem.span.as_str(src).unwrap(), "NF");
554    }
555
556    #[test]
557    fn token_spans_strip_paren_in_portion() {
558        let parsed = parse_portion("(SECRET//NF)");
559        let src = b"(SECRET//NF)";
560        let cls = parsed
561            .attrs
562            .token_spans
563            .iter()
564            .find(|t| t.kind == TokenKind::Classification)
565            .unwrap();
566        // SECRET starts at byte 1 (after the open paren), runs to byte 7.
567        assert_eq!(cls.span.start, 1);
568        assert_eq!(cls.span.end, 7);
569        assert_eq!(cls.span.as_str(src).unwrap(), "SECRET");
570
571        let dissem = parsed
572            .attrs
573            .token_spans
574            .iter()
575            .find(|t| t.kind == TokenKind::DissemControl)
576            .unwrap();
577        // NF starts at byte 9 (after `SECRET//`).
578        assert_eq!(dissem.span.start, 9);
579        assert_eq!(dissem.span.end, 11);
580    }
581
582    #[test]
583    fn token_spans_record_unknown_token() {
584        let parsed = parse_banner("SECRET//XYZZY//NOFORN");
585        let unknowns: Vec<&TokenSpan> = parsed
586            .attrs
587            .token_spans
588            .iter()
589            .filter(|t| t.kind == TokenKind::Unknown)
590            .collect();
591        assert_eq!(unknowns.len(), 1);
592        assert_eq!(
593            unknowns[0].span.as_str(b"SECRET//XYZZY//NOFORN").unwrap(),
594            "XYZZY"
595        );
596    }
597
598    #[test]
599    fn token_spans_record_rel_to_trigraphs() {
600        let parsed = parse_banner("SECRET//REL TO USA, GBR, AUS");
601        let trigraphs: Vec<&TokenSpan> = parsed
602            .attrs
603            .token_spans
604            .iter()
605            .filter(|t| t.kind == TokenKind::RelToTrigraph)
606            .collect();
607        assert_eq!(trigraphs.len(), 3);
608        let src = b"SECRET//REL TO USA, GBR, AUS";
609        assert_eq!(trigraphs[0].span.as_str(src).unwrap(), "USA");
610        assert_eq!(trigraphs[1].span.as_str(src).unwrap(), "GBR");
611        assert_eq!(trigraphs[2].span.as_str(src).unwrap(), "AUS");
612    }
613
614    #[test]
615    fn token_spans_record_separators() {
616        let parsed = parse_banner("SECRET//NF");
617        let seps: Vec<&TokenSpan> = parsed
618            .attrs
619            .token_spans
620            .iter()
621            .filter(|t| t.kind == TokenKind::Separator)
622            .collect();
623        assert_eq!(seps.len(), 1);
624        let src = b"SECRET//NF";
625        assert_eq!(seps[0].span.as_str(src).unwrap(), "//");
626    }
627}