marque_core/
parser.rs

1//! Phase 2/3: token extraction and structural parsing.
2//!
3//! Takes [`MarkingCandidate`] spans from the scanner and produces [`IsmAttributes`].
4//!
5//! # Phase 2 — Token Extraction
6//! A compile-time Aho-Corasick automaton (built from CVE token list in marque-capco)
7//! runs over each candidate span, identifying known tokens and their positions.
8//! Unrecognized tokens within a candidate boundary are themselves diagnostics.
9//!
10//! # Phase 3 — Structural Parsing
11//! Token sequence → IsmAttributes. Validates ordering and block structure.
12//! Produces `ParseError` for structural violations; these feed into the rule engine
13//! as diagnostics with associated fixes.
14//!
15//! Note: the Aho-Corasick automaton is injected via `TokenSet` to keep marque-core
16//! free of a direct dependency on marque-capco's generated data.
17
18use crate::error::CoreError;
19use marque_ism::attrs::{
20    AeaMarking, Classification, DeclassExemption, DissemControl, FgiClassification, FgiMarker,
21    ForeignClassification, IsmAttributes, JointClassification, MarkingClassification,
22    NatoClassification, NonIcDissem, SarIdentifier, SciControl, TokenKind, TokenSpan, Trigraph,
23};
24use marque_ism::span::{MarkingCandidate, MarkingType, Span};
25use marque_ism::token_set::TokenSet;
26
27/// Parse result for a single candidate.
28#[derive(Debug)]
29pub struct ParsedMarking {
30    pub attrs: IsmAttributes,
31    pub source_span: Span,
32    pub kind: MarkingType,
33}
34
35/// Phase 2+3 parser. Stateless; call [`Parser::parse`] per candidate.
36pub struct Parser<'t> {
37    tokens: &'t dyn TokenSet,
38}
39
40impl<'t> Parser<'t> {
41    pub fn new(tokens: &'t dyn TokenSet) -> Self {
42        Self { tokens }
43    }
44
45    /// Parse a single scanner candidate into [`IsmAttributes`].
46    pub fn parse(
47        &self,
48        candidate: &MarkingCandidate,
49        source: &[u8],
50    ) -> Result<ParsedMarking, CoreError> {
51        let text = candidate
52            .span
53            .as_str(source)
54            .map_err(|_| CoreError::InvalidUtf8(candidate.span))?;
55        match candidate.kind {
56            MarkingType::Portion => self.parse_portion(text, candidate),
57            MarkingType::Banner => self.parse_banner(text, candidate),
58            MarkingType::Cab => self.parse_cab(text, candidate),
59            // PageBreak candidates are scanner-emitted boundaries with no
60            // parsable content. Engine::lint filters them out before calling
61            // `parse`; reaching this arm is a programming error in the
62            // pipeline, so a `MalformedMarking` is the right surface.
63            MarkingType::PageBreak => Err(CoreError::MalformedMarking(
64                "page-break candidate must not be parsed".to_owned(),
65            )),
66        }
67    }
68
69    fn parse_portion(
70        &self,
71        text: &str,
72        candidate: &MarkingCandidate,
73    ) -> Result<ParsedMarking, CoreError> {
74        // Strip outer parentheses: "(TS//SI//NF)" -> "TS//SI//NF"
75        // The inner-string offset is `candidate.span.start + 1` because
76        // the leading `(` is one byte (verified ASCII by the scanner).
77        let inner = text
78            .strip_prefix('(')
79            .and_then(|s| s.strip_suffix(')'))
80            .ok_or_else(|| CoreError::MalformedMarking(text.to_owned()))?;
81
82        let attrs =
83            self.parse_marking_string(inner, MarkingType::Portion, candidate.span.start + 1)?;
84        Ok(ParsedMarking {
85            attrs,
86            source_span: candidate.span,
87            kind: MarkingType::Portion,
88        })
89    }
90
91    fn parse_banner(
92        &self,
93        text: &str,
94        candidate: &MarkingCandidate,
95    ) -> Result<ParsedMarking, CoreError> {
96        // For banner candidates, `text` is the full line bytes from the
97        // scanner. `text.trim()` may consume leading whitespace, which
98        // shifts the per-token offsets. Compute the leading whitespace
99        // length so we can add it to candidate.span.start.
100        let trimmed = text.trim_start();
101        let lead_ws = text.len() - trimmed.len();
102        let trimmed = trimmed.trim_end();
103        let attrs = self.parse_marking_string(
104            trimmed,
105            MarkingType::Banner,
106            candidate.span.start + lead_ws,
107        )?;
108        Ok(ParsedMarking {
109            attrs,
110            source_span: candidate.span,
111            kind: MarkingType::Banner,
112        })
113    }
114
115    fn parse_cab(
116        &self,
117        text: &str,
118        candidate: &MarkingCandidate,
119    ) -> Result<ParsedMarking, CoreError> {
120        // CAB is line-structured: "Classified By: ...\nDerived From: ...\nDeclassify On: ..."
121        let mut attrs = IsmAttributes::default();
122
123        for line in text.lines() {
124            if let Some(val) = line.strip_prefix("Classified By:") {
125                attrs.classified_by = Some(val.trim().into());
126            } else if let Some(val) = line.strip_prefix("Derived From:") {
127                attrs.derived_from = Some(val.trim().into());
128            } else if let Some(val) = line.strip_prefix("Declassify On:") {
129                let s = val.trim();
130                if let Some(exemption) = DeclassExemption::parse(s) {
131                    attrs.declass_exemption = Some(exemption);
132                } else {
133                    attrs.declassify_on = Some(s.into());
134                }
135            }
136        }
137
138        Ok(ParsedMarking {
139            attrs,
140            source_span: candidate.span,
141            kind: MarkingType::Cab,
142        })
143    }
144
145    /// Parse a marking string (without outer parentheses) into IsmAttributes.
146    /// Handles both portion form (abbreviated) and banner form (full words).
147    ///
148    /// `s_offset` is the absolute byte offset of `s` within the original
149    /// source buffer. Phase 3 uses it to record per-token absolute spans on
150    /// `IsmAttributes::token_spans` so rules can point at byte-precise
151    /// diagnostic locations.
152    fn parse_marking_string(
153        &self,
154        s: &str,
155        context: MarkingType,
156        s_offset: usize,
157    ) -> Result<IsmAttributes, CoreError> {
158        let mut attrs = IsmAttributes::default();
159
160        if s.is_empty() {
161            return Err(CoreError::MalformedMarking(s.to_owned()));
162        }
163
164        // Walk separator (`//`) positions inside `s`. Each block is the
165        // substring between consecutive separators (or string ends). Track
166        // both the block content and its inner offset so we can compute
167        // per-token absolute spans.
168        let separators: Vec<usize> = s.match_indices("//").map(|(i, _)| i).collect();
169        let mut block_ranges: Vec<(usize, usize)> = Vec::with_capacity(separators.len() + 1);
170        let mut prev_end = 0usize;
171        for &sep_start in &separators {
172            block_ranges.push((prev_end, sep_start));
173            prev_end = sep_start + 2; // skip the `//`
174        }
175        block_ranges.push((prev_end, s.len()));
176
177        let mut token_spans: Vec<TokenSpan> = Vec::new();
178
179        let mut sci: Vec<SciControl> = Vec::new();
180        let mut sar: Vec<SarIdentifier> = Vec::new();
181        let mut aea: Vec<AeaMarking> = Vec::new();
182        let mut dissem: Vec<DissemControl> = Vec::new();
183        let mut non_ic: Vec<NonIcDissem> = Vec::new();
184        let mut rel_to: Vec<Trigraph> = Vec::new();
185
186        // When the marking starts with `//`, block 0 is empty and the
187        // classification is non-US (FGI, NATO, or JOINT). Block 1 carries
188        // the foreign classification.
189        let is_non_us = s.starts_with("//");
190
191        for (idx, &(rel_start, rel_end)) in block_ranges.iter().enumerate() {
192            let raw = &s[rel_start..rel_end];
193            let trimmed = raw.trim();
194            if trimmed.is_empty() {
195                continue;
196            }
197            let trim_lead = raw.len() - raw.trim_start().len();
198            let abs_start = s_offset + rel_start + trim_lead;
199            let abs_end = abs_start + trimmed.len();
200            let span = Span::new(abs_start, abs_end);
201
202            // ---------------------------------------------------------------
203            // Block 0: US classification (or empty for non-US markings)
204            // ---------------------------------------------------------------
205            if idx == 0 && !is_non_us {
206                attrs.classification = parse_classification(trimmed).map(MarkingClassification::Us);
207                token_spans.push(TokenSpan {
208                    kind: TokenKind::Classification,
209                    span,
210                    text: trimmed.into(),
211                });
212                continue;
213            }
214
215            // ---------------------------------------------------------------
216            // Block 1 when non-US: foreign classification
217            // ---------------------------------------------------------------
218            if idx == 1 && is_non_us {
219                if let Some(nato) = parse_nato_classification(trimmed) {
220                    attrs.classification = Some(MarkingClassification::Nato(nato));
221                } else if let Some(joint) = parse_joint_classification(trimmed) {
222                    attrs.classification = Some(MarkingClassification::Joint(joint));
223                } else if let Some(fgi) = parse_fgi_classification(trimmed) {
224                    attrs.classification = Some(MarkingClassification::Fgi(fgi));
225                } else {
226                    // Unrecognized non-US classification block.
227                    token_spans.push(TokenSpan {
228                        kind: TokenKind::Unknown,
229                        span,
230                        text: trimmed.into(),
231                    });
232                    continue;
233                }
234                token_spans.push(TokenSpan {
235                    kind: TokenKind::Classification,
236                    span,
237                    text: trimmed.into(),
238                });
239                continue;
240            }
241
242            // ---------------------------------------------------------------
243            // Remaining blocks: controls, markers, and fallbacks
244            // ---------------------------------------------------------------
245
246            if trimmed.starts_with("REL TO") || trimmed.starts_with("REL ") {
247                // Record the full block text before the individual trigraph tokens
248                // so token_spans maintains a logical ordering (block → constituents).
249                token_spans.push(TokenSpan {
250                    kind: TokenKind::RelToBlock,
251                    span,
252                    text: trimmed.into(),
253                });
254                let parsed_trigraphs =
255                    parse_rel_to_with_spans(trimmed, abs_start, self.tokens, &mut token_spans);
256                rel_to.extend(parsed_trigraphs);
257            } else if let Some(ctrl) = SciControl::parse(trimmed) {
258                sci.push(ctrl);
259                token_spans.push(TokenSpan {
260                    kind: TokenKind::SciControl,
261                    span,
262                    text: trimmed.into(),
263                });
264            } else if trimmed.starts_with("FGI")
265                && matches!(attrs.classification, Some(MarkingClassification::Us(_)))
266            {
267                // FGI marker in a US-classified marking (e.g., SECRET//FGI DEU//NF).
268                if let Some(marker) = parse_fgi_marker(trimmed) {
269                    attrs.fgi_marker = Some(marker);
270                    token_spans.push(TokenSpan {
271                        kind: TokenKind::FgiMarker,
272                        span,
273                        text: trimmed.into(),
274                    });
275                }
276            } else if let Some(ctrl) =
277                DissemControl::parse(trimmed).or_else(|| parse_dissem_full_form(trimmed))
278            {
279                dissem.push(ctrl);
280                token_spans.push(TokenSpan {
281                    kind: TokenKind::DissemControl,
282                    span,
283                    text: trimmed.into(),
284                });
285            } else if let Some(nic) = NonIcDissem::parse(trimmed) {
286                non_ic.push(nic);
287                token_spans.push(TokenSpan {
288                    kind: TokenKind::NonIcDissem,
289                    span,
290                    text: trimmed.into(),
291                });
292            } else if let Some(sar_id) = SarIdentifier::parse(trimmed) {
293                sar.push(sar_id);
294                token_spans.push(TokenSpan {
295                    kind: TokenKind::SarIdentifier,
296                    span,
297                    text: trimmed.into(),
298                });
299            } else if let Some(aea_marking) = AeaMarking::parse(trimmed) {
300                aea.push(aea_marking);
301                token_spans.push(TokenSpan {
302                    kind: TokenKind::AeaMarking,
303                    span,
304                    text: trimmed.into(),
305                });
306            } else if let Some(exemption) = DeclassExemption::parse(trimmed) {
307                attrs.declass_exemption = Some(exemption);
308                token_spans.push(TokenSpan {
309                    kind: TokenKind::DeclassExemption,
310                    span,
311                    text: trimmed.into(),
312                });
313            } else if is_declass_date(trimmed) {
314                attrs.declassify_on = Some(trimmed.into());
315                token_spans.push(TokenSpan {
316                    kind: TokenKind::DeclassDate,
317                    span,
318                    text: trimmed.into(),
319                });
320            } else if let Some(foreign) = try_parse_foreign_classification(trimmed) {
321                // Conflict: a foreign classification in a marking that already
322                // has a US classification. US wins at the greater of the two.
323                if let Some(MarkingClassification::Us(us_level)) = attrs.classification {
324                    let foreign_equiv = match &foreign {
325                        ForeignClassification::Nato(n) => n.us_equivalent(),
326                        ForeignClassification::Fgi(f) => f.level,
327                        ForeignClassification::Joint(j) => j.level,
328                    };
329                    let max_level = us_level.max(foreign_equiv);
330                    attrs.classification = Some(MarkingClassification::Conflict {
331                        us: max_level,
332                        foreign: Box::new(foreign),
333                    });
334                    token_spans.push(TokenSpan {
335                        kind: TokenKind::Classification,
336                        span,
337                        text: trimmed.into(),
338                    });
339                } else {
340                    // No prior US classification — just Unknown.
341                    token_spans.push(TokenSpan {
342                        kind: TokenKind::Unknown,
343                        span,
344                        text: trimmed.into(),
345                    });
346                }
347            } else if trimmed.contains('/') && !trimmed.starts_with("REL") {
348                // Multi-token block per CAPCO §D.1: multiple entries within a
349                // **single category** are separated by `/` (e.g., "SI/TK", "NF/RD").
350                // First, speculatively parse all sub-tokens. If all recognized sub-tokens
351                // belong to the same category, commit them. If categories are mixed
352                // (e.g., "SI/NF" — SCI + dissem in one block), the `/` is a stray
353                // separator that should have been `//`; emit the whole block as Unknown
354                // so E004 can detect and fix the missing `//`.
355
356                #[derive(Clone, Copy, PartialEq, Eq)]
357                enum SubKind {
358                    Sci,
359                    Dissem,
360                    NonIc,
361                    Sar,
362                    Aea,
363                    Unknown,
364                }
365
366                struct SubResult<'a> {
367                    kind: SubKind,
368                    tok: &'a str,
369                    span: Span,
370                    // Parsed values — stored here before committing.
371                    sci: Option<SciControl>,
372                    dissem: Option<DissemControl>,
373                    nic: Option<NonIcDissem>,
374                    sar: Option<SarIdentifier>,
375                    aea: Option<AeaMarking>,
376                }
377
378                let mut results: Vec<SubResult<'_>> = Vec::new();
379                for (sub_off, sub_tok) in split_slash_with_offsets(trimmed) {
380                    let sub_abs_start = abs_start + sub_off;
381                    let sub_span = Span::new(sub_abs_start, sub_abs_start + sub_tok.len());
382                    if let Some(ctrl) = SciControl::parse(sub_tok) {
383                        results.push(SubResult {
384                            kind: SubKind::Sci,
385                            tok: sub_tok,
386                            span: sub_span,
387                            sci: Some(ctrl),
388                            dissem: None,
389                            nic: None,
390                            sar: None,
391                            aea: None,
392                        });
393                    } else if let Some(ctrl) =
394                        DissemControl::parse(sub_tok).or_else(|| parse_dissem_full_form(sub_tok))
395                    {
396                        results.push(SubResult {
397                            kind: SubKind::Dissem,
398                            tok: sub_tok,
399                            span: sub_span,
400                            sci: None,
401                            dissem: Some(ctrl),
402                            nic: None,
403                            sar: None,
404                            aea: None,
405                        });
406                    } else if let Some(nic) = NonIcDissem::parse(sub_tok) {
407                        results.push(SubResult {
408                            kind: SubKind::NonIc,
409                            tok: sub_tok,
410                            span: sub_span,
411                            sci: None,
412                            dissem: None,
413                            nic: Some(nic),
414                            sar: None,
415                            aea: None,
416                        });
417                    } else if let Some(sar_id) = SarIdentifier::parse(sub_tok) {
418                        results.push(SubResult {
419                            kind: SubKind::Sar,
420                            tok: sub_tok,
421                            span: sub_span,
422                            sci: None,
423                            dissem: None,
424                            nic: None,
425                            sar: Some(sar_id),
426                            aea: None,
427                        });
428                    } else if let Some(aea_marking) = AeaMarking::parse(sub_tok) {
429                        results.push(SubResult {
430                            kind: SubKind::Aea,
431                            tok: sub_tok,
432                            span: sub_span,
433                            sci: None,
434                            dissem: None,
435                            nic: None,
436                            sar: None,
437                            aea: Some(aea_marking),
438                        });
439                    } else {
440                        results.push(SubResult {
441                            kind: SubKind::Unknown,
442                            tok: sub_tok,
443                            span: sub_span,
444                            sci: None,
445                            dissem: None,
446                            nic: None,
447                            sar: None,
448                            aea: None,
449                        });
450                    }
451                }
452
453                // Check category consistency: all parsed (non-Unknown) sub-tokens
454                // must share the same category for `/` to be a valid intra-block
455                // separator. Mixed categories (e.g., SCI + dissem) mean the `/`
456                // is a stray single-slash separator that should have been `//`.
457                let first_parsed_kind = results
458                    .iter()
459                    .find(|r| r.kind != SubKind::Unknown)
460                    .map(|r| r.kind);
461                let all_same_category = first_parsed_kind.is_some_and(|first| {
462                    results
463                        .iter()
464                        .filter(|r| r.kind != SubKind::Unknown)
465                        .all(|r| r.kind == first)
466                });
467
468                if first_parsed_kind.is_some() && !all_same_category {
469                    // Mixed categories: the `/` is a stray separator.
470                    // Emit the whole block as Unknown so E004 can detect it.
471                    token_spans.push(TokenSpan {
472                        kind: TokenKind::Unknown,
473                        span,
474                        text: trimmed.into(),
475                    });
476                } else {
477                    // Same category (or all unknown): commit sub-token results.
478                    for r in results {
479                        match r.kind {
480                            SubKind::Sci => {
481                                sci.push(r.sci.unwrap());
482                                token_spans.push(TokenSpan {
483                                    kind: TokenKind::SciControl,
484                                    span: r.span,
485                                    text: r.tok.into(),
486                                });
487                            }
488                            SubKind::Dissem => {
489                                dissem.push(r.dissem.unwrap());
490                                token_spans.push(TokenSpan {
491                                    kind: TokenKind::DissemControl,
492                                    span: r.span,
493                                    text: r.tok.into(),
494                                });
495                            }
496                            SubKind::NonIc => {
497                                non_ic.push(r.nic.unwrap());
498                                token_spans.push(TokenSpan {
499                                    kind: TokenKind::NonIcDissem,
500                                    span: r.span,
501                                    text: r.tok.into(),
502                                });
503                            }
504                            SubKind::Sar => {
505                                sar.push(r.sar.unwrap());
506                                token_spans.push(TokenSpan {
507                                    kind: TokenKind::SarIdentifier,
508                                    span: r.span,
509                                    text: r.tok.into(),
510                                });
511                            }
512                            SubKind::Aea => {
513                                aea.push(r.aea.unwrap());
514                                token_spans.push(TokenSpan {
515                                    kind: TokenKind::AeaMarking,
516                                    span: r.span,
517                                    text: r.tok.into(),
518                                });
519                            }
520                            SubKind::Unknown => {
521                                // Unrecognized sub-token within a same-category block.
522                                // E008 fires one diagnostic per Unknown span.
523                                token_spans.push(TokenSpan {
524                                    kind: TokenKind::Unknown,
525                                    span: r.span,
526                                    text: r.tok.into(),
527                                });
528                            }
529                        }
530                    }
531                }
532            } else {
533                token_spans.push(TokenSpan {
534                    kind: TokenKind::Unknown,
535                    span,
536                    text: trimmed.into(),
537                });
538            }
539        }
540
541        attrs.sci_controls = sci.into_boxed_slice();
542        attrs.sar_identifiers = sar.into_boxed_slice();
543        attrs.aea_markings = aea.into_boxed_slice();
544        attrs.dissem_controls = dissem.into_boxed_slice();
545        attrs.non_ic_dissem = non_ic.into_boxed_slice();
546        attrs.rel_to = rel_to.into_boxed_slice();
547        // Record separator spans (Phase 3 needs them for E004). Push them
548        // here alongside block tokens, then sort by start offset so the
549        // final slice is in document (source) order.
550        for &sep_start in &separators {
551            token_spans.push(TokenSpan {
552                kind: TokenKind::Separator,
553                span: Span::new(s_offset + sep_start, s_offset + sep_start + 2),
554                text: "//".into(),
555            });
556        }
557        token_spans.sort_unstable_by_key(|ts| ts.span.start);
558        attrs.token_spans = token_spans.into_boxed_slice();
559
560        let _ = context; // used for future context-aware validation
561
562        Ok(attrs)
563    }
564}
565
566/// Parse a classification string in either portion form (`"TS"`, `"S"`, `"C"`,
567/// `"R"`, `"U"`) or banner form (`"TOP SECRET"`, `"SECRET"`, ...).
568///
569/// Includes RESTRICTED/R for foreign-origin markings (between U and C).
570///
571/// Note: `Classification` is hand-written in `marque-ism::attrs` rather than
572/// generated from the CVE because the CVE only ships single-letter abbreviations
573/// and the tool needs both forms. Other CVE-derived enums (`SciControl`,
574/// `DissemControl`, `SarIdentifier`, `DeclassExemption`) go through their
575/// generated `parse()` methods.
576fn parse_classification(s: &str) -> Option<Classification> {
577    match s {
578        "TS" | "TOP SECRET" => Some(Classification::TopSecret),
579        "S" | "SECRET" => Some(Classification::Secret),
580        "C" | "CONFIDENTIAL" => Some(Classification::Confidential),
581        "R" | "RESTRICTED" => Some(Classification::Restricted),
582        "U" | "UNCLASSIFIED" => Some(Classification::Unclassified),
583        _ => None,
584    }
585}
586
587/// Parse a NATO classification string in either banner form (`"NATO SECRET"`,
588/// `"COSMIC TOP SECRET"`, etc.) or portion form (`"NS"`, `"CTS"`, etc.).
589///
590/// Includes SAP variants (ATOMAL, BOHEMIA, BALK). Longer patterns are checked
591/// first to avoid prefix ambiguity (e.g., `"COSMIC TOP SECRET ATOMAL"` before
592/// `"COSMIC TOP SECRET"`).
593fn parse_nato_classification(s: &str) -> Option<NatoClassification> {
594    // Check longer patterns first to avoid prefix matches.
595    match s {
596        // Banner forms (full words) — longer patterns first
597        "COSMIC TOP SECRET ATOMAL" => Some(NatoClassification::CosmicTopSecretAtomal),
598        "COSMIC TOP SECRET-BOHEMIA" => Some(NatoClassification::CosmicTopSecretBohemia),
599        "COSMIC TOP SECRET-BALK" => Some(NatoClassification::CosmicTopSecretBalk),
600        "COSMIC TOP SECRET" => Some(NatoClassification::CosmicTopSecret),
601        "NATO SECRET ATOMAL" => Some(NatoClassification::NatoSecretAtomal),
602        "NATO SECRET" => Some(NatoClassification::NatoSecret),
603        "NATO CONFIDENTIAL ATOMAL" => Some(NatoClassification::NatoConfidentialAtomal),
604        "NATO CONFIDENTIAL" => Some(NatoClassification::NatoConfidential),
605        "NATO RESTRICTED" => Some(NatoClassification::NatoRestricted),
606        "NATO UNCLASSIFIED" => Some(NatoClassification::NatoUnclassified),
607        // Portion forms — primary (CAPCO Register)
608        "CTSA" | "CTS-A" => Some(NatoClassification::CosmicTopSecretAtomal),
609        "CTS-B" => Some(NatoClassification::CosmicTopSecretBohemia),
610        "CTS-BALK" => Some(NatoClassification::CosmicTopSecretBalk),
611        "CTS" => Some(NatoClassification::CosmicTopSecret),
612        "NSAT" | "NS-A" => Some(NatoClassification::NatoSecretAtomal),
613        "NS" => Some(NatoClassification::NatoSecret),
614        "NCA" | "NC-A" => Some(NatoClassification::NatoConfidentialAtomal),
615        "NC" => Some(NatoClassification::NatoConfidential),
616        "NR" => Some(NatoClassification::NatoRestricted),
617        "NU" => Some(NatoClassification::NatoUnclassified),
618        _ => None,
619    }
620}
621
622/// Parse a JOINT classification block: `"JOINT S USA GBR"` or `"JOINT SECRET USA GBR"`.
623///
624/// Format: `JOINT` + classification level + space-delimited country trigraphs.
625/// Countries are space-delimited (NOT comma-delimited like REL TO).
626fn parse_joint_classification(s: &str) -> Option<JointClassification> {
627    let rest = s.strip_prefix("JOINT ")?;
628    let mut tokens = rest.split_whitespace();
629
630    // First token(s) after JOINT are the classification level.
631    // Handle two-word levels like "TOP SECRET".
632    let first = tokens.next()?;
633    let (level, remaining_start) = if first == "TOP" {
634        // Check if next token is "SECRET" to form "TOP SECRET"
635        let mut peek_tokens = rest.split_whitespace();
636        peek_tokens.next(); // skip "TOP"
637        if peek_tokens.next() == Some("SECRET") {
638            let level = parse_classification("TOP SECRET")?;
639            // Skip past "TOP SECRET" — countries start after
640            let after_ts = rest.find("SECRET").map(|i| i + "SECRET".len())?;
641            (level, after_ts)
642        } else {
643            return None; // "TOP" alone is not a valid level
644        }
645    } else {
646        let level = parse_classification(first)?;
647        let after_level = rest.find(first).map(|i| i + first.len())?;
648        (level, after_level)
649    };
650
651    // Remaining tokens are space-delimited country trigraphs.
652    let country_str = rest[remaining_start..].trim();
653    let mut countries = Vec::new();
654    for token in country_str.split_whitespace() {
655        if token.len() == 3 {
656            if let Some(t) = Trigraph::try_new(token.as_bytes().try_into().ok()?) {
657                countries.push(t);
658            }
659        }
660        // Skip non-trigraph tokens (tetragraphs like NATO handled later)
661    }
662
663    if countries.is_empty() {
664        return None; // JOINT must have at least one country
665    }
666
667    Some(JointClassification {
668        level,
669        countries: countries.into(),
670    })
671}
672
673/// Parse an FGI classification block: `"GBR S"`, `"DEU TS"`, `"GBR DEU S"`,
674/// or `"FGI S"` (FGI as placeholder for unknown country).
675///
676/// Format: one or more country trigraphs (or "FGI") + classification level.
677/// Countries are space-delimited. The last token is the classification level.
678///
679/// Returns `None` if no classification level is found (e.g., bare `"FGI"` with
680/// no level — that's an error, not a valid FGI classification).
681fn parse_fgi_classification(s: &str) -> Option<FgiClassification> {
682    let tokens: Vec<&str> = s.split_whitespace().collect();
683    if tokens.len() < 2 {
684        return None; // Need at least country + level
685    }
686
687    // Last token is the classification level. Handle "TOP SECRET" as two tokens.
688    let (level, country_end) = if tokens.len() >= 3
689        && tokens[tokens.len() - 2] == "TOP"
690        && tokens[tokens.len() - 1] == "SECRET"
691    {
692        (parse_classification("TOP SECRET")?, tokens.len() - 2)
693    } else {
694        (
695            parse_classification(tokens[tokens.len() - 1])?,
696            tokens.len() - 1,
697        )
698    };
699
700    // Preceding tokens are country trigraphs (or "FGI" placeholder).
701    let mut countries = Vec::new();
702    for &token in &tokens[..country_end] {
703        if token == "FGI" {
704            // FGI as placeholder for unknown country — countries stays empty
705            continue;
706        }
707        if token.len() == 3 {
708            if let Some(t) = Trigraph::try_new(token.as_bytes().try_into().ok()?) {
709                countries.push(t);
710            } else {
711                return None; // Invalid trigraph
712            }
713        } else {
714            return None; // Not a trigraph or "FGI"
715        }
716    }
717
718    Some(FgiClassification {
719        countries: countries.into(),
720        level,
721    })
722}
723
724/// Parse an FGI marker block in a US-classified marking: `"FGI"` or `"FGI DEU"` or `"FGI DEU GBR"`.
725///
726/// This is the FGI block between SAR and dissem controls in a US-classified
727/// marking (e.g., `SECRET//FGI DEU//NOFORN`). Not to be confused with
728/// [`parse_fgi_classification`] which parses a non-US classification.
729fn parse_fgi_marker(s: &str) -> Option<FgiMarker> {
730    if s == "FGI" {
731        return Some(FgiMarker {
732            countries: Box::new([]),
733        });
734    }
735
736    let rest = s.strip_prefix("FGI ")?;
737    let mut countries = Vec::new();
738    for token in rest.split_whitespace() {
739        if token.len() == 3 {
740            if let Some(t) = Trigraph::try_new(token.as_bytes().try_into().ok()?) {
741                countries.push(t);
742            }
743        }
744        // Skip non-trigraph tokens for now (tetragraphs like NATO)
745    }
746
747    Some(FgiMarker {
748        countries: countries.into(),
749    })
750}
751
752/// Attempt to parse a block as a foreign classification (NATO, JOINT, or FGI).
753///
754/// Used as a fallback in the block loop to detect conflict scenarios
755/// (e.g., `SECRET//NATO SECRET//NOFORN`) where a foreign classification
756/// appears alongside a US classification.
757fn try_parse_foreign_classification(s: &str) -> Option<ForeignClassification> {
758    if let Some(nato) = parse_nato_classification(s) {
759        Some(ForeignClassification::Nato(nato))
760    } else if let Some(joint) = parse_joint_classification(s) {
761        Some(ForeignClassification::Joint(joint))
762    } else {
763        parse_fgi_classification(s).map(ForeignClassification::Fgi)
764    }
765}
766
767/// Map a banner-form (full-word) dissemination control to its CVE
768/// abbreviation form. The CVE only ships abbreviations (`NF`, `OC`, ...),
769/// but banner markings use the full words (`NOFORN`, `ORCON`, ...) and the
770/// parser must accept both. Phase 3 added this fallback so banner-form
771/// markings parse cleanly into a typed `DissemControl`.
772///
773/// Rules that detect "banner uses portion abbreviation" (E001) read the
774/// raw token span via `attrs.token_spans` and inspect the original bytes,
775/// so this mapping does not lose the abbreviation-vs-full-word signal.
776///
777/// Mapping data sourced from [`marque_ism::marking_forms`].
778fn parse_dissem_full_form(s: &str) -> Option<DissemControl> {
779    let portion = marque_ism::marking_forms::banner_to_portion(s)?;
780    DissemControl::parse(portion)
781}
782
783/// Span-aware parse of a `REL TO ...` block. Records one
784/// `TokenKind::RelToTrigraph` per recognized country code.
785///
786/// `block_offset` is the absolute byte offset of `block` within the
787/// original source buffer.
788fn parse_rel_to_with_spans(
789    block: &str,
790    block_offset: usize,
791    tokens: &dyn TokenSet,
792    token_spans: &mut Vec<TokenSpan>,
793) -> Vec<Trigraph> {
794    // Skip the "REL TO" / "REL" prefix to land on the trigraph list. We
795    // need the offset of the *trigraph list* within `block` so that each
796    // trigraph's absolute span can be computed.
797    let prefix_skip = if let Some(rest) = block.strip_prefix("REL TO") {
798        block.len() - rest.len()
799    } else if let Some(rest) = block.strip_prefix("REL") {
800        block.len() - rest.len()
801    } else {
802        0
803    };
804    let after_rel = &block[prefix_skip..];
805
806    let mut out: Vec<Trigraph> = Vec::new();
807    // Walk comma-separated entries, tracking each entry's offset within
808    // `after_rel` so we can land an absolute span on the trigraph itself
809    // (not on any leading whitespace).
810    let mut cursor = 0usize;
811    for entry in after_rel.split(',') {
812        let entry_start_in_after = cursor;
813        // Advance past the entry and its trailing comma. On the final
814        // iteration this steps one past the end of `after_rel`, but the
815        // cursor is never read after the loop ends — the split iterator
816        // drives loop termination, not the cursor. usize addition here
817        // is bounded by the document size, so no overflow in practice.
818        cursor += entry.len() + 1;
819
820        let trim_lead = entry.len() - entry.trim_start().len();
821        let trimmed = entry.trim();
822        if trimmed.is_empty() || !tokens.is_trigraph(trimmed) {
823            continue;
824        }
825        let b = trimmed.as_bytes();
826        if b.len() != 3 {
827            continue;
828        }
829        let Some(t) = Trigraph::try_new([b[0], b[1], b[2]]) else {
830            continue;
831        };
832        out.push(t);
833        let abs_start = block_offset + prefix_skip + entry_start_in_after + trim_lead;
834        token_spans.push(TokenSpan {
835            kind: TokenKind::RelToTrigraph,
836            span: Span::new(abs_start, abs_start + 3),
837            text: trimmed.into(),
838        });
839    }
840    out
841}
842
843// SCI controls, dissemination controls, SAR identifiers, and declass
844// exemptions all parse via their generated `parse()` methods (see
845// `parse_marking_string` above). The single hand-coded path is
846// `parse_classification`, which is documented inline.
847
848/// Returns `true` if `s` looks like an inline declassification date.
849///
850/// CAPCO allows `YYYYMMDD` (8-digit) or `YYYY` (4-digit, meaning declassify
851/// at the start of that calendar year). Both forms are valid in a CAB but
852/// are a violation (E005) if they appear directly in a banner or portion
853/// marking string.
854fn is_declass_date(s: &str) -> bool {
855    let bytes = s.as_bytes();
856    matches!(bytes.len(), 4 | 8) && bytes.iter().all(u8::is_ascii_digit)
857}
858
859/// Splits `s` on `/` and returns `(offset, trimmed_token)` pairs where
860/// `offset` is the byte offset of the trimmed token within `s`.
861///
862/// Used by the multi-token block fallback to handle CAPCO §D.1 blocks like
863/// `"SI/TK"` or `"NF/LIMDIS"` where multiple entries share one `//` block.
864fn split_slash_with_offsets(s: &str) -> Vec<(usize, &str)> {
865    let mut result = Vec::new();
866    let mut pos = 0usize;
867    for part in s.split('/') {
868        let trim_lead = part.len() - part.trim_start().len();
869        let trimmed = part.trim();
870        if !trimmed.is_empty() {
871            result.push((pos + trim_lead, trimmed));
872        }
873        pos += part.len() + 1; // +1 for the `/` separator
874    }
875    result
876}
877
878#[cfg(test)]
879mod tests {
880    use super::*;
881    use marque_ism::span::{MarkingCandidate, MarkingType, Span};
882    use marque_ism::token_set::CapcoTokenSet;
883
884    fn make_candidate(text: &[u8], kind: MarkingType, offset: usize) -> MarkingCandidate {
885        MarkingCandidate {
886            span: Span::new(offset, offset + text.len()),
887            kind,
888        }
889    }
890
891    fn parse_banner(text: &str) -> ParsedMarking {
892        let source = text.as_bytes();
893        let tokens = CapcoTokenSet;
894        let parser = Parser::new(&tokens);
895        let candidate = make_candidate(source, MarkingType::Banner, 0);
896        parser
897            .parse(&candidate, source)
898            .expect("parse should succeed")
899    }
900
901    fn parse_portion(text: &str) -> ParsedMarking {
902        let source = text.as_bytes();
903        let tokens = CapcoTokenSet;
904        let parser = Parser::new(&tokens);
905        let candidate = make_candidate(source, MarkingType::Portion, 0);
906        parser
907            .parse(&candidate, source)
908            .expect("parse should succeed")
909    }
910
911    // --- declass exemption in banner (E005 detection) ---
912
913    #[test]
914    fn banner_with_declass_exemption_populates_attrs() {
915        // A banner string that (incorrectly) contains a declass exemption code.
916        // parse_marking_string must populate declass_exemption so E005 can fire.
917        let parsed = parse_banner("SECRET//25X1//NOFORN");
918        assert!(
919            parsed.attrs.declass_exemption.is_some(),
920            "declass_exemption should be populated when 25X1 appears in banner"
921        );
922        use marque_ism::DeclassExemption;
923        assert_eq!(
924            parsed.attrs.declass_exemption,
925            Some(DeclassExemption::X25x1)
926        );
927    }
928
929    #[test]
930    fn portion_with_declass_exemption_populates_attrs() {
931        let parsed = parse_portion("(SECRET//50X1-HUM)");
932        assert!(parsed.attrs.declass_exemption.is_some());
933    }
934
935    // --- declass date in banner (E005 detection) ---
936
937    #[test]
938    fn banner_with_declass_date_populates_attrs() {
939        let parsed = parse_banner("SECRET//20301231//NOFORN");
940        assert_eq!(
941            parsed.attrs.declassify_on.as_deref(),
942            Some("20301231"),
943            "declassify_on should be populated when YYYYMMDD appears in banner"
944        );
945    }
946
947    #[test]
948    fn banner_with_four_digit_year_populates_attrs() {
949        let parsed = parse_banner("SECRET//2035");
950        assert_eq!(parsed.attrs.declassify_on.as_deref(), Some("2035"));
951    }
952
953    // --- normal banner (no declass tokens) ---
954
955    #[test]
956    fn banner_without_declass_leaves_fields_none() {
957        let parsed = parse_banner("TOP SECRET//SI//NOFORN");
958        assert!(parsed.attrs.declassify_on.is_none());
959        assert!(parsed.attrs.declass_exemption.is_none());
960    }
961
962    // --- is_declass_date helper ---
963
964    #[test]
965    fn is_declass_date_accepts_yyyymmdd() {
966        assert!(is_declass_date("20301231"));
967    }
968
969    #[test]
970    fn is_declass_date_accepts_yyyy() {
971        assert!(is_declass_date("2035"));
972    }
973
974    #[test]
975    fn is_declass_date_rejects_non_digit() {
976        assert!(!is_declass_date("2030X231"));
977        assert!(!is_declass_date("YYYYMMDD"));
978    }
979
980    #[test]
981    fn is_declass_date_rejects_wrong_length() {
982        assert!(!is_declass_date("203012"));
983        assert!(!is_declass_date("203012311"));
984    }
985
986    // --- token spans ---
987
988    #[test]
989    fn token_spans_track_offsets_in_banner() {
990        let parsed = parse_banner("TOP SECRET//SI//NF");
991        let kinds: Vec<TokenKind> = parsed.attrs.token_spans.iter().map(|t| t.kind).collect();
992        // Two separators + classification + sci + dissem.
993        assert!(kinds.contains(&TokenKind::Separator));
994        assert!(kinds.contains(&TokenKind::Classification));
995        assert!(kinds.contains(&TokenKind::SciControl));
996        assert!(kinds.contains(&TokenKind::DissemControl));
997
998        // Find each by kind and verify the byte slice matches.
999        let src = b"TOP SECRET//SI//NF";
1000        let cls = parsed
1001            .attrs
1002            .token_spans
1003            .iter()
1004            .find(|t| t.kind == TokenKind::Classification)
1005            .unwrap();
1006        assert_eq!(cls.span.as_str(src).unwrap(), "TOP SECRET");
1007
1008        let sci = parsed
1009            .attrs
1010            .token_spans
1011            .iter()
1012            .find(|t| t.kind == TokenKind::SciControl)
1013            .unwrap();
1014        assert_eq!(sci.span.as_str(src).unwrap(), "SI");
1015
1016        let dissem = parsed
1017            .attrs
1018            .token_spans
1019            .iter()
1020            .find(|t| t.kind == TokenKind::DissemControl)
1021            .unwrap();
1022        assert_eq!(dissem.span.as_str(src).unwrap(), "NF");
1023    }
1024
1025    #[test]
1026    fn token_spans_strip_paren_in_portion() {
1027        let parsed = parse_portion("(SECRET//NF)");
1028        let src = b"(SECRET//NF)";
1029        let cls = parsed
1030            .attrs
1031            .token_spans
1032            .iter()
1033            .find(|t| t.kind == TokenKind::Classification)
1034            .unwrap();
1035        // SECRET starts at byte 1 (after the open paren), runs to byte 7.
1036        assert_eq!(cls.span.start, 1);
1037        assert_eq!(cls.span.end, 7);
1038        assert_eq!(cls.span.as_str(src).unwrap(), "SECRET");
1039
1040        let dissem = parsed
1041            .attrs
1042            .token_spans
1043            .iter()
1044            .find(|t| t.kind == TokenKind::DissemControl)
1045            .unwrap();
1046        // NF starts at byte 9 (after `SECRET//`).
1047        assert_eq!(dissem.span.start, 9);
1048        assert_eq!(dissem.span.end, 11);
1049    }
1050
1051    #[test]
1052    fn token_spans_record_unknown_token() {
1053        let parsed = parse_banner("SECRET//XYZZY//NOFORN");
1054        let unknowns: Vec<&TokenSpan> = parsed
1055            .attrs
1056            .token_spans
1057            .iter()
1058            .filter(|t| t.kind == TokenKind::Unknown)
1059            .collect();
1060        assert_eq!(unknowns.len(), 1);
1061        assert_eq!(
1062            unknowns[0].span.as_str(b"SECRET//XYZZY//NOFORN").unwrap(),
1063            "XYZZY"
1064        );
1065    }
1066
1067    #[test]
1068    fn token_spans_record_rel_to_trigraphs() {
1069        let parsed = parse_banner("SECRET//REL TO USA, GBR, AUS");
1070        let trigraphs: Vec<&TokenSpan> = parsed
1071            .attrs
1072            .token_spans
1073            .iter()
1074            .filter(|t| t.kind == TokenKind::RelToTrigraph)
1075            .collect();
1076        assert_eq!(trigraphs.len(), 3);
1077        let src = b"SECRET//REL TO USA, GBR, AUS";
1078        assert_eq!(trigraphs[0].span.as_str(src).unwrap(), "USA");
1079        assert_eq!(trigraphs[1].span.as_str(src).unwrap(), "GBR");
1080        assert_eq!(trigraphs[2].span.as_str(src).unwrap(), "AUS");
1081    }
1082
1083    #[test]
1084    fn token_spans_record_separators() {
1085        let parsed = parse_banner("SECRET//NF");
1086        let seps: Vec<&TokenSpan> = parsed
1087            .attrs
1088            .token_spans
1089            .iter()
1090            .filter(|t| t.kind == TokenKind::Separator)
1091            .collect();
1092        assert_eq!(seps.len(), 1);
1093        let src = b"SECRET//NF";
1094        assert_eq!(seps[0].span.as_str(src).unwrap(), "//");
1095    }
1096
1097    // -----------------------------------------------------------------------
1098    // Non-US classification parsing
1099    // -----------------------------------------------------------------------
1100
1101    #[test]
1102    fn nato_banner_parses_all_variants() {
1103        for (input, expected) in [
1104            ("//NATO UNCLASSIFIED", NatoClassification::NatoUnclassified),
1105            ("//NATO RESTRICTED", NatoClassification::NatoRestricted),
1106            ("//NATO CONFIDENTIAL", NatoClassification::NatoConfidential),
1107            (
1108                "//NATO CONFIDENTIAL ATOMAL",
1109                NatoClassification::NatoConfidentialAtomal,
1110            ),
1111            ("//NATO SECRET", NatoClassification::NatoSecret),
1112            ("//NATO SECRET ATOMAL", NatoClassification::NatoSecretAtomal),
1113            ("//COSMIC TOP SECRET", NatoClassification::CosmicTopSecret),
1114            (
1115                "//COSMIC TOP SECRET ATOMAL",
1116                NatoClassification::CosmicTopSecretAtomal,
1117            ),
1118            (
1119                "//COSMIC TOP SECRET-BOHEMIA",
1120                NatoClassification::CosmicTopSecretBohemia,
1121            ),
1122            (
1123                "//COSMIC TOP SECRET-BALK",
1124                NatoClassification::CosmicTopSecretBalk,
1125            ),
1126        ] {
1127            let parsed = parse_banner(input);
1128            assert_eq!(
1129                parsed.attrs.classification,
1130                Some(MarkingClassification::Nato(expected)),
1131                "failed for banner: {input}"
1132            );
1133        }
1134    }
1135
1136    #[test]
1137    fn nato_portion_parses_all_variants() {
1138        for (input, expected) in [
1139            ("(//NU)", NatoClassification::NatoUnclassified),
1140            ("(//NR)", NatoClassification::NatoRestricted),
1141            ("(//NC)", NatoClassification::NatoConfidential),
1142            ("(//NCA)", NatoClassification::NatoConfidentialAtomal),
1143            ("(//NC-A)", NatoClassification::NatoConfidentialAtomal),
1144            ("(//NS)", NatoClassification::NatoSecret),
1145            ("(//NSAT)", NatoClassification::NatoSecretAtomal),
1146            ("(//NS-A)", NatoClassification::NatoSecretAtomal),
1147            ("(//CTS)", NatoClassification::CosmicTopSecret),
1148            ("(//CTSA)", NatoClassification::CosmicTopSecretAtomal),
1149            ("(//CTS-A)", NatoClassification::CosmicTopSecretAtomal),
1150            ("(//CTS-B)", NatoClassification::CosmicTopSecretBohemia),
1151            ("(//CTS-BALK)", NatoClassification::CosmicTopSecretBalk),
1152        ] {
1153            let parsed = parse_portion(input);
1154            assert_eq!(
1155                parsed.attrs.classification,
1156                Some(MarkingClassification::Nato(expected)),
1157                "failed for portion: {input}"
1158            );
1159        }
1160    }
1161
1162    #[test]
1163    fn nato_banner_with_rel_to() {
1164        let parsed = parse_banner("//NATO SECRET//REL TO USA, GBR");
1165        assert_eq!(
1166            parsed.attrs.classification,
1167            Some(MarkingClassification::Nato(NatoClassification::NatoSecret)),
1168        );
1169        assert_eq!(parsed.attrs.rel_to.len(), 2);
1170        assert_eq!(parsed.attrs.rel_to[0], Trigraph::USA);
1171    }
1172
1173    #[test]
1174    fn joint_banner_parses_correctly() {
1175        let parsed = parse_banner("//JOINT S USA GBR");
1176        match &parsed.attrs.classification {
1177            Some(MarkingClassification::Joint(j)) => {
1178                assert_eq!(j.level, Classification::Secret);
1179                assert_eq!(j.countries.len(), 2);
1180                assert_eq!(j.countries[0], Trigraph::USA);
1181                assert_eq!(j.countries[1].as_str(), "GBR");
1182            }
1183            other => panic!("expected Joint, got: {other:?}"),
1184        }
1185    }
1186
1187    #[test]
1188    fn joint_portion_with_rel_to() {
1189        let parsed = parse_portion("(//JOINT TS USA AUS GBR//REL TO USA, AUS, GBR)");
1190        match &parsed.attrs.classification {
1191            Some(MarkingClassification::Joint(j)) => {
1192                assert_eq!(j.level, Classification::TopSecret);
1193                assert_eq!(j.countries.len(), 3);
1194            }
1195            other => panic!("expected Joint, got: {other:?}"),
1196        }
1197        assert_eq!(parsed.attrs.rel_to.len(), 3);
1198    }
1199
1200    #[test]
1201    fn fgi_single_country_parses() {
1202        let parsed = parse_portion("(//GBR S//NF)");
1203        match &parsed.attrs.classification {
1204            Some(MarkingClassification::Fgi(f)) => {
1205                assert_eq!(f.level, Classification::Secret);
1206                assert_eq!(f.countries.len(), 1);
1207                assert_eq!(f.countries[0].as_str(), "GBR");
1208            }
1209            other => panic!("expected Fgi, got: {other:?}"),
1210        }
1211    }
1212
1213    #[test]
1214    fn fgi_multiple_countries_parses() {
1215        let parsed = parse_banner("//GBR DEU TS//NF");
1216        match &parsed.attrs.classification {
1217            Some(MarkingClassification::Fgi(f)) => {
1218                assert_eq!(f.level, Classification::TopSecret);
1219                assert_eq!(f.countries.len(), 2);
1220            }
1221            other => panic!("expected Fgi, got: {other:?}"),
1222        }
1223    }
1224
1225    #[test]
1226    fn fgi_placeholder_country_parses() {
1227        // FGI as placeholder for unknown country + level
1228        let parsed = parse_portion("(//FGI S//NF)");
1229        match &parsed.attrs.classification {
1230            Some(MarkingClassification::Fgi(f)) => {
1231                assert_eq!(f.level, Classification::Secret);
1232                assert!(
1233                    f.countries.is_empty(),
1234                    "FGI placeholder should have no countries"
1235                );
1236            }
1237            other => panic!("expected Fgi, got: {other:?}"),
1238        }
1239    }
1240
1241    #[test]
1242    fn fgi_no_level_is_error() {
1243        // //FGI// with no classification level — classification should be None
1244        let parsed = parse_banner("//FGI//NF");
1245        assert!(
1246            parsed.attrs.classification.is_none()
1247                || matches!(
1248                    parsed.attrs.classification,
1249                    Some(MarkingClassification::Us(_))
1250                ),
1251            "bare FGI with no level should not produce a valid non-US classification: {:?}",
1252            parsed.attrs.classification,
1253        );
1254    }
1255
1256    #[test]
1257    fn fgi_marker_in_us_marking() {
1258        let parsed = parse_banner("SECRET//FGI DEU//NOFORN");
1259        assert_eq!(
1260            parsed.attrs.classification,
1261            Some(MarkingClassification::Us(Classification::Secret)),
1262        );
1263        let marker = parsed
1264            .attrs
1265            .fgi_marker
1266            .as_ref()
1267            .expect("should have FGI marker");
1268        assert_eq!(marker.countries.len(), 1);
1269        assert_eq!(marker.countries[0].as_str(), "DEU");
1270    }
1271
1272    #[test]
1273    fn fgi_marker_no_countries() {
1274        let parsed = parse_banner("SECRET//FGI//NOFORN");
1275        assert_eq!(
1276            parsed.attrs.classification,
1277            Some(MarkingClassification::Us(Classification::Secret)),
1278        );
1279        let marker = parsed
1280            .attrs
1281            .fgi_marker
1282            .as_ref()
1283            .expect("should have FGI marker");
1284        assert!(marker.countries.is_empty());
1285    }
1286
1287    #[test]
1288    fn conflict_us_and_nato() {
1289        let parsed = parse_banner("SECRET//NATO SECRET//NOFORN");
1290        match &parsed.attrs.classification {
1291            Some(MarkingClassification::Conflict { us, foreign }) => {
1292                assert_eq!(*us, Classification::Secret);
1293                assert!(matches!(
1294                    foreign.as_ref(),
1295                    ForeignClassification::Nato(NatoClassification::NatoSecret)
1296                ));
1297            }
1298            other => panic!("expected Conflict, got: {other:?}"),
1299        }
1300    }
1301
1302    #[test]
1303    fn conflict_level_escalation() {
1304        // SECRET + COSMIC TOP SECRET → US escalates to TopSecret
1305        let parsed = parse_banner("SECRET//COSMIC TOP SECRET//NOFORN");
1306        match &parsed.attrs.classification {
1307            Some(MarkingClassification::Conflict { us, foreign }) => {
1308                assert_eq!(*us, Classification::TopSecret);
1309                assert!(matches!(
1310                    foreign.as_ref(),
1311                    ForeignClassification::Nato(NatoClassification::CosmicTopSecret)
1312                ));
1313            }
1314            other => panic!("expected Conflict with escalation, got: {other:?}"),
1315        }
1316    }
1317
1318    #[test]
1319    fn restricted_classification_parses() {
1320        let parsed = parse_banner("RESTRICTED//NF");
1321        assert_eq!(
1322            parsed.attrs.classification,
1323            Some(MarkingClassification::Us(Classification::Restricted)),
1324        );
1325    }
1326
1327    #[test]
1328    fn restricted_portion_parses() {
1329        let parsed = parse_portion("(R//NF)");
1330        assert_eq!(
1331            parsed.attrs.classification,
1332            Some(MarkingClassification::Us(Classification::Restricted)),
1333        );
1334    }
1335
1336    // -----------------------------------------------------------------------
1337    // Non-IC dissemination controls
1338    // -----------------------------------------------------------------------
1339
1340    #[test]
1341    fn non_ic_dissem_limdis_banner_form() {
1342        let parsed = parse_banner("UNCLASSIFIED//LIMDIS");
1343        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
1344        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Limdis,);
1345    }
1346
1347    #[test]
1348    fn non_ic_dissem_ds_portion_form() {
1349        let parsed = parse_portion("(U//DS)");
1350        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
1351        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Limdis);
1352    }
1353
1354    #[test]
1355    fn non_ic_dissem_les_nf() {
1356        let parsed = parse_portion("(U//LES-NF)");
1357        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
1358        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::LesNf);
1359        assert!(parsed.attrs.non_ic_dissem[0].carries_noforn());
1360    }
1361
1362    #[test]
1363    fn non_ic_dissem_sbu_nf_banner() {
1364        let parsed = parse_banner("UNCLASSIFIED//SBU NOFORN");
1365        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
1366        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::SbuNf);
1367    }
1368
1369    #[test]
1370    fn non_ic_dissem_not_confused_with_ic_dissem() {
1371        // SSI should be non-IC, not IC.
1372        let parsed = parse_portion("(U//SSI)");
1373        assert!(parsed.attrs.dissem_controls.is_empty());
1374        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
1375        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Ssi);
1376    }
1377
1378    #[test]
1379    fn non_ic_dissem_alongside_ic_dissem() {
1380        // Classified portion with both IC and non-IC dissem.
1381        let parsed = parse_portion("(C//NF//DS)");
1382        assert_eq!(parsed.attrs.dissem_controls.len(), 1); // NF
1383        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1); // DS = LIMDIS
1384    }
1385
1386    // -----------------------------------------------------------------------
1387    // Atomic Energy Act markings
1388    // -----------------------------------------------------------------------
1389
1390    #[test]
1391    fn aea_rd_parses() {
1392        let parsed = parse_banner("TOP SECRET//RD//NOFORN");
1393        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1394        assert_eq!(
1395            parsed.attrs.aea_markings[0],
1396            AeaMarking::Rd(marque_ism::RdBlock::default()),
1397        );
1398    }
1399
1400    #[test]
1401    fn aea_rd_cnwdi_compound() {
1402        // CNWDI is a hyphen-modifier of RD, not a separate // block.
1403        let parsed = parse_banner("SECRET//RD-CNWDI//NOFORN");
1404        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1405        match &parsed.attrs.aea_markings[0] {
1406            AeaMarking::Rd(rd) => {
1407                assert!(rd.cnwdi);
1408                assert!(rd.sigma.is_empty());
1409            }
1410            other => panic!("expected Rd with CNWDI, got: {other:?}"),
1411        }
1412    }
1413
1414    #[test]
1415    fn aea_rd_sigma_compound() {
1416        // SIGMA is a hyphen-modifier: RD-SIGMA 20
1417        let parsed = parse_banner("SECRET//RD-SIGMA 20//NOFORN");
1418        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1419        match &parsed.attrs.aea_markings[0] {
1420            AeaMarking::Rd(rd) => {
1421                assert!(!rd.cnwdi);
1422                assert_eq!(&*rd.sigma, &[20]);
1423            }
1424            other => panic!("expected Rd with SIGMA, got: {other:?}"),
1425        }
1426    }
1427
1428    #[test]
1429    fn aea_rd_cnwdi_sigma_compound() {
1430        let parsed = parse_banner("SECRET//RD-CNWDI-SIGMA 18 20//NOFORN");
1431        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1432        match &parsed.attrs.aea_markings[0] {
1433            AeaMarking::Rd(rd) => {
1434                assert!(rd.cnwdi);
1435                assert_eq!(&*rd.sigma, &[18, 20]);
1436            }
1437            other => panic!("expected Rd with CNWDI+SIGMA, got: {other:?}"),
1438        }
1439    }
1440
1441    #[test]
1442    fn aea_rd_sigma_portion() {
1443        // Portion form uses SG instead of SIGMA.
1444        let parsed = parse_portion("(TS//RD-SG 14//NF)");
1445        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1446        match &parsed.attrs.aea_markings[0] {
1447            AeaMarking::Rd(rd) => {
1448                assert_eq!(&*rd.sigma, &[14]);
1449            }
1450            other => panic!("expected Rd with SG, got: {other:?}"),
1451        }
1452    }
1453
1454    #[test]
1455    fn aea_frd_parses() {
1456        let parsed = parse_portion("(S//FRD//NF)");
1457        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1458        assert_eq!(
1459            parsed.attrs.aea_markings[0],
1460            AeaMarking::Frd(marque_ism::FrdBlock::default()),
1461        );
1462    }
1463
1464    #[test]
1465    fn aea_frd_sigma_compound() {
1466        let parsed = parse_banner("SECRET//FRD-SIGMA 14//NOFORN");
1467        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1468        match &parsed.attrs.aea_markings[0] {
1469            AeaMarking::Frd(frd) => {
1470                assert_eq!(&*frd.sigma, &[14]);
1471            }
1472            other => panic!("expected Frd with SIGMA, got: {other:?}"),
1473        }
1474    }
1475
1476    #[test]
1477    fn aea_dod_ucni_parses() {
1478        let parsed = parse_banner("UNCLASSIFIED//DOD UCNI");
1479        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1480        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::DodUcni);
1481    }
1482
1483    #[test]
1484    fn aea_dcni_portion_parses() {
1485        let parsed = parse_portion("(U//DCNI)");
1486        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1487        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::DodUcni);
1488    }
1489
1490    #[test]
1491    fn aea_tfni_parses() {
1492        let parsed = parse_banner("SECRET//TFNI//NOFORN");
1493        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1494        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::Tfni);
1495    }
1496
1497    #[test]
1498    fn aea_rd_n_shorthand() {
1499        // DoD shorthand: RD-N means RD-CNWDI
1500        let parsed = parse_portion("(S//RD-N//NF)");
1501        assert_eq!(parsed.attrs.aea_markings.len(), 1);
1502        match &parsed.attrs.aea_markings[0] {
1503            AeaMarking::Rd(rd) => assert!(rd.cnwdi),
1504            other => panic!("expected Rd with CNWDI from RD-N, got: {other:?}"),
1505        }
1506    }
1507
1508    // --- CAPCO §D.1 intra-block `/` separator ---
1509
1510    #[test]
1511    fn slash_separated_sci_in_single_block_parses() {
1512        // CAPCO §D.1: multiple SCI controls in one block, `/`-separated.
1513        // "(TS//SI/TK//NF)" must produce sci_controls: [Si, Tk], NOT Unknown.
1514        use marque_ism::SciControl;
1515        let parsed = parse_portion("(TS//SI/TK//NF)");
1516        assert_eq!(
1517            parsed.attrs.sci_controls.as_ref(),
1518            &[SciControl::Si, SciControl::Tk],
1519            "SI/TK block must yield two SCI controls"
1520        );
1521        // No Unknown token spans
1522        assert!(
1523            parsed
1524                .attrs
1525                .token_spans
1526                .iter()
1527                .all(|t| t.kind != TokenKind::Unknown),
1528            "no Unknown spans expected: {:?}",
1529            parsed.attrs.token_spans
1530        );
1531    }
1532
1533    #[test]
1534    fn slash_separated_sci_banner_parses() {
1535        // Same rule applies to banner markings.
1536        use marque_ism::SciControl;
1537        let parsed = parse_banner("TOP SECRET//SI/TK//NOFORN");
1538        assert_eq!(
1539            parsed.attrs.sci_controls.as_ref(),
1540            &[SciControl::Si, SciControl::Tk],
1541        );
1542    }
1543
1544    #[test]
1545    fn slash_separated_dissem_in_single_block_parses() {
1546        // Dissem controls can also share a block: "NF/RD" in one // block.
1547        use marque_ism::DissemControl;
1548        let parsed = parse_banner("SECRET//SI//NF/RELIDO");
1549        let dissem: Vec<DissemControl> = parsed.attrs.dissem_controls.to_vec();
1550        assert!(dissem.contains(&DissemControl::Nf), "must contain NF");
1551        assert!(
1552            dissem.contains(&DissemControl::Relido),
1553            "must contain RELIDO"
1554        );
1555    }
1556
1557    #[test]
1558    fn unrecognized_slash_token_emits_unknown() {
1559        // An unknown token like "XYZZY" in a slash block → Unknown span.
1560        let parsed = parse_portion("(S//XYZZY)");
1561        assert!(
1562            parsed
1563                .attrs
1564                .token_spans
1565                .iter()
1566                .any(|t| t.kind == TokenKind::Unknown),
1567            "XYZZY must produce Unknown span"
1568        );
1569    }
1570}
marque_core/parser.rs

marque_core/
parser.rs