marque_core/
parser.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Phase 2/3: token extraction and structural parsing.
6//!
7//! Takes [`MarkingCandidate`] spans from the scanner and produces [`IsmAttributes`].
8//!
9//! # Phase 2 — Token Extraction
10//! A compile-time Aho-Corasick automaton (built from CVE token list in marque-capco)
11//! runs over each candidate span, identifying known tokens and their positions.
12//! Unrecognized tokens within a candidate boundary are themselves diagnostics.
13//!
14//! # Phase 3 — Structural Parsing
15//! Token sequence → IsmAttributes. Validates ordering and block structure.
16//! Produces `ParseError` for structural violations; these feed into the rule engine
17//! as diagnostics with associated fixes.
18//!
19//! Note: the Aho-Corasick automaton is injected via `TokenSet` to keep marque-core
20//! free of a direct dependency on marque-capco's generated data.
21
22use crate::error::CoreError;
23use marque_ism::attrs::{
24    AeaMarking, Classification, CountryCode, DeclassExemption, DissemControl, FgiClassification,
25    FgiMarker, ForeignClassification, IsmAttributes, JointClassification, MarkingClassification,
26    NatoClassification, NonIcDissem, SarCompartment, SarIndicator, SarMarking, SarProgram,
27    SciCompartment, SciControl, SciControlBare, SciControlSystem, SciMarking, TokenKind, TokenSpan,
28};
29use marque_ism::date::IsmDate;
30use marque_ism::is_bare_cve_value;
31use marque_ism::span::{MarkingCandidate, MarkingType, Span};
32use marque_ism::token_set::TokenSet;
33use std::str::FromStr;
34
35/// Parse result for a single candidate.
36#[derive(Debug)]
37pub struct ParsedMarking {
38    pub attrs: IsmAttributes,
39    pub source_span: Span,
40    pub kind: MarkingType,
41}
42
43/// Phase 2+3 parser. Stateless; call [`Parser::parse`] per candidate.
44pub struct Parser<'t> {
45    tokens: &'t dyn TokenSet,
46}
47
48impl<'t> Parser<'t> {
49    pub fn new(tokens: &'t dyn TokenSet) -> Self {
50        Self { tokens }
51    }
52
53    /// Parse a single scanner candidate into [`IsmAttributes`].
54    pub fn parse(
55        &self,
56        candidate: &MarkingCandidate,
57        source: &[u8],
58    ) -> Result<ParsedMarking, CoreError> {
59        let text = candidate
60            .span
61            .as_str(source)
62            .map_err(|_| CoreError::InvalidUtf8(candidate.span))?;
63        match candidate.kind {
64            MarkingType::Portion => self.parse_portion(text, candidate),
65            MarkingType::Banner => self.parse_banner(text, candidate),
66            MarkingType::Cab => self.parse_cab(text, candidate),
67            // PageBreak candidates are scanner-emitted boundaries with no
68            // parsable content. Engine::lint filters them out before calling
69            // `parse`; reaching this arm is a programming error in the
70            // pipeline, so a `MalformedMarking` is the right surface.
71            MarkingType::PageBreak => Err(CoreError::MalformedMarking(
72                "page-break candidate must not be parsed".to_owned(),
73            )),
74        }
75    }
76
77    fn parse_portion(
78        &self,
79        text: &str,
80        candidate: &MarkingCandidate,
81    ) -> Result<ParsedMarking, CoreError> {
82        // Strip outer parentheses: "(TS//SI//NF)" -> "TS//SI//NF"
83        // The inner-string offset is `candidate.span.start + 1` because
84        // the leading `(` is one byte (verified ASCII by the scanner).
85        let inner = text
86            .strip_prefix('(')
87            .and_then(|s| s.strip_suffix(')'))
88            .ok_or_else(|| CoreError::MalformedMarking(text.to_owned()))?;
89
90        let attrs =
91            self.parse_marking_string(inner, MarkingType::Portion, candidate.span.start + 1)?;
92        Ok(ParsedMarking {
93            attrs,
94            source_span: candidate.span,
95            kind: MarkingType::Portion,
96        })
97    }
98
99    fn parse_banner(
100        &self,
101        text: &str,
102        candidate: &MarkingCandidate,
103    ) -> Result<ParsedMarking, CoreError> {
104        // For banner candidates, `text` is the full line bytes from the
105        // scanner. `text.trim()` may consume leading whitespace, which
106        // shifts the per-token offsets. Compute the leading whitespace
107        // length so we can add it to candidate.span.start.
108        let trimmed = text.trim_start();
109        let lead_ws = text.len() - trimmed.len();
110        let trimmed = trimmed.trim_end();
111        let attrs = self.parse_marking_string(
112            trimmed,
113            MarkingType::Banner,
114            candidate.span.start + lead_ws,
115        )?;
116        Ok(ParsedMarking {
117            attrs,
118            source_span: candidate.span,
119            kind: MarkingType::Banner,
120        })
121    }
122
123    fn parse_cab(
124        &self,
125        text: &str,
126        candidate: &MarkingCandidate,
127    ) -> Result<ParsedMarking, CoreError> {
128        // CAB is line-structured: "Classified By: ...\nDerived From: ...\nDeclassify On: ..."
129        let mut attrs = IsmAttributes::default();
130
131        for line in text.lines() {
132            if let Some(val) = line.strip_prefix("Classified By:") {
133                attrs.classified_by = Some(val.trim().into());
134            } else if let Some(val) = line.strip_prefix("Derived From:") {
135                attrs.derived_from = Some(val.trim().into());
136            } else if let Some(val) = line.strip_prefix("Declassify On:") {
137                let s = val.trim();
138                if let Some(exemption) = DeclassExemption::parse(s) {
139                    attrs.declass_exemption = Some(exemption);
140                } else {
141                    // Attempt to parse as a typed IsmDate (YYYY, YYYYMMDD,
142                    // YYYY-MM-DD, etc.). Unrecognized strings are silently
143                    // dropped rather than stored as raw text, since the field
144                    // is now typed.
145                    attrs.declassify_on = IsmDate::from_str(s).ok();
146                }
147            }
148        }
149
150        Ok(ParsedMarking {
151            attrs,
152            source_span: candidate.span,
153            kind: MarkingType::Cab,
154        })
155    }
156
157    /// Parse a marking string (without outer parentheses) into IsmAttributes.
158    /// Handles both portion form (abbreviated) and banner form (full words).
159    ///
160    /// `s_offset` is the absolute byte offset of `s` within the original
161    /// source buffer. Phase 3 uses it to record per-token absolute spans on
162    /// `IsmAttributes::token_spans` so rules can point at byte-precise
163    /// diagnostic locations.
164    fn parse_marking_string(
165        &self,
166        s: &str,
167        context: MarkingType,
168        s_offset: usize,
169    ) -> Result<IsmAttributes, CoreError> {
170        let mut attrs = IsmAttributes::default();
171
172        if s.is_empty() {
173            return Err(CoreError::MalformedMarking(s.to_owned()));
174        }
175
176        // Walk separator (`//`) positions inside `s`. Each block is the
177        // substring between consecutive separators (or string ends). Track
178        // both the block content and its inner offset so we can compute
179        // per-token absolute spans.
180        let separators: Vec<usize> = s.match_indices("//").map(|(i, _)| i).collect();
181        let mut block_ranges: Vec<(usize, usize)> = Vec::with_capacity(separators.len() + 1);
182        let mut prev_end = 0usize;
183        for &sep_start in &separators {
184            block_ranges.push((prev_end, sep_start));
185            prev_end = sep_start + 2; // skip the `//`
186        }
187        block_ranges.push((prev_end, s.len()));
188
189        let mut token_spans: Vec<TokenSpan> = Vec::new();
190
191        let mut sci: Vec<SciControl> = Vec::new();
192        let mut sci_markings: Vec<SciMarking> = Vec::new();
193        // SAR: P2 wires the hand-written subparser. Only the FIRST SAR block
194        // encountered populates `attrs.sar_markings`; any subsequent SAR block
195        // is emitted as `TokenKind::Unknown` so rule E030 (indicator-repeat)
196        // can flag the duplicate.
197        let mut sar_captured = false;
198        let mut aea: Vec<AeaMarking> = Vec::new();
199        let mut dissem: Vec<DissemControl> = Vec::new();
200        let mut non_ic: Vec<NonIcDissem> = Vec::new();
201        let mut rel_to: Vec<CountryCode> = Vec::new();
202
203        // When the marking starts with `//`, block 0 is empty and the
204        // classification is non-US (FGI, NATO, or JOINT). Block 1 carries
205        // the foreign classification.
206        let is_non_us = s.starts_with("//");
207
208        for (idx, &(rel_start, rel_end)) in block_ranges.iter().enumerate() {
209            let raw = &s[rel_start..rel_end];
210            let trimmed = raw.trim();
211            if trimmed.is_empty() {
212                continue;
213            }
214            let trim_lead = raw.len() - raw.trim_start().len();
215            let abs_start = s_offset + rel_start + trim_lead;
216            let abs_end = abs_start + trimmed.len();
217            let span = Span::new(abs_start, abs_end);
218
219            // ---------------------------------------------------------------
220            // Block 0: US classification (or empty for non-US markings)
221            // ---------------------------------------------------------------
222            if idx == 0 && !is_non_us {
223                attrs.classification = parse_classification(trimmed).map(MarkingClassification::Us);
224                token_spans.push(TokenSpan {
225                    kind: TokenKind::Classification,
226                    span,
227                    text: trimmed.into(),
228                });
229                continue;
230            }
231
232            // ---------------------------------------------------------------
233            // Block 1 when non-US: foreign classification
234            // ---------------------------------------------------------------
235            if idx == 1 && is_non_us {
236                if let Some(nato) = parse_nato_classification(trimmed) {
237                    attrs.classification = Some(MarkingClassification::Nato(nato));
238                } else if let Some(joint) = parse_joint_classification(trimmed) {
239                    attrs.classification = Some(MarkingClassification::Joint(joint));
240                } else if let Some(fgi) = parse_fgi_classification(trimmed) {
241                    attrs.classification = Some(MarkingClassification::Fgi(fgi));
242                } else {
243                    // Unrecognized non-US classification block.
244                    token_spans.push(TokenSpan {
245                        kind: TokenKind::Unknown,
246                        span,
247                        text: trimmed.into(),
248                    });
249                    continue;
250                }
251                token_spans.push(TokenSpan {
252                    kind: TokenKind::Classification,
253                    span,
254                    text: trimmed.into(),
255                });
256                continue;
257            }
258
259            // ---------------------------------------------------------------
260            // Remaining blocks: controls, markers, and fallbacks
261            // ---------------------------------------------------------------
262
263            // SAR category block (must precede the other branches because a
264            // SAR block such as `SAR-BP-J12/CD` contains `/` and would be
265            // misrouted to the multi-slash fallback). §H.5 / §A.6.
266            if trimmed.starts_with("SAR-") || trimmed.starts_with("SPECIAL ACCESS REQUIRED-") {
267                if sar_captured {
268                    // Second (or later) SAR block in this marking. Leave the
269                    // whole block as Unknown so E030 (sar-indicator-repeat)
270                    // can surface it in P3.
271                    token_spans.push(TokenSpan {
272                        kind: TokenKind::Unknown,
273                        span,
274                        text: trimmed.into(),
275                    });
276                    continue;
277                }
278                if let Some((marking, sar_spans)) = parse_sar_category(trimmed, abs_start) {
279                    attrs.sar_markings = Some(marking);
280                    token_spans.extend(sar_spans);
281                    sar_captured = true;
282                    continue;
283                }
284                // Grammar rejection (e.g., `SAR-` with nothing after): fall
285                // through to the normal Unknown handling below.
286                token_spans.push(TokenSpan {
287                    kind: TokenKind::Unknown,
288                    span,
289                    text: trimmed.into(),
290                });
291                continue;
292            }
293
294            if trimmed.starts_with("REL TO") || trimmed.starts_with("REL ") {
295                // Record the full block text before the individual trigraph tokens
296                // so token_spans maintains a logical ordering (block → constituents).
297                token_spans.push(TokenSpan {
298                    kind: TokenKind::RelToBlock,
299                    span,
300                    text: trimmed.into(),
301                });
302                let parsed =
303                    parse_rel_to_with_spans(trimmed, abs_start, self.tokens, &mut token_spans);
304                rel_to.extend(parsed.countries);
305                dissem.extend(parsed.trailing_dissem);
306                non_ic.extend(parsed.trailing_non_ic);
307            } else if (trimmed.contains('-')
308                || trimmed.contains('/')
309                || is_bare_cve_value(trimmed)
310                // Standalone custom SCI control (e.g., `99` in §A.6 p16).
311                // Require at least one digit so pure-alpha tokens (which
312                // are far more likely to be typos, other-category markers
313                // like `FGI`, or scanner-test garbage like `XYZZY`) keep
314                // falling through to Unknown. Declass dates (4/8 digit)
315                // and known non-SCI markers are also excluded.
316                || (is_valid_custom_control(trimmed)
317                    && trimmed.bytes().any(|b| b.is_ascii_digit())
318                    && !is_known_non_sci_token(trimmed)
319                    && !is_declass_date(trimmed)))
320                && let Some(markings) = parse_sci_block(trimmed, abs_start, &mut token_spans)
321            {
322                // Structural SCI path (spec 003-sci-compartments §R2). Runs
323                // before the exact-match path so compound/sub-compartment
324                // forms like `SI-G ABCD` and `123/SI-G ABCD DEFG-MMM AACD`
325                // are recognized. Projects canonical enum values into
326                // `sci_controls` for back-compat with rules that read the
327                // flat enum view (E010, E011).
328                for marking in &markings {
329                    if let Some(ctrl) = marking.canonical_enum {
330                        sci.push(ctrl);
331                    }
332                }
333                sci_markings.extend(markings);
334            } else if let Some(ctrl) = SciControl::parse(trimmed) {
335                sci.push(ctrl);
336                token_spans.push(TokenSpan {
337                    kind: TokenKind::SciControl,
338                    span,
339                    text: trimmed.into(),
340                });
341            } else if trimmed.starts_with("FGI")
342                && matches!(attrs.classification, Some(MarkingClassification::Us(_)))
343            {
344                // FGI marker in a US-classified marking (e.g., SECRET//FGI DEU//NF).
345                if let Some(marker) = parse_fgi_marker(trimmed) {
346                    attrs.fgi_marker = Some(marker);
347                    token_spans.push(TokenSpan {
348                        kind: TokenKind::FgiMarker,
349                        span,
350                        text: trimmed.into(),
351                    });
352                }
353            } else if let Some(ctrl) =
354                DissemControl::parse(trimmed).or_else(|| parse_dissem_full_form(trimmed))
355            {
356                dissem.push(ctrl);
357                token_spans.push(TokenSpan {
358                    kind: TokenKind::DissemControl,
359                    span,
360                    text: trimmed.into(),
361                });
362            } else if let Some(nic) = parse_non_ic_full_form(trimmed) {
363                non_ic.push(nic);
364                token_spans.push(TokenSpan {
365                    kind: TokenKind::NonIcDissem,
366                    span,
367                    text: trimmed.into(),
368                });
369            } else if let Some(aea_marking) = AeaMarking::parse(trimmed) {
370                aea.push(aea_marking);
371                token_spans.push(TokenSpan {
372                    kind: TokenKind::AeaMarking,
373                    span,
374                    text: trimmed.into(),
375                });
376            } else if let Some(exemption) = DeclassExemption::parse(trimmed) {
377                attrs.declass_exemption = Some(exemption);
378                token_spans.push(TokenSpan {
379                    kind: TokenKind::DeclassExemption,
380                    span,
381                    text: trimmed.into(),
382                });
383            } else if is_declass_date(trimmed) {
384                attrs.declassify_on = IsmDate::from_str(trimmed).ok();
385                token_spans.push(TokenSpan {
386                    kind: TokenKind::DeclassDate,
387                    span,
388                    text: trimmed.into(),
389                });
390            } else if let Some(foreign) = try_parse_foreign_classification(trimmed) {
391                // Conflict: a foreign classification in a marking that already
392                // has a US classification. US wins at the greater of the two.
393                if let Some(MarkingClassification::Us(us_level)) = attrs.classification {
394                    let foreign_equiv = match &foreign {
395                        ForeignClassification::Nato(n) => n.us_equivalent(),
396                        ForeignClassification::Fgi(f) => f.level,
397                        ForeignClassification::Joint(j) => j.level,
398                    };
399                    let max_level = us_level.max(foreign_equiv);
400                    attrs.classification = Some(MarkingClassification::Conflict {
401                        us: max_level,
402                        foreign: Box::new(foreign),
403                    });
404                    token_spans.push(TokenSpan {
405                        kind: TokenKind::Classification,
406                        span,
407                        text: trimmed.into(),
408                    });
409                } else {
410                    // No prior US classification — just Unknown.
411                    token_spans.push(TokenSpan {
412                        kind: TokenKind::Unknown,
413                        span,
414                        text: trimmed.into(),
415                    });
416                }
417            } else if trimmed.contains('/') && !trimmed.starts_with("REL") {
418                // Multi-token block per CAPCO §D.1: multiple entries within a
419                // **single category** are separated by `/` (e.g., "SI/TK", "NF/RD").
420                // First, speculatively parse all sub-tokens. If all recognized sub-tokens
421                // belong to the same category, commit them. If categories are mixed
422                // (e.g., "SI/NF" — SCI + dissem in one block), the `/` is a stray
423                // separator that should have been `//`; emit the whole block as Unknown
424                // so E004 can detect and fix the missing `//`.
425
426                #[derive(Clone, Copy, PartialEq, Eq)]
427                enum SubKind {
428                    Sci,
429                    Dissem,
430                    NonIc,
431                    Aea,
432                    Unknown,
433                }
434
435                struct SubResult<'a> {
436                    kind: SubKind,
437                    tok: &'a str,
438                    span: Span,
439                    // Parsed values — stored here before committing.
440                    sci: Option<SciControl>,
441                    dissem: Option<DissemControl>,
442                    nic: Option<NonIcDissem>,
443                    aea: Option<AeaMarking>,
444                }
445
446                let mut results: Vec<SubResult<'_>> = Vec::new();
447                for (sub_off, sub_tok) in split_slash_with_offsets(trimmed) {
448                    let sub_abs_start = abs_start + sub_off;
449                    let sub_span = Span::new(sub_abs_start, sub_abs_start + sub_tok.len());
450                    if let Some(ctrl) = SciControl::parse(sub_tok) {
451                        results.push(SubResult {
452                            kind: SubKind::Sci,
453                            tok: sub_tok,
454                            span: sub_span,
455                            sci: Some(ctrl),
456                            dissem: None,
457                            nic: None,
458                            aea: None,
459                        });
460                    } else if let Some(ctrl) =
461                        DissemControl::parse(sub_tok).or_else(|| parse_dissem_full_form(sub_tok))
462                    {
463                        results.push(SubResult {
464                            kind: SubKind::Dissem,
465                            tok: sub_tok,
466                            span: sub_span,
467                            sci: None,
468                            dissem: Some(ctrl),
469                            nic: None,
470                            aea: None,
471                        });
472                    } else if let Some(nic) = parse_non_ic_full_form(sub_tok) {
473                        results.push(SubResult {
474                            kind: SubKind::NonIc,
475                            tok: sub_tok,
476                            span: sub_span,
477                            sci: None,
478                            dissem: None,
479                            nic: Some(nic),
480                            aea: None,
481                        });
482                    } else if let Some(aea_marking) = AeaMarking::parse(sub_tok) {
483                        results.push(SubResult {
484                            kind: SubKind::Aea,
485                            tok: sub_tok,
486                            span: sub_span,
487                            sci: None,
488                            dissem: None,
489                            nic: None,
490                            aea: Some(aea_marking),
491                        });
492                    } else {
493                        results.push(SubResult {
494                            kind: SubKind::Unknown,
495                            tok: sub_tok,
496                            span: sub_span,
497                            sci: None,
498                            dissem: None,
499                            nic: None,
500                            aea: None,
501                        });
502                    }
503                }
504
505                // Check category consistency: all parsed (non-Unknown) sub-tokens
506                // must share the same category for `/` to be a valid intra-block
507                // separator. Mixed categories (e.g., SCI + dissem) mean the `/`
508                // is a stray single-slash separator that should have been `//`.
509                let first_parsed_kind = results
510                    .iter()
511                    .find(|r| r.kind != SubKind::Unknown)
512                    .map(|r| r.kind);
513                let all_same_category = first_parsed_kind.is_some_and(|first| {
514                    results
515                        .iter()
516                        .filter(|r| r.kind != SubKind::Unknown)
517                        .all(|r| r.kind == first)
518                });
519
520                if first_parsed_kind.is_some() && !all_same_category {
521                    // Mixed categories: the `/` is a stray separator.
522                    // Emit the whole block as Unknown so E004 can detect it.
523                    token_spans.push(TokenSpan {
524                        kind: TokenKind::Unknown,
525                        span,
526                        text: trimmed.into(),
527                    });
528                } else {
529                    // Same category (or all unknown): commit sub-token results.
530                    for r in results {
531                        match r.kind {
532                            SubKind::Sci => {
533                                sci.push(r.sci.unwrap());
534                                token_spans.push(TokenSpan {
535                                    kind: TokenKind::SciControl,
536                                    span: r.span,
537                                    text: r.tok.into(),
538                                });
539                            }
540                            SubKind::Dissem => {
541                                dissem.push(r.dissem.unwrap());
542                                token_spans.push(TokenSpan {
543                                    kind: TokenKind::DissemControl,
544                                    span: r.span,
545                                    text: r.tok.into(),
546                                });
547                            }
548                            SubKind::NonIc => {
549                                non_ic.push(r.nic.unwrap());
550                                token_spans.push(TokenSpan {
551                                    kind: TokenKind::NonIcDissem,
552                                    span: r.span,
553                                    text: r.tok.into(),
554                                });
555                            }
556                            SubKind::Aea => {
557                                aea.push(r.aea.unwrap());
558                                token_spans.push(TokenSpan {
559                                    kind: TokenKind::AeaMarking,
560                                    span: r.span,
561                                    text: r.tok.into(),
562                                });
563                            }
564                            SubKind::Unknown => {
565                                // Unrecognized sub-token within a same-category block.
566                                // E008 fires one diagnostic per Unknown span.
567                                token_spans.push(TokenSpan {
568                                    kind: TokenKind::Unknown,
569                                    span: r.span,
570                                    text: r.tok.into(),
571                                });
572                            }
573                        }
574                    }
575                }
576            } else {
577                token_spans.push(TokenSpan {
578                    kind: TokenKind::Unknown,
579                    span,
580                    text: trimmed.into(),
581                });
582            }
583        }
584
585        attrs.sci_controls = sci.into_boxed_slice();
586        attrs.sci_markings = sci_markings.into_boxed_slice();
587        // `attrs.sar_markings` is populated inline by the SAR branch above
588        // when the first SAR category block is encountered; otherwise it
589        // defaults to `None` from `IsmAttributes::default()`. `sar_captured`
590        // is read in that branch to gate duplicate-block detection.
591        attrs.aea_markings = aea.into_boxed_slice();
592        attrs.dissem_controls = dissem.into_boxed_slice();
593        attrs.non_ic_dissem = non_ic.into_boxed_slice();
594        attrs.rel_to = rel_to.into_boxed_slice();
595        // Record separator spans (Phase 3 needs them for E004). Push them
596        // here alongside block tokens, then sort by start offset so the
597        // final slice is in document (source) order.
598        for &sep_start in &separators {
599            token_spans.push(TokenSpan {
600                kind: TokenKind::Separator,
601                span: Span::new(s_offset + sep_start, s_offset + sep_start + 2),
602                text: "//".into(),
603            });
604        }
605        token_spans.sort_unstable_by_key(|ts| ts.span.start);
606        attrs.token_spans = token_spans.into_boxed_slice();
607
608        let _ = context; // used for future context-aware validation
609
610        Ok(attrs)
611    }
612}
613
614/// Parse a classification string in either portion form (`"TS"`, `"S"`, `"C"`,
615/// `"R"`, `"U"`) or banner form (`"TOP SECRET"`, `"SECRET"`, ...).
616///
617/// Includes RESTRICTED/R for foreign-origin markings (between U and C).
618///
619/// Note: `Classification` is hand-written in `marque-ism::attrs` rather than
620/// generated from the CVE because the CVE only ships single-letter abbreviations
621/// and the tool needs both forms. Other CVE-derived enums (`SciControl`,
622/// `DissemControl`, `DeclassExemption`) go through their generated `parse()`
623/// methods. SAR is structural (not CVE-backed) and handled separately.
624fn parse_classification(s: &str) -> Option<Classification> {
625    match s {
626        "TS" | "TOP SECRET" => Some(Classification::TopSecret),
627        "S" | "SECRET" => Some(Classification::Secret),
628        "C" | "CONFIDENTIAL" => Some(Classification::Confidential),
629        "R" | "RESTRICTED" => Some(Classification::Restricted),
630        "U" | "UNCLASSIFIED" => Some(Classification::Unclassified),
631        _ => None,
632    }
633}
634
635/// Structural subparser for the SCI category block per CAPCO-2016 §A.6.
636///
637/// Grammar (spec 003-sci-compartments §R2):
638///
639/// ```text
640/// SCI_BLOCK      := SCI_SYSTEM ("/" SCI_SYSTEM)*
641/// SCI_SYSTEM     := CONTROL (-COMPARTMENT)*
642/// CONTROL        := BARE_CONTROL | CUSTOM_CONTROL
643/// BARE_CONTROL   := any bare CVE value (via is_bare_cve_value)
644/// CUSTOM_CONTROL := [A-Z0-9]{2,5} (not matching a BARE_CONTROL)
645/// COMPARTMENT    := COMP_ID (SPACE SUB_COMP)*
646/// COMP_ID        := [A-Z0-9]+
647/// SUB_COMP       := [A-Z0-9]+
648/// ```
649///
650/// Returns `Some(markings)` on successful structural parse, `None` on any
651/// grammar violation (dangling hyphens, leading hyphens, lowercase,
652/// empty compartments, invalid custom shape). On `None`, the caller falls
653/// back to the existing `SciControl::parse` exact-match path.
654///
655/// `canonical_enum` is populated via `format!("{ctrl}-{first_comp}").parse::<SciControl>()`
656/// ONLY when the matching compartment has no sub-compartments — sub-comps
657/// imply the compound is a structural anchor, not an atomic CVE value.
658///
659/// On success, emits TokenSpan entries (SciSystem / SciCompartment /
660/// SciSubCompartment) at byte-precise offsets relative to `base`.
661fn parse_sci_block(
662    text: &str,
663    base: usize,
664    tokens: &mut Vec<TokenSpan>,
665) -> Option<Vec<SciMarking>> {
666    if text.is_empty() {
667        return None;
668    }
669
670    // Buffer tokens into a local vec so we can discard them if any system
671    // fails to parse (all-or-nothing success semantics per spec).
672    let mut local_tokens: Vec<TokenSpan> = Vec::new();
673    let mut markings: Vec<SciMarking> = Vec::new();
674
675    // Split on `/` into per-system chunks, tracking byte offsets so each
676    // TokenSpan's `span` is accurate relative to the original source.
677    let mut chunk_start = 0usize;
678    let chunks: Vec<(usize, &str)> = {
679        let mut v = Vec::new();
680        for (i, ch) in text.char_indices() {
681            if ch == '/' {
682                v.push((chunk_start, &text[chunk_start..i]));
683                chunk_start = i + 1;
684            }
685        }
686        v.push((chunk_start, &text[chunk_start..]));
687        v
688    };
689
690    for (chunk_off, chunk) in chunks {
691        // No trim — grammar is strict; whitespace inside a chunk is
692        // meaningful only between sub-compartments (see below).
693        if chunk.is_empty() {
694            return None;
695        }
696        // Leading hyphen rejects immediately (e.g., `-SI`).
697        if chunk.starts_with('-') {
698            return None;
699        }
700
701        // Split chunk on first `-` into (control, rest). If no `-`, the
702        // whole chunk is the control with no compartments.
703        let (ctrl_str, rest_opt) = match chunk.find('-') {
704            Some(i) => (&chunk[..i], Some(&chunk[i + 1..])),
705            None => (chunk, None),
706        };
707
708        if ctrl_str.is_empty() {
709            return None;
710        }
711
712        // Recognize control: bare CVE first, then custom [A-Z0-9]{2,5}.
713        // A custom control must not collide with any other known category
714        // (Dissem / NonIcDissem / Sar / Aea / DeclassExemption) — otherwise
715        // a block like `SI/NF` would be mis-claimed as SCI instead of
716        // flagged as a stray `/` by E004.
717        let system: SciControlSystem = if let Some(bare) = SciControlBare::parse(ctrl_str) {
718            SciControlSystem::Published(bare)
719        } else if is_valid_custom_control(ctrl_str) && !is_known_non_sci_token(ctrl_str) {
720            SciControlSystem::Custom(ctrl_str.into())
721        } else {
722            return None;
723        };
724
725        // Emit a block-level SciControl span covering the full system
726        // chunk (control + compartments + sub-compartments), mirroring the
727        // existing exact-match path so rule consumers (E010, E011, and
728        // audit tooling that reads TokenKind::SciControl) continue to see
729        // one span per marking. The granular SciSystem/SciCompartment/
730        // SciSubCompartment spans below provide finer-grained structure
731        // for spec 003 rules (E032–E035).
732        let chunk_abs = base + chunk_off;
733        local_tokens.push(TokenSpan {
734            kind: TokenKind::SciControl,
735            span: Span::new(chunk_abs, chunk_abs + chunk.len()),
736            text: chunk.into(),
737        });
738        // Emit SciSystem token for the control identifier itself.
739        let ctrl_abs = base + chunk_off;
740        local_tokens.push(TokenSpan {
741            kind: TokenKind::SciSystem,
742            span: Span::new(ctrl_abs, ctrl_abs + ctrl_str.len()),
743            text: ctrl_str.into(),
744        });
745
746        // Parse compartments. `rest` is the substring after the first `-`.
747        // Each additional compartment is preceded by another `-`, and
748        // sub-compartments within a compartment are space-separated.
749        let mut compartments: Vec<SciCompartment> = Vec::new();
750        if let Some(rest) = rest_opt {
751            // Split `rest` on `-` into compartment segments. Strict grammar:
752            // empty segment (trailing or consecutive hyphen) → reject.
753            let rest_abs_base = base + chunk_off + ctrl_str.len() + 1; // +1 skips the `-`
754            let mut seg_start = 0usize;
755            let mut seg_offs: Vec<(usize, &str)> = Vec::new();
756            for (i, ch) in rest.char_indices() {
757                if ch == '-' {
758                    seg_offs.push((seg_start, &rest[seg_start..i]));
759                    seg_start = i + 1;
760                }
761            }
762            seg_offs.push((seg_start, &rest[seg_start..]));
763
764            for (seg_off, seg) in seg_offs {
765                if seg.is_empty() {
766                    return None; // dangling `-` or consecutive `--`
767                }
768                // Each compartment segment = COMP_ID (SPACE SUB_COMP)*
769                // Split on space.
770                let mut parts = seg.split(' ');
771                let comp_id = parts.next().unwrap(); // at least one part
772                if comp_id.is_empty() || !is_alnum_upper(comp_id) {
773                    return None;
774                }
775
776                let comp_abs = rest_abs_base + seg_off;
777                local_tokens.push(TokenSpan {
778                    kind: TokenKind::SciCompartment,
779                    span: Span::new(comp_abs, comp_abs + comp_id.len()),
780                    text: comp_id.into(),
781                });
782
783                let mut subs: Vec<Box<str>> = Vec::new();
784                // Track cursor within segment for sub-compartment offsets.
785                let mut sub_cursor = comp_id.len() + 1; // +1 skips the space
786                for sub in parts {
787                    if sub.is_empty() || !is_alnum_upper(sub) {
788                        return None;
789                    }
790                    let sub_abs = rest_abs_base + seg_off + sub_cursor;
791                    local_tokens.push(TokenSpan {
792                        kind: TokenKind::SciSubCompartment,
793                        span: Span::new(sub_abs, sub_abs + sub.len()),
794                        text: sub.into(),
795                    });
796                    subs.push(sub.into());
797                    sub_cursor += sub.len() + 1;
798                }
799
800                compartments.push(SciCompartment::new(comp_id.into(), subs.into_boxed_slice()));
801            }
802        }
803
804        // canonical_enum population (per data-model §canonical_enum):
805        // - No compartments → the bare control itself may be a CVE value
806        //   (e.g., `SI`, `TK`, `HCS`). Preserves pre-spec behavior.
807        // - One or more compartments → try `{ctrl}-{first_comp}` ONLY when
808        //   the first compartment has no sub-compartments. Sub-comps mean
809        //   the compound is a structural anchor, not an atomic CVE atom.
810        let canonical_enum = if compartments.is_empty() {
811            SciControl::parse(ctrl_str)
812        } else {
813            compartments
814                .first()
815                .filter(|c| c.sub_compartments.is_empty())
816                .and_then(|c| {
817                    let composite = format!("{}-{}", ctrl_str, c.identifier);
818                    SciControl::parse(&composite)
819                })
820        };
821
822        markings.push(SciMarking::new(
823            system,
824            compartments.into_boxed_slice(),
825            canonical_enum,
826        ));
827    }
828
829    tokens.extend(local_tokens);
830    Some(markings)
831}
832
833/// Custom control shape check: `[A-Z0-9]{2,5}` per spec §R1. Must not match
834/// a bare CVE value (caller dispatches to Published first, so this check is
835/// strictly the shape constraint).
836fn is_valid_custom_control(s: &str) -> bool {
837    let len = s.len();
838    (2..=5).contains(&len) && is_alnum_upper(s)
839}
840
841/// Returns true if `s` is non-empty and every byte is ASCII uppercase or digit.
842fn is_alnum_upper(s: &str) -> bool {
843    !s.is_empty()
844        && s.bytes()
845            .all(|b| b.is_ascii_uppercase() || b.is_ascii_digit())
846}
847
848/// Guard for the SCI structural subparser: returns true if `s` is a known
849/// non-SCI token (dissem, non-IC dissem, AEA marking, or declass exemption).
850/// Prevents `parse_sci_block` from claiming mixed-category slash blocks
851/// like `SI/NF` that should surface as stray-slash errors. SAR is
852/// structural (not CVE-backed) and handled by `parse_sar_category`.
853fn is_known_non_sci_token(s: &str) -> bool {
854    DissemControl::parse(s).is_some()
855        || parse_dissem_full_form(s).is_some()
856        || parse_non_ic_full_form(s).is_some()
857        || AeaMarking::parse(s).is_some()
858        || DeclassExemption::parse(s).is_some()
859}
860
861/// Parse a NATO classification string in either banner form (`"NATO SECRET"`,
862/// `"COSMIC TOP SECRET"`, etc.) or portion form (`"NS"`, `"CTS"`, etc.).
863///
864/// Includes SAP variants (ATOMAL, BOHEMIA, BALK). Longer patterns are checked
865/// first to avoid prefix ambiguity (e.g., `"COSMIC TOP SECRET ATOMAL"` before
866/// `"COSMIC TOP SECRET"`).
867fn parse_nato_classification(s: &str) -> Option<NatoClassification> {
868    // Check longer patterns first to avoid prefix matches.
869    match s {
870        // Banner forms (full words) — longer patterns first
871        "COSMIC TOP SECRET ATOMAL" => Some(NatoClassification::CosmicTopSecretAtomal),
872        "COSMIC TOP SECRET-BOHEMIA" => Some(NatoClassification::CosmicTopSecretBohemia),
873        "COSMIC TOP SECRET-BALK" => Some(NatoClassification::CosmicTopSecretBalk),
874        "COSMIC TOP SECRET" => Some(NatoClassification::CosmicTopSecret),
875        "NATO SECRET ATOMAL" => Some(NatoClassification::NatoSecretAtomal),
876        "NATO SECRET" => Some(NatoClassification::NatoSecret),
877        "NATO CONFIDENTIAL ATOMAL" => Some(NatoClassification::NatoConfidentialAtomal),
878        "NATO CONFIDENTIAL" => Some(NatoClassification::NatoConfidential),
879        "NATO RESTRICTED" => Some(NatoClassification::NatoRestricted),
880        "NATO UNCLASSIFIED" => Some(NatoClassification::NatoUnclassified),
881        // Portion forms — primary (CAPCO Register)
882        "CTSA" | "CTS-A" => Some(NatoClassification::CosmicTopSecretAtomal),
883        "CTS-B" => Some(NatoClassification::CosmicTopSecretBohemia),
884        "CTS-BALK" => Some(NatoClassification::CosmicTopSecretBalk),
885        "CTS" => Some(NatoClassification::CosmicTopSecret),
886        "NSAT" | "NS-A" => Some(NatoClassification::NatoSecretAtomal),
887        "NS" => Some(NatoClassification::NatoSecret),
888        "NCA" | "NC-A" => Some(NatoClassification::NatoConfidentialAtomal),
889        "NC" => Some(NatoClassification::NatoConfidential),
890        "NR" => Some(NatoClassification::NatoRestricted),
891        "NU" => Some(NatoClassification::NatoUnclassified),
892        _ => None,
893    }
894}
895
896/// Parse a JOINT classification block: `"JOINT S USA GBR"` or `"JOINT SECRET USA GBR"`.
897///
898/// Format: `JOINT` + classification level + space-delimited country trigraphs.
899/// Countries are space-delimited (NOT comma-delimited like REL TO).
900fn parse_joint_classification(s: &str) -> Option<JointClassification> {
901    let rest = s.strip_prefix("JOINT ")?;
902    let mut tokens = rest.split_whitespace();
903
904    // First token(s) after JOINT are the classification level.
905    // Handle two-word levels like "TOP SECRET".
906    let first = tokens.next()?;
907    let (level, remaining_start) = if first == "TOP" {
908        // Check if next token is "SECRET" to form "TOP SECRET"
909        let mut peek_tokens = rest.split_whitespace();
910        peek_tokens.next(); // skip "TOP"
911        if peek_tokens.next() == Some("SECRET") {
912            let level = parse_classification("TOP SECRET")?;
913            // Skip past "TOP SECRET" — countries start after
914            let after_ts = rest.find("SECRET").map(|i| i + "SECRET".len())?;
915            (level, after_ts)
916        } else {
917            return None; // "TOP" alone is not a valid level
918        }
919    } else {
920        let level = parse_classification(first)?;
921        let after_level = rest.find(first).map(|i| i + first.len())?;
922        (level, after_level)
923    };
924
925    // Remaining tokens are space-delimited country trigraphs.
926    //
927    // NOTE: JOINT classifications today drop non-3-byte tokens
928    // silently (tetragraphs like NATO never appear in real JOINT
929    // markings, but the parallel of issue #183's REL TO silent-drop
930    // is tracked as deferred scope for PR-B / a future issue).
931    let country_str = rest[remaining_start..].trim();
932    let mut countries = Vec::new();
933    for token in country_str.split_whitespace() {
934        if token.len() == 3 {
935            if let Some(t) = CountryCode::try_new(token.as_bytes()) {
936                countries.push(t);
937            }
938        }
939    }
940
941    if countries.is_empty() {
942        return None; // JOINT must have at least one country
943    }
944
945    Some(JointClassification {
946        level,
947        countries: countries.into(),
948    })
949}
950
951/// Parse an FGI classification block: `"GBR S"`, `"DEU TS"`, `"GBR DEU S"`,
952/// or `"FGI S"` (FGI as placeholder for unknown country).
953///
954/// Format: one or more country trigraphs (or "FGI") + classification level.
955/// Countries are space-delimited. The last token is the classification level.
956///
957/// Returns `None` if no classification level is found (e.g., bare `"FGI"` with
958/// no level — that's an error, not a valid FGI classification).
959fn parse_fgi_classification(s: &str) -> Option<FgiClassification> {
960    let tokens: Vec<&str> = s.split_whitespace().collect();
961    if tokens.len() < 2 {
962        return None; // Need at least country + level
963    }
964
965    // Last token is the classification level. Handle "TOP SECRET" as two tokens.
966    let (level, country_end) = if tokens.len() >= 3
967        && tokens[tokens.len() - 2] == "TOP"
968        && tokens[tokens.len() - 1] == "SECRET"
969    {
970        (parse_classification("TOP SECRET")?, tokens.len() - 2)
971    } else {
972        (
973            parse_classification(tokens[tokens.len() - 1])?,
974            tokens.len() - 1,
975        )
976    };
977
978    // Preceding tokens are country trigraphs (or "FGI" placeholder).
979    let mut countries = Vec::new();
980    for &token in &tokens[..country_end] {
981        if token == "FGI" {
982            // FGI as placeholder for unknown country — countries stays empty
983            continue;
984        }
985        if token.len() == 3 {
986            let t = CountryCode::try_new(token.as_bytes())?;
987            countries.push(t);
988        } else {
989            return None; // Not a trigraph or "FGI"
990        }
991    }
992
993    Some(FgiClassification {
994        countries: countries.into(),
995        level,
996    })
997}
998
999/// Parse an FGI marker block in a US-classified marking: `"FGI"` or `"FGI DEU"` or `"FGI DEU GBR"`.
1000///
1001/// This is the FGI block between SAR and dissem controls in a US-classified
1002/// marking (e.g., `SECRET//FGI DEU//NOFORN`). Not to be confused with
1003/// [`parse_fgi_classification`] which parses a non-US classification.
1004fn parse_fgi_marker(s: &str) -> Option<FgiMarker> {
1005    if s == "FGI" {
1006        return Some(FgiMarker {
1007            countries: Box::new([]),
1008        });
1009    }
1010
1011    let rest = s.strip_prefix("FGI ")?;
1012    let mut countries = Vec::new();
1013    for token in rest.split_whitespace() {
1014        if token.len() == 3 {
1015            if let Some(t) = CountryCode::try_new(token.as_bytes()) {
1016                countries.push(t);
1017            }
1018        }
1019        // Skip non-trigraph tokens for now (tetragraphs like NATO)
1020    }
1021
1022    Some(FgiMarker {
1023        countries: countries.into(),
1024    })
1025}
1026
1027/// Attempt to parse a block as a foreign classification (NATO, JOINT, or FGI).
1028///
1029/// Used as a fallback in the block loop to detect conflict scenarios
1030/// (e.g., `SECRET//NATO SECRET//NOFORN`) where a foreign classification
1031/// appears alongside a US classification.
1032fn try_parse_foreign_classification(s: &str) -> Option<ForeignClassification> {
1033    if let Some(nato) = parse_nato_classification(s) {
1034        Some(ForeignClassification::Nato(nato))
1035    } else if let Some(joint) = parse_joint_classification(s) {
1036        Some(ForeignClassification::Joint(joint))
1037    } else {
1038        parse_fgi_classification(s).map(ForeignClassification::Fgi)
1039    }
1040}
1041
1042/// Map a banner-form (full-word) dissemination control to its CVE
1043/// abbreviation form. The CVE only ships abbreviations (`NF`, `OC`, ...),
1044/// but banner markings use the full words (`NOFORN`, `ORCON`, ...) and the
1045/// parser must accept both. Phase 3 added this fallback so banner-form
1046/// markings parse cleanly into a typed `DissemControl`.
1047///
1048/// Rules that detect "banner uses portion abbreviation" (E001) read the
1049/// raw token span via `attrs.token_spans` and inspect the original bytes,
1050/// so this mapping does not lose the abbreviation-vs-full-word signal.
1051///
1052/// Mapping data sourced from [`marque_ism::marking_forms`].
1053fn parse_dissem_full_form(s: &str) -> Option<DissemControl> {
1054    // Accept both the Banner Line Abbreviation (e.g., "NOFORN") and the
1055    // long Marking Title (e.g., "NOT RELEASABLE TO FOREIGN NATIONALS")
1056    // per CAPCO-2016 §A.6 line 317: "Any control markings in the banner
1057    // line may be spelled out per the 'Marking Title' ... or abbreviated
1058    // as per the 'Authorized Abbreviation' ... in accordance with the
1059    // Register". Long-title acceptance is what lets the S001 style rule
1060    // observe banner-form tokens that use the full title — without it
1061    // the parser would tag those as Unknown and E008 would fire instead.
1062    let portion = marque_ism::marking_forms::banner_to_portion(s)
1063        .or_else(|| marque_ism::marking_forms::title_to_portion(s))?;
1064    DissemControl::parse(portion)
1065}
1066
1067/// Non-IC dissemination control parser covering both the Banner Line
1068/// Abbreviation (e.g., `"LIMDIS"`) and the long "Marking Title" form
1069/// (e.g., `"LIMITED DISTRIBUTION"`). Mirror of [`parse_dissem_full_form`]
1070/// for the §9 non-IC marking set so the S001 style rule can see title
1071/// tokens across both categories.
1072fn parse_non_ic_full_form(s: &str) -> Option<NonIcDissem> {
1073    NonIcDissem::parse(s).or_else(|| {
1074        let portion = marque_ism::marking_forms::title_to_portion(s)?;
1075        NonIcDissem::parse(portion)
1076    })
1077}
1078
1079/// Return type for [`parse_rel_to_with_spans`].
1080///
1081/// Carries both the recognized country codes and any dissem/non-IC controls
1082/// that were appended to the last comma entry via an intra-segment `/`
1083/// separator (e.g., `REL TO USA, FVEY/NF` → countries=[USA, FVEY],
1084/// trailing_dissem=[NF]).
1085struct RelToParseResult {
1086    countries: Vec<CountryCode>,
1087    trailing_dissem: Vec<DissemControl>,
1088    trailing_non_ic: Vec<NonIcDissem>,
1089}
1090
1091/// Span-aware parse of a `REL TO ...` block. Records one
1092/// `TokenKind::RelToTrigraph` per recognized country code.
1093///
1094/// When a comma entry ends with `/<control>` — e.g., the last entry is
1095/// `FVEY/NF` instead of just `FVEY` — the function splits on the `/` and
1096/// parses the tail as additional dissem/non-IC controls. This handles the
1097/// CAPCO portion-mark convention where dissem controls in the same `//`-slot
1098/// are separated by `/` (e.g., `(TS//REL TO USA, FVEY/NF)` is valid). The
1099/// caller must extend its own `dissem`/`non_ic` vecs from the returned
1100/// `trailing_dissem` / `trailing_non_ic` fields.
1101///
1102/// `block_offset` is the absolute byte offset of `block` within the
1103/// original source buffer.
1104fn parse_rel_to_with_spans(
1105    block: &str,
1106    block_offset: usize,
1107    tokens: &dyn TokenSet,
1108    token_spans: &mut Vec<TokenSpan>,
1109) -> RelToParseResult {
1110    // Skip the "REL TO" / "REL" prefix to land on the trigraph list. We
1111    // need the offset of the *trigraph list* within `block` so that each
1112    // trigraph's absolute span can be computed.
1113    let prefix_skip = if let Some(rest) = block.strip_prefix("REL TO") {
1114        block.len() - rest.len()
1115    } else if let Some(rest) = block.strip_prefix("REL") {
1116        block.len() - rest.len()
1117    } else {
1118        0
1119    };
1120    let after_rel = &block[prefix_skip..];
1121
1122    let mut countries: Vec<CountryCode> = Vec::new();
1123    let mut trailing_dissem: Vec<DissemControl> = Vec::new();
1124    let mut trailing_non_ic: Vec<NonIcDissem> = Vec::new();
1125    // Walk comma-separated entries, tracking each entry's offset within
1126    // `after_rel` so we can land an absolute span on the trigraph itself
1127    // (not on any leading whitespace).
1128    let mut cursor = 0usize;
1129    for entry in after_rel.split(',') {
1130        let entry_start_in_after = cursor;
1131        // Advance past the entry and its trailing comma. On the final
1132        // iteration this steps one past the end of `after_rel`, but the
1133        // cursor is never read after the loop ends — the split iterator
1134        // drives loop termination, not the cursor. usize addition here
1135        // is bounded by the document size, so no overflow in practice.
1136        cursor += entry.len() + 1;
1137
1138        let trim_lead = entry.len() - entry.trim_start().len();
1139        let trimmed = entry.trim();
1140        if trimmed.is_empty() {
1141            continue;
1142        }
1143        let abs_start = block_offset + prefix_skip + entry_start_in_after + trim_lead;
1144
1145        // If the entry contains `/`, the part before the slash is the country
1146        // code and the part(s) after are additional dissem/non-IC controls
1147        // packed into the same `//`-slot (e.g., `FVEY/NF` in `REL TO USA,
1148        // FVEY/NF`). CAPCO portion-mark syntax uses `/` as the intra-segment
1149        // control separator within a `//`-delimited slot (§A.4 / §D.1).
1150        if let Some(slash_pos) = trimmed.find('/') {
1151            let country_part = trimmed[..slash_pos].trim();
1152            let tail = trimmed[slash_pos + 1..].trim();
1153
1154            // Parse the country part (may be empty if the slash is leading).
1155            if !country_part.is_empty() {
1156                if tokens.is_trigraph(country_part) {
1157                    if let Some(t) = CountryCode::try_new(country_part.as_bytes()) {
1158                        countries.push(t);
1159                        token_spans.push(TokenSpan {
1160                            kind: TokenKind::RelToTrigraph,
1161                            span: Span::new(abs_start, abs_start + country_part.len()),
1162                            text: country_part.into(),
1163                        });
1164                    }
1165                } else {
1166                    token_spans.push(TokenSpan {
1167                        kind: TokenKind::Unknown,
1168                        span: Span::new(abs_start, abs_start + country_part.len()),
1169                        text: country_part.into(),
1170                    });
1171                }
1172            }
1173
1174            // Parse each `/`-separated tail token as a dissem or non-IC control.
1175            let tail_base = abs_start + slash_pos + 1;
1176            let mut tail_cursor = 0usize;
1177            for part in tail.split('/') {
1178                let part_trim_lead = part.len() - part.trim_start().len();
1179                let part = part.trim();
1180                let part_abs = tail_base + tail_cursor + part_trim_lead;
1181                tail_cursor += part.len() + part_trim_lead + 1; // +1 for `/`
1182                if part.is_empty() {
1183                    continue;
1184                }
1185                if let Some(ctrl) =
1186                    DissemControl::parse(part).or_else(|| parse_dissem_full_form(part))
1187                {
1188                    trailing_dissem.push(ctrl);
1189                    token_spans.push(TokenSpan {
1190                        kind: TokenKind::DissemControl,
1191                        span: Span::new(part_abs, part_abs + part.len()),
1192                        text: part.into(),
1193                    });
1194                } else if let Some(nic) = parse_non_ic_full_form(part) {
1195                    trailing_non_ic.push(nic);
1196                    token_spans.push(TokenSpan {
1197                        kind: TokenKind::NonIcDissem,
1198                        span: Span::new(part_abs, part_abs + part.len()),
1199                        text: part.into(),
1200                    });
1201                } else {
1202                    token_spans.push(TokenSpan {
1203                        kind: TokenKind::Unknown,
1204                        span: Span::new(part_abs, part_abs + part.len()),
1205                        text: part.into(),
1206                    });
1207                }
1208            }
1209            continue;
1210        }
1211
1212        if !tokens.is_trigraph(trimmed) {
1213            // Issue #233: emit an Unknown span for unrecognized
1214            // entries inside a REL TO block instead of silently
1215            // dropping them. The decoder's
1216            // ``DecoderRecognizer::recognize`` step 3a rejects any
1217            // candidate whose strict parse leaves Unknown spans,
1218            // which is what makes the fuzzy-trigraph expansion
1219            // (``try_rel_to_fuzzy_trigraph_candidates``) win the
1220            // score contest: the original "drop USB" candidate now
1221            // carries an Unknown span and is filtered out, leaving
1222            // the corpus-weighted log-prior to break ties between
1223            // the surviving fuzzy alternates (USA, UZB, …).
1224            //
1225            // Strict-path callers still see a clean ``rel_to`` slice
1226            // — the Unknown span is metadata for the decoder filter,
1227            // not a parser failure. Existing rules that walk
1228            // ``token_spans`` already handle ``TokenKind::Unknown``
1229            // (see E030 sar-indicator-repeat for the analogous
1230            // pattern at line ~263).
1231            token_spans.push(TokenSpan {
1232                kind: TokenKind::Unknown,
1233                span: Span::new(abs_start, abs_start + trimmed.len()),
1234                text: trimmed.into(),
1235            });
1236            continue;
1237        }
1238        // Issue #183: drop the historical `b.len() != 3` gate that
1239        // silently dropped tetragraphs (`FVEY`, `NATO`, `ACGU`, …)
1240        // and the longer registered codes (`EU`, `AUSTRALIA_GROUP`)
1241        // from `rel_to`. `is_trigraph` already covers the full
1242        // registered CVE recognition surface, including trigraphs,
1243        // tetragraphs, and longer special forms such as `EU` and
1244        // `AUSTRALIA_GROUP`; `CountryCode::try_new` accepts
1245        // 2..=16-byte codes in the CAPCO byte set, so any code that
1246        // passed `is_trigraph` will also pass `try_new` here.
1247        let Some(t) = CountryCode::try_new(trimmed.as_bytes()) else {
1248            continue;
1249        };
1250        countries.push(t);
1251        token_spans.push(TokenSpan {
1252            kind: TokenKind::RelToTrigraph,
1253            span: Span::new(abs_start, abs_start + trimmed.len()),
1254            text: trimmed.into(),
1255        });
1256    }
1257    RelToParseResult {
1258        countries,
1259        trailing_dissem,
1260        trailing_non_ic,
1261    }
1262}
1263
1264// SCI controls, dissemination controls, SAR identifiers, and declass
1265// exemptions all parse via their generated `parse()` methods (see
1266// `parse_marking_string` above). The single hand-coded path is
1267// `parse_classification`, which is documented inline.
1268
1269/// Returns `true` if `s` looks like a syntactically and calendrically valid
1270/// inline declassification date.
1271///
1272/// CAPCO allows `YYYYMMDD` (8-digit) or `YYYY` (4-digit, meaning declassify
1273/// at the start of that calendar year). Both forms are valid in a CAB but
1274/// are a violation (E005) if they appear directly in a banner or portion
1275/// marking string.
1276///
1277/// Only strings that round-trip through [`IsmDate::from_str`] successfully
1278/// are accepted. This rejects impossible dates like `20301340` (month 13 /
1279/// day 40) that look like dates but would silently set `declassify_on` to
1280/// `None` and prevent E005 from firing.
1281fn is_declass_date(s: &str) -> bool {
1282    let bytes = s.as_bytes();
1283    if !matches!(bytes.len(), 4 | 8) || !bytes.iter().all(u8::is_ascii_digit) {
1284        return false;
1285    }
1286    IsmDate::from_str(s).is_ok()
1287}
1288
1289/// Splits `s` on `/` and returns `(offset, trimmed_token)` pairs where
1290/// `offset` is the byte offset of the trimmed token within `s`.
1291///
1292/// Used by the multi-token block fallback to handle CAPCO §D.1 blocks like
1293/// `"SI/TK"` or `"NF/LIMDIS"` where multiple entries share one `//` block.
1294fn split_slash_with_offsets(s: &str) -> Vec<(usize, &str)> {
1295    let mut result = Vec::new();
1296    let mut pos = 0usize;
1297    for part in s.split('/') {
1298        let trim_lead = part.len() - part.trim_start().len();
1299        let trimmed = part.trim();
1300        if !trimmed.is_empty() {
1301            result.push((pos + trim_lead, trimmed));
1302        }
1303        pos += part.len() + 1; // +1 for the `/` separator
1304    }
1305    result
1306}
1307
1308// ===========================================================================
1309// SAR subparser (§H.5 / §A.6)
1310// ===========================================================================
1311
1312/// Parse a single SAR category block.
1313///
1314/// `block_text` is the full block text (everything between `//` separators)
1315/// INCLUDING the `SAR-` or `SPECIAL ACCESS REQUIRED-` indicator prefix.
1316/// `base` is the absolute byte offset in the original source where
1317/// `block_text` starts.
1318///
1319/// Returns `Some((marking, spans))` when `block_text` starts with a recognized
1320/// SAR indicator AND the remainder is grammatically non-empty. Each returned
1321/// [`TokenSpan`] carries absolute byte offsets into the source.
1322///
1323/// Grammar (see spec `specs/002-sar-implementation/spec.md` §R2):
1324///
1325/// ```text
1326/// SAR_BLOCK    := INDICATOR PROGRAM ("/" PROGRAM)*
1327/// INDICATOR    := "SAR-" | "SPECIAL ACCESS REQUIRED-"
1328/// PROGRAM      := PROG_ID ( "-" COMPARTMENT )?
1329/// COMPARTMENT  := COMP_ID (" " SUB_COMP)*
1330/// PROG_ID      := [A-Z0-9]{2,3}           (SAR- form)
1331///               | [A-Z ]+                  (full-indicator form)
1332/// COMP_ID      := [A-Z0-9]+
1333/// SUB_COMP     := [A-Z0-9]+
1334/// ```
1335///
1336/// Rejection returns `None`:
1337/// - `SAR` without trailing hyphen.
1338/// - `SAR-` with an empty program identifier.
1339/// - A `//` sequence inside `block_text` (should not happen — the outer
1340///   category-block splitter would have handed us two separate blocks —
1341///   but we reject defensively).
1342/// - Empty string.
1343///
1344/// Ordering, classification, and roll-up constraints are NOT enforced here;
1345/// they are rule-layer (P3/P4) concerns.
1346fn parse_sar_category(block_text: &str, base: usize) -> Option<(SarMarking, Vec<TokenSpan>)> {
1347    // Defensive: `//` would mean the outer splitter gave us more than one
1348    // block. Refuse so the caller can record the text as Unknown and let
1349    // E030 handle it separately.
1350    if block_text.contains("//") {
1351        return None;
1352    }
1353
1354    // Identify the indicator variant. Longer prefix first so `SPECIAL
1355    // ACCESS REQUIRED-` wins over any putative `SAR-` substring.
1356    let (indicator, indicator_lit) = if block_text.starts_with("SPECIAL ACCESS REQUIRED-") {
1357        (SarIndicator::Full, "SPECIAL ACCESS REQUIRED-")
1358    } else if block_text.starts_with("SAR-") {
1359        (SarIndicator::Abbrev, "SAR-")
1360    } else {
1361        return None;
1362    };
1363    let rest_offset = indicator_lit.len();
1364    let rest = &block_text[rest_offset..];
1365    if rest.is_empty() {
1366        return None;
1367    }
1368
1369    let mut spans: Vec<TokenSpan> = Vec::new();
1370
1371    // Record the indicator span (does NOT include the first character of
1372    // the program identifier — only the literal `SAR-` / `SPECIAL ACCESS
1373    // REQUIRED-` including the trailing hyphen).
1374    spans.push(TokenSpan {
1375        kind: TokenKind::SarIndicator,
1376        span: Span::new(base, base + indicator_lit.len()),
1377        text: indicator_lit.into(),
1378    });
1379
1380    let mut programs: Vec<SarProgram> = Vec::new();
1381
1382    // Split the remainder on `/` into program chunks. Each chunk is a
1383    // `PROGRAM` production: `PROG_ID` optionally followed by `-COMPARTMENT`.
1384    let mut chunk_offset = rest_offset; // offset within block_text
1385    for (i, prog_chunk) in rest.split('/').enumerate() {
1386        if i > 0 {
1387            chunk_offset += 1; // account for the `/` just consumed
1388        }
1389        let program_base = base + chunk_offset;
1390
1391        let program = parse_sar_program(prog_chunk, program_base, indicator, &mut spans)?;
1392        programs.push(program);
1393        chunk_offset += prog_chunk.len();
1394    }
1395
1396    if programs.is_empty() {
1397        return None;
1398    }
1399
1400    Some((
1401        SarMarking::new(indicator, programs.into_boxed_slice()),
1402        spans,
1403    ))
1404}
1405
1406/// Parse a single `PROGRAM` production.
1407///
1408/// `chunk` is everything between adjacent `/` separators (or between the
1409/// indicator and the next `/`, or the tail of the block). `base` is the
1410/// absolute offset of `chunk[0]` in the source buffer. `indicator` drives
1411/// the shape of the program identifier only; compartment and
1412/// sub-compartment parsing is identical for both indicator forms.
1413///
1414/// Grammar: `PROG_ID ( "-" COMPARTMENT )? ( "-" COMPARTMENT )* `, where
1415/// `COMPARTMENT` is `COMP_ID (" " SUB_COMP)*`. `PROG_ID` shape is:
1416///
1417/// - **Abbrev** (`SAR-`): 2–3 alphanumeric characters.
1418/// - **Full** (`SPECIAL ACCESS REQUIRED-`): one or more uppercase ASCII
1419///   letters, optionally with spaces. Hyphens are NOT permitted inside
1420///   the program identifier for the full form — the first `-` always
1421///   marks the program/compartment boundary (CAPCO-2016 §H.5 p100).
1422///
1423/// Canonical example per §H.5 p100: `SAR-BP-J12 J54-K15/CD-...` decomposes
1424/// BP as two compartments `J12` (with sub-compartment `J54`) and `K15`.
1425/// Within one program the sequence alternates:
1426///   `PROG "-" COMP (" " SUB)* ( "-" COMP (" " SUB)* )*`
1427fn parse_sar_program(
1428    chunk: &str,
1429    base: usize,
1430    indicator: SarIndicator,
1431    spans: &mut Vec<TokenSpan>,
1432) -> Option<SarProgram> {
1433    if chunk.is_empty() {
1434        return None;
1435    }
1436
1437    // Split the chunk on `-`. The first segment is the program identifier;
1438    // each subsequent segment is a compartment (with optional space-joined
1439    // sub-compartments).
1440    let mut segments = split_with_offsets(chunk, '-');
1441    if segments.is_empty() {
1442        return None;
1443    }
1444
1445    // Program identifier: first segment. Shape check depends on indicator.
1446    let (prog_off, prog_id) = segments.remove(0);
1447    if prog_id.is_empty() {
1448        return None;
1449    }
1450    let prog_shape_ok = match indicator {
1451        // 2–3 alphanumeric chars.
1452        SarIndicator::Abbrev => {
1453            (2..=3).contains(&prog_id.len()) && prog_id.bytes().all(|b| b.is_ascii_alphanumeric())
1454        }
1455        // Uppercase ASCII letters with optional spaces; no digits, no
1456        // hyphens. Must contain at least one non-space byte.
1457        SarIndicator::Full => {
1458            prog_id.bytes().all(|b| b == b' ' || b.is_ascii_uppercase())
1459                && prog_id.bytes().any(|b| b != b' ')
1460        }
1461    };
1462    if !prog_shape_ok {
1463        return None;
1464    }
1465    spans.push(TokenSpan {
1466        kind: TokenKind::SarProgram,
1467        span: Span::new(base + prog_off, base + prog_off + prog_id.len()),
1468        text: prog_id.into(),
1469    });
1470
1471    // Remaining segments: each is a compartment, possibly with
1472    // space-separated sub-compartments.
1473    let mut compartments: Vec<SarCompartment> = Vec::with_capacity(segments.len());
1474    for (seg_off, seg) in segments {
1475        if seg.is_empty() {
1476            return None;
1477        }
1478        // Split segment on ` ` — first token is compartment, rest are subs.
1479        let mut parts = split_with_offsets(seg, ' ');
1480        let (comp_rel_off, comp_id) = parts.remove(0);
1481        if comp_id.is_empty() || !comp_id.bytes().all(|b| b.is_ascii_alphanumeric()) {
1482            return None;
1483        }
1484        let comp_abs_off = seg_off + comp_rel_off;
1485        spans.push(TokenSpan {
1486            kind: TokenKind::SarCompartment,
1487            span: Span::new(base + comp_abs_off, base + comp_abs_off + comp_id.len()),
1488            text: comp_id.into(),
1489        });
1490
1491        let mut subs: Vec<Box<str>> = Vec::with_capacity(parts.len());
1492        for (sub_rel_off, sub_id) in parts {
1493            if sub_id.is_empty() || !sub_id.bytes().all(|b| b.is_ascii_alphanumeric()) {
1494                return None;
1495            }
1496            let sub_abs_off = seg_off + sub_rel_off;
1497            spans.push(TokenSpan {
1498                kind: TokenKind::SarSubCompartment,
1499                span: Span::new(base + sub_abs_off, base + sub_abs_off + sub_id.len()),
1500                text: sub_id.into(),
1501            });
1502            subs.push(sub_id.into());
1503        }
1504
1505        compartments.push(SarCompartment::new(comp_id.into(), subs.into_boxed_slice()));
1506    }
1507
1508    Some(SarProgram::new(
1509        prog_id.into(),
1510        compartments.into_boxed_slice(),
1511    ))
1512}
1513
1514/// Split `s` on `delim`, returning `(offset_in_s, token)` pairs. Unlike
1515/// [`split_slash_with_offsets`], this preserves empty tokens so callers can
1516/// detect malformed input (e.g., `SAR--BP` → two segments, the first empty).
1517fn split_with_offsets(s: &str, delim: char) -> Vec<(usize, &str)> {
1518    let mut result = Vec::new();
1519    let mut pos = 0usize;
1520    let delim_len = delim.len_utf8();
1521    for part in s.split(delim) {
1522        result.push((pos, part));
1523        pos += part.len() + delim_len;
1524    }
1525    result
1526}
1527
1528#[cfg(test)]
1529#[cfg_attr(coverage_nightly, coverage(off))]
1530mod tests {
1531    use super::*;
1532    use marque_ism::span::{MarkingCandidate, MarkingType, Span};
1533    use marque_ism::token_set::CapcoTokenSet;
1534
1535    fn make_candidate(text: &[u8], kind: MarkingType, offset: usize) -> MarkingCandidate {
1536        MarkingCandidate {
1537            span: Span::new(offset, offset + text.len()),
1538            kind,
1539        }
1540    }
1541
1542    fn parse_banner(text: &str) -> ParsedMarking {
1543        let source = text.as_bytes();
1544        let tokens = CapcoTokenSet;
1545        let parser = Parser::new(&tokens);
1546        let candidate = make_candidate(source, MarkingType::Banner, 0);
1547        parser
1548            .parse(&candidate, source)
1549            .expect("parse should succeed")
1550    }
1551
1552    fn parse_portion(text: &str) -> ParsedMarking {
1553        let source = text.as_bytes();
1554        let tokens = CapcoTokenSet;
1555        let parser = Parser::new(&tokens);
1556        let candidate = make_candidate(source, MarkingType::Portion, 0);
1557        parser
1558            .parse(&candidate, source)
1559            .expect("parse should succeed")
1560    }
1561
1562    // --- declass exemption in banner (E005 detection) ---
1563
1564    #[test]
1565    fn banner_with_declass_exemption_populates_attrs() {
1566        // A banner string that (incorrectly) contains a declass exemption code.
1567        // parse_marking_string must populate declass_exemption so E005 can fire.
1568        let parsed = parse_banner("SECRET//25X1//NOFORN");
1569        assert!(
1570            parsed.attrs.declass_exemption.is_some(),
1571            "declass_exemption should be populated when 25X1 appears in banner"
1572        );
1573        use marque_ism::DeclassExemption;
1574        assert_eq!(
1575            parsed.attrs.declass_exemption,
1576            Some(DeclassExemption::X25x1)
1577        );
1578    }
1579
1580    #[test]
1581    fn portion_with_declass_exemption_populates_attrs() {
1582        let parsed = parse_portion("(SECRET//50X1-HUM)");
1583        assert!(parsed.attrs.declass_exemption.is_some());
1584    }
1585
1586    // --- declass date in banner (E005 detection) ---
1587
1588    #[test]
1589    fn banner_with_declass_date_populates_attrs() {
1590        let parsed = parse_banner("SECRET//20301231//NOFORN");
1591        assert_eq!(
1592            parsed.attrs.declassify_on,
1593            Some(marque_ism::IsmDate::Date(2030, 12, 31)),
1594            "declassify_on should be populated when YYYYMMDD appears in banner"
1595        );
1596    }
1597
1598    #[test]
1599    fn banner_with_four_digit_year_populates_attrs() {
1600        let parsed = parse_banner("SECRET//2035");
1601        assert_eq!(
1602            parsed.attrs.declassify_on,
1603            Some(marque_ism::IsmDate::Year(2035))
1604        );
1605    }
1606
1607    // --- normal banner (no declass tokens) ---
1608
1609    #[test]
1610    fn banner_without_declass_leaves_fields_none() {
1611        let parsed = parse_banner("TOP SECRET//SI//NOFORN");
1612        assert!(parsed.attrs.declassify_on.is_none());
1613        assert!(parsed.attrs.declass_exemption.is_none());
1614    }
1615
1616    // --- is_declass_date helper ---
1617
1618    #[test]
1619    fn is_declass_date_accepts_yyyymmdd() {
1620        assert!(is_declass_date("20301231"));
1621    }
1622
1623    #[test]
1624    fn is_declass_date_accepts_yyyy() {
1625        assert!(is_declass_date("2035"));
1626    }
1627
1628    #[test]
1629    fn is_declass_date_rejects_non_digit() {
1630        assert!(!is_declass_date("2030X231"));
1631        assert!(!is_declass_date("YYYYMMDD"));
1632    }
1633
1634    #[test]
1635    fn is_declass_date_rejects_wrong_length() {
1636        assert!(!is_declass_date("203012"));
1637        assert!(!is_declass_date("203012311"));
1638    }
1639
1640    #[test]
1641    fn is_declass_date_rejects_impossible_calendar_dates() {
1642        // Month 13 is impossible.
1643        assert!(!is_declass_date("20301340"));
1644        // Day 0 is impossible.
1645        assert!(!is_declass_date("20300100"));
1646        // 2003-02-31 doesn't exist (February has at most 29 days).
1647        assert!(!is_declass_date("20030231"));
1648        // 2003-04-31 doesn't exist (April has 30 days).
1649        assert!(!is_declass_date("20030431"));
1650    }
1651
1652    // --- token spans ---
1653
1654    #[test]
1655    fn token_spans_track_offsets_in_banner() {
1656        let parsed = parse_banner("TOP SECRET//SI//NF");
1657        let kinds: Vec<TokenKind> = parsed.attrs.token_spans.iter().map(|t| t.kind).collect();
1658        // Two separators + classification + sci + dissem.
1659        assert!(kinds.contains(&TokenKind::Separator));
1660        assert!(kinds.contains(&TokenKind::Classification));
1661        assert!(kinds.contains(&TokenKind::SciControl));
1662        assert!(kinds.contains(&TokenKind::DissemControl));
1663
1664        // Find each by kind and verify the byte slice matches.
1665        let src = b"TOP SECRET//SI//NF";
1666        let cls = parsed
1667            .attrs
1668            .token_spans
1669            .iter()
1670            .find(|t| t.kind == TokenKind::Classification)
1671            .unwrap();
1672        assert_eq!(cls.span.as_str(src).unwrap(), "TOP SECRET");
1673
1674        let sci = parsed
1675            .attrs
1676            .token_spans
1677            .iter()
1678            .find(|t| t.kind == TokenKind::SciControl)
1679            .unwrap();
1680        assert_eq!(sci.span.as_str(src).unwrap(), "SI");
1681
1682        let dissem = parsed
1683            .attrs
1684            .token_spans
1685            .iter()
1686            .find(|t| t.kind == TokenKind::DissemControl)
1687            .unwrap();
1688        assert_eq!(dissem.span.as_str(src).unwrap(), "NF");
1689    }
1690
1691    #[test]
1692    fn token_spans_strip_paren_in_portion() {
1693        let parsed = parse_portion("(SECRET//NF)");
1694        let src = b"(SECRET//NF)";
1695        let cls = parsed
1696            .attrs
1697            .token_spans
1698            .iter()
1699            .find(|t| t.kind == TokenKind::Classification)
1700            .unwrap();
1701        // SECRET starts at byte 1 (after the open paren), runs to byte 7.
1702        assert_eq!(cls.span.start, 1);
1703        assert_eq!(cls.span.end, 7);
1704        assert_eq!(cls.span.as_str(src).unwrap(), "SECRET");
1705
1706        let dissem = parsed
1707            .attrs
1708            .token_spans
1709            .iter()
1710            .find(|t| t.kind == TokenKind::DissemControl)
1711            .unwrap();
1712        // NF starts at byte 9 (after `SECRET//`).
1713        assert_eq!(dissem.span.start, 9);
1714        assert_eq!(dissem.span.end, 11);
1715    }
1716
1717    #[test]
1718    fn token_spans_record_unknown_token() {
1719        let parsed = parse_banner("SECRET//XYZZY//NOFORN");
1720        let unknowns: Vec<&TokenSpan> = parsed
1721            .attrs
1722            .token_spans
1723            .iter()
1724            .filter(|t| t.kind == TokenKind::Unknown)
1725            .collect();
1726        assert_eq!(unknowns.len(), 1);
1727        assert_eq!(
1728            unknowns[0].span.as_str(b"SECRET//XYZZY//NOFORN").unwrap(),
1729            "XYZZY"
1730        );
1731    }
1732
1733    #[test]
1734    fn token_spans_record_rel_to_trigraphs() {
1735        let parsed = parse_banner("SECRET//REL TO USA, GBR, AUS");
1736        let trigraphs: Vec<&TokenSpan> = parsed
1737            .attrs
1738            .token_spans
1739            .iter()
1740            .filter(|t| t.kind == TokenKind::RelToTrigraph)
1741            .collect();
1742        assert_eq!(trigraphs.len(), 3);
1743        let src = b"SECRET//REL TO USA, GBR, AUS";
1744        assert_eq!(trigraphs[0].span.as_str(src).unwrap(), "USA");
1745        assert_eq!(trigraphs[1].span.as_str(src).unwrap(), "GBR");
1746        assert_eq!(trigraphs[2].span.as_str(src).unwrap(), "AUS");
1747    }
1748
1749    // -----------------------------------------------------------------------
1750    // Issue #183 PR-A — country-code widening: REL TO must preserve
1751    // tetragraphs (FVEY, NATO, ACGU, …), `EU`, and `AUSTRALIA_GROUP`.
1752    // Pre-PR-A, every non-3-byte token was silently dropped at the
1753    // `b.len() != 3` gate in `parse_rel_to_with_spans`, so a marking
1754    // like `(S//REL TO USA, FVEY, GBR)` arrived at the rule layer as
1755    // `rel_to: [USA, GBR]` — FVEY gone with no diagnostic.
1756    // -----------------------------------------------------------------------
1757
1758    #[test]
1759    fn rel_to_preserves_tetragraph_fvey() {
1760        let parsed = parse_banner("SECRET//REL TO USA, FVEY, GBR");
1761        let codes: Vec<&str> = parsed.attrs.rel_to.iter().map(|c| c.as_str()).collect();
1762        assert_eq!(
1763            codes,
1764            vec!["USA", "FVEY", "GBR"],
1765            "FVEY tetragraph must land in rel_to (issue #183 silent-drop fix)"
1766        );
1767    }
1768
1769    #[test]
1770    fn rel_to_preserves_opaque_tetragraph_nato() {
1771        let parsed = parse_banner("SECRET//REL TO USA, NATO, GBR");
1772        let codes: Vec<&str> = parsed.attrs.rel_to.iter().map(|c| c.as_str()).collect();
1773        assert_eq!(
1774            codes,
1775            vec!["USA", "NATO", "GBR"],
1776            "NATO is in CVE TRIGRAPHS recognition set; rel_to must preserve it \
1777             even though membership expansion is deferred to Phase F"
1778        );
1779    }
1780
1781    #[test]
1782    fn rel_to_preserves_two_byte_eu() {
1783        let parsed = parse_banner("SECRET//REL TO USA, EU");
1784        let codes: Vec<&str> = parsed.attrs.rel_to.iter().map(|c| c.as_str()).collect();
1785        assert_eq!(
1786            codes,
1787            vec!["USA", "EU"],
1788            "EU (2-byte CVE entry) must round-trip through the parser"
1789        );
1790    }
1791
1792    #[test]
1793    fn rel_to_preserves_long_australia_group() {
1794        let parsed = parse_banner("SECRET//REL TO USA, AUSTRALIA_GROUP");
1795        let codes: Vec<&str> = parsed.attrs.rel_to.iter().map(|c| c.as_str()).collect();
1796        assert_eq!(
1797            codes,
1798            vec!["USA", "AUSTRALIA_GROUP"],
1799            "AUSTRALIA_GROUP (15-byte CVE entry, contains underscore) \
1800             must round-trip through the parser"
1801        );
1802    }
1803
1804    #[test]
1805    fn rel_to_token_span_widens_to_actual_code_length() {
1806        // Pre-PR-A the RelToTrigraph TokenSpan was hardcoded to 3
1807        // bytes (`Span::new(abs_start, abs_start + 3)`). Widening
1808        // matters because consumers — the E002 fix splice and
1809        // diagnostic underlines — read `span.as_str()` to anchor
1810        // their replacement / message at the exact source bytes.
1811        let parsed = parse_banner("SECRET//REL TO USA, FVEY, AUSTRALIA_GROUP");
1812        let trigraph_spans: Vec<&TokenSpan> = parsed
1813            .attrs
1814            .token_spans
1815            .iter()
1816            .filter(|t| t.kind == TokenKind::RelToTrigraph)
1817            .collect();
1818        let src = b"SECRET//REL TO USA, FVEY, AUSTRALIA_GROUP";
1819        assert_eq!(trigraph_spans[0].span.as_str(src).unwrap(), "USA");
1820        assert_eq!(trigraph_spans[1].span.as_str(src).unwrap(), "FVEY");
1821        assert_eq!(
1822            trigraph_spans[2].span.as_str(src).unwrap(),
1823            "AUSTRALIA_GROUP"
1824        );
1825    }
1826
1827    #[test]
1828    fn rel_to_drops_unrecognized_token_silently() {
1829        // Defensive: tokens outside the CVE recognition set
1830        // (`is_trigraph` is false) are still skipped — we widened
1831        // recognition, not the gate. `XYZQ` is a 4-char string not
1832        // in the CVE TRIGRAPHS list.
1833        let parsed = parse_banner("SECRET//REL TO USA, XYZQ, GBR");
1834        let codes: Vec<&str> = parsed.attrs.rel_to.iter().map(|c| c.as_str()).collect();
1835        assert_eq!(codes, vec!["USA", "GBR"]);
1836    }
1837
1838    #[test]
1839    fn token_spans_record_separators() {
1840        let parsed = parse_banner("SECRET//NF");
1841        let seps: Vec<&TokenSpan> = parsed
1842            .attrs
1843            .token_spans
1844            .iter()
1845            .filter(|t| t.kind == TokenKind::Separator)
1846            .collect();
1847        assert_eq!(seps.len(), 1);
1848        let src = b"SECRET//NF";
1849        assert_eq!(seps[0].span.as_str(src).unwrap(), "//");
1850    }
1851
1852    // -----------------------------------------------------------------------
1853    // Non-US classification parsing
1854    // -----------------------------------------------------------------------
1855
1856    #[test]
1857    fn nato_banner_parses_all_variants() {
1858        for (input, expected) in [
1859            ("//NATO UNCLASSIFIED", NatoClassification::NatoUnclassified),
1860            ("//NATO RESTRICTED", NatoClassification::NatoRestricted),
1861            ("//NATO CONFIDENTIAL", NatoClassification::NatoConfidential),
1862            (
1863                "//NATO CONFIDENTIAL ATOMAL",
1864                NatoClassification::NatoConfidentialAtomal,
1865            ),
1866            ("//NATO SECRET", NatoClassification::NatoSecret),
1867            ("//NATO SECRET ATOMAL", NatoClassification::NatoSecretAtomal),
1868            ("//COSMIC TOP SECRET", NatoClassification::CosmicTopSecret),
1869            (
1870                "//COSMIC TOP SECRET ATOMAL",
1871                NatoClassification::CosmicTopSecretAtomal,
1872            ),
1873            (
1874                "//COSMIC TOP SECRET-BOHEMIA",
1875                NatoClassification::CosmicTopSecretBohemia,
1876            ),
1877            (
1878                "//COSMIC TOP SECRET-BALK",
1879                NatoClassification::CosmicTopSecretBalk,
1880            ),
1881        ] {
1882            let parsed = parse_banner(input);
1883            assert_eq!(
1884                parsed.attrs.classification,
1885                Some(MarkingClassification::Nato(expected)),
1886                "failed for banner: {input}"
1887            );
1888        }
1889    }
1890
1891    #[test]
1892    fn nato_portion_parses_all_variants() {
1893        for (input, expected) in [
1894            ("(//NU)", NatoClassification::NatoUnclassified),
1895            ("(//NR)", NatoClassification::NatoRestricted),
1896            ("(//NC)", NatoClassification::NatoConfidential),
1897            ("(//NCA)", NatoClassification::NatoConfidentialAtomal),
1898            ("(//NC-A)", NatoClassification::NatoConfidentialAtomal),
1899            ("(//NS)", NatoClassification::NatoSecret),
1900            ("(//NSAT)", NatoClassification::NatoSecretAtomal),
1901            ("(//NS-A)", NatoClassification::NatoSecretAtomal),
1902            ("(//CTS)", NatoClassification::CosmicTopSecret),
1903            ("(//CTSA)", NatoClassification::CosmicTopSecretAtomal),
1904            ("(//CTS-A)", NatoClassification::CosmicTopSecretAtomal),
1905            ("(//CTS-B)", NatoClassification::CosmicTopSecretBohemia),
1906            ("(//CTS-BALK)", NatoClassification::CosmicTopSecretBalk),
1907        ] {
1908            let parsed = parse_portion(input);
1909            assert_eq!(
1910                parsed.attrs.classification,
1911                Some(MarkingClassification::Nato(expected)),
1912                "failed for portion: {input}"
1913            );
1914        }
1915    }
1916
1917    #[test]
1918    fn nato_banner_with_rel_to() {
1919        let parsed = parse_banner("//NATO SECRET//REL TO USA, GBR");
1920        assert_eq!(
1921            parsed.attrs.classification,
1922            Some(MarkingClassification::Nato(NatoClassification::NatoSecret)),
1923        );
1924        assert_eq!(parsed.attrs.rel_to.len(), 2);
1925        assert_eq!(parsed.attrs.rel_to[0], CountryCode::USA);
1926    }
1927
1928    #[test]
1929    fn joint_banner_parses_correctly() {
1930        let parsed = parse_banner("//JOINT S USA GBR");
1931        match &parsed.attrs.classification {
1932            Some(MarkingClassification::Joint(j)) => {
1933                assert_eq!(j.level, Classification::Secret);
1934                assert_eq!(j.countries.len(), 2);
1935                assert_eq!(j.countries[0], CountryCode::USA);
1936                assert_eq!(j.countries[1].as_str(), "GBR");
1937            }
1938            other => panic!("expected Joint, got: {other:?}"),
1939        }
1940    }
1941
1942    #[test]
1943    fn joint_banner_parses_top_secret_multi_word_level() {
1944        // The JOINT parser has a separate two-token path for the
1945        // multi-word `TOP SECRET` level (vs. the single-token `S` /
1946        // `TS` / `C` / `U` abbreviations). Exercises lines 905-907
1947        // and 909 of `parse_joint_classification`.
1948        let parsed = parse_banner("//JOINT TOP SECRET USA GBR");
1949        match &parsed.attrs.classification {
1950            Some(MarkingClassification::Joint(j)) => {
1951                assert_eq!(j.level, Classification::TopSecret);
1952                assert_eq!(j.countries.len(), 2);
1953                assert_eq!(j.countries[0], CountryCode::USA);
1954                assert_eq!(j.countries[1].as_str(), "GBR");
1955            }
1956            other => panic!("expected Joint(TopSecret), got: {other:?}"),
1957        }
1958    }
1959
1960    #[test]
1961    fn joint_banner_rejects_bare_top_without_secret() {
1962        // `TOP` alone is not a valid classification level — the
1963        // JOINT parser must return None and let the parent path
1964        // try other foreign-classification shapes. Exercises the
1965        // `else { return None; }` branch of the TOP-SECRET path.
1966        let parsed = parse_banner("//JOINT TOP USA GBR");
1967        assert!(
1968            !matches!(
1969                parsed.attrs.classification,
1970                Some(MarkingClassification::Joint(_))
1971            ),
1972            "bare TOP must not parse as a JOINT classification"
1973        );
1974    }
1975
1976    #[test]
1977    fn joint_portion_with_rel_to() {
1978        let parsed = parse_portion("(//JOINT TS USA AUS GBR//REL TO USA, AUS, GBR)");
1979        match &parsed.attrs.classification {
1980            Some(MarkingClassification::Joint(j)) => {
1981                assert_eq!(j.level, Classification::TopSecret);
1982                assert_eq!(j.countries.len(), 3);
1983            }
1984            other => panic!("expected Joint, got: {other:?}"),
1985        }
1986        assert_eq!(parsed.attrs.rel_to.len(), 3);
1987    }
1988
1989    #[test]
1990    fn fgi_single_country_parses() {
1991        let parsed = parse_portion("(//GBR S//NF)");
1992        match &parsed.attrs.classification {
1993            Some(MarkingClassification::Fgi(f)) => {
1994                assert_eq!(f.level, Classification::Secret);
1995                assert_eq!(f.countries.len(), 1);
1996                assert_eq!(f.countries[0].as_str(), "GBR");
1997            }
1998            other => panic!("expected Fgi, got: {other:?}"),
1999        }
2000    }
2001
2002    #[test]
2003    fn fgi_multiple_countries_parses() {
2004        let parsed = parse_banner("//GBR DEU TS//NF");
2005        match &parsed.attrs.classification {
2006            Some(MarkingClassification::Fgi(f)) => {
2007                assert_eq!(f.level, Classification::TopSecret);
2008                assert_eq!(f.countries.len(), 2);
2009            }
2010            other => panic!("expected Fgi, got: {other:?}"),
2011        }
2012    }
2013
2014    #[test]
2015    fn fgi_placeholder_country_parses() {
2016        // FGI as placeholder for unknown country + level
2017        let parsed = parse_portion("(//FGI S//NF)");
2018        match &parsed.attrs.classification {
2019            Some(MarkingClassification::Fgi(f)) => {
2020                assert_eq!(f.level, Classification::Secret);
2021                assert!(
2022                    f.countries.is_empty(),
2023                    "FGI placeholder should have no countries"
2024                );
2025            }
2026            other => panic!("expected Fgi, got: {other:?}"),
2027        }
2028    }
2029
2030    #[test]
2031    fn fgi_non_uppercase_trigraph_rejected() {
2032        // `CountryCode::try_new` accepts ASCII uppercase letter,
2033        // ASCII digit, or underscore (issue #183 widened the byte
2034        // set to cover `AX2`/`AX3` and `AUSTRALIA_GROUP`). A 3-byte
2035        // token containing a lowercase letter still fails that
2036        // check and trips the `CountryCode::try_new(...)?` rejection
2037        // path in `parse_fgi_classification`.
2038        let parsed = parse_banner("//Gbr S//NF");
2039        assert!(
2040            !matches!(
2041                parsed.attrs.classification,
2042                Some(MarkingClassification::Fgi(_))
2043            ),
2044            "Gbr should not parse as a valid FGI classification: {:?}",
2045            parsed.attrs.classification,
2046        );
2047    }
2048
2049    #[test]
2050    fn fgi_no_level_is_error() {
2051        // //FGI// with no classification level — classification should be None
2052        let parsed = parse_banner("//FGI//NF");
2053        assert!(
2054            parsed.attrs.classification.is_none()
2055                || matches!(
2056                    parsed.attrs.classification,
2057                    Some(MarkingClassification::Us(_))
2058                ),
2059            "bare FGI with no level should not produce a valid non-US classification: {:?}",
2060            parsed.attrs.classification,
2061        );
2062    }
2063
2064    #[test]
2065    fn fgi_marker_in_us_marking() {
2066        let parsed = parse_banner("SECRET//FGI DEU//NOFORN");
2067        assert_eq!(
2068            parsed.attrs.classification,
2069            Some(MarkingClassification::Us(Classification::Secret)),
2070        );
2071        let marker = parsed
2072            .attrs
2073            .fgi_marker
2074            .as_ref()
2075            .expect("should have FGI marker");
2076        assert_eq!(marker.countries.len(), 1);
2077        assert_eq!(marker.countries[0].as_str(), "DEU");
2078    }
2079
2080    #[test]
2081    fn fgi_marker_no_countries() {
2082        let parsed = parse_banner("SECRET//FGI//NOFORN");
2083        assert_eq!(
2084            parsed.attrs.classification,
2085            Some(MarkingClassification::Us(Classification::Secret)),
2086        );
2087        let marker = parsed
2088            .attrs
2089            .fgi_marker
2090            .as_ref()
2091            .expect("should have FGI marker");
2092        assert!(marker.countries.is_empty());
2093    }
2094
2095    #[test]
2096    fn conflict_us_and_nato() {
2097        let parsed = parse_banner("SECRET//NATO SECRET//NOFORN");
2098        match &parsed.attrs.classification {
2099            Some(MarkingClassification::Conflict { us, foreign }) => {
2100                assert_eq!(*us, Classification::Secret);
2101                assert!(matches!(
2102                    foreign.as_ref(),
2103                    ForeignClassification::Nato(NatoClassification::NatoSecret)
2104                ));
2105            }
2106            other => panic!("expected Conflict, got: {other:?}"),
2107        }
2108    }
2109
2110    #[test]
2111    fn conflict_level_escalation() {
2112        // SECRET + COSMIC TOP SECRET → US escalates to TopSecret
2113        let parsed = parse_banner("SECRET//COSMIC TOP SECRET//NOFORN");
2114        match &parsed.attrs.classification {
2115            Some(MarkingClassification::Conflict { us, foreign }) => {
2116                assert_eq!(*us, Classification::TopSecret);
2117                assert!(matches!(
2118                    foreign.as_ref(),
2119                    ForeignClassification::Nato(NatoClassification::CosmicTopSecret)
2120                ));
2121            }
2122            other => panic!("expected Conflict with escalation, got: {other:?}"),
2123        }
2124    }
2125
2126    #[test]
2127    fn restricted_classification_parses() {
2128        let parsed = parse_banner("RESTRICTED//NF");
2129        assert_eq!(
2130            parsed.attrs.classification,
2131            Some(MarkingClassification::Us(Classification::Restricted)),
2132        );
2133    }
2134
2135    #[test]
2136    fn restricted_portion_parses() {
2137        let parsed = parse_portion("(R//NF)");
2138        assert_eq!(
2139            parsed.attrs.classification,
2140            Some(MarkingClassification::Us(Classification::Restricted)),
2141        );
2142    }
2143
2144    // -----------------------------------------------------------------------
2145    // Non-IC dissemination controls
2146    // -----------------------------------------------------------------------
2147
2148    #[test]
2149    fn non_ic_dissem_limdis_banner_form() {
2150        let parsed = parse_banner("UNCLASSIFIED//LIMDIS");
2151        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
2152        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Limdis,);
2153    }
2154
2155    #[test]
2156    fn non_ic_dissem_ds_portion_form() {
2157        let parsed = parse_portion("(U//DS)");
2158        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
2159        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Limdis);
2160    }
2161
2162    #[test]
2163    fn non_ic_dissem_les_nf() {
2164        let parsed = parse_portion("(U//LES-NF)");
2165        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
2166        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::LesNf);
2167        assert!(parsed.attrs.non_ic_dissem[0].carries_noforn());
2168    }
2169
2170    #[test]
2171    fn non_ic_dissem_sbu_nf_banner() {
2172        let parsed = parse_banner("UNCLASSIFIED//SBU NOFORN");
2173        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
2174        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::SbuNf);
2175    }
2176
2177    #[test]
2178    fn non_ic_dissem_not_confused_with_ic_dissem() {
2179        // SSI should be non-IC, not IC.
2180        let parsed = parse_portion("(U//SSI)");
2181        assert!(parsed.attrs.dissem_controls.is_empty());
2182        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1);
2183        assert_eq!(parsed.attrs.non_ic_dissem[0], NonIcDissem::Ssi);
2184    }
2185
2186    #[test]
2187    fn non_ic_dissem_alongside_ic_dissem() {
2188        // Classified portion with both IC and non-IC dissem.
2189        let parsed = parse_portion("(C//NF//DS)");
2190        assert_eq!(parsed.attrs.dissem_controls.len(), 1); // NF
2191        assert_eq!(parsed.attrs.non_ic_dissem.len(), 1); // DS = LIMDIS
2192    }
2193
2194    // -----------------------------------------------------------------------
2195    // Atomic Energy Act markings
2196    // -----------------------------------------------------------------------
2197
2198    #[test]
2199    fn aea_rd_parses() {
2200        let parsed = parse_banner("TOP SECRET//RD//NOFORN");
2201        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2202        assert_eq!(
2203            parsed.attrs.aea_markings[0],
2204            AeaMarking::Rd(marque_ism::RdBlock::default()),
2205        );
2206    }
2207
2208    #[test]
2209    fn aea_rd_cnwdi_compound() {
2210        // CNWDI is a hyphen-modifier of RD, not a separate // block.
2211        let parsed = parse_banner("SECRET//RD-CNWDI//NOFORN");
2212        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2213        match &parsed.attrs.aea_markings[0] {
2214            AeaMarking::Rd(rd) => {
2215                assert!(rd.cnwdi);
2216                assert!(rd.sigma.is_empty());
2217            }
2218            other => panic!("expected Rd with CNWDI, got: {other:?}"),
2219        }
2220    }
2221
2222    #[test]
2223    fn aea_rd_sigma_compound() {
2224        // SIGMA is a hyphen-modifier: RD-SIGMA 20
2225        let parsed = parse_banner("SECRET//RD-SIGMA 20//NOFORN");
2226        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2227        match &parsed.attrs.aea_markings[0] {
2228            AeaMarking::Rd(rd) => {
2229                assert!(!rd.cnwdi);
2230                assert_eq!(&*rd.sigma, &[20]);
2231            }
2232            other => panic!("expected Rd with SIGMA, got: {other:?}"),
2233        }
2234    }
2235
2236    #[test]
2237    fn aea_rd_cnwdi_sigma_compound() {
2238        let parsed = parse_banner("SECRET//RD-CNWDI-SIGMA 18 20//NOFORN");
2239        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2240        match &parsed.attrs.aea_markings[0] {
2241            AeaMarking::Rd(rd) => {
2242                assert!(rd.cnwdi);
2243                assert_eq!(&*rd.sigma, &[18, 20]);
2244            }
2245            other => panic!("expected Rd with CNWDI+SIGMA, got: {other:?}"),
2246        }
2247    }
2248
2249    #[test]
2250    fn aea_rd_sigma_portion() {
2251        // Portion form uses SG instead of SIGMA.
2252        let parsed = parse_portion("(TS//RD-SG 14//NF)");
2253        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2254        match &parsed.attrs.aea_markings[0] {
2255            AeaMarking::Rd(rd) => {
2256                assert_eq!(&*rd.sigma, &[14]);
2257            }
2258            other => panic!("expected Rd with SG, got: {other:?}"),
2259        }
2260    }
2261
2262    #[test]
2263    fn aea_frd_parses() {
2264        let parsed = parse_portion("(S//FRD//NF)");
2265        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2266        assert_eq!(
2267            parsed.attrs.aea_markings[0],
2268            AeaMarking::Frd(marque_ism::FrdBlock::default()),
2269        );
2270    }
2271
2272    #[test]
2273    fn aea_frd_sigma_compound() {
2274        let parsed = parse_banner("SECRET//FRD-SIGMA 14//NOFORN");
2275        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2276        match &parsed.attrs.aea_markings[0] {
2277            AeaMarking::Frd(frd) => {
2278                assert_eq!(&*frd.sigma, &[14]);
2279            }
2280            other => panic!("expected Frd with SIGMA, got: {other:?}"),
2281        }
2282    }
2283
2284    #[test]
2285    fn aea_dod_ucni_parses() {
2286        let parsed = parse_banner("UNCLASSIFIED//DOD UCNI");
2287        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2288        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::DodUcni);
2289    }
2290
2291    #[test]
2292    fn aea_dcni_portion_parses() {
2293        let parsed = parse_portion("(U//DCNI)");
2294        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2295        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::DodUcni);
2296    }
2297
2298    #[test]
2299    fn aea_tfni_parses() {
2300        let parsed = parse_banner("SECRET//TFNI//NOFORN");
2301        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2302        assert_eq!(parsed.attrs.aea_markings[0], AeaMarking::Tfni);
2303    }
2304
2305    #[test]
2306    fn aea_rd_n_shorthand() {
2307        // DoD shorthand: RD-N means RD-CNWDI
2308        let parsed = parse_portion("(S//RD-N//NF)");
2309        assert_eq!(parsed.attrs.aea_markings.len(), 1);
2310        match &parsed.attrs.aea_markings[0] {
2311            AeaMarking::Rd(rd) => assert!(rd.cnwdi),
2312            other => panic!("expected Rd with CNWDI from RD-N, got: {other:?}"),
2313        }
2314    }
2315
2316    // --- CAPCO §D.1 intra-block `/` separator ---
2317
2318    #[test]
2319    fn slash_separated_sci_in_single_block_parses() {
2320        // CAPCO §D.1: multiple SCI controls in one block, `/`-separated.
2321        // "(TS//SI/TK//NF)" must produce sci_controls: [Si, Tk], NOT Unknown.
2322        use marque_ism::SciControl;
2323        let parsed = parse_portion("(TS//SI/TK//NF)");
2324        assert_eq!(
2325            parsed.attrs.sci_controls.as_ref(),
2326            &[SciControl::Si, SciControl::Tk],
2327            "SI/TK block must yield two SCI controls"
2328        );
2329        // No Unknown token spans
2330        assert!(
2331            parsed
2332                .attrs
2333                .token_spans
2334                .iter()
2335                .all(|t| t.kind != TokenKind::Unknown),
2336            "no Unknown spans expected: {:?}",
2337            parsed.attrs.token_spans
2338        );
2339    }
2340
2341    #[test]
2342    fn slash_separated_sci_banner_parses() {
2343        // Same rule applies to banner markings.
2344        use marque_ism::SciControl;
2345        let parsed = parse_banner("TOP SECRET//SI/TK//NOFORN");
2346        assert_eq!(
2347            parsed.attrs.sci_controls.as_ref(),
2348            &[SciControl::Si, SciControl::Tk],
2349        );
2350    }
2351
2352    #[test]
2353    fn slash_separated_dissem_in_single_block_parses() {
2354        // Dissem controls can also share a block: "NF/RD" in one // block.
2355        use marque_ism::DissemControl;
2356        let parsed = parse_banner("SECRET//SI//NF/RELIDO");
2357        let dissem: Vec<DissemControl> = parsed.attrs.dissem_controls.to_vec();
2358        assert!(dissem.contains(&DissemControl::Nf), "must contain NF");
2359        assert!(
2360            dissem.contains(&DissemControl::Relido),
2361            "must contain RELIDO"
2362        );
2363    }
2364
2365    #[test]
2366    fn unrecognized_slash_token_emits_unknown() {
2367        // An unknown token like "XYZZY" in a slash block → Unknown span.
2368        let parsed = parse_portion("(S//XYZZY)");
2369        assert!(
2370            parsed
2371                .attrs
2372                .token_spans
2373                .iter()
2374                .any(|t| t.kind == TokenKind::Unknown),
2375            "XYZZY must produce Unknown span"
2376        );
2377    }
2378
2379    // -----------------------------------------------------------------------
2380    // SCI structural subparser (spec 003-sci-compartments §R2 / P2)
2381    // -----------------------------------------------------------------------
2382
2383    #[test]
2384    fn sci_bare_single_still_parses_via_structural_path() {
2385        // Regression: `(U//SI//NF)` existing happy path. Structural parser
2386        // claims `SI` (bare CVE) and projects to sci_controls for
2387        // back-compat with E010/E011.
2388        use marque_ism::{SciControl, SciControlBare, SciControlSystem};
2389        let parsed = parse_portion("(U//SI//NF)");
2390        assert_eq!(parsed.attrs.sci_controls.as_ref(), &[SciControl::Si]);
2391        assert_eq!(parsed.attrs.sci_markings.len(), 1);
2392        let m = &parsed.attrs.sci_markings[0];
2393        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Si));
2394        assert!(m.compartments.is_empty());
2395        assert_eq!(m.canonical_enum, Some(SciControl::Si));
2396    }
2397
2398    #[test]
2399    fn sci_published_compound_si_g_parses() {
2400        // `SI-G` is a pre-registered CVE composite; canonical_enum must be Some(SiG).
2401        use marque_ism::{SciControl, SciControlBare, SciControlSystem};
2402        let parsed = parse_banner("SECRET//SI-G//NOFORN");
2403        let m = &parsed.attrs.sci_markings[0];
2404        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Si));
2405        assert_eq!(m.compartments.len(), 1);
2406        assert_eq!(m.compartments[0].identifier.as_ref(), "G");
2407        assert!(m.compartments[0].sub_compartments.is_empty());
2408        assert_eq!(m.canonical_enum, Some(SciControl::SiG));
2409        assert_eq!(parsed.attrs.sci_controls.as_ref(), &[SciControl::SiG]);
2410    }
2411
2412    #[test]
2413    fn sci_published_compound_hcs_p_parses() {
2414        use marque_ism::{SciControl, SciControlBare, SciControlSystem};
2415        let parsed = parse_banner("TOP SECRET//HCS-P//NOFORN");
2416        let m = &parsed.attrs.sci_markings[0];
2417        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Hcs));
2418        assert_eq!(m.compartments[0].identifier.as_ref(), "P");
2419        assert_eq!(m.canonical_enum, Some(SciControl::HcsP));
2420    }
2421
2422    #[test]
2423    fn sci_bare_tk_parses() {
2424        use marque_ism::{SciControl, SciControlBare, SciControlSystem};
2425        let parsed = parse_banner("SECRET//TK//NOFORN");
2426        let m = &parsed.attrs.sci_markings[0];
2427        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Tk));
2428        assert!(m.compartments.is_empty());
2429        assert_eq!(m.canonical_enum, Some(SciControl::Tk));
2430    }
2431
2432    #[test]
2433    fn sci_multi_system_si_tk_parses() {
2434        // `SI/TK` — two bare systems in one SCI block. Existing behavior.
2435        use marque_ism::SciControl;
2436        let parsed = parse_portion("(TS//SI/TK//NF)");
2437        assert_eq!(
2438            parsed.attrs.sci_controls.as_ref(),
2439            &[SciControl::Si, SciControl::Tk]
2440        );
2441        assert_eq!(parsed.attrs.sci_markings.len(), 2);
2442    }
2443
2444    #[test]
2445    fn sci_compound_with_sub_compartment_sets_canonical_none() {
2446        // `SI-G ABCD`: published system SI with compartment G and sub-comp
2447        // ABCD. Because the first compartment has sub-comps, canonical_enum
2448        // is None (the compound is a structural anchor, not an atomic CVE).
2449        use marque_ism::{SciControlBare, SciControlSystem};
2450        let parsed = parse_banner("SECRET//SI-G ABCD//NOFORN");
2451        assert_eq!(parsed.attrs.sci_markings.len(), 1);
2452        let m = &parsed.attrs.sci_markings[0];
2453        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Si));
2454        assert_eq!(m.compartments.len(), 1);
2455        assert_eq!(m.compartments[0].identifier.as_ref(), "G");
2456        assert_eq!(m.compartments[0].sub_compartments.len(), 1);
2457        assert_eq!(m.compartments[0].sub_compartments[0].as_ref(), "ABCD");
2458        assert_eq!(m.canonical_enum, None);
2459        // sci_controls projection: no canonical_enum → no entry
2460        assert!(parsed.attrs.sci_controls.is_empty());
2461    }
2462
2463    #[test]
2464    fn sci_capco_canonical_example_parses() {
2465        // CAPCO-2016 §A.6 p16 canonical example:
2466        //   TOP SECRET//123/SI-G ABCD DEFG-MMM AACD//ORCON/NOFORN
2467        use marque_ism::{SciControlBare, SciControlSystem};
2468        let parsed = parse_banner("TOP SECRET//123/SI-G ABCD DEFG-MMM AACD//ORCON/NOFORN");
2469        assert_eq!(parsed.attrs.sci_markings.len(), 2);
2470        // Marking 0: Custom("123"), no compartments.
2471        let m0 = &parsed.attrs.sci_markings[0];
2472        assert!(matches!(&m0.system, SciControlSystem::Custom(s) if s.as_ref() == "123"));
2473        assert!(m0.compartments.is_empty());
2474        assert_eq!(m0.canonical_enum, None);
2475        // Marking 1: Published(SI) with compartments G[ABCD, DEFG] and MMM[AACD].
2476        let m1 = &parsed.attrs.sci_markings[1];
2477        assert_eq!(m1.system, SciControlSystem::Published(SciControlBare::Si));
2478        assert_eq!(m1.compartments.len(), 2);
2479        assert_eq!(m1.compartments[0].identifier.as_ref(), "G");
2480        assert_eq!(m1.compartments[0].sub_compartments.len(), 2);
2481        assert_eq!(m1.compartments[0].sub_compartments[0].as_ref(), "ABCD");
2482        assert_eq!(m1.compartments[0].sub_compartments[1].as_ref(), "DEFG");
2483        assert_eq!(m1.compartments[1].identifier.as_ref(), "MMM");
2484        assert_eq!(m1.compartments[1].sub_compartments.len(), 1);
2485        assert_eq!(m1.compartments[1].sub_compartments[0].as_ref(), "AACD");
2486        // First compartment has sub-comps → canonical_enum is None.
2487        assert_eq!(m1.canonical_enum, None);
2488        // No Unknown spans in the SCI block.
2489        let sci_block_has_unknown = parsed
2490            .attrs
2491            .token_spans
2492            .iter()
2493            .any(|t| t.kind == TokenKind::Unknown);
2494        assert!(
2495            !sci_block_has_unknown,
2496            "canonical example must not produce Unknown tokens; got: {:?}",
2497            parsed.attrs.token_spans
2498        );
2499    }
2500
2501    #[test]
2502    fn sci_custom_numeric_99_direct_parse() {
2503        // Direct unit test of parse_sci_block: `99` → Custom("99").
2504        // In dispatch, `99` alone wouldn't pass the containment gate; this
2505        // exercises the parser's custom-only happy path.
2506        use marque_ism::SciControlSystem;
2507        let mut tokens = Vec::new();
2508        let result = parse_sci_block("99", 0, &mut tokens).expect("99 must parse");
2509        assert_eq!(result.len(), 1);
2510        assert!(matches!(&result[0].system, SciControlSystem::Custom(s) if s.as_ref() == "99"));
2511        assert!(result[0].compartments.is_empty());
2512        assert_eq!(result[0].canonical_enum, None);
2513    }
2514
2515    #[test]
2516    fn sci_structural_rejections_return_none() {
2517        // Dangling hyphen.
2518        let mut tokens = Vec::new();
2519        assert!(parse_sci_block("SI-", 0, &mut tokens).is_none());
2520        // Leading hyphen.
2521        let mut tokens = Vec::new();
2522        assert!(parse_sci_block("-SI", 0, &mut tokens).is_none());
2523        // Empty.
2524        let mut tokens = Vec::new();
2525        assert!(parse_sci_block("", 0, &mut tokens).is_none());
2526        // Lowercase.
2527        let mut tokens = Vec::new();
2528        assert!(parse_sci_block("si-g", 0, &mut tokens).is_none());
2529        // Consecutive hyphens.
2530        let mut tokens = Vec::new();
2531        assert!(parse_sci_block("SI--G", 0, &mut tokens).is_none());
2532        // Empty slash chunk.
2533        let mut tokens = Vec::new();
2534        assert!(parse_sci_block("SI/", 0, &mut tokens).is_none());
2535    }
2536
2537    #[test]
2538    fn sci_mixed_category_slash_block_falls_through() {
2539        // `SI/NF` has `/` and gate passes, but parse_sci_block must reject
2540        // because NF is a known dissem control — otherwise E004's
2541        // stray-slash detection would stop working.
2542        let parsed = parse_banner("SECRET//SI/NF");
2543        // The SI/NF block should NOT be claimed by structural SCI; it must
2544        // fall through to the existing intra-block `/` splitter which in
2545        // turn flags the mixed-category slash as Unknown.
2546        let has_unknown_block = parsed
2547            .attrs
2548            .token_spans
2549            .iter()
2550            .any(|t| t.kind == TokenKind::Unknown);
2551        assert!(
2552            has_unknown_block,
2553            "SI/NF must surface as Unknown for E004; got: {:?}",
2554            parsed.attrs.token_spans
2555        );
2556    }
2557
2558    #[test]
2559    fn sci_weird_sub_compartment_parses() {
2560        // `SI-G WEIRD FOO` — WEIRD and FOO both match [A-Z0-9]+ so the
2561        // grammar treats them as sub-compartments of G.
2562        use marque_ism::{SciControlBare, SciControlSystem};
2563        let parsed = parse_banner("SECRET//SI-G WEIRD FOO//NOFORN");
2564        let m = &parsed.attrs.sci_markings[0];
2565        assert_eq!(m.system, SciControlSystem::Published(SciControlBare::Si));
2566        assert_eq!(m.compartments.len(), 1);
2567        assert_eq!(m.compartments[0].identifier.as_ref(), "G");
2568        assert_eq!(m.compartments[0].sub_compartments.len(), 2);
2569        assert_eq!(m.compartments[0].sub_compartments[0].as_ref(), "WEIRD");
2570        assert_eq!(m.compartments[0].sub_compartments[1].as_ref(), "FOO");
2571    }
2572
2573    // -----------------------------------------------------------------------
2574    // CAB date parsing (parse_cab Declassify On: path)
2575    // -----------------------------------------------------------------------
2576
2577    fn parse_cab_text(text: &str) -> ParsedMarking {
2578        let source = text.as_bytes();
2579        let tokens = CapcoTokenSet;
2580        let parser = Parser::new(&tokens);
2581        let candidate = make_candidate(source, MarkingType::Cab, 0);
2582        parser
2583            .parse(&candidate, source)
2584            .expect("CAB parse should succeed")
2585    }
2586
2587    #[test]
2588    fn cab_declassify_on_yyyymmdd_populates_declassify_on() {
2589        let text = "Classified By: Jane Doe\nDeclassify On: 20301231";
2590        let parsed = parse_cab_text(text);
2591        assert_eq!(
2592            parsed.attrs.declassify_on,
2593            Some(marque_ism::IsmDate::Date(2030, 12, 31)),
2594            "YYYYMMDD in CAB should set declassify_on to Date"
2595        );
2596        assert!(parsed.attrs.declass_exemption.is_none());
2597    }
2598
2599    #[test]
2600    fn cab_declassify_on_yyyy_populates_declassify_on() {
2601        let text = "Declassify On: 2035";
2602        let parsed = parse_cab_text(text);
2603        assert_eq!(
2604            parsed.attrs.declassify_on,
2605            Some(marque_ism::IsmDate::Year(2035)),
2606            "YYYY in CAB should set declassify_on to Year"
2607        );
2608    }
2609
2610    #[test]
2611    fn cab_declassify_on_iso_date_populates_declassify_on() {
2612        // ISO hyphenated YYYY-MM-DD form is valid for the CAB "Declassify On:" line.
2613        let text = "Declassify On: 2030-12-31";
2614        let parsed = parse_cab_text(text);
2615        assert_eq!(
2616            parsed.attrs.declassify_on,
2617            Some(marque_ism::IsmDate::Date(2030, 12, 31)),
2618            "YYYY-MM-DD in CAB should set declassify_on to Date"
2619        );
2620    }
2621
2622    #[test]
2623    fn cab_declassify_on_exemption_sets_exemption_not_date() {
2624        // A declassification exemption code must not be stored in declassify_on.
2625        let text = "Declassify On: 50X1-HUM";
2626        let parsed = parse_cab_text(text);
2627        assert!(
2628            parsed.attrs.declassify_on.is_none(),
2629            "exemption code must not set declassify_on"
2630        );
2631        assert!(
2632            parsed.attrs.declass_exemption.is_some(),
2633            "exemption code must set declass_exemption"
2634        );
2635    }
2636
2637    #[test]
2638    fn cab_declassify_on_invalid_date_silently_ignored() {
2639        // Unrecognized strings are silently dropped — no panic, declassify_on stays None.
2640        let text = "Declassify On: UNRECOGNIZED";
2641        let parsed = parse_cab_text(text);
2642        assert!(
2643            parsed.attrs.declassify_on.is_none(),
2644            "unrecognized Declassify On value should leave declassify_on as None"
2645        );
2646        assert!(parsed.attrs.declass_exemption.is_none());
2647    }
2648
2649    #[test]
2650    fn cab_classified_by_and_derived_from_populated() {
2651        let text = "Classified By: Jane Doe\nDerived From: SCG-2024\nDeclassify On: 20301231";
2652        let parsed = parse_cab_text(text);
2653        assert_eq!(
2654            parsed.attrs.classified_by.as_deref(),
2655            Some("Jane Doe"),
2656            "classified_by should be populated"
2657        );
2658        assert_eq!(
2659            parsed.attrs.derived_from.as_deref(),
2660            Some("SCG-2024"),
2661            "derived_from should be populated"
2662        );
2663        assert_eq!(
2664            parsed.attrs.declassify_on,
2665            Some(marque_ism::IsmDate::Date(2030, 12, 31))
2666        );
2667    }
2668
2669    #[test]
2670    fn cab_without_declassify_on_leaves_both_none() {
2671        let text = "Classified By: Jane Doe\nDerived From: SCG-2024";
2672        let parsed = parse_cab_text(text);
2673        assert!(parsed.attrs.declassify_on.is_none());
2674        assert!(parsed.attrs.declass_exemption.is_none());
2675    }
2676
2677    // -----------------------------------------------------------------------
2678    // Portion declass date (is_declass_date path in parse_marking_string)
2679    // -----------------------------------------------------------------------
2680
2681    #[test]
2682    fn portion_with_yyyymmdd_sets_declassify_on() {
2683        // A portion that (erroneously) contains an inline declass date; the
2684        // parser must populate declassify_on so E005 can fire.
2685        let parsed = parse_portion("(SECRET//20301231//NOFORN)");
2686        assert_eq!(
2687            parsed.attrs.declassify_on,
2688            Some(marque_ism::IsmDate::Date(2030, 12, 31)),
2689            "YYYYMMDD in portion should set declassify_on"
2690        );
2691    }
2692
2693    #[test]
2694    fn portion_with_yyyy_sets_declassify_on() {
2695        let parsed = parse_portion("(SECRET//2035)");
2696        assert_eq!(
2697            parsed.attrs.declassify_on,
2698            Some(marque_ism::IsmDate::Year(2035)),
2699            "YYYY in portion should set declassify_on"
2700        );
2701    }
2702
2703    #[test]
2704    fn is_declass_date_rejects_leap_day_non_leap_year() {
2705        // 2003 is not a leap year; Feb 29 is impossible.
2706        assert!(!is_declass_date("20030229"));
2707    }
2708
2709    #[test]
2710    fn is_declass_date_accepts_leap_day_in_leap_year() {
2711        assert!(is_declass_date("20040229")); // 2004 is a leap year
2712        assert!(is_declass_date("20000229")); // 2000 is a leap year
2713    }
2714
2715    #[test]
2716    fn is_declass_date_rejects_day_zero() {
2717        assert!(!is_declass_date("20030100")); // day 0 is impossible
2718    }
2719}
2720
2721#[cfg(test)]
2722#[cfg_attr(coverage_nightly, coverage(off))]
2723mod sar_parse_tests {
2724    //! Direct unit tests for [`parse_sar_category`] plus integration-level
2725    //! tests that exercise the dispatch from `parse_marking_string`.
2726
2727    use super::*;
2728    use marque_ism::span::{MarkingCandidate, MarkingType, Span};
2729    use marque_ism::token_set::CapcoTokenSet;
2730
2731    // ---------------------------------------------------------------------
2732    // Direct subparser tests
2733    // ---------------------------------------------------------------------
2734
2735    #[test]
2736    fn single_program_no_compartments() {
2737        let (marking, spans) = parse_sar_category("SAR-BP", 0).expect("grammar accepts SAR-BP");
2738        assert_eq!(marking.indicator, SarIndicator::Abbrev);
2739        assert_eq!(marking.programs.len(), 1);
2740        assert_eq!(&*marking.programs[0].identifier, "BP");
2741        assert_eq!(marking.programs[0].compartments.len(), 0);
2742        // Spans: one indicator + one program.
2743        assert_eq!(
2744            spans
2745                .iter()
2746                .filter(|s| s.kind == TokenKind::SarIndicator)
2747                .count(),
2748            1
2749        );
2750        assert_eq!(
2751            spans
2752                .iter()
2753                .filter(|s| s.kind == TokenKind::SarProgram)
2754                .count(),
2755            1
2756        );
2757    }
2758
2759    #[test]
2760    fn three_programs_no_compartments() {
2761        let (marking, _) =
2762            parse_sar_category("SAR-BP/CD/XR", 0).expect("grammar accepts three programs");
2763        assert_eq!(marking.programs.len(), 3);
2764        let ids: Vec<&str> = marking.programs.iter().map(|p| &*p.identifier).collect();
2765        assert_eq!(ids, vec!["BP", "CD", "XR"]);
2766        for p in marking.programs.iter() {
2767            assert_eq!(p.compartments.len(), 0);
2768        }
2769    }
2770
2771    #[test]
2772    fn program_with_single_compartment() {
2773        let (marking, _) = parse_sar_category("SAR-BP-J12", 0).expect("grammar accepts");
2774        assert_eq!(marking.programs.len(), 1);
2775        let p = &marking.programs[0];
2776        assert_eq!(&*p.identifier, "BP");
2777        assert_eq!(p.compartments.len(), 1);
2778        assert_eq!(&*p.compartments[0].identifier, "J12");
2779        assert_eq!(p.compartments[0].sub_compartments.len(), 0);
2780    }
2781
2782    #[test]
2783    fn program_with_compartment_and_sub_compartment() {
2784        let (marking, _) = parse_sar_category("SAR-BP-J12 J54", 0).expect("grammar accepts");
2785        let p = &marking.programs[0];
2786        assert_eq!(p.compartments.len(), 1);
2787        let c = &p.compartments[0];
2788        assert_eq!(&*c.identifier, "J12");
2789        assert_eq!(c.sub_compartments.len(), 1);
2790        assert_eq!(&*c.sub_compartments[0], "J54");
2791    }
2792
2793    #[test]
2794    fn canonical_h5_p100_multi_program_example() {
2795        // The §H.5 p100 canonical decomposition:
2796        //   BP → [J12 (+ J54), K15]
2797        //   CD → [YYY (+ 456, 689)]
2798        //   XR → [XRA (+ RB)]
2799        let block = "SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB";
2800        let (marking, spans) = parse_sar_category(block, 0).expect("grammar accepts");
2801
2802        assert_eq!(marking.indicator, SarIndicator::Abbrev);
2803        assert_eq!(marking.programs.len(), 3);
2804
2805        // BP
2806        let bp = &marking.programs[0];
2807        assert_eq!(&*bp.identifier, "BP");
2808        assert_eq!(bp.compartments.len(), 2);
2809        assert_eq!(&*bp.compartments[0].identifier, "J12");
2810        assert_eq!(
2811            bp.compartments[0]
2812                .sub_compartments
2813                .iter()
2814                .map(|s| &**s)
2815                .collect::<Vec<_>>(),
2816            vec!["J54"]
2817        );
2818        assert_eq!(&*bp.compartments[1].identifier, "K15");
2819        assert_eq!(bp.compartments[1].sub_compartments.len(), 0);
2820
2821        // CD
2822        let cd = &marking.programs[1];
2823        assert_eq!(&*cd.identifier, "CD");
2824        assert_eq!(cd.compartments.len(), 1);
2825        assert_eq!(&*cd.compartments[0].identifier, "YYY");
2826        assert_eq!(
2827            cd.compartments[0]
2828                .sub_compartments
2829                .iter()
2830                .map(|s| &**s)
2831                .collect::<Vec<_>>(),
2832            vec!["456", "689"]
2833        );
2834
2835        // XR
2836        let xr = &marking.programs[2];
2837        assert_eq!(&*xr.identifier, "XR");
2838        assert_eq!(xr.compartments.len(), 1);
2839        assert_eq!(&*xr.compartments[0].identifier, "XRA");
2840        assert_eq!(
2841            xr.compartments[0]
2842                .sub_compartments
2843                .iter()
2844                .map(|s| &**s)
2845                .collect::<Vec<_>>(),
2846            vec!["RB"]
2847        );
2848
2849        // Spot-check span offsets: the indicator is at [0, 4) and the first
2850        // program "BP" is at [4, 6).
2851        let indicator = spans
2852            .iter()
2853            .find(|s| s.kind == TokenKind::SarIndicator)
2854            .unwrap();
2855        assert_eq!(indicator.span, Span::new(0, 4));
2856        assert_eq!(&*indicator.text, "SAR-");
2857        let first_prog = spans
2858            .iter()
2859            .find(|s| s.kind == TokenKind::SarProgram)
2860            .unwrap();
2861        assert_eq!(first_prog.span, Span::new(4, 6));
2862        assert_eq!(&*first_prog.text, "BP");
2863    }
2864
2865    #[test]
2866    fn full_form_single_program_with_space() {
2867        // `SPECIAL ACCESS REQUIRED-BUTTER POPCORN` — full form allows spaces
2868        // inside the nickname. No compartment decomposition at the lexical
2869        // level (see spec §R2 ambiguity note).
2870        let (marking, spans) =
2871            parse_sar_category("SPECIAL ACCESS REQUIRED-BUTTER POPCORN", 0).unwrap();
2872        assert_eq!(marking.indicator, SarIndicator::Full);
2873        assert_eq!(marking.programs.len(), 1);
2874        assert_eq!(&*marking.programs[0].identifier, "BUTTER POPCORN");
2875        assert_eq!(marking.programs[0].compartments.len(), 0);
2876
2877        // Indicator span is 24 bytes: `SPECIAL ACCESS REQUIRED-`.
2878        let indicator = spans
2879            .iter()
2880            .find(|s| s.kind == TokenKind::SarIndicator)
2881            .unwrap();
2882        assert_eq!(&*indicator.text, "SPECIAL ACCESS REQUIRED-");
2883        assert_eq!(indicator.span, Span::new(0, 24));
2884    }
2885
2886    #[test]
2887    fn full_form_with_compartment_and_sub() {
2888        // The grammar permits compartments under a full-form program
2889        // identically to the abbreviated form. Program nickname may
2890        // contain spaces; compartments and sub-compartments are still
2891        // alphanumeric without spaces.
2892        let (marking, _spans) =
2893            parse_sar_category("SPECIAL ACCESS REQUIRED-BUTTER POPCORN-J12 J54", 0)
2894                .expect("grammar accepts full form with compartment");
2895        assert_eq!(marking.indicator, SarIndicator::Full);
2896        assert_eq!(marking.programs.len(), 1);
2897        let prog = &marking.programs[0];
2898        assert_eq!(&*prog.identifier, "BUTTER POPCORN");
2899        assert_eq!(prog.compartments.len(), 1);
2900        assert_eq!(&*prog.compartments[0].identifier, "J12");
2901        assert_eq!(prog.compartments[0].sub_compartments.len(), 1);
2902        assert_eq!(&*prog.compartments[0].sub_compartments[0], "J54");
2903    }
2904
2905    #[test]
2906    fn full_form_rejects_digits_or_hyphens_in_nickname() {
2907        // Full-form nickname may only contain uppercase letters and
2908        // spaces; digits or hyphens inside the nickname are parsed as
2909        // compartment boundaries (hyphen) or as a shape violation
2910        // (digits).
2911        assert!(parse_sar_category("SPECIAL ACCESS REQUIRED-123", 0).is_none());
2912    }
2913
2914    #[test]
2915    fn rejects_double_slash_inside_block() {
2916        // Defensive: the outer category-block splitter wouldn't hand us
2917        // `SAR-BP//CD` (it splits on `//` first). But if it somehow did,
2918        // `parse_sar_category` refuses because `//` is a category separator
2919        // that should never appear inside a single block. The caller
2920        // records the text as Unknown so E030 can flag the repeat form.
2921        assert!(parse_sar_category("SAR-BP//CD", 0).is_none());
2922    }
2923
2924    #[test]
2925    fn rejects_missing_hyphen() {
2926        assert!(parse_sar_category("SAR", 0).is_none());
2927    }
2928
2929    #[test]
2930    fn rejects_empty_program() {
2931        assert!(parse_sar_category("SAR-", 0).is_none());
2932    }
2933
2934    #[test]
2935    fn rejects_empty_string() {
2936        assert!(parse_sar_category("", 0).is_none());
2937    }
2938
2939    #[test]
2940    fn rejects_non_sar_prefix() {
2941        assert!(parse_sar_category("NOFORN", 0).is_none());
2942        assert!(parse_sar_category("SI", 0).is_none());
2943    }
2944
2945    #[test]
2946    fn rejects_program_id_out_of_2_3_length() {
2947        // Single-char program id.
2948        assert!(parse_sar_category("SAR-B", 0).is_none());
2949        // Four-char program id.
2950        assert!(parse_sar_category("SAR-BPCD", 0).is_none());
2951    }
2952
2953    // ---------------------------------------------------------------------
2954    // Dispatch tests (through `parse_marking_string`)
2955    // ---------------------------------------------------------------------
2956
2957    fn make_banner(text: &str) -> ParsedMarking {
2958        let source = text.as_bytes();
2959        let tokens = CapcoTokenSet;
2960        let parser = Parser::new(&tokens);
2961        let candidate = MarkingCandidate {
2962            span: Span::new(0, source.len()),
2963            kind: MarkingType::Banner,
2964        };
2965        parser.parse(&candidate, source).expect("parse succeeds")
2966    }
2967
2968    #[test]
2969    fn banner_dispatch_populates_sar_markings() {
2970        let parsed = make_banner("TOP SECRET//SAR-BP//NOFORN");
2971        let sar = parsed
2972            .attrs
2973            .sar_markings
2974            .as_ref()
2975            .expect("SAR block must populate sar_markings");
2976        assert_eq!(sar.programs.len(), 1);
2977        assert_eq!(&*sar.programs[0].identifier, "BP");
2978
2979        // Token-span mix must include both the indicator and program token.
2980        let kinds: Vec<TokenKind> = parsed.attrs.token_spans.iter().map(|t| t.kind).collect();
2981        assert!(kinds.contains(&TokenKind::SarIndicator));
2982        assert!(kinds.contains(&TokenKind::SarProgram));
2983
2984        // Dissem accumulator still populated: NOFORN is present.
2985        assert!(
2986            parsed
2987                .attrs
2988                .dissem_controls
2989                .contains(&marque_ism::DissemControl::Nf),
2990            "NOFORN must still be recognized after the SAR block"
2991        );
2992    }
2993
2994    #[test]
2995    fn banner_dispatch_multi_program_canonical() {
2996        // The §H.5 p100 canonical line as a full banner.
2997        let parsed = make_banner("SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN");
2998        let sar = parsed.attrs.sar_markings.as_ref().expect("sar present");
2999        assert_eq!(sar.programs.len(), 3);
3000        let ids: Vec<&str> = sar.programs.iter().map(|p| &*p.identifier).collect();
3001        assert_eq!(ids, vec!["BP", "CD", "XR"]);
3002
3003        // Token-span offsets are absolute into the banner string. Find the
3004        // SarIndicator and verify its byte slice.
3005        let src = parsed
3006            .attrs
3007            .token_spans
3008            .iter()
3009            .find(|t| t.kind == TokenKind::SarIndicator)
3010            .expect("SarIndicator span present");
3011        assert_eq!(&*src.text, "SAR-");
3012        // `SECRET//` is 8 bytes, so `SAR-` starts at offset 8.
3013        assert_eq!(src.span, Span::new(8, 12));
3014    }
3015
3016    #[test]
3017    fn second_sar_block_becomes_unknown() {
3018        // Two SAR category blocks: the first populates `sar_markings`; the
3019        // second is left as `Unknown` so rule E030 can flag the repeat.
3020        let parsed = make_banner("SECRET//SAR-BP//SAR-CD//NOFORN");
3021        let sar = parsed
3022            .attrs
3023            .sar_markings
3024            .as_ref()
3025            .expect("first SAR block populates sar_markings");
3026        assert_eq!(sar.programs.len(), 1);
3027        assert_eq!(&*sar.programs[0].identifier, "BP");
3028
3029        // The `SAR-CD` block must appear as an Unknown span.
3030        let unknown_texts: Vec<&str> = parsed
3031            .attrs
3032            .token_spans
3033            .iter()
3034            .filter(|t| t.kind == TokenKind::Unknown)
3035            .map(|t| &*t.text)
3036            .collect();
3037        assert!(
3038            unknown_texts.contains(&"SAR-CD"),
3039            "duplicate SAR block must be recorded as Unknown, got: {unknown_texts:?}",
3040        );
3041    }
3042}
marque_core/parser.rs

marque_core/
parser.rs