1use crate::error::CoreError;
19use marque_ism::attrs::{
20 Classification, DeclassExemption, DissemControl, IsmAttributes, SarIdentifier, SciControl,
21 TokenKind, TokenSpan, Trigraph,
22};
23use marque_ism::span::{MarkingCandidate, MarkingType, Span};
26use marque_ism::token_set::TokenSet;
27
28#[derive(Debug)]
30pub struct ParsedMarking {
31 pub attrs: IsmAttributes,
32 pub source_span: Span,
33 pub kind: MarkingType,
34}
35
36pub struct Parser<'t> {
38 tokens: &'t dyn TokenSet,
39}
40
41impl<'t> Parser<'t> {
42 pub fn new(tokens: &'t dyn TokenSet) -> Self {
43 Self { tokens }
44 }
45
46 pub fn parse(
48 &self,
49 candidate: &MarkingCandidate,
50 source: &[u8],
51 ) -> Result<ParsedMarking, CoreError> {
52 let text = candidate
53 .span
54 .as_str(source)
55 .map_err(|_| CoreError::InvalidUtf8(candidate.span))?;
56 match candidate.kind {
57 MarkingType::Portion => self.parse_portion(text, candidate),
58 MarkingType::Banner => self.parse_banner(text, candidate),
59 MarkingType::Cab => self.parse_cab(text, candidate),
60 MarkingType::PageBreak => Err(CoreError::MalformedMarking(
65 "page-break candidate must not be parsed".to_owned(),
66 )),
67 }
68 }
69
70 fn parse_portion(
71 &self,
72 text: &str,
73 candidate: &MarkingCandidate,
74 ) -> Result<ParsedMarking, CoreError> {
75 let inner = text
79 .strip_prefix('(')
80 .and_then(|s| s.strip_suffix(')'))
81 .ok_or_else(|| CoreError::MalformedMarking(text.to_owned()))?;
82
83 let attrs =
84 self.parse_marking_string(inner, MarkingType::Portion, candidate.span.start + 1)?;
85 Ok(ParsedMarking {
86 attrs,
87 source_span: candidate.span,
88 kind: MarkingType::Portion,
89 })
90 }
91
92 fn parse_banner(
93 &self,
94 text: &str,
95 candidate: &MarkingCandidate,
96 ) -> Result<ParsedMarking, CoreError> {
97 let trimmed = text.trim_start();
102 let lead_ws = text.len() - trimmed.len();
103 let trimmed = trimmed.trim_end();
104 let attrs = self.parse_marking_string(
105 trimmed,
106 MarkingType::Banner,
107 candidate.span.start + lead_ws,
108 )?;
109 Ok(ParsedMarking {
110 attrs,
111 source_span: candidate.span,
112 kind: MarkingType::Banner,
113 })
114 }
115
116 fn parse_cab(
117 &self,
118 text: &str,
119 candidate: &MarkingCandidate,
120 ) -> Result<ParsedMarking, CoreError> {
121 let mut attrs = IsmAttributes::default();
123
124 for line in text.lines() {
125 if let Some(val) = line.strip_prefix("Classified By:") {
126 attrs.classified_by = Some(val.trim().into());
127 } else if let Some(val) = line.strip_prefix("Derived From:") {
128 attrs.derived_from = Some(val.trim().into());
129 } else if let Some(val) = line.strip_prefix("Declassify On:") {
130 let s = val.trim();
131 if let Some(exemption) = DeclassExemption::parse(s) {
132 attrs.declass_exemption = Some(exemption);
133 } else {
134 attrs.declassify_on = Some(s.into());
135 }
136 }
137 }
138
139 Ok(ParsedMarking {
140 attrs,
141 source_span: candidate.span,
142 kind: MarkingType::Cab,
143 })
144 }
145
146 fn parse_marking_string(
154 &self,
155 s: &str,
156 context: MarkingType,
157 s_offset: usize,
158 ) -> Result<IsmAttributes, CoreError> {
159 let mut attrs = IsmAttributes::default();
160
161 if s.is_empty() {
162 return Err(CoreError::MalformedMarking(s.to_owned()));
163 }
164
165 let separators: Vec<usize> = s.match_indices("//").map(|(i, _)| i).collect();
170 let mut block_ranges: Vec<(usize, usize)> = Vec::with_capacity(separators.len() + 1);
171 let mut prev_end = 0usize;
172 for &sep_start in &separators {
173 block_ranges.push((prev_end, sep_start));
174 prev_end = sep_start + 2; }
176 block_ranges.push((prev_end, s.len()));
177
178 let mut token_spans: Vec<TokenSpan> = Vec::new();
179
180 let mut sci: Vec<SciControl> = Vec::new();
182 let mut sar: Vec<SarIdentifier> = Vec::new();
183 let mut dissem: Vec<DissemControl> = Vec::new();
184 let mut rel_to: Vec<Trigraph> = Vec::new();
185
186 for (idx, &(rel_start, rel_end)) in block_ranges.iter().enumerate() {
187 let raw = &s[rel_start..rel_end];
188 let trimmed = raw.trim();
191 if trimmed.is_empty() {
192 continue;
193 }
194 let trim_lead = raw.len() - raw.trim_start().len();
195 let abs_start = s_offset + rel_start + trim_lead;
196 let abs_end = abs_start + trimmed.len();
197 let span = Span::new(abs_start, abs_end);
198
199 if idx == 0 {
200 attrs.classification = parse_classification(trimmed);
203 token_spans.push(TokenSpan {
204 kind: TokenKind::Classification,
205 span,
206 text: trimmed.into(),
207 });
208 continue;
209 }
210
211 if trimmed.starts_with("REL TO") || trimmed.starts_with("REL ") {
212 let parsed_trigraphs =
213 parse_rel_to_with_spans(trimmed, abs_start, self.tokens, &mut token_spans);
214 rel_to.extend(parsed_trigraphs);
215 } else if let Some(ctrl) = SciControl::parse(trimmed) {
216 sci.push(ctrl);
217 token_spans.push(TokenSpan {
218 kind: TokenKind::SciControl,
219 span,
220 text: trimmed.into(),
221 });
222 } else if let Some(ctrl) =
223 DissemControl::parse(trimmed).or_else(|| parse_dissem_full_form(trimmed))
224 {
225 dissem.push(ctrl);
226 token_spans.push(TokenSpan {
227 kind: TokenKind::DissemControl,
228 span,
229 text: trimmed.into(),
230 });
231 } else if let Some(sar_id) = SarIdentifier::parse(trimmed) {
232 sar.push(sar_id);
233 token_spans.push(TokenSpan {
234 kind: TokenKind::SarIdentifier,
235 span,
236 text: trimmed.into(),
237 });
238 } else if let Some(exemption) = DeclassExemption::parse(trimmed) {
239 attrs.declass_exemption = Some(exemption);
243 token_spans.push(TokenSpan {
244 kind: TokenKind::DeclassExemption,
245 span,
246 text: trimmed.into(),
247 });
248 } else if is_declass_date(trimmed) {
249 attrs.declassify_on = Some(trimmed.into());
252 token_spans.push(TokenSpan {
253 kind: TokenKind::DeclassDate,
254 span,
255 text: trimmed.into(),
256 });
257 } else {
258 token_spans.push(TokenSpan {
263 kind: TokenKind::Unknown,
264 span,
265 text: trimmed.into(),
266 });
267 }
268 }
269
270 attrs.sci_controls = sci.into_boxed_slice();
271 attrs.sar_identifiers = sar.into_boxed_slice();
272 attrs.dissem_controls = dissem.into_boxed_slice();
273 attrs.rel_to = rel_to.into_boxed_slice();
274 for &sep_start in &separators {
278 token_spans.push(TokenSpan {
279 kind: TokenKind::Separator,
280 span: Span::new(s_offset + sep_start, s_offset + sep_start + 2),
281 text: "//".into(),
282 });
283 }
284 token_spans.sort_unstable_by_key(|ts| ts.span.start);
285 attrs.token_spans = token_spans.into_boxed_slice();
286
287 let _ = context; Ok(attrs)
290 }
291}
292
293fn parse_classification(s: &str) -> Option<Classification> {
302 match s {
303 "TS" | "TOP SECRET" => Some(Classification::TopSecret),
304 "S" | "SECRET" => Some(Classification::Secret),
305 "C" | "CONFIDENTIAL" => Some(Classification::Confidential),
306 "U" | "UNCLASSIFIED" => Some(Classification::Unclassified),
307 _ => None,
308 }
309}
310
311fn parse_dissem_full_form(s: &str) -> Option<DissemControl> {
321 let abbrev = match s {
322 "NOFORN" => "NF",
323 "ORCON" => "OC",
324 "IMCON" => "IMC",
325 "DEA SENSITIVE" => "DSEN",
326 "PROPIN" => "PR",
327 "RELIDO" => "RELIDO",
328 _ => return None,
329 };
330 DissemControl::parse(abbrev)
331}
332
333fn parse_rel_to_with_spans(
339 block: &str,
340 block_offset: usize,
341 tokens: &dyn TokenSet,
342 token_spans: &mut Vec<TokenSpan>,
343) -> Vec<Trigraph> {
344 let prefix_skip = if let Some(rest) = block.strip_prefix("REL TO") {
348 block.len() - rest.len()
349 } else if let Some(rest) = block.strip_prefix("REL") {
350 block.len() - rest.len()
351 } else {
352 0
353 };
354 let after_rel = &block[prefix_skip..];
355
356 let mut out: Vec<Trigraph> = Vec::new();
357 let mut cursor = 0usize;
361 for entry in after_rel.split(',') {
362 let entry_start_in_after = cursor;
363 cursor += entry.len() + 1;
369
370 let trim_lead = entry.len() - entry.trim_start().len();
371 let trimmed = entry.trim();
372 if trimmed.is_empty() || !tokens.is_trigraph(trimmed) {
373 continue;
374 }
375 let b = trimmed.as_bytes();
376 if b.len() != 3 {
377 continue;
378 }
379 let Some(t) = Trigraph::try_new([b[0], b[1], b[2]]) else {
380 continue;
381 };
382 out.push(t);
383 let abs_start = block_offset + prefix_skip + entry_start_in_after + trim_lead;
384 token_spans.push(TokenSpan {
385 kind: TokenKind::RelToTrigraph,
386 span: Span::new(abs_start, abs_start + 3),
387 text: trimmed.into(),
388 });
389 }
390 out
391}
392
393fn is_declass_date(s: &str) -> bool {
405 let bytes = s.as_bytes();
406 matches!(bytes.len(), 4 | 8) && bytes.iter().all(u8::is_ascii_digit)
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412 use marque_ism::span::{MarkingCandidate, MarkingType, Span};
413 use marque_ism::token_set::CapcoTokenSet;
414
415 fn make_candidate(text: &[u8], kind: MarkingType, offset: usize) -> MarkingCandidate {
416 MarkingCandidate {
417 span: Span::new(offset, offset + text.len()),
418 kind,
419 }
420 }
421
422 fn parse_banner(text: &str) -> ParsedMarking {
423 let source = text.as_bytes();
424 let tokens = CapcoTokenSet;
425 let parser = Parser::new(&tokens);
426 let candidate = make_candidate(source, MarkingType::Banner, 0);
427 parser
428 .parse(&candidate, source)
429 .expect("parse should succeed")
430 }
431
432 fn parse_portion(text: &str) -> ParsedMarking {
433 let source = text.as_bytes();
434 let tokens = CapcoTokenSet;
435 let parser = Parser::new(&tokens);
436 let candidate = make_candidate(source, MarkingType::Portion, 0);
437 parser
438 .parse(&candidate, source)
439 .expect("parse should succeed")
440 }
441
442 #[test]
445 fn banner_with_declass_exemption_populates_attrs() {
446 let parsed = parse_banner("SECRET//25X1//NOFORN");
449 assert!(
450 parsed.attrs.declass_exemption.is_some(),
451 "declass_exemption should be populated when 25X1 appears in banner"
452 );
453 use marque_ism::DeclassExemption;
454 assert_eq!(
455 parsed.attrs.declass_exemption,
456 Some(DeclassExemption::X25x1)
457 );
458 }
459
460 #[test]
461 fn portion_with_declass_exemption_populates_attrs() {
462 let parsed = parse_portion("(SECRET//50X1-HUM)");
463 assert!(parsed.attrs.declass_exemption.is_some());
464 }
465
466 #[test]
469 fn banner_with_declass_date_populates_attrs() {
470 let parsed = parse_banner("SECRET//20301231//NOFORN");
471 assert_eq!(
472 parsed.attrs.declassify_on.as_deref(),
473 Some("20301231"),
474 "declassify_on should be populated when YYYYMMDD appears in banner"
475 );
476 }
477
478 #[test]
479 fn banner_with_four_digit_year_populates_attrs() {
480 let parsed = parse_banner("SECRET//2035");
481 assert_eq!(parsed.attrs.declassify_on.as_deref(), Some("2035"));
482 }
483
484 #[test]
487 fn banner_without_declass_leaves_fields_none() {
488 let parsed = parse_banner("TOP SECRET//SI//NOFORN");
489 assert!(parsed.attrs.declassify_on.is_none());
490 assert!(parsed.attrs.declass_exemption.is_none());
491 }
492
493 #[test]
496 fn is_declass_date_accepts_yyyymmdd() {
497 assert!(is_declass_date("20301231"));
498 }
499
500 #[test]
501 fn is_declass_date_accepts_yyyy() {
502 assert!(is_declass_date("2035"));
503 }
504
505 #[test]
506 fn is_declass_date_rejects_non_digit() {
507 assert!(!is_declass_date("2030X231"));
508 assert!(!is_declass_date("YYYYMMDD"));
509 }
510
511 #[test]
512 fn is_declass_date_rejects_wrong_length() {
513 assert!(!is_declass_date("203012"));
514 assert!(!is_declass_date("203012311"));
515 }
516
517 #[test]
520 fn token_spans_track_offsets_in_banner() {
521 let parsed = parse_banner("TOP SECRET//SI//NF");
522 let kinds: Vec<TokenKind> = parsed.attrs.token_spans.iter().map(|t| t.kind).collect();
523 assert!(kinds.contains(&TokenKind::Separator));
525 assert!(kinds.contains(&TokenKind::Classification));
526 assert!(kinds.contains(&TokenKind::SciControl));
527 assert!(kinds.contains(&TokenKind::DissemControl));
528
529 let src = b"TOP SECRET//SI//NF";
531 let cls = parsed
532 .attrs
533 .token_spans
534 .iter()
535 .find(|t| t.kind == TokenKind::Classification)
536 .unwrap();
537 assert_eq!(cls.span.as_str(src).unwrap(), "TOP SECRET");
538
539 let sci = parsed
540 .attrs
541 .token_spans
542 .iter()
543 .find(|t| t.kind == TokenKind::SciControl)
544 .unwrap();
545 assert_eq!(sci.span.as_str(src).unwrap(), "SI");
546
547 let dissem = parsed
548 .attrs
549 .token_spans
550 .iter()
551 .find(|t| t.kind == TokenKind::DissemControl)
552 .unwrap();
553 assert_eq!(dissem.span.as_str(src).unwrap(), "NF");
554 }
555
556 #[test]
557 fn token_spans_strip_paren_in_portion() {
558 let parsed = parse_portion("(SECRET//NF)");
559 let src = b"(SECRET//NF)";
560 let cls = parsed
561 .attrs
562 .token_spans
563 .iter()
564 .find(|t| t.kind == TokenKind::Classification)
565 .unwrap();
566 assert_eq!(cls.span.start, 1);
568 assert_eq!(cls.span.end, 7);
569 assert_eq!(cls.span.as_str(src).unwrap(), "SECRET");
570
571 let dissem = parsed
572 .attrs
573 .token_spans
574 .iter()
575 .find(|t| t.kind == TokenKind::DissemControl)
576 .unwrap();
577 assert_eq!(dissem.span.start, 9);
579 assert_eq!(dissem.span.end, 11);
580 }
581
582 #[test]
583 fn token_spans_record_unknown_token() {
584 let parsed = parse_banner("SECRET//XYZZY//NOFORN");
585 let unknowns: Vec<&TokenSpan> = parsed
586 .attrs
587 .token_spans
588 .iter()
589 .filter(|t| t.kind == TokenKind::Unknown)
590 .collect();
591 assert_eq!(unknowns.len(), 1);
592 assert_eq!(
593 unknowns[0].span.as_str(b"SECRET//XYZZY//NOFORN").unwrap(),
594 "XYZZY"
595 );
596 }
597
598 #[test]
599 fn token_spans_record_rel_to_trigraphs() {
600 let parsed = parse_banner("SECRET//REL TO USA, GBR, AUS");
601 let trigraphs: Vec<&TokenSpan> = parsed
602 .attrs
603 .token_spans
604 .iter()
605 .filter(|t| t.kind == TokenKind::RelToTrigraph)
606 .collect();
607 assert_eq!(trigraphs.len(), 3);
608 let src = b"SECRET//REL TO USA, GBR, AUS";
609 assert_eq!(trigraphs[0].span.as_str(src).unwrap(), "USA");
610 assert_eq!(trigraphs[1].span.as_str(src).unwrap(), "GBR");
611 assert_eq!(trigraphs[2].span.as_str(src).unwrap(), "AUS");
612 }
613
614 #[test]
615 fn token_spans_record_separators() {
616 let parsed = parse_banner("SECRET//NF");
617 let seps: Vec<&TokenSpan> = parsed
618 .attrs
619 .token_spans
620 .iter()
621 .filter(|t| t.kind == TokenKind::Separator)
622 .collect();
623 assert_eq!(seps.len(), 1);
624 let src = b"SECRET//NF";
625 assert_eq!(seps[0].span.as_str(src).unwrap(), "//");
626 }
627}