use marque_ism::span::{MarkingCandidate, MarkingType, Span};
use memchr::memchr_iter;
pub struct Scanner;
impl Scanner {
pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
let mut candidates = Vec::new();
Self::scan_portions(source, &mut candidates);
Self::scan_banners(source, &mut candidates);
Self::scan_cab(source, &mut candidates);
Self::scan_page_breaks(source, &mut candidates);
candidates.sort_unstable_by(|a, b| {
a.span
.start
.cmp(&b.span.start)
.then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
});
candidates
}
fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
for pos in memchr_iter(b'\x0c', source) {
out.push(MarkingCandidate {
span: Span::new(pos, pos),
kind: MarkingType::PageBreak,
});
}
let mut run = 0usize;
for (i, &b) in source.iter().enumerate() {
if b == b'\n' {
run += 1;
if run == 3 {
out.push(MarkingCandidate {
span: Span::new(i, i),
kind: MarkingType::PageBreak,
});
}
} else if b != b'\r' {
run = 0;
}
}
}
fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
for start in memchr_iter(b'(', source) {
if let Some(end) = find_portion_end(source, start) {
let span = Span::new(start, end + 1);
if span.len() >= 3 && span.len() <= 256 {
out.push(MarkingCandidate {
span,
kind: MarkingType::Portion,
});
}
}
}
}
fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
const BANNER_PREFIXES: &[&[u8]] = &[
b"TOP SECRET",
b"TS//",
b"SECRET",
b"S//",
b"CONFIDENTIAL",
b"C//",
b"RESTRICTED",
b"UNCLASSIFIED",
b"U//",
b"//",
];
for line in source.split(|&b| b == b'\n') {
let trimmed = trim_ascii(line);
if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
let start = line.as_ptr() as usize - source.as_ptr() as usize;
let end = start + line.len();
out.push(MarkingCandidate {
span: Span::new(start, end),
kind: MarkingType::Banner,
});
}
}
}
fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
const CAB_LABEL: &[u8] = b"Classified By:";
let mut search_from = 0;
while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
let pos = search_from + rel;
let end = find_cab_end(source, pos);
out.push(MarkingCandidate {
span: Span::new(pos, end),
kind: MarkingType::Cab,
});
search_from = end;
}
}
}
fn kind_sort_priority(kind: MarkingType) -> u8 {
match kind {
MarkingType::PageBreak => 0,
_ => 1,
}
}
fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
let rest = source.get(open + 1..)?;
for (i, &b) in rest.iter().enumerate() {
match b {
b')' => return Some(open + 1 + i),
b'\n' | b'\r' | b'\x0c' | b'(' => return None,
_ => {}
}
}
None
}
fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
haystack.windows(needle.len()).position(|w| w == needle)
}
fn find_cab_end(source: &[u8], start: usize) -> usize {
let mut prev_newline = false;
for (i, &b) in source[start..].iter().enumerate() {
if b == b'\n' {
if prev_newline {
return start + i;
}
prev_newline = true;
} else if b != b'\r' {
prev_newline = false;
}
}
source.len()
}
fn trim_ascii(s: &[u8]) -> &[u8] {
s.trim_ascii()
}
#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
use super::*;
#[test]
fn detects_portion_marking() {
let src = b"(TS//SI//NF) This paragraph is classified.";
let candidates = Scanner::scan(src);
assert_eq!(candidates.len(), 1);
assert_eq!(candidates[0].kind, MarkingType::Portion);
assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
}
#[test]
fn detects_banner() {
let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
let candidates = Scanner::scan(src);
assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
}
#[test]
fn rejects_newline_in_portion() {
let src = b"(TS\n//NF) not a real marking";
let candidates = Scanner::scan(src);
assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
}
#[test]
fn rejects_form_feed_in_portion() {
let src = b"(TS\x0c//NF)";
let candidates = Scanner::scan(src);
assert!(
candidates.iter().all(|c| c.kind != MarkingType::Portion),
"form feed inside portion parens must not produce a Portion candidate"
);
assert!(
candidates
.iter()
.any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
"expected PageBreak at form-feed offset 3"
);
}
#[test]
fn detects_page_break_form_feed() {
let src = b"page1\x0cpage2";
let candidates = Scanner::scan(src);
let breaks: Vec<_> = candidates
.iter()
.filter(|c| c.kind == MarkingType::PageBreak)
.collect();
assert_eq!(breaks.len(), 1);
assert_eq!(breaks[0].span.start, 5);
assert_eq!(breaks[0].span.end, 5);
}
#[test]
fn detects_page_break_blank_line_run() {
let src = b"page1\n\n\npage2";
let candidates = Scanner::scan(src);
let breaks: Vec<_> = candidates
.iter()
.filter(|c| c.kind == MarkingType::PageBreak)
.collect();
assert_eq!(breaks.len(), 1);
assert_eq!(breaks[0].span.start, 7);
}
#[test]
fn double_newline_does_not_emit_page_break() {
let src = b"paragraph one\n\nparagraph two";
let candidates = Scanner::scan(src);
assert!(
candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
"double newline should not produce a PageBreak candidate"
);
}
#[test]
fn page_break_sorts_before_co_located_content() {
assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
assert!(
kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
);
assert!(
kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
);
assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
}
#[test]
fn page_break_form_feed_inside_blank_run_emits_both() {
let src = b"a\n\n\x0c\n\nb";
let candidates = Scanner::scan(src);
let breaks: Vec<_> = candidates
.iter()
.filter(|c| c.kind == MarkingType::PageBreak)
.collect();
assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
}
#[test]
fn detects_non_us_banner_nato() {
let src = b"//NATO SECRET//REL TO USA, GBR\n";
let candidates = Scanner::scan(src);
let banners: Vec<_> = candidates
.iter()
.filter(|c| c.kind == MarkingType::Banner)
.collect();
assert_eq!(banners.len(), 1);
}
#[test]
fn detects_non_us_banner_portion_form() {
let src = b"//NS//NF\n";
let candidates = Scanner::scan(src);
assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
}
#[test]
fn detects_restricted_banner() {
let src = b"RESTRICTED//NF\n";
let candidates = Scanner::scan(src);
assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
}
#[test]
fn non_us_portion_detected_by_existing_scanner() {
let src = b"(//NS//REL TO USA, GBR)";
let candidates = Scanner::scan(src);
assert!(candidates.iter().any(|c| c.kind == MarkingType::Portion));
}
#[test]
fn double_slash_mid_line_is_not_banner() {
let src = b"some text // not a marking\n";
let candidates = Scanner::scan(src);
assert!(
candidates.iter().all(|c| c.kind != MarkingType::Banner),
"// in middle of line should not produce a banner candidate"
);
}
}