1use marque_ism::span::{MarkingCandidate, MarkingType, Span};
14use memchr::memchr_iter;
15
16pub struct Scanner;
18
19impl Scanner {
20 pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
25 let mut candidates = Vec::new();
26
27 Self::scan_portions(source, &mut candidates);
28 Self::scan_banners(source, &mut candidates);
29 Self::scan_cab(source, &mut candidates);
30 Self::scan_page_breaks(source, &mut candidates);
31
32 candidates.sort_unstable_by(|a, b| {
38 a.span
39 .start
40 .cmp(&b.span.start)
41 .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
42 });
43 candidates
44 }
45
46 fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
55 for pos in memchr_iter(b'\x0c', source) {
59 out.push(MarkingCandidate {
60 span: Span::new(pos, pos),
61 kind: MarkingType::PageBreak,
62 });
63 }
64 let mut run = 0usize;
69 for (i, &b) in source.iter().enumerate() {
70 if b == b'\n' {
71 run += 1;
72 if run == 3 {
73 out.push(MarkingCandidate {
74 span: Span::new(i, i),
75 kind: MarkingType::PageBreak,
76 });
77 }
78 } else if b != b'\r' {
79 run = 0;
80 }
81 }
82 }
83
84 fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
85 for start in memchr_iter(b'(', source) {
87 if let Some(end) = find_portion_end(source, start) {
88 let span = Span::new(start, end + 1);
89 if span.len() >= 3 && span.len() <= 256 {
91 out.push(MarkingCandidate {
92 span,
93 kind: MarkingType::Portion,
94 });
95 }
96 }
97 }
98 }
99
100 fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
101 const BANNER_PREFIXES: &[&[u8]] = &[
106 b"TOP SECRET",
107 b"SECRET",
108 b"CONFIDENTIAL",
109 b"RESTRICTED",
110 b"UNCLASSIFIED",
111 b"//",
112 ];
113
114 for line in source.split(|&b| b == b'\n') {
115 let trimmed = trim_ascii(line);
116 if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
117 let start = line.as_ptr() as usize - source.as_ptr() as usize;
120 let end = start + line.len();
121 out.push(MarkingCandidate {
122 span: Span::new(start, end),
123 kind: MarkingType::Banner,
124 });
125 }
126 }
127 }
128
129 fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
130 const CAB_LABEL: &[u8] = b"Classified By:";
131 let mut search_from = 0;
132 while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
133 let pos = search_from + rel;
134 let end = find_cab_end(source, pos);
135 out.push(MarkingCandidate {
136 span: Span::new(pos, end),
137 kind: MarkingType::Cab,
138 });
139 search_from = end;
140 }
141 }
142}
143
144fn kind_sort_priority(kind: MarkingType) -> u8 {
149 match kind {
150 MarkingType::PageBreak => 0,
151 _ => 1,
152 }
153}
154
155fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
156 let rest = source.get(open + 1..)?;
165 for (i, &b) in rest.iter().enumerate() {
166 match b {
167 b')' => return Some(open + 1 + i),
168 b'\n' | b'\r' | b'\x0c' | b'(' => return None,
169 _ => {}
170 }
171 }
172 None
173}
174
175fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
176 haystack.windows(needle.len()).position(|w| w == needle)
177}
178
179fn find_cab_end(source: &[u8], start: usize) -> usize {
180 let mut prev_newline = false;
182 for (i, &b) in source[start..].iter().enumerate() {
183 if b == b'\n' {
184 if prev_newline {
185 return start + i;
186 }
187 prev_newline = true;
188 } else if b != b'\r' {
189 prev_newline = false;
190 }
191 }
192 source.len()
193}
194
195fn trim_ascii(s: &[u8]) -> &[u8] {
196 s.trim_ascii()
199}
200
201#[cfg(test)]
202mod tests {
203 use super::*;
204
205 #[test]
206 fn detects_portion_marking() {
207 let src = b"(TS//SI//NF) This paragraph is classified.";
208 let candidates = Scanner::scan(src);
209 assert_eq!(candidates.len(), 1);
210 assert_eq!(candidates[0].kind, MarkingType::Portion);
211 assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
212 }
213
214 #[test]
215 fn detects_banner() {
216 let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
217 let candidates = Scanner::scan(src);
218 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
219 }
220
221 #[test]
222 fn rejects_newline_in_portion() {
223 let src = b"(TS\n//NF) not a real marking";
224 let candidates = Scanner::scan(src);
225 assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
226 }
227
228 #[test]
229 fn rejects_form_feed_in_portion() {
230 let src = b"(TS\x0c//NF)";
234 let candidates = Scanner::scan(src);
235 assert!(
236 candidates.iter().all(|c| c.kind != MarkingType::Portion),
237 "form feed inside portion parens must not produce a Portion candidate"
238 );
239 assert!(
241 candidates
242 .iter()
243 .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
244 "expected PageBreak at form-feed offset 3"
245 );
246 }
247
248 #[test]
249 fn detects_page_break_form_feed() {
250 let src = b"page1\x0cpage2";
251 let candidates = Scanner::scan(src);
252 let breaks: Vec<_> = candidates
253 .iter()
254 .filter(|c| c.kind == MarkingType::PageBreak)
255 .collect();
256 assert_eq!(breaks.len(), 1);
257 assert_eq!(breaks[0].span.start, 5);
259 assert_eq!(breaks[0].span.end, 5);
260 }
261
262 #[test]
263 fn detects_page_break_blank_line_run() {
264 let src = b"page1\n\n\npage2";
265 let candidates = Scanner::scan(src);
266 let breaks: Vec<_> = candidates
267 .iter()
268 .filter(|c| c.kind == MarkingType::PageBreak)
269 .collect();
270 assert_eq!(breaks.len(), 1);
273 assert_eq!(breaks[0].span.start, 7);
274 }
275
276 #[test]
277 fn double_newline_does_not_emit_page_break() {
278 let src = b"paragraph one\n\nparagraph two";
281 let candidates = Scanner::scan(src);
282 assert!(
283 candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
284 "double newline should not produce a PageBreak candidate"
285 );
286 }
287
288 #[test]
289 fn page_break_sorts_before_co_located_content() {
290 assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
303 assert!(
304 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
305 );
306 assert!(
307 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
308 );
309 assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
310 }
311
312 #[test]
313 fn page_break_form_feed_inside_blank_run_emits_both() {
314 let src = b"a\n\n\x0c\n\nb";
318 let candidates = Scanner::scan(src);
319 let breaks: Vec<_> = candidates
320 .iter()
321 .filter(|c| c.kind == MarkingType::PageBreak)
322 .collect();
323 assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
324 }
325
326 #[test]
329 fn detects_non_us_banner_nato() {
330 let src = b"//NATO SECRET//REL TO USA, GBR\n";
331 let candidates = Scanner::scan(src);
332 let banners: Vec<_> = candidates
333 .iter()
334 .filter(|c| c.kind == MarkingType::Banner)
335 .collect();
336 assert_eq!(banners.len(), 1);
337 }
338
339 #[test]
340 fn detects_non_us_banner_portion_form() {
341 let src = b"//NS//NF\n";
342 let candidates = Scanner::scan(src);
343 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
344 }
345
346 #[test]
347 fn detects_restricted_banner() {
348 let src = b"RESTRICTED//NF\n";
349 let candidates = Scanner::scan(src);
350 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
351 }
352
353 #[test]
354 fn non_us_portion_detected_by_existing_scanner() {
355 let src = b"(//NS//REL TO USA, GBR)";
357 let candidates = Scanner::scan(src);
358 assert!(candidates.iter().any(|c| c.kind == MarkingType::Portion));
359 }
360
361 #[test]
362 fn double_slash_mid_line_is_not_banner() {
363 let src = b"some text // not a marking\n";
365 let candidates = Scanner::scan(src);
366 assert!(
367 candidates.iter().all(|c| c.kind != MarkingType::Banner),
368 "// in middle of line should not produce a banner candidate"
369 );
370 }
371}