marque_core/scanner.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Phase 1: candidate detection — finds potential classification markings in a byte buffer.
6//!
7//! Uses `memchr` for SIMD-accelerated boundary detection. Zero heap allocation
8//! beyond the output `Vec<MarkingCandidate>`. Never invokes the parser.
9//!
10//! # Strategy
11//! - Portion candidates: scan for `(` with `memchr`, walk to `)`, apply
12//! lightweight heuristics (minimum length, ASCII uppercase content).
13//! - Banner candidates: scan for lines whose trimmed content begins with a
14//! known classification prefix (UNCLASSIFIED, CONFIDENTIAL, SECRET, TOP SECRET).
15//! - CAB candidates: scan for "Classified By:" label, walk to end of block.
16
17use marque_ism::span::{MarkingCandidate, MarkingType, Span};
18use memchr::memchr_iter;
19
20/// Phase 1 scanner. Stateless; call [`Scanner::scan`] on any byte buffer.
21pub struct Scanner;
22
23impl Scanner {
24 /// Scan `source` for classification marking candidates.
25 ///
26 /// Returns candidates in source order. Allocation is proportional to
27 /// the number of candidates found, not source length.
28 pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
29 let mut candidates = Vec::new();
30
31 Self::scan_portions(source, &mut candidates);
32 Self::scan_banners(source, &mut candidates);
33 Self::scan_cab(source, &mut candidates);
34 Self::scan_page_breaks(source, &mut candidates);
35
36 // Sort by `(start, kind_priority)`. PageBreak gets priority 0 so
37 // it sorts before any content candidate at the same offset — the
38 // engine's PageContext reset must run before a co-located banner
39 // or portion is processed, otherwise the reset is defeated by an
40 // unstable secondary order.
41 candidates.sort_unstable_by(|a, b| {
42 a.span
43 .start
44 .cmp(&b.span.start)
45 .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
46 });
47 candidates
48 }
49
50 /// Phase 3 — emit a `MarkingType::PageBreak` candidate at every form-feed
51 /// (`\f`) byte and at the third consecutive `\n` of a `\n\n\n+` run.
52 /// The engine uses these to reset `PageContext` so banner/CAB rules on
53 /// the next page see a fresh aggregate.
54 ///
55 /// PageBreak spans are zero-length and carry no parsable content; the
56 /// parser will reject them, so the engine must filter them out *before*
57 /// calling `parser.parse`.
58 fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
59 // Form-feed: every `\f` is a hard page break in pretty much every
60 // ASCII document convention. memchr is overkill at this scale but
61 // matches the rest of the scanner's idiom.
62 for pos in memchr_iter(b'\x0c', source) {
63 out.push(MarkingCandidate {
64 span: Span::new(pos, pos),
65 kind: MarkingType::PageBreak,
66 });
67 }
68 // Three-or-more consecutive `\n` is a soft page break under our
69 // heuristic. We emit one candidate at the third newline, then skip
70 // ahead until we leave the run, so a single blank gap between
71 // paragraphs (`\n\n`) does NOT trip the reset.
72 let mut run = 0usize;
73 for (i, &b) in source.iter().enumerate() {
74 if b == b'\n' {
75 run += 1;
76 if run == 3 {
77 out.push(MarkingCandidate {
78 span: Span::new(i, i),
79 kind: MarkingType::PageBreak,
80 });
81 }
82 } else if b != b'\r' {
83 run = 0;
84 }
85 }
86 }
87
88 fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
89 // Find every `(` and walk forward to the matching `)`.
90 for start in memchr_iter(b'(', source) {
91 if let Some(end) = find_portion_end(source, start) {
92 let span = Span::new(start, end + 1);
93 // Heuristic gate: minimum length `(U)` = 3, max reasonable = 256
94 if span.len() >= 3 && span.len() <= 256 {
95 out.push(MarkingCandidate {
96 span,
97 kind: MarkingType::Portion,
98 });
99 }
100 }
101 }
102 }
103
104 fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
105 // Classification prefixes that can start a banner line.
106 // Full-form US classifications are listed first. Abbreviated US forms
107 // (`TS//`, `S//`, `C//`, `U//`) are included so rules like E001 (portion
108 // abbreviation in banner context) can fire on abbreviated banners.
109 // `//` detects non-US classifications (FGI, NATO, JOINT) where the
110 // US classification slot is empty. `RESTRICTED` supports foreign-origin
111 // markings with the RESTRICTED level.
112 const BANNER_PREFIXES: &[&[u8]] = &[
113 b"TOP SECRET",
114 b"TS//",
115 b"SECRET",
116 b"S//",
117 b"CONFIDENTIAL",
118 b"C//",
119 b"RESTRICTED",
120 b"UNCLASSIFIED",
121 b"U//",
122 b"//",
123 ];
124
125 for line in source.split(|&b| b == b'\n') {
126 let trimmed = trim_ascii(line);
127 if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
128 // `line` is a subslice produced by split(), so its pointer lies
129 // within `source`. Subtraction yields the byte offset safely.
130 let start = line.as_ptr() as usize - source.as_ptr() as usize;
131 let end = start + line.len();
132 out.push(MarkingCandidate {
133 span: Span::new(start, end),
134 kind: MarkingType::Banner,
135 });
136 }
137 }
138 }
139
140 fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
141 const CAB_LABEL: &[u8] = b"Classified By:";
142 let mut search_from = 0;
143 while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
144 let pos = search_from + rel;
145 let end = find_cab_end(source, pos);
146 out.push(MarkingCandidate {
147 span: Span::new(pos, end),
148 kind: MarkingType::Cab,
149 });
150 search_from = end;
151 }
152 }
153}
154
155/// Sort priority for `MarkingCandidate` kinds at equal start offsets.
156/// PageBreak sorts first so the engine's `PageContext` reset runs before
157/// any co-located content candidate is processed (banner/portion/CAB at
158/// the same byte offset as a page break — an edge case, but hardened).
159fn kind_sort_priority(kind: MarkingType) -> u8 {
160 match kind {
161 MarkingType::PageBreak => 0,
162 _ => 1,
163 }
164}
165
166fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
167 // Walk bytes after `(` looking for `)`. Reject anything that cannot
168 // legitimately appear inside a single-line portion marking:
169 // - `\n` / `\r`: portion markings are always on a single line
170 // - `(`: nested parens are never valid
171 // - `\x0c` (form feed): a page-break control character cannot
172 // appear inside a portion. Rejecting it here keeps a
173 // PageBreak candidate from being shadowed by a spurious
174 // Portion that spans the form feed.
175 let rest = source.get(open + 1..)?;
176 for (i, &b) in rest.iter().enumerate() {
177 match b {
178 b')' => return Some(open + 1 + i),
179 b'\n' | b'\r' | b'\x0c' | b'(' => return None,
180 _ => {}
181 }
182 }
183 None
184}
185
186fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
187 haystack.windows(needle.len()).position(|w| w == needle)
188}
189
190fn find_cab_end(source: &[u8], start: usize) -> usize {
191 // CAB ends at a blank line or EOF.
192 let mut prev_newline = false;
193 for (i, &b) in source[start..].iter().enumerate() {
194 if b == b'\n' {
195 if prev_newline {
196 return start + i;
197 }
198 prev_newline = true;
199 } else if b != b'\r' {
200 prev_newline = false;
201 }
202 }
203 source.len()
204}
205
206fn trim_ascii(s: &[u8]) -> &[u8] {
207 // Use stdlib trim_ascii (stable since Rust 1.80) to strip all leading/trailing
208 // ASCII whitespace including \r (handles CRLF line endings from split(b'\n')).
209 s.trim_ascii()
210}
211
212#[cfg(test)]
213#[cfg_attr(coverage_nightly, coverage(off))]
214mod tests {
215 use super::*;
216
217 #[test]
218 fn detects_portion_marking() {
219 let src = b"(TS//SI//NF) This paragraph is classified.";
220 let candidates = Scanner::scan(src);
221 assert_eq!(candidates.len(), 1);
222 assert_eq!(candidates[0].kind, MarkingType::Portion);
223 assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
224 }
225
226 #[test]
227 fn detects_banner() {
228 let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
229 let candidates = Scanner::scan(src);
230 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
231 }
232
233 #[test]
234 fn rejects_newline_in_portion() {
235 let src = b"(TS\n//NF) not a real marking";
236 let candidates = Scanner::scan(src);
237 assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
238 }
239
240 #[test]
241 fn rejects_form_feed_in_portion() {
242 // A `\f` inside `(...)` is never a valid single-line portion.
243 // Without this rejection the portion candidate would span the
244 // form feed and shadow the PageBreak candidate at that offset.
245 let src = b"(TS\x0c//NF)";
246 let candidates = Scanner::scan(src);
247 assert!(
248 candidates.iter().all(|c| c.kind != MarkingType::Portion),
249 "form feed inside portion parens must not produce a Portion candidate"
250 );
251 // The PageBreak candidate at offset 3 should still be emitted.
252 assert!(
253 candidates
254 .iter()
255 .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
256 "expected PageBreak at form-feed offset 3"
257 );
258 }
259
260 #[test]
261 fn detects_page_break_form_feed() {
262 let src = b"page1\x0cpage2";
263 let candidates = Scanner::scan(src);
264 let breaks: Vec<_> = candidates
265 .iter()
266 .filter(|c| c.kind == MarkingType::PageBreak)
267 .collect();
268 assert_eq!(breaks.len(), 1);
269 // Form feed sits at offset 5 in `b"page1\x0cpage2"`.
270 assert_eq!(breaks[0].span.start, 5);
271 assert_eq!(breaks[0].span.end, 5);
272 }
273
274 #[test]
275 fn detects_page_break_blank_line_run() {
276 let src = b"page1\n\n\npage2";
277 let candidates = Scanner::scan(src);
278 let breaks: Vec<_> = candidates
279 .iter()
280 .filter(|c| c.kind == MarkingType::PageBreak)
281 .collect();
282 // Exactly one PageBreak — emitted at the *third* newline (offset 7),
283 // not one per `\n` in the run.
284 assert_eq!(breaks.len(), 1);
285 assert_eq!(breaks[0].span.start, 7);
286 }
287
288 #[test]
289 fn double_newline_does_not_emit_page_break() {
290 // A normal paragraph break (`\n\n`) must NOT trip the reset, otherwise
291 // every paragraph in a multi-page document looks like a fresh page.
292 let src = b"paragraph one\n\nparagraph two";
293 let candidates = Scanner::scan(src);
294 assert!(
295 candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
296 "double newline should not produce a PageBreak candidate"
297 );
298 }
299
300 #[test]
301 fn page_break_sorts_before_co_located_content() {
302 // Edge case: a banner line whose line start is at the same byte
303 // offset as a form-feed candidate. The scanner emits both at
304 // offset N — PageBreak (zero-length) and Banner (line span).
305 // The sort must place PageBreak first so the engine reset runs
306 // before the banner is processed.
307 //
308 // Construct `\fSECRET\n`: form-feed at 0, banner line 1..7.
309 // The PageBreak lands at offset 0 with zero length; the banner
310 // line scanner's offset is 1 (after the `\f`), so they are NOT
311 // co-located in this case. Build a synthetic double-push case
312 // by testing `kind_sort_priority` directly instead — simpler
313 // and covers the sort key without fighting the scanner.
314 assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
315 assert!(
316 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
317 );
318 assert!(
319 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
320 );
321 assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
322 }
323
324 #[test]
325 fn page_break_form_feed_inside_blank_run_emits_both() {
326 // `\n\n\f\n\n` — the form feed itself is one PageBreak; the surrounding
327 // newlines do not also trip the 3-newline heuristic because the run
328 // is broken by the `\f`.
329 let src = b"a\n\n\x0c\n\nb";
330 let candidates = Scanner::scan(src);
331 let breaks: Vec<_> = candidates
332 .iter()
333 .filter(|c| c.kind == MarkingType::PageBreak)
334 .collect();
335 assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
336 }
337
338 // --- Non-US banner detection ---
339
340 #[test]
341 fn detects_non_us_banner_nato() {
342 let src = b"//NATO SECRET//REL TO USA, GBR\n";
343 let candidates = Scanner::scan(src);
344 let banners: Vec<_> = candidates
345 .iter()
346 .filter(|c| c.kind == MarkingType::Banner)
347 .collect();
348 assert_eq!(banners.len(), 1);
349 }
350
351 #[test]
352 fn detects_non_us_banner_portion_form() {
353 let src = b"//NS//NF\n";
354 let candidates = Scanner::scan(src);
355 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
356 }
357
358 #[test]
359 fn detects_restricted_banner() {
360 let src = b"RESTRICTED//NF\n";
361 let candidates = Scanner::scan(src);
362 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
363 }
364
365 #[test]
366 fn non_us_portion_detected_by_existing_scanner() {
367 // Portions starting with (// should already be detected via `(`.
368 let src = b"(//NS//REL TO USA, GBR)";
369 let candidates = Scanner::scan(src);
370 assert!(candidates.iter().any(|c| c.kind == MarkingType::Portion));
371 }
372
373 #[test]
374 fn double_slash_mid_line_is_not_banner() {
375 // `//` not at start of trimmed line should not produce a banner.
376 let src = b"some text // not a marking\n";
377 let candidates = Scanner::scan(src);
378 assert!(
379 candidates.iter().all(|c| c.kind != MarkingType::Banner),
380 "// in middle of line should not produce a banner candidate"
381 );
382 }
383}