marque_core/scanner.rs
1//! Phase 1: candidate detection — finds potential classification markings in a byte buffer.
2//!
3//! Uses `memchr` for SIMD-accelerated boundary detection. Zero heap allocation
4//! beyond the output `Vec<MarkingCandidate>`. Never invokes the parser.
5//!
6//! # Strategy
7//! - Portion candidates: scan for `(` with `memchr`, walk to `)`, apply
8//! lightweight heuristics (minimum length, ASCII uppercase content).
9//! - Banner candidates: scan for lines whose trimmed content begins with a
10//! known classification prefix (UNCLASSIFIED, CONFIDENTIAL, SECRET, TOP SECRET).
11//! - CAB candidates: scan for "Classified By:" label, walk to end of block.
12
13use marque_ism::span::{MarkingCandidate, MarkingType, Span};
14use memchr::memchr_iter;
15
16/// Phase 1 scanner. Stateless; call [`Scanner::scan`] on any byte buffer.
17pub struct Scanner;
18
19impl Scanner {
20 /// Scan `source` for classification marking candidates.
21 ///
22 /// Returns candidates in source order. Allocation is proportional to
23 /// the number of candidates found, not source length.
24 pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
25 let mut candidates = Vec::new();
26
27 Self::scan_portions(source, &mut candidates);
28 Self::scan_banners(source, &mut candidates);
29 Self::scan_cab(source, &mut candidates);
30 Self::scan_page_breaks(source, &mut candidates);
31
32 // Sort by `(start, kind_priority)`. PageBreak gets priority 0 so
33 // it sorts before any content candidate at the same offset — the
34 // engine's PageContext reset must run before a co-located banner
35 // or portion is processed, otherwise the reset is defeated by an
36 // unstable secondary order.
37 candidates.sort_unstable_by(|a, b| {
38 a.span
39 .start
40 .cmp(&b.span.start)
41 .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
42 });
43 candidates
44 }
45
46 /// Phase 3 — emit a `MarkingType::PageBreak` candidate at every form-feed
47 /// (`\f`) byte and at the third consecutive `\n` of a `\n\n\n+` run.
48 /// The engine uses these to reset `PageContext` so banner/CAB rules on
49 /// the next page see a fresh aggregate.
50 ///
51 /// PageBreak spans are zero-length and carry no parsable content; the
52 /// parser will reject them, so the engine must filter them out *before*
53 /// calling `parser.parse`.
54 fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
55 // Form-feed: every `\f` is a hard page break in pretty much every
56 // ASCII document convention. memchr is overkill at this scale but
57 // matches the rest of the scanner's idiom.
58 for pos in memchr_iter(b'\x0c', source) {
59 out.push(MarkingCandidate {
60 span: Span::new(pos, pos),
61 kind: MarkingType::PageBreak,
62 });
63 }
64 // Three-or-more consecutive `\n` is a soft page break under our
65 // heuristic. We emit one candidate at the third newline, then skip
66 // ahead until we leave the run, so a single blank gap between
67 // paragraphs (`\n\n`) does NOT trip the reset.
68 let mut run = 0usize;
69 for (i, &b) in source.iter().enumerate() {
70 if b == b'\n' {
71 run += 1;
72 if run == 3 {
73 out.push(MarkingCandidate {
74 span: Span::new(i, i),
75 kind: MarkingType::PageBreak,
76 });
77 }
78 } else if b != b'\r' {
79 run = 0;
80 }
81 }
82 }
83
84 fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
85 // Find every `(` and walk forward to the matching `)`.
86 for start in memchr_iter(b'(', source) {
87 if let Some(end) = find_portion_end(source, start) {
88 let span = Span::new(start, end + 1);
89 // Heuristic gate: minimum length `(U)` = 3, max reasonable = 256
90 if span.len() >= 3 && span.len() <= 256 {
91 out.push(MarkingCandidate {
92 span,
93 kind: MarkingType::Portion,
94 });
95 }
96 }
97 }
98 }
99
100 fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
101 // Classification prefixes that can start a banner line (full-word form only).
102 const BANNER_PREFIXES: &[&[u8]] =
103 &[b"TOP SECRET", b"SECRET", b"CONFIDENTIAL", b"UNCLASSIFIED"];
104
105 for line in source.split(|&b| b == b'\n') {
106 let trimmed = trim_ascii(line);
107 if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
108 // `line` is a subslice produced by split(), so its pointer lies
109 // within `source`. Subtraction yields the byte offset safely.
110 let start = line.as_ptr() as usize - source.as_ptr() as usize;
111 let end = start + line.len();
112 out.push(MarkingCandidate {
113 span: Span::new(start, end),
114 kind: MarkingType::Banner,
115 });
116 }
117 }
118 }
119
120 fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
121 const CAB_LABEL: &[u8] = b"Classified By:";
122 let mut search_from = 0;
123 while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
124 let pos = search_from + rel;
125 let end = find_cab_end(source, pos);
126 out.push(MarkingCandidate {
127 span: Span::new(pos, end),
128 kind: MarkingType::Cab,
129 });
130 search_from = end;
131 }
132 }
133}
134
135/// Sort priority for `MarkingCandidate` kinds at equal start offsets.
136/// PageBreak sorts first so the engine's `PageContext` reset runs before
137/// any co-located content candidate is processed (banner/portion/CAB at
138/// the same byte offset as a page break — an edge case, but hardened).
139fn kind_sort_priority(kind: MarkingType) -> u8 {
140 match kind {
141 MarkingType::PageBreak => 0,
142 _ => 1,
143 }
144}
145
146fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
147 // Walk bytes after `(` looking for `)`. Reject anything that cannot
148 // legitimately appear inside a single-line portion marking:
149 // - `\n` / `\r`: portion markings are always on a single line
150 // - `(`: nested parens are never valid
151 // - `\x0c` (form feed): a page-break control character cannot
152 // appear inside a portion. Rejecting it here keeps a
153 // PageBreak candidate from being shadowed by a spurious
154 // Portion that spans the form feed.
155 let rest = source.get(open + 1..)?;
156 for (i, &b) in rest.iter().enumerate() {
157 match b {
158 b')' => return Some(open + 1 + i),
159 b'\n' | b'\r' | b'\x0c' | b'(' => return None,
160 _ => {}
161 }
162 }
163 None
164}
165
166fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
167 haystack.windows(needle.len()).position(|w| w == needle)
168}
169
170fn find_cab_end(source: &[u8], start: usize) -> usize {
171 // CAB ends at a blank line or EOF.
172 let mut prev_newline = false;
173 for (i, &b) in source[start..].iter().enumerate() {
174 if b == b'\n' {
175 if prev_newline {
176 return start + i;
177 }
178 prev_newline = true;
179 } else if b != b'\r' {
180 prev_newline = false;
181 }
182 }
183 source.len()
184}
185
186fn trim_ascii(s: &[u8]) -> &[u8] {
187 // Use stdlib trim_ascii (stable since Rust 1.80) to strip all leading/trailing
188 // ASCII whitespace including \r (handles CRLF line endings from split(b'\n')).
189 s.trim_ascii()
190}
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
197 fn detects_portion_marking() {
198 let src = b"(TS//SI//NF) This paragraph is classified.";
199 let candidates = Scanner::scan(src);
200 assert_eq!(candidates.len(), 1);
201 assert_eq!(candidates[0].kind, MarkingType::Portion);
202 assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
203 }
204
205 #[test]
206 fn detects_banner() {
207 let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
208 let candidates = Scanner::scan(src);
209 assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
210 }
211
212 #[test]
213 fn rejects_newline_in_portion() {
214 let src = b"(TS\n//NF) not a real marking";
215 let candidates = Scanner::scan(src);
216 assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
217 }
218
219 #[test]
220 fn rejects_form_feed_in_portion() {
221 // A `\f` inside `(...)` is never a valid single-line portion.
222 // Without this rejection the portion candidate would span the
223 // form feed and shadow the PageBreak candidate at that offset.
224 let src = b"(TS\x0c//NF)";
225 let candidates = Scanner::scan(src);
226 assert!(
227 candidates.iter().all(|c| c.kind != MarkingType::Portion),
228 "form feed inside portion parens must not produce a Portion candidate"
229 );
230 // The PageBreak candidate at offset 3 should still be emitted.
231 assert!(
232 candidates
233 .iter()
234 .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
235 "expected PageBreak at form-feed offset 3"
236 );
237 }
238
239 #[test]
240 fn detects_page_break_form_feed() {
241 let src = b"page1\x0cpage2";
242 let candidates = Scanner::scan(src);
243 let breaks: Vec<_> = candidates
244 .iter()
245 .filter(|c| c.kind == MarkingType::PageBreak)
246 .collect();
247 assert_eq!(breaks.len(), 1);
248 // Form feed sits at offset 5 in `b"page1\x0cpage2"`.
249 assert_eq!(breaks[0].span.start, 5);
250 assert_eq!(breaks[0].span.end, 5);
251 }
252
253 #[test]
254 fn detects_page_break_blank_line_run() {
255 let src = b"page1\n\n\npage2";
256 let candidates = Scanner::scan(src);
257 let breaks: Vec<_> = candidates
258 .iter()
259 .filter(|c| c.kind == MarkingType::PageBreak)
260 .collect();
261 // Exactly one PageBreak — emitted at the *third* newline (offset 7),
262 // not one per `\n` in the run.
263 assert_eq!(breaks.len(), 1);
264 assert_eq!(breaks[0].span.start, 7);
265 }
266
267 #[test]
268 fn double_newline_does_not_emit_page_break() {
269 // A normal paragraph break (`\n\n`) must NOT trip the reset, otherwise
270 // every paragraph in a multi-page document looks like a fresh page.
271 let src = b"paragraph one\n\nparagraph two";
272 let candidates = Scanner::scan(src);
273 assert!(
274 candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
275 "double newline should not produce a PageBreak candidate"
276 );
277 }
278
279 #[test]
280 fn page_break_sorts_before_co_located_content() {
281 // Edge case: a banner line whose line start is at the same byte
282 // offset as a form-feed candidate. The scanner emits both at
283 // offset N — PageBreak (zero-length) and Banner (line span).
284 // The sort must place PageBreak first so the engine reset runs
285 // before the banner is processed.
286 //
287 // Construct `\fSECRET\n`: form-feed at 0, banner line 1..7.
288 // The PageBreak lands at offset 0 with zero length; the banner
289 // line scanner's offset is 1 (after the `\f`), so they are NOT
290 // co-located in this case. Build a synthetic double-push case
291 // by testing `kind_sort_priority` directly instead — simpler
292 // and covers the sort key without fighting the scanner.
293 assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
294 assert!(
295 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
296 );
297 assert!(
298 kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
299 );
300 assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
301 }
302
303 #[test]
304 fn page_break_form_feed_inside_blank_run_emits_both() {
305 // `\n\n\f\n\n` — the form feed itself is one PageBreak; the surrounding
306 // newlines do not also trip the 3-newline heuristic because the run
307 // is broken by the `\f`.
308 let src = b"a\n\n\x0c\n\nb";
309 let candidates = Scanner::scan(src);
310 let breaks: Vec<_> = candidates
311 .iter()
312 .filter(|c| c.kind == MarkingType::PageBreak)
313 .collect();
314 assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
315 }
316}