Skip to main content

marque_core/
scanner.rs

1//! Phase 1: candidate detection — finds potential classification markings in a byte buffer.
2//!
3//! Uses `memchr` for SIMD-accelerated boundary detection. Zero heap allocation
4//! beyond the output `Vec<MarkingCandidate>`. Never invokes the parser.
5//!
6//! # Strategy
7//! - Portion candidates: scan for `(` with `memchr`, walk to `)`, apply
8//!   lightweight heuristics (minimum length, ASCII uppercase content).
9//! - Banner candidates: scan for lines whose trimmed content begins with a
10//!   known classification prefix (UNCLASSIFIED, CONFIDENTIAL, SECRET, TOP SECRET).
11//! - CAB candidates: scan for "Classified By:" label, walk to end of block.
12
13use marque_ism::span::{MarkingCandidate, MarkingType, Span};
14use memchr::memchr_iter;
15
16/// Phase 1 scanner. Stateless; call [`Scanner::scan`] on any byte buffer.
17pub struct Scanner;
18
19impl Scanner {
20    /// Scan `source` for classification marking candidates.
21    ///
22    /// Returns candidates in source order. Allocation is proportional to
23    /// the number of candidates found, not source length.
24    pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
25        let mut candidates = Vec::new();
26
27        Self::scan_portions(source, &mut candidates);
28        Self::scan_banners(source, &mut candidates);
29        Self::scan_cab(source, &mut candidates);
30        Self::scan_page_breaks(source, &mut candidates);
31
32        // Sort by `(start, kind_priority)`. PageBreak gets priority 0 so
33        // it sorts before any content candidate at the same offset — the
34        // engine's PageContext reset must run before a co-located banner
35        // or portion is processed, otherwise the reset is defeated by an
36        // unstable secondary order.
37        candidates.sort_unstable_by(|a, b| {
38            a.span
39                .start
40                .cmp(&b.span.start)
41                .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
42        });
43        candidates
44    }
45
46    /// Phase 3 — emit a `MarkingType::PageBreak` candidate at every form-feed
47    /// (`\f`) byte and at the third consecutive `\n` of a `\n\n\n+` run.
48    /// The engine uses these to reset `PageContext` so banner/CAB rules on
49    /// the next page see a fresh aggregate.
50    ///
51    /// PageBreak spans are zero-length and carry no parsable content; the
52    /// parser will reject them, so the engine must filter them out *before*
53    /// calling `parser.parse`.
54    fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
55        // Form-feed: every `\f` is a hard page break in pretty much every
56        // ASCII document convention. memchr is overkill at this scale but
57        // matches the rest of the scanner's idiom.
58        for pos in memchr_iter(b'\x0c', source) {
59            out.push(MarkingCandidate {
60                span: Span::new(pos, pos),
61                kind: MarkingType::PageBreak,
62            });
63        }
64        // Three-or-more consecutive `\n` is a soft page break under our
65        // heuristic. We emit one candidate at the third newline, then skip
66        // ahead until we leave the run, so a single blank gap between
67        // paragraphs (`\n\n`) does NOT trip the reset.
68        let mut run = 0usize;
69        for (i, &b) in source.iter().enumerate() {
70            if b == b'\n' {
71                run += 1;
72                if run == 3 {
73                    out.push(MarkingCandidate {
74                        span: Span::new(i, i),
75                        kind: MarkingType::PageBreak,
76                    });
77                }
78            } else if b != b'\r' {
79                run = 0;
80            }
81        }
82    }
83
84    fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
85        // Find every `(` and walk forward to the matching `)`.
86        for start in memchr_iter(b'(', source) {
87            if let Some(end) = find_portion_end(source, start) {
88                let span = Span::new(start, end + 1);
89                // Heuristic gate: minimum length `(U)` = 3, max reasonable = 256
90                if span.len() >= 3 && span.len() <= 256 {
91                    out.push(MarkingCandidate {
92                        span,
93                        kind: MarkingType::Portion,
94                    });
95                }
96            }
97        }
98    }
99
100    fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
101        // Classification prefixes that can start a banner line.
102        // `//` detects non-US classifications (FGI, NATO, JOINT) where the
103        // US classification slot is empty. `RESTRICTED` supports foreign-origin
104        // markings with the RESTRICTED level.
105        const BANNER_PREFIXES: &[&[u8]] = &[
106            b"TOP SECRET",
107            b"SECRET",
108            b"CONFIDENTIAL",
109            b"RESTRICTED",
110            b"UNCLASSIFIED",
111            b"//",
112        ];
113
114        for line in source.split(|&b| b == b'\n') {
115            let trimmed = trim_ascii(line);
116            if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
117                // `line` is a subslice produced by split(), so its pointer lies
118                // within `source`. Subtraction yields the byte offset safely.
119                let start = line.as_ptr() as usize - source.as_ptr() as usize;
120                let end = start + line.len();
121                out.push(MarkingCandidate {
122                    span: Span::new(start, end),
123                    kind: MarkingType::Banner,
124                });
125            }
126        }
127    }
128
129    fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
130        const CAB_LABEL: &[u8] = b"Classified By:";
131        let mut search_from = 0;
132        while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
133            let pos = search_from + rel;
134            let end = find_cab_end(source, pos);
135            out.push(MarkingCandidate {
136                span: Span::new(pos, end),
137                kind: MarkingType::Cab,
138            });
139            search_from = end;
140        }
141    }
142}
143
144/// Sort priority for `MarkingCandidate` kinds at equal start offsets.
145/// PageBreak sorts first so the engine's `PageContext` reset runs before
146/// any co-located content candidate is processed (banner/portion/CAB at
147/// the same byte offset as a page break — an edge case, but hardened).
148fn kind_sort_priority(kind: MarkingType) -> u8 {
149    match kind {
150        MarkingType::PageBreak => 0,
151        _ => 1,
152    }
153}
154
155fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
156    // Walk bytes after `(` looking for `)`. Reject anything that cannot
157    // legitimately appear inside a single-line portion marking:
158    //   - `\n` / `\r`: portion markings are always on a single line
159    //   - `(`: nested parens are never valid
160    //   - `\x0c` (form feed): a page-break control character cannot
161    //     appear inside a portion. Rejecting it here keeps a
162    //     PageBreak candidate from being shadowed by a spurious
163    //     Portion that spans the form feed.
164    let rest = source.get(open + 1..)?;
165    for (i, &b) in rest.iter().enumerate() {
166        match b {
167            b')' => return Some(open + 1 + i),
168            b'\n' | b'\r' | b'\x0c' | b'(' => return None,
169            _ => {}
170        }
171    }
172    None
173}
174
175fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
176    haystack.windows(needle.len()).position(|w| w == needle)
177}
178
179fn find_cab_end(source: &[u8], start: usize) -> usize {
180    // CAB ends at a blank line or EOF.
181    let mut prev_newline = false;
182    for (i, &b) in source[start..].iter().enumerate() {
183        if b == b'\n' {
184            if prev_newline {
185                return start + i;
186            }
187            prev_newline = true;
188        } else if b != b'\r' {
189            prev_newline = false;
190        }
191    }
192    source.len()
193}
194
195fn trim_ascii(s: &[u8]) -> &[u8] {
196    // Use stdlib trim_ascii (stable since Rust 1.80) to strip all leading/trailing
197    // ASCII whitespace including \r (handles CRLF line endings from split(b'\n')).
198    s.trim_ascii()
199}
200
201#[cfg(test)]
202mod tests {
203    use super::*;
204
205    #[test]
206    fn detects_portion_marking() {
207        let src = b"(TS//SI//NF) This paragraph is classified.";
208        let candidates = Scanner::scan(src);
209        assert_eq!(candidates.len(), 1);
210        assert_eq!(candidates[0].kind, MarkingType::Portion);
211        assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
212    }
213
214    #[test]
215    fn detects_banner() {
216        let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
217        let candidates = Scanner::scan(src);
218        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
219    }
220
221    #[test]
222    fn rejects_newline_in_portion() {
223        let src = b"(TS\n//NF) not a real marking";
224        let candidates = Scanner::scan(src);
225        assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
226    }
227
228    #[test]
229    fn rejects_form_feed_in_portion() {
230        // A `\f` inside `(...)` is never a valid single-line portion.
231        // Without this rejection the portion candidate would span the
232        // form feed and shadow the PageBreak candidate at that offset.
233        let src = b"(TS\x0c//NF)";
234        let candidates = Scanner::scan(src);
235        assert!(
236            candidates.iter().all(|c| c.kind != MarkingType::Portion),
237            "form feed inside portion parens must not produce a Portion candidate"
238        );
239        // The PageBreak candidate at offset 3 should still be emitted.
240        assert!(
241            candidates
242                .iter()
243                .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
244            "expected PageBreak at form-feed offset 3"
245        );
246    }
247
248    #[test]
249    fn detects_page_break_form_feed() {
250        let src = b"page1\x0cpage2";
251        let candidates = Scanner::scan(src);
252        let breaks: Vec<_> = candidates
253            .iter()
254            .filter(|c| c.kind == MarkingType::PageBreak)
255            .collect();
256        assert_eq!(breaks.len(), 1);
257        // Form feed sits at offset 5 in `b"page1\x0cpage2"`.
258        assert_eq!(breaks[0].span.start, 5);
259        assert_eq!(breaks[0].span.end, 5);
260    }
261
262    #[test]
263    fn detects_page_break_blank_line_run() {
264        let src = b"page1\n\n\npage2";
265        let candidates = Scanner::scan(src);
266        let breaks: Vec<_> = candidates
267            .iter()
268            .filter(|c| c.kind == MarkingType::PageBreak)
269            .collect();
270        // Exactly one PageBreak — emitted at the *third* newline (offset 7),
271        // not one per `\n` in the run.
272        assert_eq!(breaks.len(), 1);
273        assert_eq!(breaks[0].span.start, 7);
274    }
275
276    #[test]
277    fn double_newline_does_not_emit_page_break() {
278        // A normal paragraph break (`\n\n`) must NOT trip the reset, otherwise
279        // every paragraph in a multi-page document looks like a fresh page.
280        let src = b"paragraph one\n\nparagraph two";
281        let candidates = Scanner::scan(src);
282        assert!(
283            candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
284            "double newline should not produce a PageBreak candidate"
285        );
286    }
287
288    #[test]
289    fn page_break_sorts_before_co_located_content() {
290        // Edge case: a banner line whose line start is at the same byte
291        // offset as a form-feed candidate. The scanner emits both at
292        // offset N — PageBreak (zero-length) and Banner (line span).
293        // The sort must place PageBreak first so the engine reset runs
294        // before the banner is processed.
295        //
296        // Construct `\fSECRET\n`: form-feed at 0, banner line 1..7.
297        // The PageBreak lands at offset 0 with zero length; the banner
298        // line scanner's offset is 1 (after the `\f`), so they are NOT
299        // co-located in this case. Build a synthetic double-push case
300        // by testing `kind_sort_priority` directly instead — simpler
301        // and covers the sort key without fighting the scanner.
302        assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
303        assert!(
304            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
305        );
306        assert!(
307            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
308        );
309        assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
310    }
311
312    #[test]
313    fn page_break_form_feed_inside_blank_run_emits_both() {
314        // `\n\n\f\n\n` — the form feed itself is one PageBreak; the surrounding
315        // newlines do not also trip the 3-newline heuristic because the run
316        // is broken by the `\f`.
317        let src = b"a\n\n\x0c\n\nb";
318        let candidates = Scanner::scan(src);
319        let breaks: Vec<_> = candidates
320            .iter()
321            .filter(|c| c.kind == MarkingType::PageBreak)
322            .collect();
323        assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
324    }
325
326    // --- Non-US banner detection ---
327
328    #[test]
329    fn detects_non_us_banner_nato() {
330        let src = b"//NATO SECRET//REL TO USA, GBR\n";
331        let candidates = Scanner::scan(src);
332        let banners: Vec<_> = candidates
333            .iter()
334            .filter(|c| c.kind == MarkingType::Banner)
335            .collect();
336        assert_eq!(banners.len(), 1);
337    }
338
339    #[test]
340    fn detects_non_us_banner_portion_form() {
341        let src = b"//NS//NF\n";
342        let candidates = Scanner::scan(src);
343        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
344    }
345
346    #[test]
347    fn detects_restricted_banner() {
348        let src = b"RESTRICTED//NF\n";
349        let candidates = Scanner::scan(src);
350        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
351    }
352
353    #[test]
354    fn non_us_portion_detected_by_existing_scanner() {
355        // Portions starting with (// should already be detected via `(`.
356        let src = b"(//NS//REL TO USA, GBR)";
357        let candidates = Scanner::scan(src);
358        assert!(candidates.iter().any(|c| c.kind == MarkingType::Portion));
359    }
360
361    #[test]
362    fn double_slash_mid_line_is_not_banner() {
363        // `//` not at start of trimmed line should not produce a banner.
364        let src = b"some text // not a marking\n";
365        let candidates = Scanner::scan(src);
366        assert!(
367            candidates.iter().all(|c| c.kind != MarkingType::Banner),
368            "// in middle of line should not produce a banner candidate"
369        );
370    }
371}