Skip to main content

marque_core/
scanner.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Phase 1: candidate detection — finds potential classification markings in a byte buffer.
6//!
7//! Uses `memchr` for SIMD-accelerated boundary detection. Zero heap allocation
8//! beyond the output `Vec<MarkingCandidate>`. Never invokes the parser.
9//!
10//! # Strategy
11//! - Portion candidates: scan for `(` with `memchr`, walk to `)`, apply
12//!   lightweight heuristics (minimum length, ASCII uppercase content).
13//! - Banner candidates: scan for lines whose trimmed content begins with a
14//!   known classification prefix (UNCLASSIFIED, CONFIDENTIAL, SECRET, TOP SECRET).
15//! - CAB candidates: scan for "Classified By:" label, walk to end of block.
16
17use marque_ism::span::{MarkingCandidate, MarkingType, Span};
18use memchr::memchr_iter;
19
20/// Phase 1 scanner. Stateless; call [`Scanner::scan`] on any byte buffer.
21pub struct Scanner;
22
23impl Scanner {
24    /// Scan `source` for classification marking candidates.
25    ///
26    /// Returns candidates in source order. Allocation is proportional to
27    /// the number of candidates found, not source length.
28    pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
29        let mut candidates = Vec::new();
30
31        Self::scan_portions(source, &mut candidates);
32        Self::scan_banners(source, &mut candidates);
33        Self::scan_cab(source, &mut candidates);
34        Self::scan_page_breaks(source, &mut candidates);
35
36        // Sort by `(start, kind_priority)`. PageBreak gets priority 0 so
37        // it sorts before any content candidate at the same offset — the
38        // engine's PageContext reset must run before a co-located banner
39        // or portion is processed, otherwise the reset is defeated by an
40        // unstable secondary order.
41        candidates.sort_unstable_by(|a, b| {
42            a.span
43                .start
44                .cmp(&b.span.start)
45                .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
46        });
47        candidates
48    }
49
50    /// Phase 3 — emit a `MarkingType::PageBreak` candidate at every form-feed
51    /// (`\f`) byte and at the third consecutive `\n` of a `\n\n\n+` run.
52    /// The engine uses these to reset `PageContext` so banner/CAB rules on
53    /// the next page see a fresh aggregate.
54    ///
55    /// PageBreak spans are zero-length and carry no parsable content; the
56    /// parser will reject them, so the engine must filter them out *before*
57    /// calling `parser.parse`.
58    fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
59        // Form-feed: every `\f` is a hard page break in pretty much every
60        // ASCII document convention. memchr is overkill at this scale but
61        // matches the rest of the scanner's idiom.
62        for pos in memchr_iter(b'\x0c', source) {
63            out.push(MarkingCandidate {
64                span: Span::new(pos, pos),
65                kind: MarkingType::PageBreak,
66            });
67        }
68        // Three-or-more consecutive `\n` is a soft page break under our
69        // heuristic. We emit one candidate at the third newline, then skip
70        // ahead until we leave the run, so a single blank gap between
71        // paragraphs (`\n\n`) does NOT trip the reset.
72        let mut run = 0usize;
73        for (i, &b) in source.iter().enumerate() {
74            if b == b'\n' {
75                run += 1;
76                if run == 3 {
77                    out.push(MarkingCandidate {
78                        span: Span::new(i, i),
79                        kind: MarkingType::PageBreak,
80                    });
81                }
82            } else if b != b'\r' {
83                run = 0;
84            }
85        }
86    }
87
88    fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
89        // Find every `(` and walk forward to the matching `)`.
90        for start in memchr_iter(b'(', source) {
91            if let Some(end) = find_portion_end(source, start) {
92                let span = Span::new(start, end + 1);
93                // Heuristic gate: minimum length `(U)` = 3, max reasonable = 256
94                if span.len() >= 3 && span.len() <= 256 {
95                    out.push(MarkingCandidate {
96                        span,
97                        kind: MarkingType::Portion,
98                    });
99                }
100            }
101        }
102    }
103
104    fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
105        // Classification prefixes that can start a banner line.
106        // Full-form US classifications are listed first. Abbreviated US forms
107        // (`TS//`, `S//`, `C//`, `U//`) are included so rules like E001 (portion
108        // abbreviation in banner context) can fire on abbreviated banners.
109        // `//` detects non-US classifications (FGI, NATO, JOINT) where the
110        // US classification slot is empty. `RESTRICTED` supports foreign-origin
111        // markings with the RESTRICTED level.
112        const BANNER_PREFIXES: &[&[u8]] = &[
113            b"TOP SECRET",
114            b"TS//",
115            b"SECRET",
116            b"S//",
117            b"CONFIDENTIAL",
118            b"C//",
119            b"RESTRICTED",
120            b"UNCLASSIFIED",
121            b"U//",
122            b"//",
123        ];
124
125        for line in source.split(|&b| b == b'\n') {
126            let trimmed = trim_ascii(line);
127            if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
128                // `line` is a subslice produced by split(), so its pointer lies
129                // within `source`. Subtraction yields the byte offset safely.
130                let start = line.as_ptr() as usize - source.as_ptr() as usize;
131                let end = start + line.len();
132                out.push(MarkingCandidate {
133                    span: Span::new(start, end),
134                    kind: MarkingType::Banner,
135                });
136            }
137        }
138    }
139
140    fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
141        const CAB_LABEL: &[u8] = b"Classified By:";
142        let mut search_from = 0;
143        while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
144            let pos = search_from + rel;
145            let end = find_cab_end(source, pos);
146            out.push(MarkingCandidate {
147                span: Span::new(pos, end),
148                kind: MarkingType::Cab,
149            });
150            search_from = end;
151        }
152    }
153}
154
155/// Sort priority for `MarkingCandidate` kinds at equal start offsets.
156/// PageBreak sorts first so the engine's `PageContext` reset runs before
157/// any co-located content candidate is processed (banner/portion/CAB at
158/// the same byte offset as a page break — an edge case, but hardened).
159fn kind_sort_priority(kind: MarkingType) -> u8 {
160    match kind {
161        MarkingType::PageBreak => 0,
162        _ => 1,
163    }
164}
165
166fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
167    // Walk bytes after `(` looking for `)`. Reject anything that cannot
168    // legitimately appear inside a single-line portion marking:
169    //   - `\n` / `\r`: portion markings are always on a single line
170    //   - `(`: nested parens are never valid
171    //   - `\x0c` (form feed): a page-break control character cannot
172    //     appear inside a portion. Rejecting it here keeps a
173    //     PageBreak candidate from being shadowed by a spurious
174    //     Portion that spans the form feed.
175    let rest = source.get(open + 1..)?;
176    for (i, &b) in rest.iter().enumerate() {
177        match b {
178            b')' => return Some(open + 1 + i),
179            b'\n' | b'\r' | b'\x0c' | b'(' => return None,
180            _ => {}
181        }
182    }
183    None
184}
185
186fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
187    haystack.windows(needle.len()).position(|w| w == needle)
188}
189
190fn find_cab_end(source: &[u8], start: usize) -> usize {
191    // CAB ends at a blank line or EOF.
192    let mut prev_newline = false;
193    for (i, &b) in source[start..].iter().enumerate() {
194        if b == b'\n' {
195            if prev_newline {
196                return start + i;
197            }
198            prev_newline = true;
199        } else if b != b'\r' {
200            prev_newline = false;
201        }
202    }
203    source.len()
204}
205
206fn trim_ascii(s: &[u8]) -> &[u8] {
207    // Use stdlib trim_ascii (stable since Rust 1.80) to strip all leading/trailing
208    // ASCII whitespace including \r (handles CRLF line endings from split(b'\n')).
209    s.trim_ascii()
210}
211
212#[cfg(test)]
213#[cfg_attr(coverage_nightly, coverage(off))]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn detects_portion_marking() {
219        let src = b"(TS//SI//NF) This paragraph is classified.";
220        let candidates = Scanner::scan(src);
221        assert_eq!(candidates.len(), 1);
222        assert_eq!(candidates[0].kind, MarkingType::Portion);
223        assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
224    }
225
226    #[test]
227    fn detects_banner() {
228        let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
229        let candidates = Scanner::scan(src);
230        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
231    }
232
233    #[test]
234    fn rejects_newline_in_portion() {
235        let src = b"(TS\n//NF) not a real marking";
236        let candidates = Scanner::scan(src);
237        assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
238    }
239
240    #[test]
241    fn rejects_form_feed_in_portion() {
242        // A `\f` inside `(...)` is never a valid single-line portion.
243        // Without this rejection the portion candidate would span the
244        // form feed and shadow the PageBreak candidate at that offset.
245        let src = b"(TS\x0c//NF)";
246        let candidates = Scanner::scan(src);
247        assert!(
248            candidates.iter().all(|c| c.kind != MarkingType::Portion),
249            "form feed inside portion parens must not produce a Portion candidate"
250        );
251        // The PageBreak candidate at offset 3 should still be emitted.
252        assert!(
253            candidates
254                .iter()
255                .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
256            "expected PageBreak at form-feed offset 3"
257        );
258    }
259
260    #[test]
261    fn detects_page_break_form_feed() {
262        let src = b"page1\x0cpage2";
263        let candidates = Scanner::scan(src);
264        let breaks: Vec<_> = candidates
265            .iter()
266            .filter(|c| c.kind == MarkingType::PageBreak)
267            .collect();
268        assert_eq!(breaks.len(), 1);
269        // Form feed sits at offset 5 in `b"page1\x0cpage2"`.
270        assert_eq!(breaks[0].span.start, 5);
271        assert_eq!(breaks[0].span.end, 5);
272    }
273
274    #[test]
275    fn detects_page_break_blank_line_run() {
276        let src = b"page1\n\n\npage2";
277        let candidates = Scanner::scan(src);
278        let breaks: Vec<_> = candidates
279            .iter()
280            .filter(|c| c.kind == MarkingType::PageBreak)
281            .collect();
282        // Exactly one PageBreak — emitted at the *third* newline (offset 7),
283        // not one per `\n` in the run.
284        assert_eq!(breaks.len(), 1);
285        assert_eq!(breaks[0].span.start, 7);
286    }
287
288    #[test]
289    fn double_newline_does_not_emit_page_break() {
290        // A normal paragraph break (`\n\n`) must NOT trip the reset, otherwise
291        // every paragraph in a multi-page document looks like a fresh page.
292        let src = b"paragraph one\n\nparagraph two";
293        let candidates = Scanner::scan(src);
294        assert!(
295            candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
296            "double newline should not produce a PageBreak candidate"
297        );
298    }
299
300    #[test]
301    fn page_break_sorts_before_co_located_content() {
302        // Edge case: a banner line whose line start is at the same byte
303        // offset as a form-feed candidate. The scanner emits both at
304        // offset N — PageBreak (zero-length) and Banner (line span).
305        // The sort must place PageBreak first so the engine reset runs
306        // before the banner is processed.
307        //
308        // Construct `\fSECRET\n`: form-feed at 0, banner line 1..7.
309        // The PageBreak lands at offset 0 with zero length; the banner
310        // line scanner's offset is 1 (after the `\f`), so they are NOT
311        // co-located in this case. Build a synthetic double-push case
312        // by testing `kind_sort_priority` directly instead — simpler
313        // and covers the sort key without fighting the scanner.
314        assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
315        assert!(
316            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
317        );
318        assert!(
319            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
320        );
321        assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
322    }
323
324    #[test]
325    fn page_break_form_feed_inside_blank_run_emits_both() {
326        // `\n\n\f\n\n` — the form feed itself is one PageBreak; the surrounding
327        // newlines do not also trip the 3-newline heuristic because the run
328        // is broken by the `\f`.
329        let src = b"a\n\n\x0c\n\nb";
330        let candidates = Scanner::scan(src);
331        let breaks: Vec<_> = candidates
332            .iter()
333            .filter(|c| c.kind == MarkingType::PageBreak)
334            .collect();
335        assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
336    }
337
338    // --- Non-US banner detection ---
339
340    #[test]
341    fn detects_non_us_banner_nato() {
342        let src = b"//NATO SECRET//REL TO USA, GBR\n";
343        let candidates = Scanner::scan(src);
344        let banners: Vec<_> = candidates
345            .iter()
346            .filter(|c| c.kind == MarkingType::Banner)
347            .collect();
348        assert_eq!(banners.len(), 1);
349    }
350
351    #[test]
352    fn detects_non_us_banner_portion_form() {
353        let src = b"//NS//NF\n";
354        let candidates = Scanner::scan(src);
355        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
356    }
357
358    #[test]
359    fn detects_restricted_banner() {
360        let src = b"RESTRICTED//NF\n";
361        let candidates = Scanner::scan(src);
362        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
363    }
364
365    #[test]
366    fn non_us_portion_detected_by_existing_scanner() {
367        // Portions starting with (// should already be detected via `(`.
368        let src = b"(//NS//REL TO USA, GBR)";
369        let candidates = Scanner::scan(src);
370        assert!(candidates.iter().any(|c| c.kind == MarkingType::Portion));
371    }
372
373    #[test]
374    fn double_slash_mid_line_is_not_banner() {
375        // `//` not at start of trimmed line should not produce a banner.
376        let src = b"some text // not a marking\n";
377        let candidates = Scanner::scan(src);
378        assert!(
379            candidates.iter().all(|c| c.kind != MarkingType::Banner),
380            "// in middle of line should not produce a banner candidate"
381        );
382    }
383}