Skip to main content

marque_core/
scanner.rs

1//! Phase 1: candidate detection — finds potential classification markings in a byte buffer.
2//!
3//! Uses `memchr` for SIMD-accelerated boundary detection. Zero heap allocation
4//! beyond the output `Vec<MarkingCandidate>`. Never invokes the parser.
5//!
6//! # Strategy
7//! - Portion candidates: scan for `(` with `memchr`, walk to `)`, apply
8//!   lightweight heuristics (minimum length, ASCII uppercase content).
9//! - Banner candidates: scan for lines whose trimmed content begins with a
10//!   known classification prefix (UNCLASSIFIED, CONFIDENTIAL, SECRET, TOP SECRET).
11//! - CAB candidates: scan for "Classified By:" label, walk to end of block.
12
13use marque_ism::span::{MarkingCandidate, MarkingType, Span};
14use memchr::memchr_iter;
15
16/// Phase 1 scanner. Stateless; call [`Scanner::scan`] on any byte buffer.
17pub struct Scanner;
18
19impl Scanner {
20    /// Scan `source` for classification marking candidates.
21    ///
22    /// Returns candidates in source order. Allocation is proportional to
23    /// the number of candidates found, not source length.
24    pub fn scan(source: &[u8]) -> Vec<MarkingCandidate> {
25        let mut candidates = Vec::new();
26
27        Self::scan_portions(source, &mut candidates);
28        Self::scan_banners(source, &mut candidates);
29        Self::scan_cab(source, &mut candidates);
30        Self::scan_page_breaks(source, &mut candidates);
31
32        // Sort by `(start, kind_priority)`. PageBreak gets priority 0 so
33        // it sorts before any content candidate at the same offset — the
34        // engine's PageContext reset must run before a co-located banner
35        // or portion is processed, otherwise the reset is defeated by an
36        // unstable secondary order.
37        candidates.sort_unstable_by(|a, b| {
38            a.span
39                .start
40                .cmp(&b.span.start)
41                .then_with(|| kind_sort_priority(a.kind).cmp(&kind_sort_priority(b.kind)))
42        });
43        candidates
44    }
45
46    /// Phase 3 — emit a `MarkingType::PageBreak` candidate at every form-feed
47    /// (`\f`) byte and at the third consecutive `\n` of a `\n\n\n+` run.
48    /// The engine uses these to reset `PageContext` so banner/CAB rules on
49    /// the next page see a fresh aggregate.
50    ///
51    /// PageBreak spans are zero-length and carry no parsable content; the
52    /// parser will reject them, so the engine must filter them out *before*
53    /// calling `parser.parse`.
54    fn scan_page_breaks(source: &[u8], out: &mut Vec<MarkingCandidate>) {
55        // Form-feed: every `\f` is a hard page break in pretty much every
56        // ASCII document convention. memchr is overkill at this scale but
57        // matches the rest of the scanner's idiom.
58        for pos in memchr_iter(b'\x0c', source) {
59            out.push(MarkingCandidate {
60                span: Span::new(pos, pos),
61                kind: MarkingType::PageBreak,
62            });
63        }
64        // Three-or-more consecutive `\n` is a soft page break under our
65        // heuristic. We emit one candidate at the third newline, then skip
66        // ahead until we leave the run, so a single blank gap between
67        // paragraphs (`\n\n`) does NOT trip the reset.
68        let mut run = 0usize;
69        for (i, &b) in source.iter().enumerate() {
70            if b == b'\n' {
71                run += 1;
72                if run == 3 {
73                    out.push(MarkingCandidate {
74                        span: Span::new(i, i),
75                        kind: MarkingType::PageBreak,
76                    });
77                }
78            } else if b != b'\r' {
79                run = 0;
80            }
81        }
82    }
83
84    fn scan_portions(source: &[u8], out: &mut Vec<MarkingCandidate>) {
85        // Find every `(` and walk forward to the matching `)`.
86        for start in memchr_iter(b'(', source) {
87            if let Some(end) = find_portion_end(source, start) {
88                let span = Span::new(start, end + 1);
89                // Heuristic gate: minimum length `(U)` = 3, max reasonable = 256
90                if span.len() >= 3 && span.len() <= 256 {
91                    out.push(MarkingCandidate {
92                        span,
93                        kind: MarkingType::Portion,
94                    });
95                }
96            }
97        }
98    }
99
100    fn scan_banners(source: &[u8], out: &mut Vec<MarkingCandidate>) {
101        // Classification prefixes that can start a banner line (full-word form only).
102        const BANNER_PREFIXES: &[&[u8]] =
103            &[b"TOP SECRET", b"SECRET", b"CONFIDENTIAL", b"UNCLASSIFIED"];
104
105        for line in source.split(|&b| b == b'\n') {
106            let trimmed = trim_ascii(line);
107            if BANNER_PREFIXES.iter().any(|p| trimmed.starts_with(p)) {
108                // `line` is a subslice produced by split(), so its pointer lies
109                // within `source`. Subtraction yields the byte offset safely.
110                let start = line.as_ptr() as usize - source.as_ptr() as usize;
111                let end = start + line.len();
112                out.push(MarkingCandidate {
113                    span: Span::new(start, end),
114                    kind: MarkingType::Banner,
115                });
116            }
117        }
118    }
119
120    fn scan_cab(source: &[u8], out: &mut Vec<MarkingCandidate>) {
121        const CAB_LABEL: &[u8] = b"Classified By:";
122        let mut search_from = 0;
123        while let Some(rel) = find_subsequence(&source[search_from..], CAB_LABEL) {
124            let pos = search_from + rel;
125            let end = find_cab_end(source, pos);
126            out.push(MarkingCandidate {
127                span: Span::new(pos, end),
128                kind: MarkingType::Cab,
129            });
130            search_from = end;
131        }
132    }
133}
134
135/// Sort priority for `MarkingCandidate` kinds at equal start offsets.
136/// PageBreak sorts first so the engine's `PageContext` reset runs before
137/// any co-located content candidate is processed (banner/portion/CAB at
138/// the same byte offset as a page break — an edge case, but hardened).
139fn kind_sort_priority(kind: MarkingType) -> u8 {
140    match kind {
141        MarkingType::PageBreak => 0,
142        _ => 1,
143    }
144}
145
146fn find_portion_end(source: &[u8], open: usize) -> Option<usize> {
147    // Walk bytes after `(` looking for `)`. Reject anything that cannot
148    // legitimately appear inside a single-line portion marking:
149    //   - `\n` / `\r`: portion markings are always on a single line
150    //   - `(`: nested parens are never valid
151    //   - `\x0c` (form feed): a page-break control character cannot
152    //     appear inside a portion. Rejecting it here keeps a
153    //     PageBreak candidate from being shadowed by a spurious
154    //     Portion that spans the form feed.
155    let rest = source.get(open + 1..)?;
156    for (i, &b) in rest.iter().enumerate() {
157        match b {
158            b')' => return Some(open + 1 + i),
159            b'\n' | b'\r' | b'\x0c' | b'(' => return None,
160            _ => {}
161        }
162    }
163    None
164}
165
166fn find_subsequence(haystack: &[u8], needle: &[u8]) -> Option<usize> {
167    haystack.windows(needle.len()).position(|w| w == needle)
168}
169
170fn find_cab_end(source: &[u8], start: usize) -> usize {
171    // CAB ends at a blank line or EOF.
172    let mut prev_newline = false;
173    for (i, &b) in source[start..].iter().enumerate() {
174        if b == b'\n' {
175            if prev_newline {
176                return start + i;
177            }
178            prev_newline = true;
179        } else if b != b'\r' {
180            prev_newline = false;
181        }
182    }
183    source.len()
184}
185
186fn trim_ascii(s: &[u8]) -> &[u8] {
187    // Use stdlib trim_ascii (stable since Rust 1.80) to strip all leading/trailing
188    // ASCII whitespace including \r (handles CRLF line endings from split(b'\n')).
189    s.trim_ascii()
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn detects_portion_marking() {
198        let src = b"(TS//SI//NF) This paragraph is classified.";
199        let candidates = Scanner::scan(src);
200        assert_eq!(candidates.len(), 1);
201        assert_eq!(candidates[0].kind, MarkingType::Portion);
202        assert_eq!(candidates[0].span.as_str(src).unwrap(), "(TS//SI//NF)");
203    }
204
205    #[test]
206    fn detects_banner() {
207        let src = b"TOP SECRET//NOFORN\n\nSome content here.\n";
208        let candidates = Scanner::scan(src);
209        assert!(candidates.iter().any(|c| c.kind == MarkingType::Banner));
210    }
211
212    #[test]
213    fn rejects_newline_in_portion() {
214        let src = b"(TS\n//NF) not a real marking";
215        let candidates = Scanner::scan(src);
216        assert!(candidates.iter().all(|c| c.kind != MarkingType::Portion));
217    }
218
219    #[test]
220    fn rejects_form_feed_in_portion() {
221        // A `\f` inside `(...)` is never a valid single-line portion.
222        // Without this rejection the portion candidate would span the
223        // form feed and shadow the PageBreak candidate at that offset.
224        let src = b"(TS\x0c//NF)";
225        let candidates = Scanner::scan(src);
226        assert!(
227            candidates.iter().all(|c| c.kind != MarkingType::Portion),
228            "form feed inside portion parens must not produce a Portion candidate"
229        );
230        // The PageBreak candidate at offset 3 should still be emitted.
231        assert!(
232            candidates
233                .iter()
234                .any(|c| c.kind == MarkingType::PageBreak && c.span.start == 3),
235            "expected PageBreak at form-feed offset 3"
236        );
237    }
238
239    #[test]
240    fn detects_page_break_form_feed() {
241        let src = b"page1\x0cpage2";
242        let candidates = Scanner::scan(src);
243        let breaks: Vec<_> = candidates
244            .iter()
245            .filter(|c| c.kind == MarkingType::PageBreak)
246            .collect();
247        assert_eq!(breaks.len(), 1);
248        // Form feed sits at offset 5 in `b"page1\x0cpage2"`.
249        assert_eq!(breaks[0].span.start, 5);
250        assert_eq!(breaks[0].span.end, 5);
251    }
252
253    #[test]
254    fn detects_page_break_blank_line_run() {
255        let src = b"page1\n\n\npage2";
256        let candidates = Scanner::scan(src);
257        let breaks: Vec<_> = candidates
258            .iter()
259            .filter(|c| c.kind == MarkingType::PageBreak)
260            .collect();
261        // Exactly one PageBreak — emitted at the *third* newline (offset 7),
262        // not one per `\n` in the run.
263        assert_eq!(breaks.len(), 1);
264        assert_eq!(breaks[0].span.start, 7);
265    }
266
267    #[test]
268    fn double_newline_does_not_emit_page_break() {
269        // A normal paragraph break (`\n\n`) must NOT trip the reset, otherwise
270        // every paragraph in a multi-page document looks like a fresh page.
271        let src = b"paragraph one\n\nparagraph two";
272        let candidates = Scanner::scan(src);
273        assert!(
274            candidates.iter().all(|c| c.kind != MarkingType::PageBreak),
275            "double newline should not produce a PageBreak candidate"
276        );
277    }
278
279    #[test]
280    fn page_break_sorts_before_co_located_content() {
281        // Edge case: a banner line whose line start is at the same byte
282        // offset as a form-feed candidate. The scanner emits both at
283        // offset N — PageBreak (zero-length) and Banner (line span).
284        // The sort must place PageBreak first so the engine reset runs
285        // before the banner is processed.
286        //
287        // Construct `\fSECRET\n`: form-feed at 0, banner line 1..7.
288        // The PageBreak lands at offset 0 with zero length; the banner
289        // line scanner's offset is 1 (after the `\f`), so they are NOT
290        // co-located in this case. Build a synthetic double-push case
291        // by testing `kind_sort_priority` directly instead — simpler
292        // and covers the sort key without fighting the scanner.
293        assert_eq!(kind_sort_priority(MarkingType::PageBreak), 0);
294        assert!(
295            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Banner)
296        );
297        assert!(
298            kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Portion)
299        );
300        assert!(kind_sort_priority(MarkingType::PageBreak) < kind_sort_priority(MarkingType::Cab));
301    }
302
303    #[test]
304    fn page_break_form_feed_inside_blank_run_emits_both() {
305        // `\n\n\f\n\n` — the form feed itself is one PageBreak; the surrounding
306        // newlines do not also trip the 3-newline heuristic because the run
307        // is broken by the `\f`.
308        let src = b"a\n\n\x0c\n\nb";
309        let candidates = Scanner::scan(src);
310        let breaks: Vec<_> = candidates
311            .iter()
312            .filter(|c| c.kind == MarkingType::PageBreak)
313            .collect();
314        assert_eq!(breaks.len(), 1, "only the form-feed should fire here");
315    }
316}