Skip to main content

marque_engine/
recognizer.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! [`Recognizer`] implementations for the engine's strict dispatch path.
6//!
7//! Phase-4 PR-2 (T058 + T063) wraps `marque-core`'s existing strict
8//! parser behind the domain-neutral [`Recognizer`] trait so
9//! [`crate::Engine::lint`] dispatches parsing through
10//! `Arc<dyn Recognizer<S>>` instead of instantiating `Parser` inline.
11//! Phase-4 PR-3 will add a `DecoderRecognizer` alongside this one;
12//! both coexist behind the same trait object.
13//!
14//! ## Why this lives in `marque-engine`, not `marque-capco`
15//!
16//! Constitution VII forbids `marque-capco` from depending on
17//! `marque-core`. `StrictRecognizer` wraps `marque_core::Parser` and
18//! produces [`CapcoMarking`](marque_capco::CapcoMarking) values — it
19//! therefore needs both chains, and the constitutional dep-graph
20//! names `marque-engine` as the sole convergence crate. The scheme-
21//! adapter pattern from Phase 3 stays intact: `marque-capco` owns
22//! `CapcoScheme` / `CapcoMarking`; the engine owns dispatch.
23//!
24//! ## Span-offset contract
25//!
26//! The [`Recognizer`] trait contract is "given bytes, return a
27//! [`Parsed<M>`] whose internal spans are relative to the input
28//! bytes" (foundational-plan "spans are by offset into this buffer").
29//! Rules in `marque-capco` expect source-relative spans, so the engine
30//! shifts token spans after `recognize()` returns via
31//! [`shift_token_spans`]. That post-processing is the natural seam —
32//! the engine is the only code that sees both the full source buffer
33//! and the candidate's source offset.
34//!
35//! ## Zero-candidate = no fabricated marking
36//!
37//! On a strict-parse failure the recognizer returns
38//! `Parsed::Ambiguous { candidates: vec![] }` — the zero-candidate
39//! form mandated by the trait contract (foundational-plan
40//! line 609-612). Callers MUST treat that as "no plausible
41//! interpretation," not as a silently-fabricated marking.
42
43use marque_capco::{CapcoMarking, CapcoScheme};
44use marque_core::Parser;
45use marque_ism::{
46    CapcoTokenSet, Classification, IsmAttributes, MarkingClassification,
47    span::{MarkingCandidate, MarkingType, Span},
48};
49use marque_scheme::ambiguity::Parsed;
50use marque_scheme::recognizer::{ParseContext, Recognizer};
51
52/// Strict-path recognizer. Zero false positives by construction —
53/// delegates to the existing [`Parser`], which only accepts the
54/// CAPCO-2016 canonical grammar.
55///
56/// Stateless. Cheaply constructible; the engine holds a single
57/// instance behind `Arc` for the lifetime of one [`crate::Engine`].
58#[derive(Debug, Default, Clone, Copy)]
59pub struct StrictRecognizer;
60
61impl StrictRecognizer {
62    /// Construct a strict-path recognizer.
63    pub const fn new() -> Self {
64        Self
65    }
66}
67
68impl Recognizer<CapcoScheme> for StrictRecognizer {
69    fn recognize(&self, bytes: &[u8], _cx: &ParseContext) -> Parsed<CapcoMarking> {
70        // `_cx.strict_evidence` is always satisfied here — this
71        // recognizer only emits candidates that hit the strict grammar.
72        // `zone` / `position` are rule-side concerns, not parser input.
73        let Some(kind) = infer_marking_type(bytes) else {
74            return Parsed::Ambiguous {
75                candidates: Vec::new(),
76            };
77        };
78        let token_set = CapcoTokenSet;
79        let parser = Parser::new(&token_set);
80        let leading_ws = if matches!(kind, MarkingType::Portion) {
81            bytes.iter().take_while(|b| b.is_ascii_whitespace()).count()
82        } else {
83            0
84        };
85        let parse_bytes = &bytes[leading_ws..];
86        let candidate = MarkingCandidate {
87            span: Span::new(0, parse_bytes.len()),
88            kind,
89        };
90        match parser.parse(&candidate, parse_bytes) {
91            Ok(mut parsed) => {
92                if leading_ws != 0 {
93                    shift_token_spans(&mut parsed.attrs, leading_ws);
94                }
95                let marking = CapcoMarking::new(parsed.attrs);
96                // Reject `Us(Restricted)` markings. RESTRICTED is by
97                // definition a non-US classification level — see
98                // [`is_us_restricted`] for the full rationale and
99                // why `fgi_marker.is_some()` does not redeem the
100                // marking.
101                if is_us_restricted(&marking) {
102                    return Parsed::Ambiguous {
103                        candidates: Vec::new(),
104                    };
105                }
106                Parsed::Unambiguous(marking)
107            }
108            Err(_) => Parsed::Ambiguous {
109                candidates: Vec::new(),
110            },
111        }
112    }
113}
114
115/// True when the marking is classified as `Us(Restricted)`.
116///
117/// CAPCO §H.7: RESTRICTED is by definition a non-US classification
118/// level. A US document cannot be RESTRICTED. Every legitimate
119/// foreign-origin RESTRICTED form parses to a non-US variant of
120/// [`MarkingClassification`] — `Fgi(Restricted)` for `(//DEU R)` or
121/// `(//FGI DEU R)`, `Nato(NatoRestricted)` for `(//NR)` or fully-
122/// spelled `(//NATO RESTRICTED)`, `Joint(...)` for shared-origin
123/// markings — so those are unaffected by this predicate.
124///
125/// `Us(Restricted)` only appears when the strict parser blindly
126/// mapped a leading `R` token onto the US classification axis
127/// (`Classification::Restricted`'s portion abbreviation is `"R"`).
128/// That mapping is the bug. Every shape that produces it — bare
129/// `(R)`, `(R//NF)` (R first, dissem after), `R//USA, GBR` (banner
130/// shape, R first), `RESTRICTED//FGI DEU//NOFORN` (long-form R
131/// followed by a US-marking FGI block) — is invalid and must be
132/// rejected.
133///
134/// **Why `fgi_marker.is_some()` does not redeem the marking.**
135/// `fgi_marker` is the `FGI [LIST]` block parsed in *US-classified*
136/// markings (e.g., `SECRET//FGI DEU//NOFORN` → `Us(Secret)` +
137/// `fgi_marker: Some([DEU])`). The block annotates that a
138/// US-classified document references foreign-government
139/// information; it does not retroactively make the US-axis
140/// classification valid. `Us(Restricted)` + any `fgi_marker` value
141/// is still `Us(Restricted)`, still nonsense.
142///
143/// Used by both the strict recognizer and the decoder so the engine
144/// never produces a `Us(Restricted)` marking, regardless of what
145/// other tokens the input carried.
146pub(crate) fn is_us_restricted(marking: &CapcoMarking) -> bool {
147    matches!(
148        marking.0.classification,
149        Some(MarkingClassification::Us(Classification::Restricted))
150    )
151}
152
153/// Shift every source-relative byte offset recorded inside `attrs` by
154/// `delta`. Used by the engine to reconcile zero-origin spans produced
155/// by a [`Recognizer`] (which sees only the candidate's slice of the
156/// source) back to the full-source coordinates rules expect.
157///
158/// Only `IsmAttributes::token_spans` carries offsets today; if later
159/// structural fields (SCI / SAR marker spans) start recording source
160/// offsets, add the shift here — there is no alternative code path to
161/// keep in sync.
162///
163/// Crate-visibility only: this is an engine-internal seam. The engine
164/// is the only caller (PR-3's `DecoderRecognizer` will live in this
165/// same crate and call it the same way). Exposing it outside the
166/// crate would lock in an API surface before the `IsmAttributes`
167/// span story is finished.
168pub(crate) fn shift_token_spans(attrs: &mut IsmAttributes, delta: usize) {
169    if delta == 0 {
170        return;
171    }
172    for ts in attrs.token_spans.iter_mut() {
173        ts.span = Span::new(ts.span.start + delta, ts.span.end + delta);
174    }
175}
176
177/// Infer a [`MarkingType`] from the shape of `bytes`.
178///
179/// Mirrors the scanner's classification heuristic so the strict
180/// recognizer can reconstruct the parse path from bytes alone.
181/// Returns `None` only for empty input — the engine filters
182/// zero-length candidates before this point, but the null-return
183/// keeps the recognizer safe on hostile input.
184fn infer_marking_type(bytes: &[u8]) -> Option<MarkingType> {
185    let first = bytes.iter().copied().find(|&b| !b.is_ascii_whitespace())?;
186    if first == b'(' {
187        return Some(MarkingType::Portion);
188    }
189    if is_cab_head(bytes) {
190        return Some(MarkingType::Cab);
191    }
192    Some(MarkingType::Banner)
193}
194
195/// CAB detection: the three line-initial CAPCO-2016 §E authority
196/// heads. Rough but sufficient — the scanner already filtered out
197/// anything that doesn't look like a classification region, so
198/// byte-prefix matching on the known heads is reliable here.
199///
200/// Heads recognized (with trailing colon, matching CAPCO-2016 §E
201/// labels exactly):
202///
203/// - `Classified By:` — §E.1 p31 (Original) and §E.2 p32 (Derivative);
204///   always the first line of a CAB, and what the `marque-core`
205///   scanner keys off of.
206/// - `Derived From:` — §E.2 p32, derivative-classification CABs.
207/// - `Declassify On:` — §E.1 p31 and §E.2 p32, both classification
208///   paths.
209///
210/// The §E.1 original-classification `Classification Reason:` head is
211/// intentionally not matched here — CAPCO-2016 §E.1 p31 spells that
212/// label in full, and a bare `Reason:` prefix is not an authorized
213/// CAPCO CAB label (it would collide with unrelated "Reason: ..."
214/// text in prose). The scanner emits CAB candidates anchored on
215/// `Classified By:`, so this helper is only ever reached on bytes
216/// the scanner already classified as CAB-shaped; the non-head lines
217/// (including `Classification Reason:`) live inside the candidate
218/// body.
219fn is_cab_head(bytes: &[u8]) -> bool {
220    let Ok(text) = std::str::from_utf8(bytes) else {
221        return false;
222    };
223    let trimmed = text.trim_start();
224    trimmed.starts_with("Classified By:")
225        || trimmed.starts_with("Derived From:")
226        || trimmed.starts_with("Declassify On:")
227}
228
229#[cfg(test)]
230#[cfg_attr(coverage_nightly, coverage(off))]
231mod tests {
232    use super::*;
233
234    #[test]
235    fn infer_marking_type_portion_on_leading_paren() {
236        assert_eq!(infer_marking_type(b"(TS//SI)"), Some(MarkingType::Portion));
237        // Leading whitespace is tolerated — scanners may hand over
238        // candidates with a small amount of leading whitespace.
239        assert_eq!(infer_marking_type(b"  (S//NF)"), Some(MarkingType::Portion));
240    }
241
242    #[test]
243    fn infer_marking_type_cab_on_authority_head() {
244        assert_eq!(
245            infer_marking_type(b"Classified By: X\nDerived From: Y"),
246            Some(MarkingType::Cab)
247        );
248        assert_eq!(
249            infer_marking_type(b"Declassify On: 20350101"),
250            Some(MarkingType::Cab)
251        );
252    }
253
254    #[test]
255    fn infer_marking_type_bare_reason_prefix_is_not_cab() {
256        // CAPCO-2016 §E.1 p31 spells the original-classification head
257        // as "Classification Reason:", not bare "Reason:". A bare
258        // "Reason:" prefix is indistinguishable from unrelated prose
259        // text ("Reason: the quick brown fox…") and must fall through
260        // to the Banner classification, not be promoted to CAB.
261        assert_eq!(
262            infer_marking_type(b"Reason: 1.4(c)"),
263            Some(MarkingType::Banner),
264        );
265    }
266
267    #[test]
268    fn infer_marking_type_banner_otherwise() {
269        assert_eq!(
270            infer_marking_type(b"TOP SECRET//NOFORN"),
271            Some(MarkingType::Banner)
272        );
273    }
274
275    #[test]
276    fn infer_marking_type_empty_input_returns_none() {
277        assert_eq!(infer_marking_type(b""), None);
278        assert_eq!(infer_marking_type(b"   "), None);
279    }
280
281    #[test]
282    fn strict_recognizer_resolves_portion_unambiguously() {
283        let rx = StrictRecognizer::new();
284        let cx = ParseContext::default();
285        match rx.recognize(b"(S//NF)", &cx) {
286            Parsed::Unambiguous(_) => {}
287            other => panic!("expected Unambiguous, got {other:?}"),
288        }
289    }
290
291    #[test]
292    fn strict_recognizer_rejects_bare_restricted_portion() {
293        // CAPCO §H.7: bare `(R)` is structurally indistinguishable from
294        // a registered-mark glyph or list-item enumerator. RESTRICTED
295        // requires foreign-origin context (FGI marker); without it the
296        // strict path must NOT recognize the input as a marking, so
297        // `is_us_restricted` collapses the marking to
298        // zero-candidate Ambiguous.
299        let rx = StrictRecognizer::new();
300        let cx = ParseContext::default();
301        match rx.recognize(b"(R)", &cx) {
302            Parsed::Ambiguous { candidates } => assert!(
303                candidates.is_empty(),
304                "bare (R) must be zero-candidate, got {} candidates",
305                candidates.len()
306            ),
307            Parsed::Unambiguous(m) => panic!(
308                "bare (R) must be rejected, got Unambiguous({:?})",
309                m.0.classification
310            ),
311        }
312    }
313
314    #[test]
315    fn strict_recognizer_rejects_restricted_with_dissem_only() {
316        // `(R//NF)` parses to `Us(Restricted)` + `dissem_controls: [Nf]`,
317        // no `fgi_marker`. Per CAPCO §H.7 the canonical form requires
318        // a foreign-origin signal (FGI/tetragraph/trigraph) BEFORE the
319        // R, not a dissem control AFTER. The predicate must reject so
320        // a future refactor that loosened the FGI-marker check (e.g.,
321        // by treating REL TO or NOFORN as foreign-origin evidence,
322        // which they are not) is caught here.
323        let rx = StrictRecognizer::new();
324        let cx = ParseContext::default();
325        match rx.recognize(b"(R//NF)", &cx) {
326            Parsed::Ambiguous { candidates } => assert!(
327                candidates.is_empty(),
328                "(R//NF) must be zero-candidate, got {} candidates",
329                candidates.len()
330            ),
331            Parsed::Unambiguous(m) => panic!(
332                "(R//NF) must be rejected — `Us(Restricted)` with dissem \
333                 control but no FGI marker is invalid; got Unambiguous({:?})",
334                m.0.classification
335            ),
336        }
337    }
338
339    #[test]
340    fn strict_recognizer_rejects_restricted_with_rel_to_only() {
341        // Banner-shape `R//USA, GBR` — same rejection rationale as
342        // `(R//NF)`. REL TO populates `rel_to` but is not foreign-
343        // origin evidence; `R` first is the bug-case `Us(Restricted)`.
344        let rx = StrictRecognizer::new();
345        let cx = ParseContext::default();
346        match rx.recognize(b"R//USA, GBR", &cx) {
347            Parsed::Ambiguous { candidates } => assert!(
348                candidates.is_empty(),
349                "R//USA, GBR must be zero-candidate, got {} candidates",
350                candidates.len()
351            ),
352            Parsed::Unambiguous(m) => panic!(
353                "R//USA, GBR must be rejected — banner-shape \
354                 `Us(Restricted)` with REL TO but no FGI marker is \
355                 invalid; got Unambiguous({:?})",
356                m.0.classification
357            ),
358        }
359    }
360
361    #[test]
362    fn strict_recognizer_rejects_us_restricted_with_fgi_marker() {
363        // `RESTRICTED//FGI DEU//NOFORN` is the parser shape that
364        // led to the predicate's earlier `fgi_marker.is_none()`
365        // hedge (PR #262 review). The strict parser sees `RESTRICTED`
366        // first, lands `Us(Restricted)`, then parses the trailing
367        // `FGI DEU` as the US-marking FGI block — producing
368        // `classification: Us(Restricted), fgi_marker: Some([DEU])`.
369        // The shape is still nonsense (a US doc cannot be RESTRICTED;
370        // RESTRICTED is the foreign classification level), so the
371        // recognizer must reject it. Pinning this case prevents a
372        // future refactor from re-introducing an FGI-marker hedge
373        // that would silently let `Us(Restricted)` slip through.
374        let rx = StrictRecognizer::new();
375        let cx = ParseContext::default();
376        match rx.recognize(b"RESTRICTED//FGI DEU//NOFORN", &cx) {
377            Parsed::Ambiguous { candidates } => assert!(
378                candidates.is_empty(),
379                "RESTRICTED//FGI DEU//NOFORN must be zero-candidate, \
380                 got {} candidates",
381                candidates.len()
382            ),
383            Parsed::Unambiguous(m) => panic!(
384                "RESTRICTED//FGI DEU//NOFORN must be rejected — an FGI \
385                 marker block does not redeem a Us(Restricted) \
386                 classification; got Unambiguous({:?}, fgi_marker={:?})",
387                m.0.classification, m.0.fgi_marker
388            ),
389        }
390    }
391
392    #[test]
393    fn strict_recognizer_accepts_fgi_axis_restricted() {
394        // The legitimate foreign-origin RESTRICTED form `(//FGI R//NF)`
395        // parses to `MarkingClassification::Fgi(level=Restricted)` —
396        // the FGI classification axis, NOT `Us(Restricted)`. The
397        // rejection predicate matches only on `Us(Restricted)`, so
398        // this shape passes through and the strict recognizer
399        // produces an Unambiguous marking. Real RESTRICTED markings
400        // never reach the bug path the predicate gates against.
401        let rx = StrictRecognizer::new();
402        let cx = ParseContext::default();
403        match rx.recognize(b"(//FGI R//NF)", &cx) {
404            Parsed::Unambiguous(m) => {
405                assert!(
406                    !is_us_restricted(&m),
407                    "FGI-axis RESTRICTED must not match the bare-`Us(Restricted)` predicate; \
408                     classification = {:?}",
409                    m.0.classification,
410                );
411            }
412            other => panic!("expected Unambiguous for `(//FGI R//NF)`, got {other:?}"),
413        }
414    }
415
416    #[test]
417    fn is_us_restricted_distinguishes_us_secret() {
418        // Defensive: only `Us(Restricted)` triggers the rejection; other
419        // US classifications (Secret, Confidential, Unclassified) are
420        // unaffected because they are valid US-axis classifications
421        // that don't require foreign-origin context.
422        let rx = StrictRecognizer::new();
423        let cx = ParseContext::default();
424        let Parsed::Unambiguous(m) = rx.recognize(b"(S)", &cx) else {
425            panic!("(S) must parse to a SECRET portion");
426        };
427        assert!(
428            !is_us_restricted(&m),
429            "Us(Secret) must not match the bare-RESTRICTED predicate",
430        );
431    }
432
433    #[test]
434    fn strict_recognizer_returns_zero_candidate_on_parse_failure() {
435        let rx = StrictRecognizer::new();
436        let cx = ParseContext::default();
437        // Missing closing paren — parser rejects; recognizer surfaces
438        // zero-candidate Ambiguous per the trait contract.
439        match rx.recognize(b"(S//NF", &cx) {
440            Parsed::Ambiguous { candidates } => assert!(candidates.is_empty()),
441            other => panic!("expected zero-candidate Ambiguous, got {other:?}"),
442        }
443    }
444
445    #[test]
446    fn shift_token_spans_is_identity_for_zero_delta() {
447        let rx = StrictRecognizer::new();
448        let cx = ParseContext::default();
449        let Parsed::Unambiguous(mut marking) = rx.recognize(b"(S//NF)", &cx) else {
450            panic!("strict parse should succeed");
451        };
452        let before: Vec<Span> = marking.0.token_spans.iter().map(|t| t.span).collect();
453        shift_token_spans(&mut marking.0, 0);
454        let after: Vec<Span> = marking.0.token_spans.iter().map(|t| t.span).collect();
455        assert_eq!(before, after);
456    }
457
458    #[test]
459    fn shift_token_spans_shifts_by_delta() {
460        let rx = StrictRecognizer::new();
461        let cx = ParseContext::default();
462        let Parsed::Unambiguous(mut marking) = rx.recognize(b"(S//NF)", &cx) else {
463            panic!("strict parse should succeed");
464        };
465        let before: Vec<(usize, usize)> = marking
466            .0
467            .token_spans
468            .iter()
469            .map(|t| (t.span.start, t.span.end))
470            .collect();
471        shift_token_spans(&mut marking.0, 100);
472        let after: Vec<(usize, usize)> = marking
473            .0
474            .token_spans
475            .iter()
476            .map(|t| (t.span.start, t.span.end))
477            .collect();
478        for (b, a) in before.iter().zip(after.iter()) {
479            assert_eq!(a.0, b.0 + 100);
480            assert_eq!(a.1, b.1 + 100);
481        }
482    }
483
484    #[test]
485    fn strict_recognizer_is_send_sync_as_trait_object() {
486        // Compile-time assertion: the exact `Arc<dyn Recognizer<…>>`
487        // storage Engine holds must be `Send + Sync` so `BatchEngine`
488        // workers can share one instance (Constitution VI, FR-023).
489        // Also assert the concrete `StrictRecognizer` and `Box<dyn …>`
490        // directly so a regression in either the impl or the storage
491        // choice trips this gate — the `Recognizer: Send + Sync`
492        // super-bound would make a bare `Box<dyn …>` check
493        // self-satisfying and hide a real `StrictRecognizer` regression.
494        fn assert_send_sync<T: Send + Sync + ?Sized>() {}
495        assert_send_sync::<StrictRecognizer>();
496        assert_send_sync::<std::sync::Arc<dyn Recognizer<CapcoScheme>>>();
497        assert_send_sync::<Box<dyn Recognizer<CapcoScheme>>>();
498    }
499}