marque_engine/recognizer.rs
1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! [`Recognizer`] implementations for the engine's strict dispatch path.
6//!
7//! Phase-4 PR-2 (T058 + T063) wraps `marque-core`'s existing strict
8//! parser behind the domain-neutral [`Recognizer`] trait so
9//! [`crate::Engine::lint`] dispatches parsing through
10//! `Arc<dyn Recognizer<S>>` instead of instantiating `Parser` inline.
11//! Phase-4 PR-3 will add a `DecoderRecognizer` alongside this one;
12//! both coexist behind the same trait object.
13//!
14//! ## Why this lives in `marque-engine`, not `marque-capco`
15//!
16//! Constitution VII forbids `marque-capco` from depending on
17//! `marque-core`. `StrictRecognizer` wraps `marque_core::Parser` and
18//! produces [`CapcoMarking`](marque_capco::CapcoMarking) values — it
19//! therefore needs both chains, and the constitutional dep-graph
20//! names `marque-engine` as the sole convergence crate. The scheme-
21//! adapter pattern from Phase 3 stays intact: `marque-capco` owns
22//! `CapcoScheme` / `CapcoMarking`; the engine owns dispatch.
23//!
24//! ## Span-offset contract
25//!
26//! The [`Recognizer`] trait contract is "given bytes, return a
27//! [`Parsed<M>`] whose internal spans are relative to the input
28//! bytes" (foundational-plan "spans are by offset into this buffer").
29//! Rules in `marque-capco` expect source-relative spans, so the engine
30//! shifts token spans after `recognize()` returns via
31//! [`shift_token_spans`]. That post-processing is the natural seam —
32//! the engine is the only code that sees both the full source buffer
33//! and the candidate's source offset.
34//!
35//! ## Zero-candidate = no fabricated marking
36//!
37//! On a strict-parse failure the recognizer returns
38//! `Parsed::Ambiguous { candidates: vec![] }` — the zero-candidate
39//! form mandated by the trait contract (foundational-plan
40//! line 609-612). Callers MUST treat that as "no plausible
41//! interpretation," not as a silently-fabricated marking.
42
43use marque_capco::{CapcoMarking, CapcoScheme};
44use marque_core::Parser;
45use marque_ism::{
46 CapcoTokenSet, Classification, IsmAttributes, MarkingClassification,
47 span::{MarkingCandidate, MarkingType, Span},
48};
49use marque_scheme::ambiguity::Parsed;
50use marque_scheme::recognizer::{ParseContext, Recognizer};
51
52/// Strict-path recognizer. Zero false positives by construction —
53/// delegates to the existing [`Parser`], which only accepts the
54/// CAPCO-2016 canonical grammar.
55///
56/// Stateless. Cheaply constructible; the engine holds a single
57/// instance behind `Arc` for the lifetime of one [`crate::Engine`].
58#[derive(Debug, Default, Clone, Copy)]
59pub struct StrictRecognizer;
60
61impl StrictRecognizer {
62 /// Construct a strict-path recognizer.
63 pub const fn new() -> Self {
64 Self
65 }
66}
67
68impl Recognizer<CapcoScheme> for StrictRecognizer {
69 fn recognize(&self, bytes: &[u8], _cx: &ParseContext) -> Parsed<CapcoMarking> {
70 // `_cx.strict_evidence` is always satisfied here — this
71 // recognizer only emits candidates that hit the strict grammar.
72 // `zone` / `position` are rule-side concerns, not parser input.
73 let Some(kind) = infer_marking_type(bytes) else {
74 return Parsed::Ambiguous {
75 candidates: Vec::new(),
76 };
77 };
78 let token_set = CapcoTokenSet;
79 let parser = Parser::new(&token_set);
80 let leading_ws = if matches!(kind, MarkingType::Portion) {
81 bytes.iter().take_while(|b| b.is_ascii_whitespace()).count()
82 } else {
83 0
84 };
85 let parse_bytes = &bytes[leading_ws..];
86 let candidate = MarkingCandidate {
87 span: Span::new(0, parse_bytes.len()),
88 kind,
89 };
90 match parser.parse(&candidate, parse_bytes) {
91 Ok(mut parsed) => {
92 if leading_ws != 0 {
93 shift_token_spans(&mut parsed.attrs, leading_ws);
94 }
95 let marking = CapcoMarking::new(parsed.attrs);
96 // Reject `Us(Restricted)` markings. RESTRICTED is by
97 // definition a non-US classification level — see
98 // [`is_us_restricted`] for the full rationale and
99 // why `fgi_marker.is_some()` does not redeem the
100 // marking.
101 if is_us_restricted(&marking) {
102 return Parsed::Ambiguous {
103 candidates: Vec::new(),
104 };
105 }
106 Parsed::Unambiguous(marking)
107 }
108 Err(_) => Parsed::Ambiguous {
109 candidates: Vec::new(),
110 },
111 }
112 }
113}
114
115/// True when the marking is classified as `Us(Restricted)`.
116///
117/// CAPCO §H.7: RESTRICTED is by definition a non-US classification
118/// level. A US document cannot be RESTRICTED. Every legitimate
119/// foreign-origin RESTRICTED form parses to a non-US variant of
120/// [`MarkingClassification`] — `Fgi(Restricted)` for `(//DEU R)` or
121/// `(//FGI DEU R)`, `Nato(NatoRestricted)` for `(//NR)` or fully-
122/// spelled `(//NATO RESTRICTED)`, `Joint(...)` for shared-origin
123/// markings — so those are unaffected by this predicate.
124///
125/// `Us(Restricted)` only appears when the strict parser blindly
126/// mapped a leading `R` token onto the US classification axis
127/// (`Classification::Restricted`'s portion abbreviation is `"R"`).
128/// That mapping is the bug. Every shape that produces it — bare
129/// `(R)`, `(R//NF)` (R first, dissem after), `R//USA, GBR` (banner
130/// shape, R first), `RESTRICTED//FGI DEU//NOFORN` (long-form R
131/// followed by a US-marking FGI block) — is invalid and must be
132/// rejected.
133///
134/// **Why `fgi_marker.is_some()` does not redeem the marking.**
135/// `fgi_marker` is the `FGI [LIST]` block parsed in *US-classified*
136/// markings (e.g., `SECRET//FGI DEU//NOFORN` → `Us(Secret)` +
137/// `fgi_marker: Some([DEU])`). The block annotates that a
138/// US-classified document references foreign-government
139/// information; it does not retroactively make the US-axis
140/// classification valid. `Us(Restricted)` + any `fgi_marker` value
141/// is still `Us(Restricted)`, still nonsense.
142///
143/// Used by both the strict recognizer and the decoder so the engine
144/// never produces a `Us(Restricted)` marking, regardless of what
145/// other tokens the input carried.
146pub(crate) fn is_us_restricted(marking: &CapcoMarking) -> bool {
147 matches!(
148 marking.0.classification,
149 Some(MarkingClassification::Us(Classification::Restricted))
150 )
151}
152
153/// Shift every source-relative byte offset recorded inside `attrs` by
154/// `delta`. Used by the engine to reconcile zero-origin spans produced
155/// by a [`Recognizer`] (which sees only the candidate's slice of the
156/// source) back to the full-source coordinates rules expect.
157///
158/// Only `IsmAttributes::token_spans` carries offsets today; if later
159/// structural fields (SCI / SAR marker spans) start recording source
160/// offsets, add the shift here — there is no alternative code path to
161/// keep in sync.
162///
163/// Crate-visibility only: this is an engine-internal seam. The engine
164/// is the only caller (PR-3's `DecoderRecognizer` will live in this
165/// same crate and call it the same way). Exposing it outside the
166/// crate would lock in an API surface before the `IsmAttributes`
167/// span story is finished.
168pub(crate) fn shift_token_spans(attrs: &mut IsmAttributes, delta: usize) {
169 if delta == 0 {
170 return;
171 }
172 for ts in attrs.token_spans.iter_mut() {
173 ts.span = Span::new(ts.span.start + delta, ts.span.end + delta);
174 }
175}
176
177/// Infer a [`MarkingType`] from the shape of `bytes`.
178///
179/// Mirrors the scanner's classification heuristic so the strict
180/// recognizer can reconstruct the parse path from bytes alone.
181/// Returns `None` only for empty input — the engine filters
182/// zero-length candidates before this point, but the null-return
183/// keeps the recognizer safe on hostile input.
184fn infer_marking_type(bytes: &[u8]) -> Option<MarkingType> {
185 let first = bytes.iter().copied().find(|&b| !b.is_ascii_whitespace())?;
186 if first == b'(' {
187 return Some(MarkingType::Portion);
188 }
189 if is_cab_head(bytes) {
190 return Some(MarkingType::Cab);
191 }
192 Some(MarkingType::Banner)
193}
194
195/// CAB detection: the three line-initial CAPCO-2016 §E authority
196/// heads. Rough but sufficient — the scanner already filtered out
197/// anything that doesn't look like a classification region, so
198/// byte-prefix matching on the known heads is reliable here.
199///
200/// Heads recognized (with trailing colon, matching CAPCO-2016 §E
201/// labels exactly):
202///
203/// - `Classified By:` — §E.1 p31 (Original) and §E.2 p32 (Derivative);
204/// always the first line of a CAB, and what the `marque-core`
205/// scanner keys off of.
206/// - `Derived From:` — §E.2 p32, derivative-classification CABs.
207/// - `Declassify On:` — §E.1 p31 and §E.2 p32, both classification
208/// paths.
209///
210/// The §E.1 original-classification `Classification Reason:` head is
211/// intentionally not matched here — CAPCO-2016 §E.1 p31 spells that
212/// label in full, and a bare `Reason:` prefix is not an authorized
213/// CAPCO CAB label (it would collide with unrelated "Reason: ..."
214/// text in prose). The scanner emits CAB candidates anchored on
215/// `Classified By:`, so this helper is only ever reached on bytes
216/// the scanner already classified as CAB-shaped; the non-head lines
217/// (including `Classification Reason:`) live inside the candidate
218/// body.
219fn is_cab_head(bytes: &[u8]) -> bool {
220 let Ok(text) = std::str::from_utf8(bytes) else {
221 return false;
222 };
223 let trimmed = text.trim_start();
224 trimmed.starts_with("Classified By:")
225 || trimmed.starts_with("Derived From:")
226 || trimmed.starts_with("Declassify On:")
227}
228
229#[cfg(test)]
230#[cfg_attr(coverage_nightly, coverage(off))]
231mod tests {
232 use super::*;
233
234 #[test]
235 fn infer_marking_type_portion_on_leading_paren() {
236 assert_eq!(infer_marking_type(b"(TS//SI)"), Some(MarkingType::Portion));
237 // Leading whitespace is tolerated — scanners may hand over
238 // candidates with a small amount of leading whitespace.
239 assert_eq!(infer_marking_type(b" (S//NF)"), Some(MarkingType::Portion));
240 }
241
242 #[test]
243 fn infer_marking_type_cab_on_authority_head() {
244 assert_eq!(
245 infer_marking_type(b"Classified By: X\nDerived From: Y"),
246 Some(MarkingType::Cab)
247 );
248 assert_eq!(
249 infer_marking_type(b"Declassify On: 20350101"),
250 Some(MarkingType::Cab)
251 );
252 }
253
254 #[test]
255 fn infer_marking_type_bare_reason_prefix_is_not_cab() {
256 // CAPCO-2016 §E.1 p31 spells the original-classification head
257 // as "Classification Reason:", not bare "Reason:". A bare
258 // "Reason:" prefix is indistinguishable from unrelated prose
259 // text ("Reason: the quick brown fox…") and must fall through
260 // to the Banner classification, not be promoted to CAB.
261 assert_eq!(
262 infer_marking_type(b"Reason: 1.4(c)"),
263 Some(MarkingType::Banner),
264 );
265 }
266
267 #[test]
268 fn infer_marking_type_banner_otherwise() {
269 assert_eq!(
270 infer_marking_type(b"TOP SECRET//NOFORN"),
271 Some(MarkingType::Banner)
272 );
273 }
274
275 #[test]
276 fn infer_marking_type_empty_input_returns_none() {
277 assert_eq!(infer_marking_type(b""), None);
278 assert_eq!(infer_marking_type(b" "), None);
279 }
280
281 #[test]
282 fn strict_recognizer_resolves_portion_unambiguously() {
283 let rx = StrictRecognizer::new();
284 let cx = ParseContext::default();
285 match rx.recognize(b"(S//NF)", &cx) {
286 Parsed::Unambiguous(_) => {}
287 other => panic!("expected Unambiguous, got {other:?}"),
288 }
289 }
290
291 #[test]
292 fn strict_recognizer_rejects_bare_restricted_portion() {
293 // CAPCO §H.7: bare `(R)` is structurally indistinguishable from
294 // a registered-mark glyph or list-item enumerator. RESTRICTED
295 // requires foreign-origin context (FGI marker); without it the
296 // strict path must NOT recognize the input as a marking, so
297 // `is_us_restricted` collapses the marking to
298 // zero-candidate Ambiguous.
299 let rx = StrictRecognizer::new();
300 let cx = ParseContext::default();
301 match rx.recognize(b"(R)", &cx) {
302 Parsed::Ambiguous { candidates } => assert!(
303 candidates.is_empty(),
304 "bare (R) must be zero-candidate, got {} candidates",
305 candidates.len()
306 ),
307 Parsed::Unambiguous(m) => panic!(
308 "bare (R) must be rejected, got Unambiguous({:?})",
309 m.0.classification
310 ),
311 }
312 }
313
314 #[test]
315 fn strict_recognizer_rejects_restricted_with_dissem_only() {
316 // `(R//NF)` parses to `Us(Restricted)` + `dissem_controls: [Nf]`,
317 // no `fgi_marker`. Per CAPCO §H.7 the canonical form requires
318 // a foreign-origin signal (FGI/tetragraph/trigraph) BEFORE the
319 // R, not a dissem control AFTER. The predicate must reject so
320 // a future refactor that loosened the FGI-marker check (e.g.,
321 // by treating REL TO or NOFORN as foreign-origin evidence,
322 // which they are not) is caught here.
323 let rx = StrictRecognizer::new();
324 let cx = ParseContext::default();
325 match rx.recognize(b"(R//NF)", &cx) {
326 Parsed::Ambiguous { candidates } => assert!(
327 candidates.is_empty(),
328 "(R//NF) must be zero-candidate, got {} candidates",
329 candidates.len()
330 ),
331 Parsed::Unambiguous(m) => panic!(
332 "(R//NF) must be rejected — `Us(Restricted)` with dissem \
333 control but no FGI marker is invalid; got Unambiguous({:?})",
334 m.0.classification
335 ),
336 }
337 }
338
339 #[test]
340 fn strict_recognizer_rejects_restricted_with_rel_to_only() {
341 // Banner-shape `R//USA, GBR` — same rejection rationale as
342 // `(R//NF)`. REL TO populates `rel_to` but is not foreign-
343 // origin evidence; `R` first is the bug-case `Us(Restricted)`.
344 let rx = StrictRecognizer::new();
345 let cx = ParseContext::default();
346 match rx.recognize(b"R//USA, GBR", &cx) {
347 Parsed::Ambiguous { candidates } => assert!(
348 candidates.is_empty(),
349 "R//USA, GBR must be zero-candidate, got {} candidates",
350 candidates.len()
351 ),
352 Parsed::Unambiguous(m) => panic!(
353 "R//USA, GBR must be rejected — banner-shape \
354 `Us(Restricted)` with REL TO but no FGI marker is \
355 invalid; got Unambiguous({:?})",
356 m.0.classification
357 ),
358 }
359 }
360
361 #[test]
362 fn strict_recognizer_rejects_us_restricted_with_fgi_marker() {
363 // `RESTRICTED//FGI DEU//NOFORN` is the parser shape that
364 // led to the predicate's earlier `fgi_marker.is_none()`
365 // hedge (PR #262 review). The strict parser sees `RESTRICTED`
366 // first, lands `Us(Restricted)`, then parses the trailing
367 // `FGI DEU` as the US-marking FGI block — producing
368 // `classification: Us(Restricted), fgi_marker: Some([DEU])`.
369 // The shape is still nonsense (a US doc cannot be RESTRICTED;
370 // RESTRICTED is the foreign classification level), so the
371 // recognizer must reject it. Pinning this case prevents a
372 // future refactor from re-introducing an FGI-marker hedge
373 // that would silently let `Us(Restricted)` slip through.
374 let rx = StrictRecognizer::new();
375 let cx = ParseContext::default();
376 match rx.recognize(b"RESTRICTED//FGI DEU//NOFORN", &cx) {
377 Parsed::Ambiguous { candidates } => assert!(
378 candidates.is_empty(),
379 "RESTRICTED//FGI DEU//NOFORN must be zero-candidate, \
380 got {} candidates",
381 candidates.len()
382 ),
383 Parsed::Unambiguous(m) => panic!(
384 "RESTRICTED//FGI DEU//NOFORN must be rejected — an FGI \
385 marker block does not redeem a Us(Restricted) \
386 classification; got Unambiguous({:?}, fgi_marker={:?})",
387 m.0.classification, m.0.fgi_marker
388 ),
389 }
390 }
391
392 #[test]
393 fn strict_recognizer_accepts_fgi_axis_restricted() {
394 // The legitimate foreign-origin RESTRICTED form `(//FGI R//NF)`
395 // parses to `MarkingClassification::Fgi(level=Restricted)` —
396 // the FGI classification axis, NOT `Us(Restricted)`. The
397 // rejection predicate matches only on `Us(Restricted)`, so
398 // this shape passes through and the strict recognizer
399 // produces an Unambiguous marking. Real RESTRICTED markings
400 // never reach the bug path the predicate gates against.
401 let rx = StrictRecognizer::new();
402 let cx = ParseContext::default();
403 match rx.recognize(b"(//FGI R//NF)", &cx) {
404 Parsed::Unambiguous(m) => {
405 assert!(
406 !is_us_restricted(&m),
407 "FGI-axis RESTRICTED must not match the bare-`Us(Restricted)` predicate; \
408 classification = {:?}",
409 m.0.classification,
410 );
411 }
412 other => panic!("expected Unambiguous for `(//FGI R//NF)`, got {other:?}"),
413 }
414 }
415
416 #[test]
417 fn is_us_restricted_distinguishes_us_secret() {
418 // Defensive: only `Us(Restricted)` triggers the rejection; other
419 // US classifications (Secret, Confidential, Unclassified) are
420 // unaffected because they are valid US-axis classifications
421 // that don't require foreign-origin context.
422 let rx = StrictRecognizer::new();
423 let cx = ParseContext::default();
424 let Parsed::Unambiguous(m) = rx.recognize(b"(S)", &cx) else {
425 panic!("(S) must parse to a SECRET portion");
426 };
427 assert!(
428 !is_us_restricted(&m),
429 "Us(Secret) must not match the bare-RESTRICTED predicate",
430 );
431 }
432
433 #[test]
434 fn strict_recognizer_returns_zero_candidate_on_parse_failure() {
435 let rx = StrictRecognizer::new();
436 let cx = ParseContext::default();
437 // Missing closing paren — parser rejects; recognizer surfaces
438 // zero-candidate Ambiguous per the trait contract.
439 match rx.recognize(b"(S//NF", &cx) {
440 Parsed::Ambiguous { candidates } => assert!(candidates.is_empty()),
441 other => panic!("expected zero-candidate Ambiguous, got {other:?}"),
442 }
443 }
444
445 #[test]
446 fn shift_token_spans_is_identity_for_zero_delta() {
447 let rx = StrictRecognizer::new();
448 let cx = ParseContext::default();
449 let Parsed::Unambiguous(mut marking) = rx.recognize(b"(S//NF)", &cx) else {
450 panic!("strict parse should succeed");
451 };
452 let before: Vec<Span> = marking.0.token_spans.iter().map(|t| t.span).collect();
453 shift_token_spans(&mut marking.0, 0);
454 let after: Vec<Span> = marking.0.token_spans.iter().map(|t| t.span).collect();
455 assert_eq!(before, after);
456 }
457
458 #[test]
459 fn shift_token_spans_shifts_by_delta() {
460 let rx = StrictRecognizer::new();
461 let cx = ParseContext::default();
462 let Parsed::Unambiguous(mut marking) = rx.recognize(b"(S//NF)", &cx) else {
463 panic!("strict parse should succeed");
464 };
465 let before: Vec<(usize, usize)> = marking
466 .0
467 .token_spans
468 .iter()
469 .map(|t| (t.span.start, t.span.end))
470 .collect();
471 shift_token_spans(&mut marking.0, 100);
472 let after: Vec<(usize, usize)> = marking
473 .0
474 .token_spans
475 .iter()
476 .map(|t| (t.span.start, t.span.end))
477 .collect();
478 for (b, a) in before.iter().zip(after.iter()) {
479 assert_eq!(a.0, b.0 + 100);
480 assert_eq!(a.1, b.1 + 100);
481 }
482 }
483
484 #[test]
485 fn strict_recognizer_is_send_sync_as_trait_object() {
486 // Compile-time assertion: the exact `Arc<dyn Recognizer<…>>`
487 // storage Engine holds must be `Send + Sync` so `BatchEngine`
488 // workers can share one instance (Constitution VI, FR-023).
489 // Also assert the concrete `StrictRecognizer` and `Box<dyn …>`
490 // directly so a regression in either the impl or the storage
491 // choice trips this gate — the `Recognizer: Send + Sync`
492 // super-bound would make a bare `Box<dyn …>` check
493 // self-satisfying and hide a real `StrictRecognizer` regression.
494 fn assert_send_sync<T: Send + Sync + ?Sized>() {}
495 assert_send_sync::<StrictRecognizer>();
496 assert_send_sync::<std::sync::Arc<dyn Recognizer<CapcoScheme>>>();
497 assert_send_sync::<Box<dyn Recognizer<CapcoScheme>>>();
498 }
499}