marque_engine/
decoder.rs

1// SPDX-FileCopyrightText: 2026 Knitli Inc.
2//
3// SPDX-License-Identifier: LicenseRef-MarqueLicense-1.0
4
5//! Phase-D probabilistic [`Recognizer`] — the "decoder".
6//!
7//! This module implements the deep-scan half of the strict/deep-scan
8//! recognizer split introduced in Phase 4 PR-2. When the engine is
9//! configured for deep-scan (batch reconciliation mode,
10//! rule-escalated region, `--deep-scan` CLI flag), and the strict
11//! recognizer returns zero candidates for a marking region, the
12//! engine falls back to the decoder to recover mangled markings that
13//! are one of a small set of canonical-shape deviations away from a
14//! real CAPCO-2016 marking:
15//!
16//! - Edit-distance-1/2 token typos (`SERCET` → `SECRET`).
17//! - Token reordering within categories (`NOFORN//SECRET` →
18//!   `SECRET//NOFORN`).
19//! - CAPCO-2016-superseded tokens (`COMINT` → `SI`).
20//! - Case mistakes (`secret//noforn` → `SECRET//NOFORN`).
21//! - Garbled delimiters (`S ∕∕ NOFORN` → `S//NOFORN`).
22//!
23//! The decoder never fabricates a marking where none exists. When the
24//! observed tokens fit no CAPCO grammar template, it returns
25//! `Parsed::Ambiguous { candidates: vec![] }` — the zero-candidate
26//! signal per foundational-plan line 609-612.
27//!
28//! ## Why this lives in `marque-engine`, not `marque-capco`
29//!
30//! Same Constitution VII rationale as `StrictRecognizer` (PR-2):
31//! `marque-capco` may not depend on `marque-core`, but the decoder
32//! needs core's fuzzy-vocab matcher and strict parser to materialize
33//! candidates. `marque-engine` is the sole crate where both chains
34//! converge. The original tasks.md T059/T061 placement is amended in
35//! tasks.md itself.
36//!
37//! ## Scoring approach (foundational-plan §5.2)
38//!
39//! For each candidate the decoder computes:
40//!
41//! ```text
42//! log_posterior(candidate | observed)
43//!   = log_prior(candidate)                      // baked corpus priors (PR-1)
44//!   + Σ log_likelihood(feature | candidate)     // enumerated scored features
45//! ```
46//!
47//! The decoder currently scores the candidate-shape features it
48//! records from the closed [`FeatureId`] enum:
49//! `EditDistance1`, `EditDistance2`, `TokenReorder`,
50//! `SupersededToken`, and `BaseRateCommonMarking`. Each contributes
51//! a fixed log-odds delta documented at the feature's call site.
52//!
53//! [`FeatureId::StrictContextClassification`] is part of the audit-
54//! schema enum but is **not** currently a scored-feature term:
55//! classification-level context is enforced through the separate
56//! [`ParseContext::classification_floor`] hard filter (FR-011),
57//! which rejects below-floor candidates before scoring rather than
58//! adding a likelihood term to the posterior. [`FeatureId::CorpusOverrideInEffect`]
59//! is reserved for PR-5 when corpus-override is wired; the decoder
60//! does not emit it today. Turning either into an actual scored
61//! contributor requires a coordinated audit-schema bump
62//! (`MARQUE_AUDIT_SCHEMA`) per `marque-rules/src/confidence.rs` doc.
63//!
64//! The top candidate wins when its posterior exceeds the runner-up by
65//! a configured ratio; below that threshold the decoder returns
66//! `Parsed::Ambiguous { candidates }` so the engine can surface a
67//! diagnostic rather than auto-apply. `Candidate::prior_log_odds`
68//! carries the prior alone (sum of token log-priors); the
69//! per-feature log-odds deltas live only in
70//! `Candidate::evidence[i].log_odds`, so a resolver that reconstructs
71//! `prior_log_odds + Σ evidence.log_odds` recovers the decoder's
72//! internal posterior exactly, without double-counting.
73//!
74//! ## What this module is NOT
75//!
76//! - Not a full template-matching grammar engine. The MVP materializes
77//!   candidates by canonicalizing observed tokens and round-tripping
78//!   through the strict parser — the strict parser is the arbiter of
79//!   "is this a CAPCO-shape marking." If the canonicalized bytes
80//!   strict-parse, we have a candidate; if not, we discard.
81//! - Not a learning system. All priors are compile-time-baked `&'static`
82//!   tables from `marque_capco::priors` (Constitution III: no runtime
83//!   corpus override on WASM).
84//! - Not a fix applier. The decoder proposes `CapcoMarking` candidates;
85//!   the engine applies them through the normal `Diagnostic` /
86//!   `FixProposal` path with `FixSource::DecoderPosterior`.
87
88use std::collections::BTreeSet;
89
90use marque_capco::provenance::DecoderProvenance;
91use marque_capco::{CapcoMarking, CapcoScheme};
92use marque_core::{Parser, fuzzy::FuzzyVocabMatcher};
93use marque_ism::{
94    CapcoTokenSet, Classification, SciControl, SciControlBare, SciControlSystem,
95    span::{MarkingCandidate, MarkingType, Span},
96    token_set::TokenSet as _,
97};
98use marque_rules::confidence::{FeatureContribution, FeatureId};
99use marque_scheme::ambiguity::{Candidate, EvidenceFeature, Parsed};
100use marque_scheme::recognizer::{ParseContext, Recognizer};
101
102use crate::recognizer::{StrictRecognizer, is_us_restricted};
103
104/// K=8 candidate bound per foundational-plan §5.2 and research.md R3.
105///
106/// Higher K burns latency without accuracy gain (diminishing returns
107/// above 6 per the primary-source corpus analysis); lower K drops
108/// recall on multi-token reorderings. Tunable in-place — the bound is
109/// advisory, not a correctness invariant.
110const K_MAX_CANDIDATES: usize = 8;
111
112/// Runner-up posterior-ratio threshold for emitting `Unambiguous`.
113///
114/// The decoder computes `log_margin = top_posterior - runner_up_posterior`
115/// in natural-log space. When `log_margin >= UNAMBIGUOUS_LOG_MARGIN`,
116/// the decoder collapses to `Unambiguous(top)`; below the threshold it
117/// returns `Ambiguous { candidates }` so the engine can surface a
118/// diagnostic rather than auto-apply a close call.
119///
120/// `1.6` corresponds to a posterior odds ratio of `e^1.6 ≈ 4.95` —
121/// i.e., the top candidate is roughly five times as likely as the
122/// runner-up given the observed bytes. This is the **odds** ratio
123/// (`P(top)/P(runner_up)`), not a probability ratio.
124const UNAMBIGUOUS_LOG_MARGIN: f32 = 1.6;
125
126/// Phase-D probabilistic marking recognizer.
127///
128/// Stateless — all priors are baked `&'static` tables consumed at
129/// scoring time. Cheaply constructible; the engine holds a single
130/// instance behind `Arc` for the lifetime of one [`crate::Engine`].
131///
132/// When `ParseContext::strict_evidence == true` the decoder defers to
133/// the strict path by returning a zero-candidate result. The engine
134/// is responsible for calling the strict recognizer first and only
135/// invoking the decoder on deep-scan regions (see
136/// `crate::Engine::lint` dispatch).
137#[derive(Debug, Default, Clone, Copy)]
138pub struct DecoderRecognizer;
139
140impl DecoderRecognizer {
141    /// Construct a decoder recognizer.
142    pub const fn new() -> Self {
143        Self
144    }
145}
146
147impl Recognizer<CapcoScheme> for DecoderRecognizer {
148    fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
149        // Strict-path callers get zero candidates so the engine's
150        // strict recognizer remains the authoritative answer under
151        // interactive-authoring latency (SC-001). The engine only
152        // invokes the decoder when `strict_evidence = false` is
153        // explicitly requested (deep-scan mode or rule-escalated
154        // region).
155        if cx.strict_evidence {
156            return Parsed::Ambiguous {
157                candidates: Vec::new(),
158            };
159        }
160
161        let Some(kind) = infer_marking_type(bytes) else {
162            return Parsed::Ambiguous {
163                candidates: Vec::new(),
164            };
165        };
166
167        // Prose-glue suppression: a single-letter portion candidate
168        // (`(s)`, `(c)`, `(u)`, `(r)`, …) immediately glued to a
169        // preceding word — `letter(s)`, `function(c)`, `loss(s)` —
170        // is overwhelmingly a plural-suffix or function-call-shaped
171        // prose glyph, not a real CAPCO marking. The strict recognizer
172        // doesn't have the surrounding-byte context to tell these
173        // apart; the engine populates `cx.preceded_by_whitespace`
174        // from the source byte preceding the candidate's span and
175        // hands it to the decoder so this fallback path doesn't
176        // resurrect the false positive that the strict path would
177        // never have produced (the strict parser is case-sensitive
178        // and rejects lowercase tokens, so `(s)` only reaches the
179        // decoder via the case-fold canonicalization).
180        //
181        // Bullets and numbered-list markers are not a problem — they
182        // always have whitespace between the bullet and the marking
183        // (`1. (S)`, `* (S//NF)`, `(a) (S)` all set
184        // `preceded_by_whitespace = true`).
185        if !cx.preceded_by_whitespace
186            && matches!(kind, MarkingType::Portion)
187            && is_single_letter_portion(bytes)
188        {
189            return Parsed::Ambiguous {
190                candidates: Vec::new(),
191            };
192        }
193
194        // 1. Canonicalize the observed bytes into zero-or-more
195        //    candidate byte-strings + per-candidate feature trace.
196        let canonical_attempts = generate_candidate_bytes(bytes);
197        if canonical_attempts.is_empty() {
198            return Parsed::Ambiguous {
199                candidates: Vec::new(),
200            };
201        }
202
203        // 2. Strict-parse each canonicalized attempt. Anything that
204        //    fails strict parsing is discarded — the strict parser is
205        //    the arbiter of "is this a CAPCO-shape marking." This is
206        //    what guarantees the decoder never fabricates a marking
207        //    shape the grammar forbids.
208        let token_set = CapcoTokenSet;
209        let parser = Parser::new(&token_set);
210        let synthetic_candidate = MarkingCandidate {
211            span: Span::new(0, 0), // re-set per attempt below
212            kind,
213        };
214        let mut scored: Vec<ScoredCandidate> = Vec::new();
215        for attempt in canonical_attempts {
216            let candidate = MarkingCandidate {
217                span: Span::new(0, attempt.bytes.len()),
218                ..synthetic_candidate
219            };
220            let Ok(mut parsed) = parser.parse(&candidate, &attempt.bytes) else {
221                continue;
222            };
223
224            // 3a. Reject partial canonicalizations. Any
225            //     `TokenKind::Unknown` span surviving strict parse of
226            //     the canonicalized bytes means the decoder passed an
227            //     uncorrectable token through unchanged (see Case 4
228            //     in `fuzzy_correct_tokens`). Accepting such a
229            //     candidate would silently drop the unknown token
230            //     from `token_spans` in step 3b and fabricate a
231            //     partial marking — e.g., `(SECRET//WIBBLE)` would
232            //     land as `classification: Some(Secret)` with
233            //     WIBBLE simply discarded. The correct behavior is
234            //     to discard the candidate so the decoder's output
235            //     set stays honest: either a token fully resolves or
236            //     the whole candidate goes away.
237            let has_unknown_token = parsed
238                .attrs
239                .token_spans
240                .iter()
241                .any(|s| matches!(s.kind, marque_ism::TokenKind::Unknown));
242            if has_unknown_token {
243                continue;
244            }
245
246            // 3b. Span-offset contract: `IsmAttributes::token_spans`
247            //     returned by the strict parser carry offsets into
248            //     `attempt.bytes` (the canonicalized buffer), NOT the
249            //     original `bytes` slice the caller passed to
250            //     `recognize()`. Propagating those spans would
251            //     violate the [`Recognizer`] contract — "spans are by
252            //     offset into [the input] buffer" — and misplace
253            //     downstream diagnostics/fixes whenever
254            //     canonicalization changed spacing, delimiter form,
255            //     token order, or token length (e.g., `COMINT` → `SI`
256            //     changes a 6-byte token to 2 bytes). Until we have a
257            //     proper source↔canonical span map, decoder-produced
258            //     markings must not carry token spans; downstream
259            //     CAPCO rules that consume `attrs.token_spans` fall
260            //     back to marking-level spans for decoder fixes.
261            //
262            //     Clearing happens AFTER the Unknown-token check
263            //     above — we need the spans to filter partial
264            //     canonicalizations, but must drop them before the
265            //     marking leaves the decoder.
266            parsed.attrs.token_spans = Box::new([]);
267            let marking = CapcoMarking::new(parsed.attrs);
268
269            // 3c. The strict parser is lenient — it accepts any
270            //     `BYTES//BYTES` shape and emits an `IsmAttributes`
271            //     with empty fields when nothing is recognized. Drop
272            //     such trivial parses so the decoder doesn't
273            //     fabricate a marking for prose like `FROBNITZ//WIBBLE`.
274            if !is_nontrivial_marking(&marking) {
275                continue;
276            }
277
278            // 3c-bis. Reject `Us(Restricted)` markings. Same rationale
279            //         as the strict recognizer (see [`is_us_restricted`]):
280            //         RESTRICTED is by definition a non-US classification,
281            //         so any candidate the parser landed on the US axis
282            //         is invalid regardless of what other tokens
283            //         (`fgi_marker`, dissem controls, REL TO) accompany
284            //         it. Real foreign-origin RESTRICTED markings parse
285            //         to `Fgi(...)` / `Nato(...)` / `Joint(...)` and
286            //         pass through.
287            if is_us_restricted(&marking) {
288                continue;
289            }
290
291            // 3d. FR-011 — drop candidates below the page's strict
292            //     classification floor.
293            if let Some(floor) = cx.classification_floor
294                && !meets_classification_floor(&marking, floor)
295            {
296                continue;
297            }
298
299            // 3e. Portion/Banner shapes REQUIRE a classification to
300            //     be a meaningful marking. The strict parser is
301            //     lenient — `(YS//NF)` parses to a marking with
302            //     `classification: None, dissem_controls: [Nf]`
303            //     because `YS` doesn't resolve to any
304            //     [`Classification`] variant. The decoder's
305            //     bag-of-tokens scorer rewards FEWER negative-log-
306            //     prior tokens, so without this filter the
307            //     no-classification candidate would outrank a
308            //     heuristic-corrected `(TS//NF)` candidate that
309            //     contributed both `TOP SECRET` and `NF` priors.
310            //
311            //     For CAB shapes the analogous completeness check
312            //     is "any of classified_by / derived_from /
313            //     declassify_on / declass_exemption is set" —
314            //     [`is_nontrivial_marking`] above already covers
315            //     that for the CAB code path. For
316            //     [`MarkingType::PageBreak`] this filter is
317            //     intentionally a no-op: page breaks are control
318            //     shapes the decoder shouldn't be asked to recover.
319            if matches!(kind, MarkingType::Portion | MarkingType::Banner)
320                && marking.0.classification.is_none()
321            {
322                continue;
323            }
324
325            // 4. Score: compute prior and posterior separately. The
326            //    prior is the sum of baked corpus log-priors over the
327            //    marking's canonical tokens; the posterior is the
328            //    prior plus the per-feature log-odds deltas recorded
329            //    during canonicalization. `Candidate::prior_log_odds`
330            //    is documented as the prior alone (see
331            //    `crates/scheme/src/ambiguity.rs`) and is combined
332            //    additively with `EvidenceFeature.log_odds` by any
333            //    downstream resolver — storing the full posterior
334            //    there would double-count the features once the
335            //    resolver re-adds them. Internal decoder sort /
336            //    threshold decisions use the posterior.
337            let (prior, posterior) = score_candidate(&attempt, &marking);
338            scored.push(ScoredCandidate {
339                marking,
340                prior,
341                posterior,
342                canonical_bytes: attempt.bytes.into_boxed_slice(),
343                features: attempt.features,
344                fix_source: attempt.fix_source,
345            });
346        }
347
348        if scored.is_empty() {
349            return Parsed::Ambiguous {
350                candidates: Vec::new(),
351            };
352        }
353
354        // 5. Drop any candidate with a non-finite posterior, sort
355        //    descending, keep top K=8.
356        //
357        // NaN posteriors should be impossible —
358        // `MISSING_TOKEN_LOG_PRIOR = -12.0` and every feature delta
359        // is a finite constant — but a future scoring change could
360        // introduce a NaN-producing codepath. Under `f32::total_cmp`
361        // with the descending comparator (`b.total_cmp(&a)`), `+NaN`
362        // would sort *ahead* of every finite posterior and become the
363        // "top" candidate — its NaN posterior would then propagate
364        // into `log_margin` and `DecoderProvenance::posterior`, where
365        // `Confidence::validate` would later panic at audit-record
366        // promotion. Filter non-finite candidates out before the sort
367        // so the dispatch can never see one.
368        //
369        // `debug_assert` keeps the original assumption (decoder code
370        // does not produce NaN today) loud in dev builds; the filter
371        // is the production safeguard for if that assumption ever
372        // breaks silently.
373        debug_assert!(
374            scored.iter().all(|c| c.posterior.is_finite()),
375            "decoder produced non-finite posterior — invariant violated"
376        );
377        scored.retain(|c| c.posterior.is_finite());
378        if scored.is_empty() {
379            return Parsed::Ambiguous {
380                candidates: Vec::new(),
381            };
382        }
383        scored.sort_by(|a, b| b.posterior.total_cmp(&a.posterior));
384        scored.truncate(K_MAX_CANDIDATES);
385
386        // 6. Decision: top-over-runner-up log margin on the posterior.
387        let top_score = scored[0].posterior;
388        let runner_up_score = scored
389            .get(1)
390            .map(|c| c.posterior)
391            .unwrap_or(f32::NEG_INFINITY);
392        let log_margin = top_score - runner_up_score;
393
394        if scored.len() == 1 || log_margin >= UNAMBIGUOUS_LOG_MARGIN {
395            // Move the top candidate out so we can hand `canonical_bytes`
396            // and `features` directly to provenance without an extra
397            // clone — the marking carries the heaviest payload and we
398            // only need it once.
399            let top = scored.swap_remove(0);
400            // `runner_up_ratio = exp(log_margin)`, but a sufficiently
401            // separated top vs. runner-up overflows `f32::exp()` to
402            // `+∞` (anything past `log_margin ≈ 88.7` saturates), and
403            // `Confidence::validate` would then reject the resulting
404            // record as non-finite — making `FixProposal::new` panic at
405            // the audit boundary on extreme score separations. Saturate
406            // at `f32::MAX` so the audit record carries "the ratio is
407            // enormous" instead of crashing the engine.
408            let runner_up_ratio = if runner_up_score.is_finite() {
409                let ratio = log_margin.exp();
410                Some(if ratio.is_finite() { ratio } else { f32::MAX })
411            } else {
412                None
413            };
414            let mut marking = top.marking;
415            marking.1 = Some(DecoderProvenance::new(
416                top.canonical_bytes,
417                top.posterior,
418                runner_up_ratio,
419                top.features
420                    .into_iter()
421                    .map(|f| FeatureContribution {
422                        id: f.id,
423                        delta: f.delta,
424                    })
425                    .collect::<Vec<_>>()
426                    .into_boxed_slice(),
427                top.fix_source,
428            ));
429            return Parsed::Unambiguous(marking);
430        }
431
432        // Ambiguous: return the whole K-truncated set with per-feature
433        // evidence so the engine can surface a user-visible diagnostic.
434        // `prior_log_odds` carries the prior alone; `evidence` carries
435        // the feature deltas. A resolver that re-computes the
436        // posterior as `prior + Σ evidence.log_odds` reproduces the
437        // decoder's internal score without double-counting.
438        Parsed::Ambiguous {
439            candidates: scored
440                .into_iter()
441                .map(|s| Candidate {
442                    marking: s.marking,
443                    evidence: s.features.iter().map(feature_entry_to_evidence).collect(),
444                    prior_log_odds: s.prior,
445                })
446                .collect(),
447        }
448    }
449}
450
451/// One scored candidate kept in the decoder's working set.
452///
453/// `prior` and `posterior` are tracked separately so
454/// `Candidate::prior_log_odds` can carry the prior alone (per the
455/// trait-level contract in `crates/scheme/src/ambiguity.rs`) while
456/// internal sort / threshold decisions use the posterior.
457struct ScoredCandidate {
458    marking: CapcoMarking,
459    /// Sum of baked corpus log-priors over the marking's canonical
460    /// tokens. No feature deltas included.
461    prior: f32,
462    /// `prior + Σ feature.delta`. Used for sorting and threshold
463    /// comparisons inside the decoder; not stored in the emitted
464    /// `Candidate` record.
465    posterior: f32,
466    /// Canonical byte string the strict parser accepted for this
467    /// candidate. Threaded into [`DecoderProvenance::canonical_bytes`]
468    /// when this candidate wins the Unambiguous collapse, so the
469    /// engine can emit the decoder fix from the original mangled
470    /// bytes to this canonical form (Phase 4 PR-4b, T068).
471    canonical_bytes: Box<[u8]>,
472    features: Vec<FeatureEntry>,
473    /// Provenance discriminator carried from the originating
474    /// [`CanonicalAttempt`]. The engine maps this to
475    /// [`Severity::Fix`](marque_rules::Severity::Fix) for
476    /// `DecoderPosterior` and
477    /// [`Severity::Warn`](marque_rules::Severity::Warn) for
478    /// `DecoderClassificationHeuristic` (issue #133 PR 2).
479    fix_source: marque_rules::FixSource,
480}
481
482/// One feature recorded during candidate generation, paired with its
483/// log-odds contribution. The decoder accumulates these to reconstruct
484/// `Confidence::features` at audit-emit time.
485#[derive(Debug, Clone, Copy)]
486struct FeatureEntry {
487    id: FeatureId,
488    delta: f32,
489}
490
491/// Project a `FeatureEntry` onto the wire-shape [`EvidenceFeature`].
492///
493/// Routes the label through [`FeatureId::as_str`] — the single source
494/// of truth for the FeatureId → audit-record-string registry declared
495/// in `crates/rules/src/confidence.rs`. Lifted out of the inline
496/// closure in [`DecoderRecognizer::recognize`] so the projection is
497/// directly testable: a divergent local label registry (the PR #142 H2
498/// pre-fix shape) would now fail
499/// [`tests::feature_entry_to_evidence_uses_canonical_label_registry`]
500/// rather than going unnoticed because the dispatcher discards
501/// `Parsed::Ambiguous` results today.
502fn feature_entry_to_evidence(f: &FeatureEntry) -> EvidenceFeature {
503    EvidenceFeature {
504        label: f.id.as_str(),
505        log_odds: f.delta,
506    }
507}
508
509/// A canonicalization attempt: the byte string the decoder will hand
510/// to the strict parser, plus the features that transformation
511/// represents. Zero or more attempts are generated per observed input.
512struct CanonicalAttempt {
513    bytes: Vec<u8>,
514    features: Vec<FeatureEntry>,
515    /// Which decoder path produced this attempt. Defaults to
516    /// [`marque_rules::FixSource::DecoderPosterior`] for the standard
517    /// vocab-based pipeline (delimiter normalization, fuzzy
518    /// correction, token reorder, superseded-token replacement).
519    /// The position-aware classification heuristic emits attempts
520    /// with [`marque_rules::FixSource::DecoderClassificationHeuristic`]
521    /// (issue #133 PR 2) so the engine can downgrade to
522    /// [`marque_rules::Severity::Warn`] and cap
523    /// [`marque_rules::Confidence::rule`].
524    fix_source: marque_rules::FixSource,
525}
526
527// ---------------------------------------------------------------------------
528// Marking-type inference (mirrors `recognizer::infer_marking_type`)
529// ---------------------------------------------------------------------------
530
531/// Infer a [`MarkingType`] from the shape of `bytes`.
532///
533/// Same heuristic as the strict recognizer — portion on leading `(`,
534/// CAB on authority-head prefix, banner otherwise. Lives locally so
535/// the decoder doesn't need to poke into `StrictRecognizer` internals.
536fn infer_marking_type(bytes: &[u8]) -> Option<MarkingType> {
537    let first = bytes.iter().copied().find(|&b| !b.is_ascii_whitespace())?;
538    if first == b'(' {
539        return Some(MarkingType::Portion);
540    }
541    if is_cab_head(bytes) {
542        return Some(MarkingType::Cab);
543    }
544    Some(MarkingType::Banner)
545}
546
547fn is_cab_head(bytes: &[u8]) -> bool {
548    let Ok(text) = std::str::from_utf8(bytes) else {
549        return false;
550    };
551    let trimmed = text.trim_start();
552    trimmed.starts_with("Classified By:")
553        || trimmed.starts_with("Derived From:")
554        || trimmed.starts_with("Declassify On:")
555}
556
557// ---------------------------------------------------------------------------
558// Candidate byte generation
559// ---------------------------------------------------------------------------
560
561/// Generate bounded canonical-byte candidates from a mangled input.
562///
563/// Each returned [`CanonicalAttempt`] is a `Vec<u8>` the decoder will
564/// hand to the strict parser. Attempts cover the transforms named in
565/// the module docs:
566///
567/// - Case normalization (`secret//noforn` → `SECRET//NOFORN`).
568/// - Garbled-delimiter rewrite (`S ∕∕ NOFORN` → `S//NOFORN`).
569/// - Per-token fuzzy correction (edit-distance ≤ 2 via
570///   [`marque_core::fuzzy::FuzzyVocabMatcher`]).
571/// - Superseded-token replacement (`COMINT` → `SI`).
572/// - Token reordering — tried when categorical ordering is the obvious
573///   deviation (e.g., portion `A//B` where B is a classification and
574///   A isn't).
575///
576/// Bounded by [`K_MAX_CANDIDATES`] × 2 to keep the strict-parse pass
577/// bounded; duplicates (different feature traces producing the same
578/// canonical bytes) are deduplicated at emit time.
579fn generate_candidate_bytes(bytes: &[u8]) -> Vec<CanonicalAttempt> {
580    let Ok(text) = std::str::from_utf8(bytes) else {
581        return Vec::new();
582    };
583
584    // Strip surrounding whitespace; preserve leading `(` for portion
585    // detection so the strict parser's portion path stays keyed off
586    // the same first-non-whitespace byte the recognizer saw.
587    let trimmed = text.trim();
588    if trimmed.is_empty() {
589        return Vec::new();
590    }
591
592    let mut attempts: Vec<CanonicalAttempt> = Vec::new();
593    let mut emit =
594        |bytes: Vec<u8>, features: Vec<FeatureEntry>, fix_source: marque_rules::FixSource| {
595            // Hard cap at K_MAX_CANDIDATES × 2 — guarantees the strict-parse
596            // work downstream is bounded even if new transform stages are added.
597            if attempts.len() >= K_MAX_CANDIDATES * 2 {
598                return;
599            }
600            // Dedup by the canonical byte string — different transform
601            // sequences can converge on the same output. Emit-first wins:
602            // the standard vocab-based attempts are emitted before the
603            // heuristic attempt, so a heuristic candidate with bytes that
604            // converge on a vocab-based result is dropped here, preserving
605            // the more authoritative `FixSource::DecoderPosterior`
606            // provenance.
607            if !attempts.iter().any(|a| a.bytes == bytes) {
608                attempts.push(CanonicalAttempt {
609                    bytes,
610                    features,
611                    fix_source,
612                });
613            }
614        };
615
616    // ---- Raw: just trim + normalize delimiters/case. --------------
617    let (normalized, mut delim_features) = normalize_delimiters_and_case(trimmed);
618
619    // ---- REL TO structural repair (issue #133 PR 9) — applied as
620    //      PREPROCESSING (before fuzzy correction) rather than as a
621    //      competing candidate emission. All four PR-9 patterns are
622    //      safe to apply unconditionally:
623    //
624    //      - Patterns 1/2 (`REL OT ` / `RELT O ` → `REL TO `) are
625    //        literal-shape transforms. Neither pattern appears in any
626    //        valid CAPCO text — REL has exactly two valid extensions
627    //        (`REL TO` and `RELIDO`) — so the byte replacement is
628    //        collision-free.
629    //      - Patterns 3/4 (`A US` → `AUS`, `AU,S ` → `AUS, `) are
630    //        trigraph-guarded inside a `REL TO ` block: the fix only
631    //        fires when the joined 3-letter string is a known trigraph
632    //        AND the shorter prefix alone is not, so a false positive
633    //        would require the trigraph dictionary itself to disagree
634    //        with reality.
635    //
636    //      Applying as preprocessing avoids two scoring problems that
637    //      a separate-candidate emission would hit: (a) fuzzy
638    //      correction would silently rewrite `RELT` → `REL` before
639    //      pattern 2's header normalize could fire, and (b) REL TO
640    //      trigraphs do NOT contribute to the prior in
641    //      `canonical_tokens_for` (only classification, SCI, dissem,
642    //      NIC, AEA, FGI do — see issue #186 for the corpus-weighted
643    //      trigraph priors followup), so a separate fix candidate
644    //      would tie with the raw on prior and lose on emit-order.
645    //      Preprocessing eliminates the competing-raw-candidate
646    //      problem entirely.
647    //
648    //      When structural repair fires, push a `BaseRateCommonMarking`
649    //      feature onto `delim_features` so every candidate derived
650    //      from the repaired text inherits the marker. This mirrors
651    //      `try_insert_delimiter` and `try_sar_indicator_repair`
652    //      (which add their own per-candidate `BaseRateCommonMarking`)
653    //      and ensures the audit/provenance trace reflects that the
654    //      input required cleanup beyond delimiter/case normalization.
655    //      No dedicated `FeatureId` for structural repair exists in
656    //      the audit schema (`marque-mvp-2`); reusing
657    //      `BaseRateCommonMarking` keeps the schema closed and
658    //      composes additively with the other normalization paths
659    //      that share the same id.
660    let repaired_text = match try_rel_to_structural_repair(&normalized) {
661        Some(repaired) => {
662            delim_features.push(FeatureEntry {
663                id: FeatureId::BaseRateCommonMarking,
664                delta: -0.3,
665            });
666            repaired
667        }
668        None => normalized,
669    };
670
671    // ---- SCI delimiter repair (issue #198, #133 PR 10). Same
672    //      preprocessing-shape as the REL TO repair above: rewrites
673    //      concatenated CVE compounds (`HCSP → HCS-P`), missing
674    //      slashes between bare control systems (`SITK → SI/TK`), and
675    //      wrong-delimiter cases (`SI-TK → SI/TK`). All targets live
676    //      in `CVEnumISMSCIControls.xml` — no agency vocab. Sub-
677    //      compartments and unregistered compartments are out of
678    //      scope (issue #180). Push a `BaseRateCommonMarking`
679    //      penalty for the same reason as REL TO repair: a candidate
680    //      that arrived clean should outrank one that needed
681    //      structural cleanup when both produce the same shape.
682    let repaired_text = match try_sci_delimiter_repair(&repaired_text) {
683        Some(repaired) => {
684            delim_features.push(FeatureEntry {
685                id: FeatureId::BaseRateCommonMarking,
686                delta: -0.3,
687            });
688            repaired
689        }
690        None => repaired_text,
691    };
692
693    // ---- Per-token fuzzy correction on the repaired text. --------
694    let vocab = CapcoTokenSet.correction_vocab();
695    let matcher = FuzzyVocabMatcher::new(vocab);
696    let (fuzzy_corrected, fuzzy_features) = fuzzy_correct_tokens(&repaired_text, &matcher);
697
698    // Emit the straightforward "normalize + fuzzy-correct" attempt
699    // first — this covers typos (T046) and case/delimiter mangling
700    // by default.
701    let mut features = delim_features.clone();
702    features.extend(fuzzy_features.iter().copied());
703    emit(
704        fuzzy_corrected.clone().into_bytes(),
705        features,
706        marque_rules::FixSource::DecoderPosterior,
707    );
708
709    // ---- Also attempt a token-reorder pass. The reorder is gentle:
710    //      inside each `//`-separated segment, if the segment's tokens
711    //      look like they belong to multiple categories, we try a
712    //      canonical category ordering (classification first).
713    if let Some(reordered) = try_canonical_reorder(&fuzzy_corrected) {
714        let mut features = delim_features.clone();
715        features.extend(fuzzy_features.iter().copied());
716        features.push(FeatureEntry {
717            id: FeatureId::TokenReorder,
718            delta: -0.4,
719        });
720        emit(
721            reordered.into_bytes(),
722            features,
723            marque_rules::FixSource::DecoderPosterior,
724        );
725    }
726
727    // ---- Non-US prefix insertion. For bare non-US markings that
728    //      arrive with no `//` at all (e.g., `NS`, `JOINT S GBR USA`,
729    //      `CAN S`), emit a `//{body}` candidate so the strict parser
730    //      enters the non-US classification code path. The reorder pass
731    //      above handles inputs that already contain `//` but are
732    //      missing the leading empty-US-slot prefix.
733    if let Some(prefixed) = try_add_non_us_prefix(&fuzzy_corrected) {
734        let mut features = delim_features.clone();
735        features.extend(fuzzy_features.iter().copied());
736        features.push(FeatureEntry {
737            id: FeatureId::TokenReorder,
738            delta: -0.4,
739        });
740        emit(
741            prefixed.into_bytes(),
742            features,
743            marque_rules::FixSource::DecoderPosterior,
744        );
745    }
746
747    // ---- Missing-delimiter insertion (issue #133 PR 3). Walks the
748    //      fuzzy-corrected text, inserts `//` at category-transition
749    //      whitespace gaps. Tagged with `FixSource::DecoderPosterior`
750    //      because the recovery is structural (missing punctuation),
751    //      not a probabilistic guess like the classification heuristic
752    //      below — auto-applies at default threshold when its strict
753    //      parse + scoring outranks competing candidates.
754    if let Some(delim_inserted) = try_insert_delimiter(&fuzzy_corrected) {
755        let mut features = delim_features.clone();
756        features.extend(fuzzy_features.iter().copied());
757        // No FeatureId for delimiter insertion in the audit schema.
758        // Reuse `BaseRateCommonMarking` with a small negative delta
759        // to record that this attempt required cleanup beyond the
760        // raw input — keeps the canonical-arrived-clean attempt
761        // ranked higher when both produce the same shape.
762        features.push(FeatureEntry {
763            id: FeatureId::BaseRateCommonMarking,
764            delta: -0.3,
765        });
766        emit(
767            delim_inserted.into_bytes(),
768            features,
769            marque_rules::FixSource::DecoderPosterior,
770        );
771    }
772
773    // ---- SAR indicator-keyword structural repair (issue #133 PR 6).
774    //      Recovers `USAR-BP-J12...` (stray prefix on the SAR
775    //      indicator) and `SARBP` (missing hyphen between indicator
776    //      and program identifier). Same provenance / penalty story
777    //      as `try_insert_delimiter`: a `BaseRateCommonMarking` delta
778    //      records that the candidate required cleanup beyond raw
779    //      input, so a canonical-arrived-clean candidate beats a
780    //      SAR-repaired one with the same final shape.
781    if let Some(sar_repaired) = try_sar_indicator_repair(&fuzzy_corrected) {
782        let mut features = delim_features.clone();
783        features.extend(fuzzy_features.iter().copied());
784        features.push(FeatureEntry {
785            id: FeatureId::BaseRateCommonMarking,
786            delta: -0.3,
787        });
788        emit(
789            sar_repaired.into_bytes(),
790            features,
791            marque_rules::FixSource::DecoderPosterior,
792        );
793    }
794
795    // ---- Stray-character `/X/` recovery (issue #133 PR 7). Walks
796    //      the fuzzy-corrected text looking for the pattern
797    //      `<alnum>/<single_alnum_char>/<alnum>` — three transforms
798    //      emitted per match (drop X, attach X to right token,
799    //      attach X to left token). Step 3a's Unknown-token filter
800    //      acts as the natural disambiguator: only the transform
801    //      that produces a recognizable token survives. See
802    //      [`try_collapse_stray_char_slash`] for the recovery
803    //      shapes (`SI/U/NOFORN` → drop, `SI/N/OFORN` →
804    //      right-attach, `SECRE/T/REL TO` → left-attach).
805    for candidate in try_collapse_stray_char_slash(&fuzzy_corrected) {
806        let mut features = delim_features.clone();
807        features.extend(fuzzy_features.iter().copied());
808        features.push(FeatureEntry {
809            id: FeatureId::BaseRateCommonMarking,
810            delta: -0.3,
811        });
812        emit(
813            candidate.into_bytes(),
814            features,
815            marque_rules::FixSource::DecoderPosterior,
816        );
817    }
818
819    // ---- REL TO trigraph fuzzy-priors expansion (issue #233).
820    //      The standard fuzzy path in `fuzzy_correct_tokens` operates
821    //      against `correction_vocab()`, which deliberately excludes
822    //      country trigraphs (see the comment on `ALL_CVE_TOKENS` in
823    //      `crates/ism/build.rs` and the design rationale in
824    //      `EXTENDED_CORRECTION_VOCAB`). Trigraphs live in a separate
825    //      `TRIGRAPHS` slice reached via `is_trigraph`. So an unknown
826    //      3-char REL TO entry like `USB` doesn't get any fuzzy
827    //      correction — the standard fuzzy walk has nothing to match
828    //      against. The strict REL TO parser previously dropped
829    //      unknown entries silently; issue #233 makes
830    //      `parse_rel_to_with_spans` emit `TokenKind::Unknown` instead
831    //      so the dispatcher's step 3a rejects the "drop USB"
832    //      candidate.
833    //
834    //      With unknown entries no longer silently absorbed, the
835    //      candidate set must include real trigraph alternates for
836    //      the dispatcher to choose between. This block walks each
837    //      `REL TO ` block, finds 3-char entries that aren't valid
838    //      trigraphs, and emits one canonical-byte alternate per
839    //      candidate from a fuzzy match against the TRIGRAPHS
840    //      slice. The structural strict parse +
841    //      `score_candidate` (which sums `country_code_log_prior`
842    //      over the parsed `rel_to` slice) then picks the right
843    //      winner: USA dominates UZB by ~7 nats, far above
844    //      `UNAMBIGUOUS_LOG_MARGIN`.
845    //
846    //      Each alternate carries an `EditDistance1` /
847    //      `EditDistance2` feature so the audit trail records the
848    //      fuzzy work, plus a zero-delta `BaseRateCommonMarking`
849    //      feature whose role is purely audit-trail provenance —
850    //      "country-code priors were consulted on this candidate".
851    //      The actual scoring weight comes from `score_candidate`
852    //      summing `country_code_log_prior` over `attempt.rel_to`;
853    //      adding a non-zero delta here would double-count. The
854    //      other structural-cleanup paths in this file use `-0.3`
855    //      because they have no parallel score-time prior to back
856    //      them up; the trigraph path does, so the audit feature
857    //      is informational only. No new `FeatureId` variant —
858    //      adding one would bump the audit schema. Reusing
859    //      `BaseRateCommonMarking` matches the variant's existing
860    //      doc ("the candidate's base rate in the target corpus
861    //      dominates the posterior").
862    let trigraph_matcher = FuzzyVocabMatcher::new(marque_ism::TRIGRAPHS);
863    for (alt_text, edit_feature) in
864        try_rel_to_fuzzy_trigraph_candidates(&fuzzy_corrected, &trigraph_matcher)
865    {
866        let mut features = delim_features.clone();
867        features.extend(fuzzy_features.iter().copied());
868        features.push(edit_feature);
869        // Trigraph-prior acknowledgement (see comment above for the
870        // FeatureId reuse rationale + zero-delta justification).
871        features.push(FeatureEntry {
872            id: FeatureId::BaseRateCommonMarking,
873            delta: 0.0,
874        });
875        emit(
876            alt_text.into_bytes(),
877            features,
878            marque_rules::FixSource::DecoderPosterior,
879        );
880    }
881
882    // ---- REL TO USA-injection for short first entries (issue #234 PR-B).
883    //      Complementary to PR-A above: PR-A fuzzy-matches 3-char REL TO
884    //      entries; PR-B handles 1-2 char first entries that are below
885    //      `MIN_FUZZY_LEN`. The §H.8 p151 USA-first invariant gives us a
886    //      strong structural signal that fuzzy matching cannot exploit
887    //      on inputs that short — `SA → USA`, `S → USA`, etc. The
888    //      `BaseRateCommonMarking` audit delta keeps the audit schema
889    //      closed (no new `FeatureId` variant); see the doc on
890    //      `try_rel_to_usa_injection_candidates` for the rationale.
891    for (alt_text, prior_feature) in try_rel_to_usa_injection_candidates(&fuzzy_corrected) {
892        let mut features = delim_features.clone();
893        features.extend(fuzzy_features.iter().copied());
894        features.push(prior_feature);
895        emit(
896            alt_text.into_bytes(),
897            features,
898            marque_rules::FixSource::DecoderPosterior,
899        );
900    }
901
902    // ---- Position-aware classification heuristic (issue #133 PR 2).
903    //      Runs LAST so the dedup-keep-first guard above lets a
904    //      vocab-based attempt with the same canonical bytes win the
905    //      provenance contest — the heuristic only "wins" when no
906    //      vocab path produces the same shape.
907    //
908    //      Scoring intentionally adds NO `EditDistance1` penalty.
909    //      The heuristic's value comes from RECOGNIZING a
910    //      classification token where the vocab-only path would
911    //      leave the slot as `classification: None`. The added prior
912    //      contribution from the recognized classification (e.g.,
913    //      `log_prior("TOP SECRET")`) is what should put the
914    //      heuristic candidate ahead of the no-classification fuzzy
915    //      fallback. An EditDistance penalty would push the
916    //      heuristic candidate BELOW the no-classification candidate
917    //      and the fuzzy one would win — defeating the heuristic's
918    //      purpose. The audit-record provenance still distinguishes
919    //      this path through `FixSource::DecoderClassificationHeuristic`.
920    if let Some(heuristic_bytes) = try_classification_heuristic_fix(&fuzzy_corrected) {
921        let mut features = delim_features.clone();
922        features.extend(fuzzy_features.iter().copied());
923        emit(
924            heuristic_bytes.into_bytes(),
925            features,
926            marque_rules::FixSource::DecoderClassificationHeuristic,
927        );
928    }
929
930    attempts
931}
932
933/// Diagnostic-only accessor exposing the canonicalized byte attempts
934/// the decoder generates from `bytes`. Returns one byte string per
935/// attempt, in emit order; feature traces and the internal
936/// [`CanonicalAttempt`] type are deliberately not surfaced — the
937/// diagnostic only needs the bytes the strict parser will see.
938///
939/// Gated by the `decoder-harness` feature so it does not appear in
940/// production builds. The single consumer is
941/// `crates/engine/tests/decoder_diagnostic.rs` (issue #133 root-cause
942/// tracing). Calling the real [`generate_candidate_bytes`] eliminates
943/// the drift class of bug a hand-rolled re-implementation in the
944/// diagnostic would carry.
945#[cfg(feature = "decoder-harness")]
946pub fn diagnostic_canonical_attempts(bytes: &[u8]) -> Vec<Vec<u8>> {
947    generate_candidate_bytes(bytes)
948        .into_iter()
949        .map(|a| a.bytes)
950        .collect()
951}
952
953/// Normalize delimiters and case on a trimmed input.
954///
955/// - Fullwidth slash variants (`∕∕`, `/ /`, ` / / `, spaced `//`) all
956///   collapse to `//`.
957/// - ASCII alphabetic characters are upper-cased; the CAPCO grammar
958///   is case-sensitive uppercase (§B).
959/// - Leading `(` and trailing `)` are preserved so portion detection
960///   still works.
961///
962/// Returns the normalized string and the features that were applied.
963/// When normalization was actually needed, a `BaseRateCommonMarking`
964/// feature is recorded with a negative delta — the candidate pays a
965/// small penalty for having required case- or delimiter-cleanup
966/// rather than arriving in canonical form. A candidate that
967/// normalized cleanly and also resolved its tokens via fuzzy
968/// correction will still outrank a candidate that arrived dirty,
969/// but a canonical-from-the-start candidate beats both.
970fn normalize_delimiters_and_case(text: &str) -> (String, Vec<FeatureEntry>) {
971    let mut features = Vec::new();
972
973    // Collapse fullwidth and spaced slash variants.
974    // The order matters: we want multi-char sequences first.
975    let mut normalized: String = text.to_owned();
976    let replacements = [
977        ("∕∕", "//"),
978        (" // ", "//"),
979        ("// ", "//"),
980        (" //", "//"),
981        ("/ / ", "//"),
982        (" / / ", "//"),
983        ("/ /", "//"),
984    ];
985    let mut delim_changed = false;
986    for (from, to) in replacements {
987        if normalized.contains(from) {
988            normalized = normalized.replace(from, to);
989            delim_changed = true;
990        }
991    }
992
993    // Case normalization. If the input was all-lowercase or mixed-case
994    // (Title Case), uppercasing is a significant canonicalization the
995    // decoder flags (via the `BaseRateCommonMarking` feature below)
996    // so the posterior reflects that the candidate required cleanup.
997    let had_lowercase = normalized.chars().any(|c| c.is_ascii_lowercase());
998    if had_lowercase {
999        normalized = normalized.to_ascii_uppercase();
1000    }
1001
1002    if delim_changed || had_lowercase {
1003        // Record a `BaseRateCommonMarking` feature with a penalty
1004        // delta. The feature doesn't fit into one of the sharper
1005        // features (`EditDistance*`, `TokenReorder`,
1006        // `SupersededToken`), but it flags that we had to massage
1007        // the input — delimiters were non-canonical, or case was
1008        // wrong. A small negative delta means a canonical-input
1009        // candidate outranks an otherwise-equivalent normalized one,
1010        // which is the intent: "arrives clean" should be preferred
1011        // over "needed cleanup."
1012        features.push(FeatureEntry {
1013            id: FeatureId::BaseRateCommonMarking,
1014            delta: -0.3,
1015        });
1016    }
1017
1018    (normalized, features)
1019}
1020
1021/// Fuzzy-correct each whitespace/delimiter-separated token in `text`.
1022///
1023/// Tokens that are already canonical are passed through. Unknown
1024/// tokens are run through [`FuzzyVocabMatcher`]; if a correction is
1025/// unambiguous the replacement lands in the output and the appropriate
1026/// `EditDistance1`/`EditDistance2` feature is recorded. If no
1027/// correction is available, the token is dropped into the output
1028/// unchanged.
1029///
1030/// Note on pass-through safety: `marque_core::Parser` is lenient — it
1031/// does NOT reject the whole parse when an unknown token appears, it
1032/// emits the token as a `TokenKind::Unknown` span instead. So
1033/// dropping an uncorrectable token through this step does not by
1034/// itself reject the candidate. The decoder's outer loop
1035/// (`DecoderRecognizer::recognize` step 3a) checks for any Unknown
1036/// span on the strict-parse result and discards such candidates
1037/// before they reach scoring — that is where partial-canonicalization
1038/// candidates get filtered out.
1039///
1040/// Also consults [`SUPERSEDED_TOKEN_MAP`] for CAPCO-2016 retirement
1041/// pairs (currently just `COMINT` → `SI`), recording the
1042/// `SupersededToken` feature when triggered.
1043fn fuzzy_correct_tokens(
1044    text: &str,
1045    matcher: &FuzzyVocabMatcher<'_>,
1046) -> (String, Vec<FeatureEntry>) {
1047    let mut features = Vec::new();
1048    let mut out = String::with_capacity(text.len());
1049    let mut rest = text;
1050
1051    // We walk the text segment-by-segment, preserving the `//`,
1052    // `-`, `(`, `)`, `,`, and whitespace delimiters verbatim. Tokens
1053    // are the maximal runs of ASCII alphanumerics (plus `-` when it
1054    // appears between alphanumerics, to keep compounds like `SI-G`
1055    // intact).
1056    while !rest.is_empty() {
1057        // Take the non-token prefix (delimiters/whitespace/punct).
1058        let non_token_len = rest
1059            .chars()
1060            .take_while(|c| !is_token_char(*c))
1061            .map(|c| c.len_utf8())
1062            .sum::<usize>();
1063        if non_token_len > 0 {
1064            out.push_str(&rest[..non_token_len]);
1065            rest = &rest[non_token_len..];
1066            continue;
1067        }
1068        // Take the token: alnum + internal `-`.
1069        let token_len = scan_token(rest);
1070        if token_len == 0 {
1071            // Should not happen given the non-token prefix branch,
1072            // but guard against infinite loops on pathological input.
1073            break;
1074        }
1075        let (token, tail) = rest.split_at(token_len);
1076        rest = tail;
1077
1078        // Case 1: exact superseded token (e.g., standalone `COMINT` → `SI`).
1079        if let Some(replacement) = SUPERSEDED_TOKEN_MAP
1080            .iter()
1081            .find(|&&(from, _)| from == token)
1082            .map(|&(_, to)| to)
1083        {
1084            out.push_str(replacement);
1085            features.push(FeatureEntry {
1086                id: FeatureId::SupersededToken,
1087                delta: -0.2,
1088            });
1089            continue;
1090        }
1091
1092        // Case 1b: embedded superseded token — the deprecated keyword
1093        // appears as a substring within a longer token. Handles compound
1094        // prefixes (`COMINT-G` → `SI-G`), embedded substitutions
1095        // (`UNCLASCOMINTFIED` → `UNCLASSIFIED`, `FRD-COMINTGMA 14` →
1096        // `FRD-SIGMA 14`, `SENCOMINTTIVE` → `SENSITIVE`). The token !=
1097        // from guard ensures the exact-match case above is the only path
1098        // for bare superseded tokens. CAPCO-2016 §H.4 p74.
1099        let embedded_replacement = SUPERSEDED_TOKEN_MAP
1100            .iter()
1101            .find(|&&(from, _)| token != from && token.contains(from))
1102            .map(|&(from, to)| token.replace(from, to));
1103        if let Some(replaced) = embedded_replacement {
1104            out.push_str(&replaced);
1105            features.push(FeatureEntry {
1106                id: FeatureId::SupersededToken,
1107                delta: -0.2,
1108            });
1109            continue;
1110        }
1111
1112        // Case 2: already canonical (known CVE token or trigraph).
1113        // Check this first so we don't run a vocab scan + edit-
1114        // distance pass on tokens we already recognize.
1115        if CapcoTokenSet.canonicalize(token).is_some() || CapcoTokenSet.is_trigraph(token) {
1116            out.push_str(token);
1117            continue;
1118        }
1119
1120        // Case 3: fuzzy-correctable. Compute once and reuse; the
1121        // previous structure called `matcher.correct(token)` twice
1122        // on tokens that weren't already canonical, doubling the
1123        // vocab-scan cost on exactly the unknown-token hot path.
1124        if let Some(correction) = matcher.correct(token) {
1125            out.push_str(correction.token);
1126            // `FeatureId` is part of the audit-schema contract (see
1127            // `crates/rules/src/confidence.rs` and the
1128            // `MARQUE_AUDIT_SCHEMA` pin); a wildcard `_` arm on it
1129            // would silently absorb future-variant additions. Pair
1130            // each (id, delta) directly off `correction.distance` so
1131            // both arms are total over the only two outcomes the
1132            // outer guard permits (`distance > 0`, `distance <=
1133            // MAX_EDIT_DISTANCE = 2`).
1134            let feature = match correction.distance {
1135                // `correct` returns `None` for exact matches, so
1136                // `distance == 0` cannot reach here; `MAX_EDIT_DISTANCE
1137                // == 2` upstream caps `distance <= 2`.
1138                0 => None,
1139                1 => Some(FeatureEntry {
1140                    id: FeatureId::EditDistance1,
1141                    delta: -0.5,
1142                }),
1143                _ => Some(FeatureEntry {
1144                    id: FeatureId::EditDistance2,
1145                    delta: -1.2,
1146                }),
1147            };
1148            if let Some(entry) = feature {
1149                features.push(entry);
1150            }
1151            continue;
1152        }
1153
1154        // Case 4: unknown and uncorrectable. Pass through verbatim.
1155        // The strict parser will register this as a
1156        // `TokenKind::Unknown` span rather than failing the parse
1157        // outright, so the decoder's outer loop (step 3a of
1158        // `DecoderRecognizer::recognize`) is what filters the
1159        // resulting partial-canonicalization candidate out.
1160        out.push_str(token);
1161    }
1162
1163    (out, features)
1164}
1165
1166/// Token characters: ASCII alphanumerics. `-` is handled by
1167/// [`scan_token`] as an internal separator.
1168fn is_token_char(c: char) -> bool {
1169    c.is_ascii_alphanumeric()
1170}
1171
1172/// Scan a token starting at `text[0]`. Returns the token length in
1173/// bytes. A token is a run of alphanumerics, with internal `-` allowed
1174/// between alphanumerics to support compounds like `SI-G` and
1175/// `SAR-BP`.
1176fn scan_token(text: &str) -> usize {
1177    let bytes = text.as_bytes();
1178    let mut i = 0;
1179    while i < bytes.len() {
1180        let b = bytes[i];
1181        let is_alnum = b.is_ascii_alphanumeric();
1182        let is_internal_hyphen =
1183            b == b'-' && i > 0 && i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphanumeric();
1184        if is_alnum || is_internal_hyphen {
1185            i += 1;
1186        } else {
1187            break;
1188        }
1189    }
1190    i
1191}
1192
1193/// Map of CAPCO-2016-superseded tokens → their authoritative live
1194/// replacements. Each entry MUST cite a specific passage in
1195/// `crates/capco/docs/CAPCO-2016.md` (Constitution VIII). Adding an
1196/// entry without a verified citation is a correctness defect.
1197///
1198/// - `COMINT` → `SI`: CAPCO-2016 §H.4 p74 ("The COMINT title for the
1199///   Special Intelligence (SI) control system is no longer valid.")
1200///   inside §H.4 SCI Control System Markings.
1201const SUPERSEDED_TOKEN_MAP: &[(&str, &str)] = &[("COMINT", "SI")];
1202
1203// ---------------------------------------------------------------------------
1204// Position-aware short-token classification heuristic (issue #133 PR 2)
1205// ---------------------------------------------------------------------------
1206
1207/// Try to fix a malformed leading classification token using a
1208/// keyboard-proximity heuristic.
1209///
1210/// `MIN_FUZZY_LEN = 3` blocks the vocab-based fuzzy matcher from
1211/// running on 1- and 2-character tokens — `R`, `W`, `YS`, `XS` etc.
1212/// are too short for edit-distance to be reliable against the closed
1213/// vocabulary alone. But when such a token sits at the **leading
1214/// classification position** of a portion or banner marking, the
1215/// position itself is strong evidence: the user intended a
1216/// classification level, and the malformed token is almost certainly
1217/// keyboard-adjacent to a real one.
1218///
1219/// This helper applies a small keyboard-proximity table to the first
1220/// whitespace-separated token of the first `//`-separated segment.
1221/// It returns the corrected text (with the leading token replaced)
1222/// when a rule fires. Returns `None` when the leading token is
1223/// already canonical, longer than 2 chars, or doesn't match any
1224/// rule.
1225///
1226/// # Confidence
1227///
1228/// The decoder tags this attempt's [`CanonicalAttempt::fix_source`]
1229/// with [`FixSource::DecoderClassificationHeuristic`]. The engine
1230/// then (a) downgrades the diagnostic severity to
1231/// [`Severity::Warn`](marque_rules::Severity::Warn) — always-visible
1232/// in `--check`, exits non-zero — and (b) caps
1233/// [`Confidence::rule`](marque_rules::Confidence) at `0.80` so
1234/// `combined ≤ 0.80` stays below the default `confidence_threshold`
1235/// of `0.95`. The heuristic only auto-applies in `--fix` mode when
1236/// the user has explicitly lowered the threshold, opting into the
1237/// heuristic's bar of evidence.
1238///
1239/// # Rules (CAPCO-2016 §A.2 classification levels: U, R, C, S, TS)
1240///
1241/// Length is checked first — a 2-char token never reaches the 1-char
1242/// table. The keyboard-proximity sets are derived from the standard
1243/// QWERTY layout: keys physically adjacent to S (`A`, `W`, `E`, `Z`)
1244/// likely correspond to S typos; keys adjacent to T (`R`, `Y`, `H`,
1245/// `G`, `F`) likely correspond to T typos when followed by an
1246/// S-cluster character (so the pair maps to `TS`). The table is
1247/// intentionally narrow — wider sets produce more false positives
1248/// in normal prose.
1249///
1250/// **Length 3** (issue #133 PR 8) — exactly one mapping:
1251/// - `OTP` → `TOP` (T↔O transposition; standard Levenshtein dist 2,
1252///   blocked by `MIN_USEFUL_CONFIDENCE` for 3-char inputs at dist 2,
1253///   so the vocab path can't catch it even with `TOP` in vocab).
1254///
1255/// The 3-char rule is intentionally a single hardcoded mapping —
1256/// the dense 3-char trigraph vocab (`TON`, `TUR`, `TWN`, …, 289
1257/// entries) means a wider "all transpositions of TOP" rule
1258/// would generate too many false positives. Other corpus-attested
1259/// 3-char `TOP` typos (`TPP`, `UOP`) are at standard Levenshtein
1260/// dist 1 from the bare `TOP` in `EXTENDED_CORRECTION_VOCAB` and
1261/// recover via the vocab path; only transposition (which standard
1262/// Levenshtein scores as dist 2) needs the heuristic. See
1263/// [`try_3char_classification_heuristic`] for the implementation
1264/// and the `try_3char_classification_heuristic_only_matches_otp`
1265/// regression-pin for the narrow-scope policy.
1266///
1267/// **Length 2** (checked second):
1268/// - `[T, R, Y, H, G][A, W, E, Z, S]` → `TS` (e.g., `RS`, `YS`, `HE`)
1269/// - `[F][A, W, E, Z, S]` → `TS` (e.g., `FS`, `FE`)
1270/// - `TP` → `TOP` (issue #133 PR 8; corpus-attested keyboard typo
1271///   where the middle `O` was elided; bare `TP` has no other
1272///   canonical CAPCO meaning).
1273/// - `TO` → `TOP` (issue #133 PR 8; same family — trailing `P`
1274///   elided).
1275///
1276/// **Length 1**:
1277/// - `[A, W, E, Z]` → `S` (S-key neighbors; bare `S` is canonical)
1278/// - `[V, F]` → `C` (C-key neighbors; bare `C` is canonical)
1279/// - `[X]` → `S` (X is between C and S on QWERTY; default to the
1280///   higher classification per the issue #133 PR 2 design note)
1281///
1282/// **Length 4+**: returns `None`. Long-token typos benefit from the
1283/// vocab-based fuzzy matcher (4-char `TDOP`/`QTOP`/`TOPW` recover
1284/// to `TOP` at edit distance 1 via the standard fuzzy path now
1285/// that `TOP` lives in `EXTENDED_CORRECTION_VOCAB`); the
1286/// keyboard-proximity heuristic adds nothing here.
1287///
1288/// **Bare canonical**: returns `None` when the leading token is
1289/// already a known classification short form (`U`, `R`, `C`, `S`,
1290/// `TS`) OR the bare leading word `TOP` of the two-word
1291/// `TOP SECRET` classification. PR 8 added `TOP` to the canonical
1292/// short-circuit set because the new length-3 `OTP→TOP` heuristic
1293/// would otherwise have to walk the heuristic path on every
1294/// already-canonical `TOP SECRET//...` input. The strict parser
1295/// already accepts all of these. See
1296/// [`is_canonical_short_classification`] for the implementation.
1297///
1298/// # CAB markings
1299///
1300/// Returns `None` when `text` looks like a CAB (Classification
1301/// Authority Block) — those are keyed authority lines, not
1302/// classification-leading shapes, and the heuristic would emit
1303/// nonsense if applied. The check mirrors [`is_cab_head`].
1304fn try_classification_heuristic_fix(text: &str) -> Option<String> {
1305    // Skip CAB shapes — they don't have a leading classification token.
1306    if is_cab_head(text.as_bytes()) {
1307        return None;
1308    }
1309
1310    // Strip portion-form parens (preserve them at output).
1311    let (open_paren, body, close_paren) = if text.starts_with('(') && text.ends_with(')') {
1312        ("(", &text[1..text.len() - 1], ")")
1313    } else {
1314        ("", text, "")
1315    };
1316
1317    // First `//`-separated segment carries the leading classification.
1318    let first_seg_end = body.find("//").unwrap_or(body.len());
1319    let first_seg = &body[..first_seg_end];
1320    let after_first_seg = &body[first_seg_end..];
1321
1322    // First whitespace-delimited token of that segment.
1323    let first_seg_trimmed_start = first_seg
1324        .char_indices()
1325        .find(|(_, c)| !c.is_whitespace())
1326        .map(|(i, _)| i)
1327        .unwrap_or(0);
1328    let leading_ws = &first_seg[..first_seg_trimmed_start];
1329    let after_leading_ws = &first_seg[first_seg_trimmed_start..];
1330    let token_end = after_leading_ws
1331        .find(char::is_whitespace)
1332        .unwrap_or(after_leading_ws.len());
1333    let first_token = &after_leading_ws[..token_end];
1334    let after_first_token = &after_leading_ws[token_end..];
1335
1336    // Bare canonical → no fix needed.
1337    if is_canonical_short_classification(first_token) {
1338        return None;
1339    }
1340
1341    // **Lone-input safety guard (issue #133 PR 4 / #176).** Skip the
1342    // heuristic when the input has no marking-shape signal beyond the
1343    // leading token — i.e., nothing after the first token within the
1344    // first segment AND no `//`-separated tail. The corpus measurement
1345    // committed at `tools/corpus-analysis/output/heuristic_frequencies.json`
1346    // validated heuristic confidence well above the acceptance
1347    // threshold only for the *in-context* case (trigger appears within
1348    // ~30 chars of `//` or a recognized vocab token). For lone inputs
1349    // the empirical FP rate against Enron body text is many orders of
1350    // magnitude higher — high-frequency triggers like `A` and `E` have
1351    // tens of thousands of unrestricted occurrences vs at most a few
1352    // hundred in marking-context, and a fix-and-warn that auto-applies
1353    // at default threshold would produce false positives on
1354    // parenthetical refs like `(A)` / `(W)` / `(F)` common in business
1355    // prose. Spot-check the evidence file directly for per-trigger
1356    // detail.
1357    //
1358    // Form-field input (`(YS)` typed into a portion-mark field)
1359    // SHOULD heuristic-fix at high confidence — the caller knows the
1360    // input is a marking attempt — but we don't yet have an input-
1361    // source signal to distinguish form-field from document-content.
1362    // Tracked in #176 (input-source signal on ParseContext); when
1363    // that lands, this safety guard becomes conditional on
1364    // `ParseContext::input_source == DocumentContent`.
1365    // Trailing whitespace doesn't count as "other content" — `(YS )`
1366    // is functionally equivalent to `(YS)` for the lone-case test.
1367    let has_other_marking_content = after_first_token.chars().any(|c| !c.is_whitespace())
1368        || after_first_seg.chars().any(|c| !c.is_whitespace());
1369    if !has_other_marking_content {
1370        return None;
1371    }
1372
1373    let replacement = match first_token.len() {
1374        3 => try_3char_classification_heuristic(first_token)?,
1375        2 => try_2char_classification_heuristic(first_token)?,
1376        1 => try_1char_classification_heuristic(first_token)?,
1377        _ => return None,
1378    };
1379
1380    Some(format!(
1381        "{open_paren}{leading_ws}{replacement}{after_first_token}{after_first_seg}{close_paren}"
1382    ))
1383}
1384
1385/// True when `token` is a known CAPCO-2016 classification short
1386/// form (U, R, C, S, TS) OR the bare leading word of the
1387/// `TOP SECRET` two-word classification.
1388///
1389/// The full-word forms (UNCLASSIFIED, RESTRICTED, etc.) are
1390/// intentionally NOT matched here: a malformed full-word would
1391/// already be handled by the vocab-based fuzzy matcher (`SECRET`
1392/// is in `correction_vocab`).
1393///
1394/// Issue #133 PR 8 added `TOP` to the match set. Pre-PR-8 the
1395/// helper's whitespace tokenizer treated `TOP` as a non-canonical
1396/// token and the heuristic fired on perfectly-canonical
1397/// `TOP SECRET//...` input — a no-op when the heuristic returned
1398/// `None` for length-3 inputs, but a latent footgun once the
1399/// length-3 arm started returning `Some` (PR 8). Recognizing bare
1400/// `TOP` as canonical short-circuits the heuristic on the
1401/// already-correct case.
1402fn is_canonical_short_classification(token: &str) -> bool {
1403    matches!(token, "U" | "R" | "C" | "S" | "TS" | "TOP")
1404}
1405
1406/// 2-char keyboard-proximity rule. Two mappings:
1407///
1408/// 1. T-cluster + S-cluster pair → `TS` (the original PR 2 rule).
1409/// 2. Specific `TP` / `TO` pair → `TOP` (issue #133 PR 8). These
1410///    are corpus-attested classification typos where the middle
1411///    `O` (`TP`) or trailing `P` (`TO`) was elided. Bare `TP` and
1412///    `TO` have no other canonical CAPCO meaning at the leading
1413///    classification position — `TP` isn't an SCI control or
1414///    dissem, `TO` isn't either (the `REL TO` keyword path lives
1415///    inside the structural REL TO parser, not here).
1416///
1417/// The TS rule is checked first; rule 2 only fires when rule 1
1418/// doesn't (so `TS` itself, which has T-cluster + S-cluster, would
1419/// already be marked canonical by `is_canonical_short_classification`
1420/// upstream and the heuristic doesn't run on it).
1421fn try_2char_classification_heuristic(token: &str) -> Option<&'static str> {
1422    let bytes = token.as_bytes();
1423    debug_assert_eq!(bytes.len(), 2);
1424    let first = bytes[0].to_ascii_uppercase();
1425    let second = bytes[1].to_ascii_uppercase();
1426
1427    // T-key cluster: T itself plus QWERTY-adjacent keys (R, Y above-
1428    // adjacent on the home row; H, G, F on the row below). Wide
1429    // enough to catch the common transposition typos; narrow
1430    // enough to avoid touching unrelated 2-char prose.
1431    let t_cluster = matches!(first, b'T' | b'R' | b'Y' | b'H' | b'G' | b'F');
1432    // S-key cluster: S plus QWERTY-adjacent keys (A, W, E above-
1433    // adjacent on the upper row; Z below).
1434    let s_cluster = matches!(second, b'A' | b'W' | b'E' | b'Z' | b'S');
1435
1436    if t_cluster && s_cluster {
1437        return Some("TS");
1438    }
1439
1440    // PR 8: `TP` / `TO` → `TOP`. Tight pattern (literal pair, not
1441    // cluster) because broadening to e.g. `T[A-Z]` → `TOP` would
1442    // collide with too many real 2-char tokens in non-marking
1443    // prose. Anchored to T as the first byte and P / O as the
1444    // second.
1445    if first == b'T' && matches!(second, b'P' | b'O') {
1446        return Some("TOP");
1447    }
1448
1449    None
1450}
1451
1452/// 3-char keyboard-proximity rule (issue #133 PR 8). Maps a small
1453/// set of corpus-attested 3-char classification typos to their
1454/// canonical form when they appear in the leading classification
1455/// slot.
1456///
1457/// The vocab-based fuzzy matcher catches `TPP→TOP`, `UOP→TOP`, and
1458/// other distance-1 inputs once `TOP` lives in
1459/// `EXTENDED_CORRECTION_VOCAB`. This heuristic covers the residual
1460/// cases the fuzzy path can't reach:
1461///
1462/// - **`OTP` → `TOP`** — T↔O transposition. Standard Levenshtein
1463///   counts a transposition as 2 substitutions (distance 2), and
1464///   the fuzzy matcher's `MIN_USEFUL_CONFIDENCE` floor (0.45)
1465///   blocks distance-2 corrections for 3-char inputs (confidence
1466///   0.40). Switching the matcher to Damerau-Levenshtein would
1467///   recover this case but expand the false-positive surface
1468///   across the whole vocab; a targeted heuristic at the
1469///   classification slot is the lower-blast-radius fix.
1470///
1471/// Returns `None` for any other 3-char input — the heuristic is
1472/// intentionally narrow to avoid false positives in the dense
1473/// 3-char trigraph vocab (`TON`, `TUR`, `TWN`, …).
1474fn try_3char_classification_heuristic(token: &str) -> Option<&'static str> {
1475    let bytes = token.as_bytes();
1476    debug_assert_eq!(bytes.len(), 3);
1477    // Uppercase comparison is unnecessary here because the
1478    // `normalize_delimiters_and_case` pass upstream uppercases
1479    // ASCII before this helper runs, but we mirror the
1480    // length-1 / length-2 helpers' style for consistency.
1481    let upper = [
1482        bytes[0].to_ascii_uppercase(),
1483        bytes[1].to_ascii_uppercase(),
1484        bytes[2].to_ascii_uppercase(),
1485    ];
1486    if upper == *b"OTP" {
1487        return Some("TOP");
1488    }
1489    None
1490}
1491
1492/// 1-char keyboard-proximity rule. Maps to S, C per the §A.2 short-
1493/// form classification ladder. See module-level table for the
1494/// per-character mapping rationale.
1495fn try_1char_classification_heuristic(token: &str) -> Option<&'static str> {
1496    let bytes = token.as_bytes();
1497    debug_assert_eq!(bytes.len(), 1);
1498    match bytes[0].to_ascii_uppercase() {
1499        b'A' | b'W' | b'E' | b'Z' => Some("S"),
1500        b'V' | b'F' => Some("C"),
1501        // X is between C and S on QWERTY; default to the higher
1502        // classification (S) per the issue #133 PR 2 design note —
1503        // false-negative cost (under-classified) > false-positive
1504        // cost (over-classified) for IC compliance work.
1505        b'X' => Some("S"),
1506        _ => None,
1507    }
1508}
1509
1510// ---------------------------------------------------------------------------
1511// Missing-delimiter insertion (issue #133 PR 3)
1512// ---------------------------------------------------------------------------
1513
1514/// Try to insert missing `//` segment separators at category-transition
1515/// boundaries.
1516///
1517/// CAPCO grammar requires `//` between segments —
1518/// `CLASSIFICATION//SCI_BLOCK//SAR_BLOCK//DISSEM_BLOCK`. Real-world
1519/// transcription frequently substitutes whitespace for one or more
1520/// `//` separators, producing inputs the strict parser cannot
1521/// recover (`SECRET//NOFORN EXDIS` strict-parses as
1522/// `classification: Secret, dissem: [Nf]` with `EXDIS` left as
1523/// `TokenKind::Unknown`; the decoder's step-3a Unknown-span filter
1524/// then discards the candidate).
1525///
1526/// This helper walks the input left-to-right and inserts `//` at
1527/// whitespace gaps that separate two distinct CAPCO segments. Two
1528/// rules drive insertion:
1529///
1530/// 1. **Classification → next segment.** Tokens at the start of the
1531///    input are classification-context (`U`, `R`, `C`, `S`, `TS`,
1532///    `UNCLASSIFIED`, …, plus the `TOP SECRET` two-word
1533///    classification). The first non-classification token after the
1534///    classification phrase, when no `//` has been emitted yet,
1535///    triggers `//` insertion before it. Covers the
1536///    `TOP SECRET HCS-P INTEL OPS//ORCON/NOFORN` / `SECRET REL TO
1537///    USA, AUS, GBR` family.
1538///
1539/// 2. **Hard-splitter dissem long-form.** A small set of unambiguous
1540///    long-form dissem control tokens (`NOFORN`, `ORCON`,
1541///    `ORCON-USGOV`, `PROPIN`, `IMCON`, `RELIDO`, `RSEN`,
1542///    `EYESONLY`, `EXDIS`, `NODIS`, `LIMDIS`, `FOUO`, `FISA`,
1543///    `DSEN`) ALWAYS start a new segment when they appear after a
1544///    whitespace gap, regardless of preceding context — these
1545///    tokens have no in-segment role inside SCI/SAR/REL TO
1546///    blocks. Covers the `NOFORN EXDIS` / `... SI NOFORN` /
1547///    `... HCS-P INTEL OPS ORCON/NOFORN` family. The full set is
1548///    pinned by [`is_hard_splitter_covers_documented_long_forms`].
1549///
1550/// Exceptions (do NOT insert):
1551///
1552/// - `SBU NOFORN` / `LES NOFORN` — non-IC dissem **banner long
1553///   forms** for `NonIcDissem::SbuNf` / `NonIcDissem::LesNf`. When
1554///   the previous token is `SBU` or `LES`, treat `NOFORN` as part
1555///   of the multi-word atom.
1556///
1557/// Returns `None` when no insertion was made — the caller should
1558/// not emit a duplicate of the input.
1559///
1560/// # Bounded
1561///
1562/// Hard-capped at [`MAX_DELIMITER_INSERTIONS`] insertions per call.
1563/// More than four insertions in a single marking is suspicious and
1564/// likely indicates the input isn't a CAPCO marking at all (or the
1565/// helper is wrong); rather than emit a wildly-rewritten candidate,
1566/// we cap and let the result strict-parse on the partial rewrite.
1567///
1568/// # SCI / SAR / SPECIAL-ACCESS-REQUIRED coverage
1569///
1570/// The PR-3-era doc note here used to defer SCI-starter (`TOP SECRET
1571/// SI ...`), SAR-prefix (`TOP SECRET SAR-BP ...`), and
1572/// `SPECIAL ACCESS REQUIRED-...` insertion to a follow-up. That defer
1573/// was based on a misread: rule 1 (classification → next segment)
1574/// already fires on every one of those shapes because
1575/// [`is_classification_token`] includes `TOP` and
1576/// [`is_classification_continuation`] handles the `TOP → SECRET`
1577/// special case, so the helper produces the canonical bytes for all
1578/// 17 MissingDelimiter fixtures in the SC-004 corpus. The remaining
1579/// 2/17 failures pre-PR-5 were a SCORING contest, not a missing
1580/// rewrite — handled by [`HARD_SPLITTER_ABSORPTION_PENALTY`] in
1581/// [`score_candidate`], not here.
1582fn try_insert_delimiter(text: &str) -> Option<String> {
1583    let bytes = text.as_bytes();
1584    let mut result = String::with_capacity(text.len() + 8);
1585    let mut insertions = 0;
1586
1587    let mut prev_token: Option<&str> = None;
1588    let mut in_classification = true;
1589    let mut seen_double_slash = false;
1590
1591    let mut i = 0;
1592    while i < bytes.len() {
1593        // Existing `//` delimiter — copy and reset state.
1594        if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
1595            result.push_str("//");
1596            seen_double_slash = true;
1597            in_classification = false;
1598            prev_token = None;
1599            i += 2;
1600            continue;
1601        }
1602
1603        // Whitespace run — collect, then look at next token.
1604        if bytes[i].is_ascii_whitespace() {
1605            let ws_start = i;
1606            while i < bytes.len() && bytes[i].is_ascii_whitespace() {
1607                i += 1;
1608            }
1609            let ws = &text[ws_start..i];
1610
1611            // Find the next token (alnum + internal `-`) starting at `i`.
1612            let token_start = i;
1613            while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
1614                i += 1;
1615            }
1616            if token_start == i {
1617                // Whitespace then non-token character (e.g., `,` or `/` or end).
1618                // Just copy the whitespace and continue.
1619                result.push_str(ws);
1620                continue;
1621            }
1622            let next_token = &text[token_start..i];
1623
1624            let should_insert = decide_insert_delimiter(
1625                prev_token,
1626                next_token,
1627                in_classification,
1628                seen_double_slash,
1629            );
1630
1631            if should_insert && insertions < MAX_DELIMITER_INSERTIONS {
1632                result.push_str("//");
1633                insertions += 1;
1634                seen_double_slash = true;
1635                in_classification = false;
1636            } else {
1637                result.push_str(ws);
1638            }
1639            result.push_str(next_token);
1640
1641            // Update state.
1642            if !is_classification_continuation(next_token, prev_token) {
1643                in_classification = false;
1644            }
1645            prev_token = Some(next_token);
1646            continue;
1647        }
1648
1649        // Non-whitespace, non-`//` character — likely a `/` (single
1650        // slash, used as intra-segment separator e.g.
1651        // `ORCON/NOFORN`), comma, paren, or part of a token. Copy
1652        // verbatim and continue. Tokens that contain only alnum + `-`
1653        // are handled in the whitespace branch via the lookahead;
1654        // the leading-token-at-position-0 case enters here.
1655        let other_start = i;
1656        // Take a token (alnum + internal `-`) if at one.
1657        if bytes[i].is_ascii_alphanumeric() {
1658            while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
1659                i += 1;
1660            }
1661            let leading_token = &text[other_start..i];
1662            result.push_str(leading_token);
1663            // Update prev_token / classification state for the
1664            // leading token (no insertion possible at position 0).
1665            if !is_classification_continuation(leading_token, prev_token) {
1666                in_classification = false;
1667            }
1668            prev_token = Some(leading_token);
1669            continue;
1670        }
1671
1672        // Single non-token character (`/`, `(`, `)`, `,`, or any
1673        // non-ASCII character — e.g., a stray `∕` that the upstream
1674        // delimiter normalizer didn't catch). Preserve the original
1675        // UTF-8 character verbatim instead of doing `bytes[i] as
1676        // char`, which would corrupt multi-byte sequences by emitting
1677        // each byte as a separate Latin-1 codepoint.
1678        let ch = text[i..]
1679            .chars()
1680            .next()
1681            .expect("byte index must remain on a char boundary");
1682        result.push(ch);
1683        i += ch.len_utf8();
1684    }
1685
1686    if insertions == 0 { None } else { Some(result) }
1687}
1688
1689/// Hard cap on the number of `//` insertions per call. More than 4
1690/// in a single marking is very suspicious — real markings rarely
1691/// have that many segments at all. The cap prevents the helper
1692/// from rewriting non-marking prose that happens to contain
1693/// splitter words.
1694const MAX_DELIMITER_INSERTIONS: usize = 4;
1695
1696/// Decide whether to insert `//` at a whitespace gap before
1697/// `next_token`. See [`try_insert_delimiter`] doc for the rules.
1698fn decide_insert_delimiter(
1699    prev_token: Option<&str>,
1700    next_token: &str,
1701    in_classification: bool,
1702    seen_double_slash: bool,
1703) -> bool {
1704    // Multi-word atom exceptions: don't split between SBU/LES and
1705    // their NOFORN companion (banner long forms for NonIcDissem
1706    // SbuNf/LesNf).
1707    if next_token == "NOFORN" && matches!(prev_token, Some("SBU") | Some("LES")) {
1708        return false;
1709    }
1710
1711    // Rule 1: classification → next segment. The first non-
1712    // classification token after the classification phrase, when no
1713    // `//` has been emitted yet.
1714    if in_classification && !seen_double_slash && !is_classification_token(next_token) {
1715        return true;
1716    }
1717
1718    // Rule 2: hard-splitter dissem long-form. These tokens always
1719    // start a new segment when they appear after whitespace.
1720    is_hard_splitter(next_token)
1721}
1722
1723/// True when `token` is a classification short or long form that
1724/// can appear in classification context.
1725fn is_classification_token(token: &str) -> bool {
1726    matches!(
1727        token,
1728        "U" | "R"
1729            | "C"
1730            | "S"
1731            | "TS"
1732            | "TOP"
1733            | "UNCLASSIFIED"
1734            | "RESTRICTED"
1735            | "CONFIDENTIAL"
1736            | "SECRET"
1737    )
1738}
1739
1740/// True when `next_token` continues the classification phrase from
1741/// `prev_token`. Specifically: `TOP SECRET` is the only multi-word
1742/// classification CAPCO recognizes; `SECRET` after `TOP` continues
1743/// the classification.
1744fn is_classification_continuation(next_token: &str, prev_token: Option<&str>) -> bool {
1745    if next_token == "SECRET" && prev_token == Some("TOP") {
1746        return true;
1747    }
1748    is_classification_token(next_token)
1749}
1750
1751/// True when `token` is an unambiguous segment-starting dissem
1752/// long-form. These tokens have no in-segment role inside SCI / SAR /
1753/// REL TO blocks, so seeing one after whitespace always indicates a
1754/// missing `//` separator. Pinned by
1755/// `try_insert_delimiter_inserts_before_long_form_dissem`.
1756///
1757/// Excluded from this set:
1758///
1759/// - 2-char short forms (`NF`, `OC`, `PR`, `IMC`, `RS`) — could
1760///   collide with SAR compartment / sub-compartment naming.
1761/// - SCI starters (`SI`, `HCS`, `TK`, `KDK`) — 2-3 char tokens that
1762///   appear in compartment context.
1763/// - SAR prefixes (`SAR-*`) — handled in v2 with classification-
1764///   context lookahead.
1765fn is_hard_splitter(token: &str) -> bool {
1766    matches!(
1767        token,
1768        "NOFORN"
1769            | "ORCON"
1770            | "ORCON-USGOV"
1771            | "PROPIN"
1772            | "IMCON"
1773            | "RELIDO"
1774            | "RSEN"
1775            | "EYESONLY"
1776            | "FOUO"
1777            | "FISA"
1778            | "DSEN"
1779            | "EXDIS"
1780            | "NODIS"
1781            | "LIMDIS"
1782    )
1783}
1784
1785// ---------------------------------------------------------------------------
1786// SAR indicator-keyword structural repair (issue #133 PR 6)
1787// ---------------------------------------------------------------------------
1788
1789/// Repair stray-prefix and missing-hyphen mangling around the SAR
1790/// `SAR-` indicator (CAPCO-2016 §H.5 p100). Two structural patterns:
1791///
1792/// 1. **Prefix strip** — `<boundary>[A-Z]{1,3}SAR-` → `<boundary>SAR-`.
1793///    Strips ANY attached 1–3 letter ASCII-uppercase prefix before
1794///    the SAR indicator, including prefixes whose bytes happen to
1795///    spell a known CAPCO token (`U`, `S`, `SI`, `USA`, …). Canonical
1796///    CAPCO never glues a classification token, SCI control, or
1797///    trigraph directly to `SAR-` without a `//` separator, so a
1798///    prefix at a `//`/`(`/start boundary is OCR/transcription drift
1799///    regardless of whether the prefix bytes form a CVE token in
1800///    isolation. Recovers `SECRET//USAR-BP-J12...` →
1801///    `SECRET//SAR-BP-J12...` and `(USASAR-BP)` → `(SAR-BP)`. The
1802///    "smallest prefix that aligns with `SAR-`" wins (see
1803///    [`match_sar_prefix`]) so an ambiguous input like `USASAR-`
1804///    strips the longest aligning prefix (`USA`, length 3) — there
1805///    is no shorter alignment because `USASAR-` only contains `SAR-`
1806///    starting at offset 3. An earlier defensive guard that refused
1807///    to strip CAPCO-token prefixes was removed because it broke
1808///    the central `USAR-` case (`U` IS the UNCLASSIFIED portion
1809///    form); the test
1810///    `sar_indicator_repair_strips_even_capco_token_prefix` pins
1811///    the policy.
1812///
1813/// 2. **Missing-hyphen insertion** — `<boundary>SAR[A-Z0-9]{2,3}<delim>`
1814///    → `<boundary>SAR-[A-Z0-9]{2,3}<delim>`, where `<delim>` is `-`,
1815///    `/`, ASCII whitespace, or end-of-string. Recovers
1816///    `TOP SECRET//SARBP//NOFORN` → `TOP SECRET//SAR-BP//NOFORN` and
1817///    `SARBP-J12` → `SAR-BP-J12`.
1818///
1819/// Returns `None` when no change was made; the caller's `emit` dedup
1820/// would otherwise drop the duplicate candidate but the explicit
1821/// `None` saves the alloc.
1822///
1823/// # Why these patterns are structurally safe
1824///
1825/// Both patterns operate on the SAR **indicator keyword** (the literal
1826/// `SAR-` per §H.5 p100), not on the open-vocabulary program
1827/// identifier that follows. A prefix strip removes characters that
1828/// have no role in the CAPCO grammar — there is no marking syntax
1829/// where 1–3 alphabetic characters precede `SAR-` at a `//`/`(`/
1830/// start-of-string boundary. A missing-hyphen insertion adds the
1831/// syntactic separator the §H.5 grammar requires between the indicator
1832/// and the program identifier; it does not invent or modify the
1833/// identifier itself. Neither fix claims anything about SAR program-
1834/// identifier validity (which is agency-assigned and outside the
1835/// marque vocab — see `SAR_STRUCTURAL_KEYWORDS` in
1836/// `crates/ism/src/token_set.rs`). The corpus enhancement to fuzzy-
1837/// match against per-org SAR identifier lists is intentionally
1838/// deferred (issue follow-up): config-loaded vocab is a separate
1839/// trust boundary that needs its own design pass.
1840///
1841/// `SPECIAL ACCESS REQUIRED-` (the `Full` indicator form) is NOT
1842/// handled by this helper. The dominant `Full`-form failure mode in
1843/// the mangled corpus is a typo inside the indicator keywords
1844/// themselves (`SPCIAL`, `CCESS`, `SPECAL`), which is recovered by
1845/// the existing fuzzy matcher now that `SPECIAL` and `ACCESS` live in
1846/// `SAR_STRUCTURAL_KEYWORDS`. A `Full`-form analogue can land if a
1847/// future fixture surfaces with a stray prefix on
1848/// `SPECIAL ACCESS REQUIRED-`.
1849fn try_sar_indicator_repair(text: &str) -> Option<String> {
1850    // Cheap pre-check: if `SAR` doesn't appear at all, no repair is
1851    // possible. Saves the byte-walk cost on the overwhelmingly common
1852    // case where the input has no SAR block.
1853    if !text.contains("SAR") {
1854        return None;
1855    }
1856
1857    let bytes = text.as_bytes();
1858    // Lazy allocation: `result` stays `None` until the first repair
1859    // pattern matches, at which point we allocate and copy the
1860    // verbatim prefix `text[..first_match_start]` into it. Inputs that
1861    // contain `SAR` but no repair-eligible pattern (the common case
1862    // for canonical SAR markings like `SECRET//SAR-BP//NOFORN`) walk
1863    // the bytes without ever allocating the output string. The
1864    // bytes-walk-only-no-alloc path matters because every candidate
1865    // bytes attempt the decoder generates calls into this helper, so
1866    // a per-call allocation would multiply allocator pressure across
1867    // the K candidates / N inputs hot path of the recognizer.
1868    let mut result: Option<String> = None;
1869    // `last_copied` is the byte index up to which `result` has been
1870    // populated. When a repair fires, we batch-copy the verbatim span
1871    // `text[last_copied..i]` into `result` before pushing the
1872    // canonical replacement; on the final return we flush
1873    // `text[last_copied..]`. The batch-copy approach also avoids the
1874    // per-character `chars().next()` UTF-8 iteration cost on the
1875    // verbatim-byte stretches.
1876    let mut last_copied: usize = 0;
1877    let mut i = 0;
1878
1879    while i < bytes.len() {
1880        let at_boundary =
1881            i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
1882
1883        if at_boundary {
1884            // Pattern A: <prefix>SAR- where prefix is 1-3 ASCII
1885            // uppercase letters. The prefix is always treated as
1886            // noise to be stripped; a "known CAPCO word" defense
1887            // (refuse to strip if `U`, `USA`, `SI`, …) was tried
1888            // and rejected because it broke the central
1889            // `USAR-` case — `U` IS a CVE token (the
1890            // classification portion form for UNCLASSIFIED) but
1891            // canonical CAPCO never glues `U` directly to `SAR-`
1892            // without a `//` separator. Same logic applies to every
1893            // other CVE token in this position: a classification or
1894            // SCI control or trigraph that immediately precedes
1895            // `SAR-` with no separator is not a valid CAPCO marking
1896            // shape (the classification segment ends, `//` begins
1897            // the next segment, then SAR- starts the SAR block).
1898            // So an apparent prefix at a boundary directly before
1899            // `SAR-` is OCR/transcription drift regardless of
1900            // whether the prefix bytes spell a CAPCO token.
1901            if let Some((_prefix_len, post)) = match_sar_prefix(bytes, i) {
1902                let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
1903                r.push_str(&text[last_copied..i]);
1904                r.push_str("SAR-");
1905                last_copied = post;
1906                i = post;
1907                continue;
1908            }
1909
1910            // Pattern B: SAR<2-3 alnum><delim>. The CAPCO §H.5 p100
1911            // SAR program identifier (Abbrev form) is exactly 2-3
1912            // alphanumeric characters; the canonical form requires a
1913            // hyphen between SAR and the identifier. Inserting that
1914            // hyphen does not invent identifier vocabulary.
1915            if let Some(end) = match_sar_missing_hyphen(bytes, i) {
1916                let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
1917                r.push_str(&text[last_copied..i]);
1918                r.push_str("SAR-");
1919                r.push_str(&text[i + 3..end]);
1920                last_copied = end;
1921                i = end;
1922                continue;
1923            }
1924        }
1925
1926        // Default: advance past the current UTF-8 char without copying.
1927        // The verbatim span [last_copied..i] gets batch-copied into
1928        // `result` the next time a repair pattern fires (or flushed
1929        // on return below). Using char iteration rather than
1930        // `bytes[i] as char` keeps `i` aligned to char boundaries so
1931        // the `text[last_copied..i]` slice indexing is always valid
1932        // — multi-byte sequences (rare but possible in OCR'd input)
1933        // therefore round-trip intact.
1934        let ch = text[i..]
1935            .chars()
1936            .next()
1937            .expect("byte index must remain on a char boundary");
1938        i += ch.len_utf8();
1939    }
1940
1941    // Flush any verbatim trailing span into the result. If `result`
1942    // is still `None`, no repair fired, and we never allocated —
1943    // return `None` to signal the no-op path.
1944    result.map(|mut r| {
1945        r.push_str(&text[last_copied..]);
1946        r
1947    })
1948}
1949
1950/// At byte position `i`, look for `[A-Z]{1,3}SAR-`. Returns
1951/// `(prefix_len, post_index)` where `post_index` is the byte index
1952/// just after the `-` of `SAR-`. Returns `None` when the pattern
1953/// doesn't match.
1954///
1955/// Tries prefix lengths 1, 2, 3 in order; the **smallest** prefix
1956/// that aligns with a literal `SAR-` wins. The smallest-wins policy
1957/// is a conservative choice: a 1-char prefix (`U` in `USAR-`) is the
1958/// most likely OCR/transcription drift, and stripping fewer characters
1959/// is the lower-risk repair when the input is ambiguous between
1960/// shorter and longer prefix interpretations.
1961fn match_sar_prefix(bytes: &[u8], i: usize) -> Option<(usize, usize)> {
1962    for prefix_len in 1..=3 {
1963        let sar_start = i + prefix_len;
1964        if sar_start + 4 > bytes.len() {
1965            break;
1966        }
1967        if !bytes[i..sar_start].iter().all(|b| b.is_ascii_uppercase()) {
1968            break;
1969        }
1970        if &bytes[sar_start..sar_start + 4] == b"SAR-" {
1971            return Some((prefix_len, sar_start + 4));
1972        }
1973    }
1974    None
1975}
1976
1977/// At byte position `i`, look for `SAR[A-Z0-9]{2,3}<delim>`. Returns
1978/// the byte index of the delimiter (one past the alphanumeric run).
1979/// Returns `None` when the pattern doesn't match — including the
1980/// canonical `SAR-` shape (alnum run is 0 because `-` stops the scan
1981/// immediately after `SAR`).
1982fn match_sar_missing_hyphen(bytes: &[u8], i: usize) -> Option<usize> {
1983    if i + 3 > bytes.len() || &bytes[i..i + 3] != b"SAR" {
1984        return None;
1985    }
1986    let after_sar = i + 3;
1987    let mut j = after_sar;
1988    while j < bytes.len() && bytes[j].is_ascii_alphanumeric() {
1989        j += 1;
1990    }
1991    let run = j - after_sar;
1992    if !(2..=3).contains(&run) {
1993        return None;
1994    }
1995    let next_is_delim =
1996        j == bytes.len() || matches!(bytes[j], b'-' | b'/' | b' ' | b'\t' | b'\n' | b'\r');
1997    if !next_is_delim {
1998        return None;
1999    }
2000    Some(j)
2001}
2002
2003// ---------------------------------------------------------------------------
2004// Stray-character `/X/` recovery (issue #133 PR 7)
2005// ---------------------------------------------------------------------------
2006
2007/// Walk `text` looking for the `<alnum>/<single_alnum_char>/<alnum>`
2008/// pattern. For each match (currently only the first match is
2009/// processed — see "scope" below) emit three candidate transforms:
2010///
2011/// 1. **Drop X** — `A/X/B` → `A//B`. Recovers stray characters
2012///    inserted between two valid tokens. Example:
2013///    `SECRET//NOFORN/R/EXDIS` → `SECRET//NOFORN//EXDIS` (the stray
2014///    `/R/` between NOFORN and EXDIS is removed).
2015///
2016/// 2. **Right-attach X** — `A/X/B` → `A//XB`. Recovers a single
2017///    character that got separated from the start of the right
2018///    token by a `/`. Example: `TOP SECRET//SI/N/OFORN` →
2019///    `TOP SECRET//SI//NOFORN` (the `N` was the leading character
2020///    of `NOFORN`).
2021///
2022/// 3. **Left-attach X** — `A/X/B` → `AX//B`. Recovers a single
2023///    character that got separated from the end of the left token
2024///    by a `/`. Example: `SECRE/T/REL TO USA, AUS, GBR` →
2025///    `SECRET//REL TO USA, AUS, GBR` (the `T` was the trailing
2026///    character of `SECRET`).
2027///
2028/// All three transforms are emitted as candidates; the recognizer's
2029/// step-3a [`TokenKind::Unknown`](marque_ism::TokenKind::Unknown)
2030/// filter is the natural disambiguator. For each input only one of
2031/// the three transforms produces fully-recognized tokens — the
2032/// other two leave broken-token fragments (`OFORN`, `NOFORNR`,
2033/// `SECRER`, …) that survive strict parsing as `TokenKind::Unknown`
2034/// and get dropped before scoring. The decoder doesn't need a
2035/// per-pattern lookup table to choose the right transform; the
2036/// vocab does the choosing implicitly.
2037///
2038/// # Scope (PR 7)
2039///
2040/// Only the FIRST `/X/` match in the input is processed; an input
2041/// with multiple stray-character patterns (e.g., `S/I/T/K`) is not
2042/// fully recovered by a single pass. The current corpus has very
2043/// few multi-pattern inputs (1–2 in the unresolved Typo set), and
2044/// adding a multi-pass loop here would complicate the candidate cap
2045/// in [`generate_candidate_bytes`] without proportional benefit. A
2046/// future PR can iterate if multi-pattern recovery becomes
2047/// load-bearing for SC-004 movement.
2048///
2049/// # Pattern boundary requirements
2050///
2051/// The `/X/` match requires alphanumeric context on both sides
2052/// (`<alnum>/<X>/<alnum>`). Without those guards the pattern would
2053/// fire on edge cases like `(/X/)` (start of portion form) where
2054/// the surrounding context is structural punctuation, not a token —
2055/// the recovery would be semantically meaningless there because
2056/// there's no token to attach `X` to.
2057fn try_collapse_stray_char_slash(text: &str) -> Vec<String> {
2058    let bytes = text.as_bytes();
2059    let mut i = 0;
2060    while i + 3 <= bytes.len() {
2061        // `/X/` shape: bytes[i] = `/`, bytes[i+1] = single ASCII
2062        // alnum, bytes[i+2] = `/`. The single-alnum requirement
2063        // prevents matching on `/AB/` (which would be a 2-char
2064        // token between slashes, not a stray character).
2065        if bytes[i] != b'/' || !bytes[i + 1].is_ascii_alphanumeric() || bytes[i + 2] != b'/' {
2066            i += 1;
2067            continue;
2068        }
2069        // Boundary check: the slashes must be sandwiched between
2070        // alphanumeric tokens on both sides. Without this guard
2071        // `(/X/)` (start-of-portion-form) would trip the match.
2072        let prev_alnum = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
2073        let next_alnum = i + 3 < bytes.len() && bytes[i + 3].is_ascii_alphanumeric();
2074        if !prev_alnum || !next_alnum {
2075            i += 1;
2076            continue;
2077        }
2078
2079        let x = bytes[i + 1];
2080        let prefix = &bytes[..i];
2081        let suffix = &bytes[i + 3..];
2082
2083        // The unwraps are safe: `text` is valid UTF-8, `prefix` /
2084        // `suffix` are slices on byte boundaries (the pattern only
2085        // matched on ASCII bytes), and we only insert ASCII bytes
2086        // (`/`, `x` which is ASCII alnum) between them.
2087        let mut out = Vec::with_capacity(3);
2088
2089        // 1. Drop X.
2090        let mut buf = Vec::with_capacity(bytes.len());
2091        buf.extend_from_slice(prefix);
2092        buf.extend_from_slice(b"//");
2093        buf.extend_from_slice(suffix);
2094        out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2095
2096        // 2. Right-attach X.
2097        let mut buf = Vec::with_capacity(bytes.len());
2098        buf.extend_from_slice(prefix);
2099        buf.extend_from_slice(b"//");
2100        buf.push(x);
2101        buf.extend_from_slice(suffix);
2102        out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2103
2104        // 3. Left-attach X.
2105        let mut buf = Vec::with_capacity(bytes.len());
2106        buf.extend_from_slice(prefix);
2107        buf.push(x);
2108        buf.extend_from_slice(b"//");
2109        buf.extend_from_slice(suffix);
2110        out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
2111
2112        return out;
2113    }
2114    Vec::new()
2115}
2116
2117// ---------------------------------------------------------------------------
2118// REL TO structural repair (issue #133 PR 9)
2119// ---------------------------------------------------------------------------
2120
2121/// REL TO structural repair.
2122///
2123/// Recovers four classes of REL TO structural typos that produce no
2124/// valid REL TO block in the strict parse path. All four are
2125/// **structural** (literal-shape) repairs, not vocabulary-based fuzzy
2126/// guesses — they fire only when the observed pattern is invalid
2127/// CAPCO AND the corrected pattern is unambiguously the intended form.
2128/// The riskier per-trigraph fuzzy-correction cluster (e.g.,
2129/// `USB → USA`, `AUT → AUS`) is deferred to issue #186 because it
2130/// requires corpus-weighted priors + block-level CAPCO §H.8
2131/// invariants to disambiguate safely.
2132///
2133/// # Patterns
2134///
2135/// 1. **Header transposition** — `REL OT ` → `REL TO `. The CAPCO
2136///    `REL` token has exactly two valid extensions (`REL TO` and
2137///    `RELIDO`); `REL OT` cannot appear in any valid CAPCO marking,
2138///    so the literal-bytes replacement is collision-free.
2139///
2140/// 2. **Header token-boundary** — `RELT O ` → `REL TO `. `RELT` is
2141///    not a CVE token, and `T O` as adjacent single-letter tokens
2142///    has no valid CAPCO meaning. The replacement reconstructs the
2143///    intended `REL TO ` header by migrating the trailing `T` from
2144///    `RELT` to the start of `O`.
2145///
2146/// 3. **Entry token-boundary** — `,A US,` → `,AUS,` (within a
2147///    REL TO block). A 1-letter + space + 2-letter sequence between
2148///    commas only fires when the joined 3-letter string is a known
2149///    trigraph (`is_trigraph` check) AND the 1-letter alone is not a
2150///    trigraph. The trigraph guard is what makes this safe — without
2151///    it, `,A B,` → `,AB,` would fire for any combination, but with
2152///    it the only joins that survive are those that round-trip
2153///    through the strict REL TO parser as valid trigraphs.
2154///
2155/// 4. **Entry comma misplacement** — `AU,S ` → `AUS, ` (within a
2156///    REL TO block). A 2-letter run + comma + 1-letter + space only
2157///    fires when the joined 3-letter string is a known trigraph AND
2158///    the 2-letter run alone is not. Same trigraph guard as
2159///    pattern 3 — the structural transform requires the corrected
2160///    output to be a valid trigraph.
2161///
2162/// # Scope (PR 9)
2163///
2164/// Patterns 1 and 2 affect the literal `REL TO` header and run
2165/// regardless of what follows. Patterns 3 and 4 require a `REL TO `
2166/// header in the input — they scan from each `REL TO ` substring
2167/// forward to the next `//` (or end of text) and only operate on
2168/// comma-separated entries within that block.
2169///
2170/// All four transforms are conservative: their false-positive risk
2171/// is bounded by the literal patterns not appearing in any valid
2172/// CAPCO text (patterns 1, 2) or by the `is_trigraph` guard
2173/// rejecting joins that aren't real country codes (patterns 3, 4).
2174/// The trigraph dictionary itself is the source of authority — no
2175/// new vocabulary is invented.
2176///
2177/// Returns `None` when no pattern matched. Allocation behavior:
2178///
2179/// - Inputs with no `REL` substring short-circuit before any work.
2180/// - Inputs with `REL` but no header-typo pattern run the header
2181///   walk allocation-free; the entry-level pass then short-circuits
2182///   on inputs lacking a literal `REL TO ` anchor.
2183/// - Inputs containing `REL TO ` in canonical form walk the entries
2184///   without allocating until a fix actually fires.
2185///
2186/// Allocation only occurs once a pattern produces a fixed string.
2187fn try_rel_to_structural_repair(text: &str) -> Option<String> {
2188    // Cheap pre-check: if `REL` doesn't appear at all, no repair is
2189    // possible. Saves the byte-walk cost on the overwhelmingly common
2190    // case where the input has no REL block.
2191    if !text.contains("REL") {
2192        return None;
2193    }
2194
2195    let mut working: Option<String> = None;
2196    let mut any_change = false;
2197
2198    // Patterns 1 and 2: header normalization. Apply first so the
2199    // entry-level scan in patterns 3 and 4 sees a canonical `REL TO `
2200    // header to anchor on.
2201    if let Some(normalized) = try_rel_to_header_normalize(text) {
2202        working = Some(normalized);
2203        any_change = true;
2204    }
2205
2206    // Patterns 3 and 4: entry-level fixes. Operate on the
2207    // header-normalized text when patterns 1 or 2 fired, otherwise on
2208    // the raw input.
2209    let entry_input: &str = working.as_deref().unwrap_or(text);
2210    if let Some(entry_fixed) = try_rel_to_entry_normalize(entry_input) {
2211        working = Some(entry_fixed);
2212        any_change = true;
2213    }
2214
2215    if any_change { working } else { None }
2216}
2217
2218/// Patterns 1 and 2 — header normalization.
2219///
2220/// Walks `text` once, replacing each occurrence of `REL OT ` and
2221/// `RELT O ` (each at a token boundary) with `REL TO `. Lazy-allocates
2222/// the output string only on the first match — inputs that contain
2223/// `REL` but no header-typo pattern (the common case for canonical
2224/// `REL TO USA, AUS, GBR` markings) walk the bytes without ever
2225/// allocating.
2226///
2227/// The "token boundary" check (`at_boundary`) prevents matches
2228/// embedded inside a longer alphanumeric run. Without it `XREL OT`
2229/// would match the substring `REL OT` even though the leading `X`
2230/// makes the whole thing a single 6-character token, not a `REL`
2231/// header at all.
2232fn try_rel_to_header_normalize(text: &str) -> Option<String> {
2233    let bytes = text.as_bytes();
2234    let mut result: Option<String> = None;
2235    let mut last_copied: usize = 0;
2236    let mut i = 0;
2237
2238    while i < bytes.len() {
2239        let at_boundary =
2240            i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
2241
2242        if at_boundary && i + 7 <= bytes.len() {
2243            let window = &bytes[i..i + 7];
2244            // Pattern A (transposition): `REL OT ` → `REL TO `.
2245            // Pattern B (token-boundary): `RELT O ` → `REL TO `.
2246            // Both patterns are exactly 7 bytes; the same 7-byte
2247            // window is compared against each full literal
2248            // explicitly, so a single window read covers both.
2249            if window == b"REL OT " || window == b"RELT O " {
2250                let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
2251                r.push_str(&text[last_copied..i]);
2252                r.push_str("REL TO ");
2253                last_copied = i + 7;
2254                i = last_copied;
2255                continue;
2256            }
2257        }
2258
2259        let ch = text[i..]
2260            .chars()
2261            .next()
2262            .expect("byte index must remain on a char boundary");
2263        i += ch.len_utf8();
2264    }
2265
2266    result.map(|mut r| {
2267        r.push_str(&text[last_copied..]);
2268        r
2269    })
2270}
2271
2272/// Patterns 3 and 4 — entry-level normalization within REL TO blocks.
2273///
2274/// Scans `text` for each `REL TO ` substring and processes the
2275/// comma-separated entries that follow until the next `//` (or end of
2276/// text). Two patterns apply per entry pair:
2277///
2278/// - **Token-boundary** — within a single entry, `<single-upper> <two-upper>`
2279///   is replaced with the joined 3-letter trigraph when the join is a
2280///   known trigraph and the 1-letter prefix alone is not.
2281///
2282/// - **Comma misplacement** — across an entry pair,
2283///   `<2-upper>,<1-upper><space>...` (entry N ends with two letters,
2284///   entry N+1 starts with one letter followed by a space and then
2285///   content) is replaced with `<3-upper joined>,` and the leading
2286///   character is stripped from entry N+1, when the join is a known
2287///   trigraph and the 2-letter prefix alone is not. The space guard
2288///   (the 1-upper must be followed by ASCII space) is what
2289///   distinguishes the misplacement shape from a legitimate
2290///   shorter-than-3 entry typo and is enforced by `fix_rel_to_block`.
2291///
2292/// Both patterns require the corrected output to be a known trigraph
2293/// (`CapcoTokenSet::is_trigraph`). The trigraph dictionary is the
2294/// arbiter of "valid country code" — no fuzzy guessing.
2295fn try_rel_to_entry_normalize(text: &str) -> Option<String> {
2296    // Cheap pre-check: entry-level patterns 3 and 4 only fire inside a
2297    // `REL TO ` block, so `apply_rel_to_entry_pass` cannot match
2298    // without that anchor. Skip the `to_owned()` allocation entirely
2299    // when the input has no `REL TO ` substring (the common path for
2300    // canonical inputs and for non-REL-TO segments of the broader
2301    // structural-repair caller).
2302    if !text.contains("REL TO ") {
2303        return None;
2304    }
2305
2306    let token_set = CapcoTokenSet;
2307    let mut any_change = false;
2308    let mut current: Option<String> = None;
2309
2310    // Loop until no further fix fires. Most inputs converge in one
2311    // pass; the loop guards against the rare case where fixing one
2312    // pattern exposes another (e.g., a comma misplacement that ends a
2313    // block adjacent to a token-boundary pattern in the next entry).
2314    // First iteration borrows `text`; subsequent iterations re-pass the
2315    // previously rewritten `String` so the only allocation is the one
2316    // produced by the first successful fix (and any further passes).
2317    loop {
2318        let input: &str = current.as_deref().unwrap_or(text);
2319        match apply_rel_to_entry_pass(input, &token_set) {
2320            Some(rewritten) => {
2321                current = Some(rewritten);
2322                any_change = true;
2323            }
2324            None => break,
2325        }
2326    }
2327
2328    if any_change { current } else { None }
2329}
2330
2331/// Single pass of REL TO entry normalization. Returns the rewritten
2332/// text on first fix, or `None` if no pattern matched.
2333fn apply_rel_to_entry_pass(text: &str, token_set: &CapcoTokenSet) -> Option<String> {
2334    let mut search_start = 0;
2335    while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2336        let header_end = search_start + rel_pos + "REL TO ".len();
2337        // Block ends at the next `//` (start of next category) or end
2338        // of text. The `//` boundary is always 2 bytes; we exclude it
2339        // from the block contents.
2340        let block_end = text[header_end..]
2341            .find("//")
2342            .map(|p| header_end + p)
2343            .unwrap_or(text.len());
2344        let block = &text[header_end..block_end];
2345
2346        if let Some((rel_local_offset, fixed_block)) = fix_rel_to_block(block, token_set) {
2347            let mut result = String::with_capacity(text.len());
2348            result.push_str(&text[..header_end]);
2349            result.push_str(&fixed_block);
2350            result.push_str(&text[block_end..]);
2351            // Suppress unused-variable warning when the helper returns
2352            // a fix — `rel_local_offset` is reserved for a future
2353            // localized-emit optimization but not needed today since
2354            // we rebuild the full text.
2355            let _ = rel_local_offset;
2356            return Some(result);
2357        }
2358
2359        search_start = block_end;
2360    }
2361    None
2362}
2363
2364/// Walk the comma-separated entries of one REL TO block; apply
2365/// pattern 3 (token-boundary inside an entry) and pattern 4 (comma
2366/// misplaced between adjacent entries) on first match. Returns
2367/// `(local_offset, rewritten_block)` for the first fix, or `None` if
2368/// the block is already canonical.
2369///
2370/// `local_offset` is the byte offset within `block` where the fix
2371/// landed; reserved for future localized emit optimizations.
2372fn fix_rel_to_block(block: &str, token_set: &CapcoTokenSet) -> Option<(usize, String)> {
2373    // Collect entries with their byte offsets within the block so a
2374    // fix can be emitted with precise positioning.
2375    let mut entries: Vec<(usize, &str)> = Vec::new();
2376    let mut cursor = 0;
2377    for entry in block.split(',') {
2378        entries.push((cursor, entry));
2379        cursor += entry.len() + 1; // +1 for the comma separator
2380    }
2381
2382    // Pattern 3: token-boundary inside a single entry.
2383    // `<lead-ws><single-upper> <two-upper><trail-ws>` → joined trigraph.
2384    for (entry_offset, entry) in &entries {
2385        let trimmed = entry.trim();
2386        // Need exactly 4 chars: `A US` shape. Anything else (3, 5, etc.)
2387        // is either canonical or a different recovery shape.
2388        if trimmed.len() != 4 {
2389            continue;
2390        }
2391        let bytes = trimmed.as_bytes();
2392        if !bytes[0].is_ascii_uppercase()
2393            || bytes[1] != b' '
2394            || !bytes[2].is_ascii_uppercase()
2395            || !bytes[3].is_ascii_uppercase()
2396        {
2397            continue;
2398        }
2399        let joined = format!(
2400            "{}{}{}",
2401            bytes[0] as char, bytes[2] as char, bytes[3] as char
2402        );
2403        if !token_set.is_trigraph(&joined) {
2404            continue;
2405        }
2406        // Defensive: don't fire if the 1-letter prefix is itself a
2407        // trigraph (no real CAPCO trigraph is 1-letter, but guard
2408        // anyway against future schema changes).
2409        let one_letter = std::str::from_utf8(&bytes[..1]).expect("ASCII upper");
2410        if token_set.is_trigraph(one_letter) {
2411            continue;
2412        }
2413
2414        // Rebuild the block: replace the 4-char entry contents with
2415        // the 3-char joined trigraph, preserving any leading/trailing
2416        // whitespace inside the entry.
2417        // entry = lead_ws + trimmed + trail_ws; replace `trimmed`
2418        // (4 chars) with `joined` (3 chars), preserving the
2419        // surrounding whitespace verbatim.
2420        let lead_ws_len = entry.len() - entry.trim_start().len();
2421        let mut rewritten_entry = String::with_capacity(entry.len() - 1);
2422        rewritten_entry.push_str(&entry[..lead_ws_len]);
2423        rewritten_entry.push_str(&joined);
2424        rewritten_entry.push_str(&entry[lead_ws_len + trimmed.len()..]);
2425
2426        let mut result = String::with_capacity(block.len());
2427        result.push_str(&block[..*entry_offset]);
2428        result.push_str(&rewritten_entry);
2429        result.push_str(&block[*entry_offset + entry.len()..]);
2430        return Some((*entry_offset, result));
2431    }
2432
2433    // Pattern 4: comma misplaced between entries.
2434    // entries[i] = `<2-upper>` (trimmed) AND
2435    // entries[i+1] = `<1-upper><space><rest>` (trimmed) AND
2436    // joined 3-letter is a trigraph AND 2-letter alone is not.
2437    for i in 0..entries.len().saturating_sub(1) {
2438        let (left_off, left_entry) = &entries[i];
2439        let (right_off, right_entry) = &entries[i + 1];
2440        let left_trim = left_entry.trim();
2441        let right_trim_start = right_entry.trim_start();
2442        if left_trim.len() != 2 || !left_trim.chars().all(|c| c.is_ascii_uppercase()) {
2443            continue;
2444        }
2445        let right_bytes = right_trim_start.as_bytes();
2446        if right_bytes.len() < 2 || !right_bytes[0].is_ascii_uppercase() || right_bytes[1] != b' ' {
2447            continue;
2448        }
2449        let joined = format!("{}{}", left_trim, right_bytes[0] as char);
2450        if !token_set.is_trigraph(&joined) {
2451            continue;
2452        }
2453        if token_set.is_trigraph(left_trim) {
2454            // 2-letter alone is already a trigraph (e.g., EU); the
2455            // comma might be intentional. Skip.
2456            continue;
2457        }
2458
2459        // Rebuild: left entry becomes `<lead-ws><joined>`, right
2460        // entry becomes ` <rest-after-first-char-and-space>` (we
2461        // strip the first char and the space, prepend a single
2462        // canonical space).
2463        let left_lead = left_entry.len() - left_entry.trim_start().len();
2464        let mut new_left = String::with_capacity(left_entry.len() + 1);
2465        new_left.push_str(&left_entry[..left_lead]);
2466        new_left.push_str(&joined);
2467
2468        let right_lead = right_entry.len() - right_trim_start.len();
2469        // Skip the first char and the following space.
2470        let after_first = &right_trim_start[2..];
2471        let mut new_right = String::with_capacity(right_entry.len());
2472        new_right.push_str(&right_entry[..right_lead]);
2473        new_right.push(' ');
2474        new_right.push_str(after_first);
2475
2476        // Emit: block[..left_off] + new_left + ',' + new_right + block[right_off+right_entry.len()..]
2477        let mut result = String::with_capacity(block.len() + 1);
2478        result.push_str(&block[..*left_off]);
2479        result.push_str(&new_left);
2480        result.push(',');
2481        result.push_str(&new_right);
2482        result.push_str(&block[*right_off + right_entry.len()..]);
2483        return Some((*left_off, result));
2484    }
2485
2486    None
2487}
2488
2489// ---------------------------------------------------------------------------
2490// REL TO trigraph fuzzy expansion (issue #233)
2491// ---------------------------------------------------------------------------
2492
2493/// Emit one canonical-byte alternate per fuzzy candidate for each
2494/// unknown 3- or 4-char REL TO entry.
2495///
2496/// The standard fuzzy path in [`fuzzy_correct_tokens`] operates against
2497/// the [`CapcoTokenSet::correction_vocab`] slice, which deliberately
2498/// excludes country trigraphs (the design comment on `ALL_CVE_TOKENS`
2499/// in `crates/ism/build.rs` calls this out — country codes live
2500/// exclusively in [`marque_ism::TRIGRAPHS`] and are reached through
2501/// [`CapcoTokenSet::is_trigraph`]). So a typo'd 3-char REL TO entry
2502/// like `USB` gets no correction from the standard pass — there's
2503/// nothing in the vocab to match it against. The strict parser then
2504/// emits a `TokenKind::Unknown` for the entry (issue #233 change in
2505/// `parse_rel_to_with_spans`), and the dispatcher's step 3a rejects
2506/// the "drop USB" candidate.
2507///
2508/// With the original candidate filtered out, this function provides
2509/// the alternates the dispatcher chooses between: it walks each
2510/// `REL TO ` block in `text`, finds 3- or 4-char comma-separated
2511/// entries that aren't already valid trigraphs/tetragraphs, asks the
2512/// trigraph-vocab matcher for all candidates within the edit-distance
2513/// bound, and emits one alternate text per candidate (with the
2514/// substitution applied in-place).
2515///
2516/// Each emitted alternate carries an `EditDistance1` /
2517/// `EditDistance2` feature (paired with the candidate's distance) so
2518/// the audit trail records the fuzzy work. The caller pushes a
2519/// `BaseRateCommonMarking` feature acknowledging the trigraph-prior
2520/// contribution. The decoder's `score_candidate` later sums the
2521/// trigraph-prior contribution over the parsed `rel_to` slice; the
2522/// popular-vs-rare log-prior delta (e.g., `log_prior(USA) -
2523/// log_prior(UZB)` ≈ +7 nats) decides which alternate wins the
2524/// `UNAMBIGUOUS_LOG_MARGIN` (~1.6 nat) contest.
2525///
2526/// **Scope**: 3-char (trigraph) and 4-char (tetragraph) ASCII
2527/// uppercase entries only. Two-letter entries (`EU`) are below
2528/// `MIN_FUZZY_LEN`; longer multi-char entries (`AUSTRALIA_GROUP`)
2529/// have low fuzzy-tie risk because their lengths rarely collide.
2530/// Only fires when the entry token is NOT already a valid
2531/// trigraph/tetragraph — so `AUT`, `UZB`, `FVEY`, `ACGU`, `ISAF`
2532/// in legitimate use pass through unchanged. 4-char scope added to
2533/// recover coalition-shorthand typos (`FVYE` → `FVEY`,
2534/// `SGAF` → `ISAF`); issue #246.
2535///
2536/// **CAPCO authority**: REL TO syntax is defined in CAPCO-2016 §H.8.
2537/// The trigraph/tetragraph dictionary itself comes from the ODNI CVE
2538/// schema in `CVEnumISMCATRelTo.xsd`, baked into
2539/// [`CapcoTokenSet::is_trigraph`] and into the
2540/// [`marque_ism::TRIGRAPHS`] slice this function fuzzy-matches against.
2541fn try_rel_to_fuzzy_trigraph_candidates(
2542    text: &str,
2543    trigraph_matcher: &FuzzyVocabMatcher<'_>,
2544) -> Vec<(String, FeatureEntry)> {
2545    let token_set = CapcoTokenSet;
2546    let mut out: Vec<(String, FeatureEntry)> = Vec::new();
2547
2548    let mut search_start = 0;
2549    while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2550        let header_end = search_start + rel_pos + "REL TO ".len();
2551        // Block ends at the EARLIEST of: `//` (next category), `\n`
2552        // (banner/CAB candidates from `Scanner::scan_banners` arrive
2553        // as full lines, so a REL TO line can have trailing prose
2554        // beyond the marking), or `)` (portion-form close). CAPCO
2555        // §H.8 / §A authority: `//` is the category separator; `,`
2556        // separates entries within the REL TO category itself.
2557        // Mirrors the corpus analyzer's terminator priority in
2558        // `tools/corpus-analysis/analyze.py` (`_extract_rel_to_trigraphs`).
2559        let tail = &text[header_end..];
2560        let block_len = ["//", "\n", ")"]
2561            .iter()
2562            .filter_map(|sep| tail.find(sep))
2563            .min()
2564            .unwrap_or(tail.len());
2565        let block_end = header_end + block_len;
2566        let block = &text[header_end..block_end];
2567
2568        // Walk the comma-separated entries with their byte offsets.
2569        let mut cursor = 0usize;
2570        for entry in block.split(',') {
2571            let entry_start = cursor;
2572            let entry_end = cursor + entry.len();
2573            cursor = entry_end + 1; // skip the comma
2574
2575            let trimmed = entry.trim();
2576            // 3-char (trigraph) or 4-char (tetragraph) ASCII-uppercase
2577            // entries only — see fn doc for scope rationale.
2578            let tlen = trimmed.len();
2579            if (tlen != 3 && tlen != 4) || !trimmed.bytes().all(|b| b.is_ascii_uppercase()) {
2580                continue;
2581            }
2582            // Skip already-valid trigraphs/tetragraphs (the matcher's
2583            // binary search would also short-circuit on a vocab hit, but
2584            // keeping the explicit check means a token like `FVEY`
2585            // appearing legitimately never gets multi-cast).
2586            if token_set.is_trigraph(trimmed) {
2587                continue;
2588            }
2589
2590            // Bypass the standard `MIN_USEFUL_CONFIDENCE` floor:
2591            // for a 3-char input, distance-2 corrections sit at
2592            // confidence 0.40, below the default 0.45 cutoff that
2593            // protects the standalone fuzzy path. Issue #233's score-
2594            // time tiebreak (corpus-weighted trigraph priors +
2595            // `UNAMBIGUOUS_LOG_MARGIN`) supplies the safety the
2596            // confidence-floor was substituting for; without lowering
2597            // it here, a typo like `ASU → AUS` (plain Levenshtein
2598            // distance 2) never reaches the scorer.
2599            let mut candidates = trigraph_matcher.correct_all_with_floor(trimmed, 0.0);
2600            if candidates.is_empty() {
2601                continue;
2602            }
2603
2604            // Drop candidates that would duplicate a trigraph already
2605            // present elsewhere in this REL TO block. CAPCO-2016 §H.8
2606            // does not state "no duplicates" as an explicit textual
2607            // prohibition — the REL TO grammar (§A.6 / §H.8 p131-150)
2608            // describes a list of country codes ordered USA-first then
2609            // ascending alphabetic, which structurally implies a set of
2610            // distinct codes but does not forbid repetition in so many
2611            // words. The reason we drop duplicates here is mechanical,
2612            // not citational: the bag-of-tokens scorer happens to
2613            // *reward* duplicates (each instance adds its log-prior
2614            // again), so without this filter an ambiguous typo
2615            // adjacent to a popular trigraph could collapse to
2616            // "REL TO USA, USA, GBR" because USA's log-prior
2617            // contribution is additive. Emitting a duplicate-creating
2618            // candidate would therefore be structurally redundant and
2619            // cause the scorer to erroneously favor it. The block's
2620            // other entries are computed by re-walking
2621            // `block.split(',')` and taking the trigraph form of any
2622            // 3-char ASCII-uppercase entry that's in the CVE
2623            // recognition set.
2624            let other_trigraphs: Vec<&str> = block
2625                .split(',')
2626                .map(str::trim)
2627                .filter(|e| {
2628                    let elen = e.len();
2629                    (elen == 3 || elen == 4)
2630                        && e.bytes().all(|b| b.is_ascii_uppercase())
2631                        && *e != trimmed
2632                        && token_set.is_trigraph(e)
2633                })
2634                .collect();
2635            candidates.retain(|c| !other_trigraphs.contains(&c.token));
2636            if candidates.is_empty() {
2637                continue;
2638            }
2639
2640            // Rank candidates by (distance, then country-code
2641            // log-prior). The plain Levenshtein hits for a 3-char
2642            // input often produce 20+ distance-2 candidates (every
2643            // other 3-char trigraph that shares one letter). Without
2644            // a prior-rank pre-filter, the K=16 attempt cap upstream
2645            // gets exhausted by low-prior alternates and the
2646            // high-prior ones get dropped. Sorting by (distance asc,
2647            // log-prior desc) keeps the most plausible candidates
2648            // first; we cap at TRIGRAPH_FUZZY_TOP_K per ambiguous
2649            // entry to bound the candidate-set growth.
2650            //
2651            // The cap value (4) is sized so a single ambiguous entry
2652            // doesn't crowd out the other decoder paths
2653            // (`fuzzy_corrected`, reorder, delimiter-insert, etc.):
2654            // 4 alternates ≤ K_MAX_CANDIDATES (8) leaves room for
2655            // the standard candidates the dispatcher also needs.
2656            const TRIGRAPH_FUZZY_TOP_K: usize = 4;
2657            candidates.sort_by(|a, b| {
2658                a.distance.cmp(&b.distance).then_with(|| {
2659                    let pa = marque_capco::priors::country_code_log_prior(a.token)
2660                        .unwrap_or(f32::NEG_INFINITY);
2661                    let pb = marque_capco::priors::country_code_log_prior(b.token)
2662                        .unwrap_or(f32::NEG_INFINITY);
2663                    pb.total_cmp(&pa)
2664                })
2665            });
2666            candidates.truncate(TRIGRAPH_FUZZY_TOP_K);
2667
2668            for cand in &candidates {
2669                // Reconstruct the full `text` with the entry replaced.
2670                // The 3-char trimmed sub-slice within the entry
2671                // preserves any surrounding whitespace.
2672                let lead_ws_len = entry.len() - entry.trim_start().len();
2673                let trail_ws_len = entry.len() - entry.trim_end().len();
2674                let mut rewritten_entry = String::with_capacity(entry.len());
2675                rewritten_entry.push_str(&entry[..lead_ws_len]);
2676                rewritten_entry.push_str(cand.token);
2677                rewritten_entry.push_str(&entry[entry.len() - trail_ws_len..]);
2678
2679                let mut alt = String::with_capacity(text.len());
2680                alt.push_str(&text[..header_end + entry_start]);
2681                alt.push_str(&rewritten_entry);
2682                alt.push_str(&text[header_end + entry_end..]);
2683
2684                // `FeatureId` is a closed audit-schema enum (see
2685                // `crates/rules/src/confidence.rs` and `MARQUE_AUDIT_SCHEMA`);
2686                // pair each (id, delta) directly off `cand.distance`
2687                // so the match is total over the only two outcomes
2688                // `cand.distance` can take here. The standalone fuzzy
2689                // matcher caps results at `MAX_EDIT_DISTANCE = 2`.
2690                let entry = if cand.distance <= 1 {
2691                    FeatureEntry {
2692                        id: FeatureId::EditDistance1,
2693                        delta: -0.5,
2694                    }
2695                } else {
2696                    FeatureEntry {
2697                        id: FeatureId::EditDistance2,
2698                        delta: -1.2,
2699                    }
2700                };
2701                out.push((alt, entry));
2702            }
2703        }
2704
2705        search_start = block_end;
2706    }
2707
2708    out
2709}
2710
2711// ---------------------------------------------------------------------------
2712// REL TO USA-injection for short first entries (issue #234 PR-B)
2713// ---------------------------------------------------------------------------
2714
2715/// Emit one canonical-byte alternate per REL TO block whose first
2716/// entry is a 1- or 2-character ASCII-uppercase token AND USA is not
2717/// otherwise present in the block. The alternate replaces that short
2718/// first entry with `USA`.
2719///
2720/// **Why complement to PR-A.** Issue #233's
2721/// [`try_rel_to_fuzzy_trigraph_candidates`] handles 3-char REL TO
2722/// entries: an unknown trigraph-shaped token gets fuzzy-matched
2723/// against the [`marque_ism::TRIGRAPHS`] vocabulary, and corpus-
2724/// weighted log-priors break ties at score time. That path
2725/// deliberately skips entries below `MIN_FUZZY_LEN = 3` (see the
2726/// `if trimmed.len() != 3` guard in `try_rel_to_fuzzy_trigraph_candidates`)
2727/// because `phf`-style fuzzy matching is unreliable on inputs that
2728/// short — a 2-char input is edit-distance-1 from many distinct
2729/// trigraphs and the mapper has no signal to break the tie.
2730///
2731/// For REL TO specifically, the §H.8 p150–151 grammar gives us a
2732/// stronger signal that fuzzy-matching cannot exploit: **USA must
2733/// always appear first**. So when we see a REL TO block whose first
2734/// entry is a 1- or 2-character ASCII-uppercase token, the most
2735/// likely intent — far above any other 3-char trigraph — is that
2736/// the user typed USA and dropped one or two characters. The fixture
2737/// at `tests/fixtures/mangled/typo/ad2bcfe3ac0b0765.json`
2738/// (`REL TO SA, AUS, GBR` → `REL TO USA, AUS, GBR`) is the canonical
2739/// case: `SA` is shape-incompatible with PR-A's 3-char floor, so
2740/// without this complementary path the decoder produces zero
2741/// candidates and the fixture fails recovery.
2742///
2743/// **CAPCO authority**: the USA-first invariant is CAPCO-2016 §H.8
2744/// p151: "After 'USA', list the required one or more trigraph country
2745/// codes in alphabetical order." E020 enforces that invariant at the
2746/// rule layer (via the `marque-capco`-private `canonicalize_trigraph_list`
2747/// helper). This decoder path operates one stage earlier — pre-strict-
2748/// parse, on raw text — so it does NOT call the rule-layer helper; it
2749/// emits a candidate text and lets the downstream pipeline (strict
2750/// parse + E020) verify and re-canonicalize as needed.
2751///
2752/// **Scope and guards** (mirrors PR-A's design):
2753///
2754/// - Fires only when the first entry's trimmed length is 1 or 2 ASCII
2755///   uppercase bytes (3-char entries belong to PR-A's domain).
2756/// - Skips when USA is already present elsewhere in the block — that
2757///   case isn't a USA-typo, it's an unrelated short prefix the user
2758///   may have meant differently. The block stays as-is.
2759/// - Skips when the block has fewer than two entries — a single
2760///   short entry plus nothing else doesn't fit the §H.8 p151
2761///   "USA + trigraph list" shape.
2762/// - Emits the substitution transform only — full canonicalization
2763///   (USA first, remaining trigraphs alphabetical, no duplicates) is
2764///   downstream. If the original list's tail (other than the
2765///   corrupted first entry) wasn't already alphabetical, E020 will
2766///   fire on the post-decode text and produce its own fix; if the
2767///   injection produced a duplicate (USA was already present in the
2768///   block under a different shape), the `already_has_usa` guard
2769///   above suppresses emit. Keeping the decoder text-level (no
2770///   `marque-capco` imports) avoids re-entering the rule layer
2771///   mid-recognition while preserving the single-source-of-truth
2772///   property — the canonical ordering rule lives in `marque-capco`,
2773///   and the decoder defers to whatever it produces post-parse.
2774/// - Audit signal: each candidate carries
2775///   [`FeatureId::BaseRateCommonMarking`] as provenance only, with
2776///   zero delta. This records that USA is the dominant trigraph in
2777///   the corpus prior without changing score or double-counting that
2778///   prior in the posterior. Reusing `BaseRateCommonMarking` (vs
2779///   introducing a new variant) keeps the audit schema closed —
2780///   `MARQUE_AUDIT_SCHEMA` stays at `marque-mvp-2`.
2781fn try_rel_to_usa_injection_candidates(text: &str) -> Vec<(String, FeatureEntry)> {
2782    let mut out: Vec<(String, FeatureEntry)> = Vec::new();
2783
2784    let mut search_start = 0;
2785    while let Some(rel_pos) = text[search_start..].find("REL TO ") {
2786        let header_end = search_start + rel_pos + "REL TO ".len();
2787        // Block ends at the EARLIEST of: `//` (next category), `\n`
2788        // (banner/CAB candidates from `Scanner::scan_banners` arrive
2789        // as full lines), or `)` (portion-form close). CAPCO §H.8 /
2790        // §A authority: `//` is the category separator; `,` separates
2791        // entries within the REL TO category itself. Mirrors the
2792        // terminator priority in `try_rel_to_fuzzy_trigraph_candidates`
2793        // and the corpus analyzer's `_extract_rel_to_trigraphs`.
2794        let tail = &text[header_end..];
2795        let block_len = ["//", "\n", ")"]
2796            .iter()
2797            .filter_map(|sep| tail.find(sep))
2798            .min()
2799            .unwrap_or(tail.len());
2800        let block_end = header_end + block_len;
2801        let block = &text[header_end..block_end];
2802
2803        // Walk entries with their byte offsets within the block.
2804        // Pre-size from comma count + 1 — typical REL TO blocks have
2805        // 2–6 entries, so this avoids reallocations on the common case.
2806        let entries: Vec<(usize, &str)> = {
2807            let mut v = Vec::with_capacity(block.bytes().filter(|&b| b == b',').count() + 1);
2808            let mut cursor = 0usize;
2809            for entry in block.split(',') {
2810                v.push((cursor, entry));
2811                cursor += entry.len() + 1; // +1 for the comma separator
2812            }
2813            v
2814        };
2815        if entries.len() < 2 {
2816            // Single-entry block: doesn't match the §H.8 p151
2817            // "USA + trigraph list" shape we're recovering.
2818            search_start = block_end;
2819            continue;
2820        }
2821
2822        // First entry is the candidate USA-typo position. The
2823        // structural guard is shape-only — len ∈ {1, 2}, all ASCII
2824        // uppercase. 3-char entries fall through to PR-A. Length 0
2825        // (e.g., a leading comma) is already filtered.
2826        let (first_entry_offset, first_entry) = entries[0];
2827        let trimmed = first_entry.trim();
2828        let is_short =
2829            (1..=2).contains(&trimmed.len()) && trimmed.bytes().all(|b| b.is_ascii_uppercase());
2830        if !is_short {
2831            search_start = block_end;
2832            continue;
2833        }
2834
2835        // Skip if USA is already present elsewhere in the block —
2836        // a USA-injection candidate would create a duplicate, which
2837        // E052 (issue #234 PR-B) would then need to dedup. Short-
2838        // circuit here rather than emit-and-redup.
2839        let already_has_usa = entries.iter().skip(1).any(|(_, e)| e.trim() == "USA");
2840        if already_has_usa {
2841            search_start = block_end;
2842            continue;
2843        }
2844
2845        // Build the substituted text. Preserve the entry's
2846        // surrounding whitespace (lead/trail) so the splice
2847        // round-trips through the strict parser the same way the
2848        // original would have.
2849        let lead_ws_len = first_entry.len() - first_entry.trim_start().len();
2850        let trail_ws_len = first_entry.len() - first_entry.trim_end().len();
2851        let mut rewritten_entry = String::with_capacity(first_entry.len() + 3);
2852        rewritten_entry.push_str(&first_entry[..lead_ws_len]);
2853        rewritten_entry.push_str("USA");
2854        rewritten_entry.push_str(&first_entry[first_entry.len() - trail_ws_len..]);
2855
2856        let mut alt = String::with_capacity(text.len() + 3);
2857        alt.push_str(&text[..header_end + first_entry_offset]);
2858        alt.push_str(&rewritten_entry);
2859        alt.push_str(&text[header_end + first_entry_offset + first_entry.len()..]);
2860
2861        // Audit-only provenance. The load-bearing scoring lives in
2862        // `score_candidate`, which sums `country_code_log_prior(USA)`
2863        // — already an extreme positive in the baked corpus prior —
2864        // over the parsed `rel_to` slice and is what carries the
2865        // candidate to victory. The `BaseRateCommonMarking` entry
2866        // here records the prior's contribution in the audit log
2867        // without double-counting it in the decoder's score, mirror-
2868        // ing PR-A's trigraph-prior treatment (delta = 0.0).
2869        let entry = FeatureEntry {
2870            id: FeatureId::BaseRateCommonMarking,
2871            delta: 0.0,
2872        };
2873        out.push((alt, entry));
2874
2875        search_start = block_end;
2876    }
2877
2878    out
2879}
2880
2881// ---------------------------------------------------------------------------
2882// SCI delimiter recovery (issue #198 — #133 PR 10)
2883// ---------------------------------------------------------------------------
2884
2885/// SCI delimiter recovery preprocessing — issue #198, #133 PR 10.
2886///
2887/// Repairs three classes of SCI delimiter typos against the closed
2888/// CVE vocabulary in `CVEnumISMSCIControls.xml`. Vocabulary checks
2889/// dispatch through the build-time-generated [`SciControlBare::parse`]
2890/// (bare control systems) and [`SciControl::parse`] (the full CVE set
2891/// including all registered control-compartment compounds), so the
2892/// repair surface tracks ODNI schema updates automatically — no
2893/// hand-maintained vocabulary slice to drift out of sync per
2894/// Constitution IV (Layer 1 generated predicates):
2895///
2896/// - **Pattern A (concatenated compound)**: a token equal to a compound
2897///   with the hyphen removed → canonical hyphenated form. `HCSP →
2898///   HCS-P`, `SIG → SI-G`, `TKKAND → TK-KAND`, etc.
2899/// - **Pattern B (concatenated bare control systems)**: a token of
2900///   length 4–6 that splits cleanly into two bare control systems →
2901///   slash-joined form (`SITK → SI/TK`, `HCSSI → HCS/SI`) per §A.6
2902///   p16 and the `TOP SECRET//ANB/SI/TK/XNB//NOFORN` example on p194.
2903///   Ambiguous splits bail out — see [`repair_sci_token`] for the
2904///   guard.
2905/// - **Pattern C (wrong delimiter)**: a token of the form
2906///   `<bare_cs>-<bare_cs>` that is NOT itself a registered compound →
2907///   slash-joined form. `SI-TK → SI/TK` (because `SI-TK` is not
2908///   registered), but `SI-G` is left alone (it IS registered — `-` is
2909///   the correct control-compartment separator per §A.6 p16).
2910///
2911/// **Out of scope** — sub-compartment fuzzy recovery (`ABCE → ABCD`),
2912/// unregistered-compartment recovery, and any rewrite that would
2913/// require fuzz-correcting against agency-assigned codewords. Those
2914/// require operator-supplied vocab (issue #180) — the engine cannot
2915/// invent identifiers it doesn't know are valid (Constitution VIII).
2916///
2917/// **Architectural shape** mirrors `try_rel_to_structural_repair`
2918/// (PR 9, #190): runs as preprocessing on the input string before
2919/// per-token fuzzy correction, returns `Some(repaired)` only when at
2920/// least one repair fired. The caller pushes a `BaseRateCommonMarking`
2921/// feature onto `delim_features` so every candidate derived from the
2922/// repaired text inherits the audit trace.
2923///
2924/// **Allocation behavior**: short-circuits without allocation when the
2925/// pre-check finds no SCI control system root in the text. The
2926/// per-token walk borrows the input until a fix actually fires.
2927fn try_sci_delimiter_repair(text: &str) -> Option<String> {
2928    if !contains_any_sci_root(text) {
2929        return None;
2930    }
2931
2932    // ASCII-only guard. The SCI control-system vocabulary
2933    // (`SciControlBare::ALL`) and the registered compound names
2934    // (`SciControl::ALL`) are pure ASCII, as are the delimiters this
2935    // function recognizes (`-`, `/`, `(`, `)`, space, tab, newline,
2936    // CR, comma). So any non-ASCII input cannot match any pattern;
2937    // bailing early avoids the byte-vs-char-boundary hazard that
2938    // would otherwise arise from indexing `text` with byte offsets.
2939    if !text.is_ascii() {
2940        return None;
2941    }
2942
2943    let bytes = text.as_bytes();
2944    let mut result: Option<String> = None;
2945    let mut last_copied = 0usize;
2946    let mut i = 0usize;
2947
2948    while i < bytes.len() {
2949        let at_boundary = i == 0
2950            || matches!(
2951                bytes[i - 1],
2952                b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','
2953            );
2954        if !at_boundary {
2955            i += 1;
2956            continue;
2957        }
2958
2959        let token_start = i;
2960        let token_end = bytes[token_start..]
2961            .iter()
2962            .position(|&b| matches!(b, b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','))
2963            .map(|n| token_start + n)
2964            .unwrap_or(bytes.len());
2965
2966        if token_start < token_end {
2967            let token = &text[token_start..token_end];
2968            if let Some(repaired) = repair_sci_token(token) {
2969                let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
2970                r.push_str(&text[last_copied..token_start]);
2971                r.push_str(&repaired);
2972                last_copied = token_end;
2973            }
2974        }
2975
2976        // Advance past the token; the next iteration will re-check the
2977        // boundary before the byte after the delimiter (or terminate at
2978        // end-of-input).
2979        i = token_end + 1;
2980    }
2981
2982    result.map(|mut r| {
2983        r.push_str(&text[last_copied..]);
2984        r
2985    })
2986}
2987
2988/// Cheap pre-check for [`try_sci_delimiter_repair`]: returns true when
2989/// the input contains at least one bare SCI control system identifier
2990/// as a substring. False positives just mean we walk the bytes and
2991/// return `None` — no correctness impact, only a performance
2992/// optimization for the overwhelmingly common case where the input has
2993/// no SCI category at all.
2994fn contains_any_sci_root(text: &str) -> bool {
2995    text.contains("HCS")
2996        || text.contains("KLM")
2997        || text.contains("MVL")
2998        || text.contains("RSV")
2999        || text.contains("BUR")
3000        || text.contains("SI")
3001        || text.contains("TK")
3002}
3003
3004/// Per-token classifier for SCI delimiter repair. Returns the repaired
3005/// token if one of patterns A/B/C matches; otherwise `None`.
3006///
3007/// All vocabulary checks dispatch through the build-time-generated
3008/// [`SciControlBare::parse`] and [`SciControl::parse`] (from
3009/// `marque-ism`'s generated `values.rs`), so the repair surface tracks
3010/// `CVEnumISMSCIControls.xml` automatically. New CVE compounds added
3011/// in a future ODNI schema bump (e.g., a hypothetical `SI-XYZ`) are
3012/// auto-discovered by Pattern A without any code change here.
3013///
3014/// Pattern dispatch order:
3015/// 1. Pattern A (split into bare-CS prefix + suffix; if
3016///    `{prefix}-{suffix}` is a registered CVE value, return it)
3017/// 2. Pattern C (token contains `-`, neither side is a registered
3018///    compound's compartment, both halves are bare CS)
3019/// 3. Pattern B (no `-`, splits into two bare CS, unambiguous)
3020fn repair_sci_token(token: &str) -> Option<String> {
3021    if token.is_empty() {
3022        return None;
3023    }
3024
3025    // ASCII-only guard. The CVE vocabulary is pure ASCII, so a non-
3026    // ASCII token cannot match any pattern; bailing early ensures
3027    // the byte-offset slicing below (`token[..split]`,
3028    // `token[split..]`, `token[..dash_pos]`, `token[dash_pos + 1..]`)
3029    // never lands in the middle of a multi-byte UTF-8 sequence. This
3030    // is a defense-in-depth check — the only production caller
3031    // (`try_sci_delimiter_repair`) already gates on ASCII — but
3032    // keeping it here makes the function's invariant local and
3033    // self-evident for any future caller (e.g., a unit test).
3034    if !token.is_ascii() {
3035        return None;
3036    }
3037
3038    let len = token.len();
3039
3040    // Pattern A — concatenated registered compound. Walk every split
3041    // where the prefix is a bare control system; if `{prefix}-{suffix}`
3042    // is in the CVE vocabulary, return the canonical hyphenated form.
3043    // Bare CS lengths are 2 or 3; suffix length range comes from CVE
3044    // (max compartment-form suffix is 4 chars, e.g. TK-BLFH).
3045    if !token.contains('-') && (3..=8).contains(&len) {
3046        for &split in &[2usize, 3] {
3047            if split >= len {
3048                continue;
3049            }
3050            let prefix = &token[..split];
3051            let suffix = &token[split..];
3052            if SciControlBare::parse(prefix).is_some() {
3053                let canonical = format!("{prefix}-{suffix}");
3054                if SciControl::parse(&canonical).is_some() {
3055                    return Some(canonical);
3056                }
3057            }
3058        }
3059    }
3060
3061    // Pattern C — wrong delimiter (`-` between two bare CS). Skip if
3062    // the whole token is itself a registered CVE compound.
3063    if let Some(dash_pos) = token.find('-') {
3064        if SciControl::parse(token).is_some() {
3065            return None;
3066        }
3067        let prefix = &token[..dash_pos];
3068        let suffix = &token[dash_pos + 1..];
3069        if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
3070            return Some(format!("{prefix}/{suffix}"));
3071        }
3072        return None;
3073    }
3074
3075    // Pattern B — concatenated bare control systems (no delimiter).
3076    // Bare CS lengths are 2 or 3; the concatenation is therefore in
3077    // [4..=6]. Try splits at positions 2 and 3 (the only split points
3078    // that can yield two valid bare-CS halves) and require an
3079    // unambiguous match.
3080    if !(4..=6).contains(&len) {
3081        return None;
3082    }
3083    let mut found: Option<(&str, &str)> = None;
3084    for &split in &[2usize, 3] {
3085        if split >= len {
3086            continue;
3087        }
3088        let suffix_len = len - split;
3089        if !(2..=3).contains(&suffix_len) {
3090            continue;
3091        }
3092        let prefix = &token[..split];
3093        let suffix = &token[split..];
3094        if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
3095            if found.is_some() {
3096                return None;
3097            }
3098            found = Some((prefix, suffix));
3099        }
3100    }
3101    found.map(|(p, s)| format!("{p}/{s}"))
3102}
3103
3104// ---------------------------------------------------------------------------
3105// Token reordering
3106// ---------------------------------------------------------------------------
3107
3108/// Try to produce a canonical-order rewrite of `text`.
3109///
3110/// The CAPCO category order is: classification → SCI → SAR → dissem.
3111/// If the observed segments are out of order — e.g., `NOFORN//SECRET`
3112/// with dissem first — this helper swaps them into the canonical
3113/// order. Returns `None` when the input is already in canonical order
3114/// or when reordering doesn't apply (CAB lines, single-segment input).
3115fn try_canonical_reorder(text: &str) -> Option<String> {
3116    // Only banner/portion-shaped input (contains `//`) is reorderable
3117    // with this heuristic. CABs use keyed authority lines, not
3118    // category ordering.
3119    if !text.contains("//") {
3120        return None;
3121    }
3122
3123    // Portion form: `(C//NF)` — strip the surrounding parens for
3124    // reasoning, re-wrap at emit.
3125    let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
3126        ("(", &text[1..text.len() - 1], ")")
3127    } else {
3128        ("", text, "")
3129    };
3130
3131    let segments: Vec<&str> = body.split("//").collect();
3132    if segments.len() < 2 {
3133        return None;
3134    }
3135
3136    // Classify each segment by its dominant category. We only
3137    // reorder when exactly one segment is classification-dominant
3138    // and at least one other is dissem-dominant — otherwise the
3139    // input is too ambiguous for a clean swap.
3140    let mut class_segments: Vec<&str> = Vec::new();
3141    let mut dissem_segments: Vec<&str> = Vec::new();
3142    let mut other_segments: Vec<&str> = Vec::new();
3143    for seg in &segments {
3144        let seg = seg.trim();
3145        if seg.is_empty() {
3146            continue;
3147        }
3148        match classify_segment(seg) {
3149            SegmentClass::Classification => class_segments.push(seg),
3150            SegmentClass::Dissem => dissem_segments.push(seg),
3151            SegmentClass::Other => other_segments.push(seg),
3152        }
3153    }
3154
3155    if class_segments.is_empty() {
3156        return None;
3157    }
3158
3159    // Detect non-US markings: any classification segment is a NATO,
3160    // JOINT, or FGI classification (not a US classification level).
3161    let is_non_us = class_segments
3162        .iter()
3163        .any(|s| is_non_us_classification_segment(s));
3164
3165    // Already-canonical check: if the classification segment is the
3166    // first non-empty segment, no reorder is needed.
3167    // For non-US markings: also require that the body already starts
3168    // with `//` (the empty US classification slot). If the class is
3169    // first but the `//` prefix is absent, fall through to add it.
3170    if let Some(first) = segments.iter().find(|s| !s.trim().is_empty()) {
3171        if class_segments.contains(&first.trim()) {
3172            // US: already canonical.
3173            // Non-US: already canonical only when // prefix is present.
3174            if !is_non_us || body.starts_with("//") {
3175                return None;
3176            }
3177        }
3178    }
3179
3180    // Emit: classification → other (SCI/SAR/FGI blocks) → dissem.
3181    let mut ordered: Vec<&str> = Vec::new();
3182    ordered.extend(class_segments);
3183    ordered.extend(other_segments);
3184    ordered.extend(dissem_segments);
3185
3186    let joined = ordered.join("//");
3187
3188    // Non-US canonical form: `//{class}//{others}//{dissems}`. The
3189    // leading `//` represents the empty US classification slot (per
3190    // CAPCO-2016 §A.6) and signals the strict parser to use the
3191    // non-US classification code path.
3192    if is_non_us {
3193        Some(format!("{prefix}//{joined}{suffix}"))
3194    } else {
3195        Some(format!("{prefix}{joined}{suffix}"))
3196    }
3197}
3198
3199/// Which CAPCO category a `//`-separated segment primarily belongs to.
3200///
3201/// A segment is classification-dominant if its first token is a known
3202/// classification level (`U`, `C`, `S`, `TS`, `CONFIDENTIAL`, …).
3203/// Dissem-dominant if its first token is a known dissem control
3204/// (`NOFORN`, `NF`, `ORCON`, …). Otherwise Other (SCI/SAR/FGI
3205/// sub-blocks, REL TO lists, etc.).
3206#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3207enum SegmentClass {
3208    Classification,
3209    Dissem,
3210    Other,
3211}
3212
3213fn classify_segment(seg: &str) -> SegmentClass {
3214    let first_token = seg.split_whitespace().next().unwrap_or("");
3215    // Strip trailing commas.
3216    let first_token = first_token.trim_end_matches(',');
3217    // Single-whitespace-token classifications only. `TOP SECRET` and
3218    // multi-word NATO/JOINT forms are handled by the separate
3219    // starts_with branches below.
3220    const CLASSIFICATIONS: &[&str] = &[
3221        "U",
3222        "R",
3223        "C",
3224        "S",
3225        "TS",
3226        "UNCLASSIFIED",
3227        "RESTRICTED",
3228        "CONFIDENTIAL",
3229        "SECRET",
3230        // NATO classification abbreviations (single-token forms).
3231        "NS",
3232        "NC",
3233        "NU",
3234        "CTS",
3235        "CTSA",
3236        "NSAT",
3237        "NCA",
3238        "CTS-B",
3239        "CTS-BALK",
3240        // JOINT classification indicator.
3241        "JOINT",
3242    ];
3243    // Dissemination-control tokens — IC (§H.8) and non-IC (§H.9).
3244    // SCI controls (HCS, SI, TK, and all their sub-compartment forms)
3245    // are NOT in this list — they belong to their own category under
3246    // CAPCO §A.6 and the canonical order places them between
3247    // classification and dissem. Classifying an HCS segment as Dissem
3248    // would drive `try_canonical_reorder` to move it past the dissem
3249    // block, corrupting the rewrite. SCI segments therefore fall
3250    // through to `SegmentClass::Other`, which the reorder helper
3251    // inserts between classification and dissem — the right spot per
3252    // CAPCO-2016 §A.6.
3253    //
3254    // AEA controls (RD, FRD, TFNI, CNWDI, SIGMA) are also omitted —
3255    // they appear between SCI and dissem per §A.6. A pre-check above
3256    // `CLASSIFICATIONS.contains` prevents "RESTRICTED DATA" from being
3257    // mistaken for the NATO RESTRICTED classification.
3258    //
3259    // "REL" is the first token of "REL TO {country-list}" segments.
3260    //
3261    // Non-IC dissem controls (§H.9): portion marks (DS, XD, ND,
3262    // SBU, SBU-NF, LES, LES-NF, SSI) and banner abbreviations
3263    // (LIMDIS, EXDIS, NODIS) are included so reordering places them
3264    // in the dissem block, not the SCI/AEA block (CAPCO-2016 §A.6).
3265    const DISSEMS: &[&str] = &[
3266        // §H.8 IC dissemination controls
3267        "NOFORN", "NF", "ORCON", "OC", "PROPIN", "PR", "IMCON", "IMC", "RELIDO", "RS", "RSEN",
3268        "DSEN", "FISA", "FOUO", "EYES", "REL",
3269        // §H.9 non-IC dissemination controls — portion marks
3270        "DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI",
3271        // §H.9 non-IC dissemination controls — banner abbreviations
3272        "LIMDIS", "EXDIS", "NODIS",
3273    ];
3274    // Pre-check: "RESTRICTED DATA" (AEA marking, §H.6) must not be
3275    // mistaken for the NATO RESTRICTED classification even though
3276    // "RESTRICTED" appears in CLASSIFICATIONS. The bare token
3277    // "RESTRICTED" IS valid as NATO classification; "RESTRICTED DATA"
3278    // and longer AEA forms are not. CAPCO-2016 §H.6 p113.
3279    if first_token == "RESTRICTED" && seg.split_whitespace().nth(1).is_some() {
3280        return SegmentClass::Other;
3281    }
3282    if CLASSIFICATIONS.contains(&first_token) {
3283        SegmentClass::Classification
3284    // Single-token dissem controls and multi-word non-IC long-title forms.
3285    // Multi-word forms cannot be single-token-matched because their first words
3286    // ("LIMITED", "NO", "EXCLUSIVE", "LAW", "SENSITIVE") are too ambiguous;
3287    // they are checked via starts_with here. CAPCO-2016 §H.8–9.
3288    } else if DISSEMS.contains(&first_token)
3289        || (first_token == "LIMITED" && seg.starts_with("LIMITED DISTRIBUTION"))
3290        || (first_token == "NO" && seg.starts_with("NO DISTRIBUTION"))
3291        || (first_token == "EXCLUSIVE" && seg.starts_with("EXCLUSIVE DISTRIBUTION"))
3292        || (first_token == "LAW" && seg.starts_with("LAW ENFORCEMENT SENSITIVE"))
3293        || (first_token == "SENSITIVE"
3294            && (seg.starts_with("SENSITIVE BUT UNCLASSIFIED")
3295                || seg.starts_with("SENSITIVE SECURITY INFORMATION")))
3296    {
3297        SegmentClass::Dissem
3298    } else if (first_token == "TOP" && seg.starts_with("TOP SECRET"))
3299        || (first_token == "COSMIC" && seg.starts_with("COSMIC TOP SECRET"))
3300        || (first_token == "NATO"
3301            && (seg.starts_with("NATO SECRET")
3302                || seg.starts_with("NATO CONFIDENTIAL")
3303                || seg.starts_with("NATO UNCLASSIFIED")
3304                || seg.starts_with("NATO RESTRICTED")))
3305    {
3306        SegmentClass::Classification
3307    } else if CapcoTokenSet.is_trigraph(first_token) {
3308        // FGI pattern: {registered country trigraph} {classification level}.
3309        // Validated against the authoritative CVEnumISMCATRelTo vocabulary so
3310        // typos like "OTP" (→ TOP) don't get mistaken for FGI country codes.
3311        let second = seg.split_whitespace().nth(1).unwrap_or("");
3312        let second = second.trim_end_matches(',');
3313        if matches!(
3314            second,
3315            "U" | "R"
3316                | "C"
3317                | "S"
3318                | "TS"
3319                | "UNCLASSIFIED"
3320                | "RESTRICTED"
3321                | "CONFIDENTIAL"
3322                | "SECRET"
3323        ) || (second == "TOP"
3324            && seg
3325                .split_whitespace()
3326                .nth(2)
3327                .is_some_and(|t| t.trim_end_matches(',') == "SECRET"))
3328        {
3329            SegmentClass::Classification
3330        } else {
3331            SegmentClass::Other
3332        }
3333    } else {
3334        SegmentClass::Other
3335    }
3336}
3337
3338/// Returns true when `seg` is a non-US classification segment: a NATO
3339/// classification abbreviation, a JOINT classification phrase, or an FGI
3340/// `{trigraph} {level}` pattern.
3341///
3342/// Used by `try_canonical_reorder` to decide whether the reordered output
3343/// needs a leading `//` (the empty US classification slot that signals the
3344/// strict parser to take the non-US code path).
3345fn is_non_us_classification_segment(seg: &str) -> bool {
3346    const NATO_ABBREVS: &[&str] = &[
3347        "NS", "NC", "NU", "CTS", "CTSA", "NSAT", "NCA", "CTS-B", "CTS-BALK",
3348    ];
3349    let mut tokens = seg.split_whitespace();
3350    let first = tokens.next().unwrap_or("");
3351    let first = first.trim_end_matches(',');
3352    if NATO_ABBREVS.contains(&first) {
3353        return true;
3354    }
3355    if first == "JOINT" {
3356        return true;
3357    }
3358    if first == "COSMIC" && seg.starts_with("COSMIC TOP SECRET") {
3359        return true;
3360    }
3361    if first == "NATO"
3362        && (seg.starts_with("NATO SECRET")
3363            || seg.starts_with("NATO CONFIDENTIAL")
3364            || seg.starts_with("NATO UNCLASSIFIED")
3365            || seg.starts_with("NATO RESTRICTED"))
3366    {
3367        return true;
3368    }
3369    // FGI: {registered country trigraph} {classification level}.
3370    // Validated against the authoritative CVEnumISMCATRelTo vocabulary so
3371    // typos like "OTP" (→ TOP) are not mistaken for FGI country codes.
3372    if CapcoTokenSet.is_trigraph(first) {
3373        let second = tokens.next().unwrap_or("");
3374        let second = second.trim_end_matches(',');
3375        if matches!(
3376            second,
3377            "U" | "R"
3378                | "C"
3379                | "S"
3380                | "TS"
3381                | "UNCLASSIFIED"
3382                | "RESTRICTED"
3383                | "CONFIDENTIAL"
3384                | "SECRET"
3385        ) {
3386            return true;
3387        }
3388        if second == "TOP"
3389            && tokens
3390                .next()
3391                .is_some_and(|t| t.trim_end_matches(',') == "SECRET")
3392        {
3393            return true;
3394        }
3395    }
3396    false
3397}
3398
3399/// Prepends the non-US leading `//` when the entire input (no existing `//`)
3400/// looks like a non-US classification segment.
3401///
3402/// This covers bare non-US markings like `NS`, `JOINT S GBR USA`, or
3403/// `CAN S` that arrive with no delimiter at all — `try_canonical_reorder`
3404/// cannot act on them because it requires at least two `//`-separated
3405/// segments. Emitting `//NS`, `//JOINT S GBR USA`, etc. lets the strict
3406/// parser recognize the non-US code path (CAPCO-2016 §A.6, parser block 1).
3407fn try_add_non_us_prefix(text: &str) -> Option<String> {
3408    // Only act when there is no `//` at all — try_canonical_reorder
3409    // handles the has-// but missing-prefix case.
3410    if text.contains("//") {
3411        return None;
3412    }
3413    let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
3414        ("(", &text[1..text.len() - 1], ")")
3415    } else {
3416        ("", text, "")
3417    };
3418    if is_non_us_classification_segment(body.trim()) {
3419        Some(format!("{prefix}//{body}{suffix}"))
3420    } else {
3421        None
3422    }
3423}
3424
3425// ---------------------------------------------------------------------------
3426// FR-011 strict-context floor
3427// ---------------------------------------------------------------------------
3428
3429/// True when `marking`'s classification level is ≥ `floor`.
3430///
3431/// FR-011 invariant. `floor` is the `Classification as u8` encoding
3432/// (Unclassified=0 … TopSecret=4) — see [`ParseContext::classification_floor`].
3433///
3434/// A marking with no classification info cannot clear a non-trivial
3435/// floor — return `false` so the candidate is dropped when the floor
3436/// is CONFIDENTIAL or above.
3437fn meets_classification_floor(marking: &CapcoMarking, floor: u8) -> bool {
3438    let Some(level) = marking_classification(marking) else {
3439        return floor == Classification::Unclassified as u8;
3440    };
3441    (level as u8) >= floor
3442}
3443
3444/// Extract the effective classification level from a parsed marking.
3445///
3446/// Delegates to [`marque_ism::MarkingClassification::effective_level`],
3447/// which handles all variants (`Us`, `Fgi`, `Nato`, `Joint`,
3448/// `Conflict`) by mapping each to the canonical [`Classification`]
3449/// ladder. NATO levels map through
3450/// [`NatoClassification::us_equivalent`](marque_ism::NatoClassification::us_equivalent).
3451fn marking_classification(marking: &CapcoMarking) -> Option<Classification> {
3452    marking
3453        .0
3454        .classification
3455        .as_ref()
3456        .map(|c| c.effective_level())
3457}
3458
3459/// True when the parsed marking carries at least one recognized
3460/// attribute — any classification, SCI / SAR / AEA / FGI / dissem /
3461/// REL-TO entry, or CAB field (Classified By, Derived From,
3462/// Declassify On, declass exemption).
3463///
3464/// Distinct from [`strict_parse_is_complete`]: a marking can be
3465/// nontrivial (has a dissem control) while still being incomplete
3466/// (missing its classification). The dispatcher consults both — a
3467/// strict result is only accepted when it is BOTH nontrivial AND
3468/// complete; otherwise the decoder is invoked to try to recover the
3469/// missing pieces.
3470///
3471/// True when `bytes` is a portion-shaped slice whose inner content
3472/// is exactly one ASCII letter — `(s)`, `(c)`, `(u)`, `(r)`, `(S)`,
3473/// etc. Tolerant of leading whitespace; the strict recognizer
3474/// already accepts a small amount of leading whitespace on portion
3475/// candidates (`StrictRecognizer::recognize` strips it before
3476/// parsing) and the prose-glue heuristic must do the same so the
3477/// caller's `cx.preceded_by_whitespace` flag remains the authoritative
3478/// signal for "is this glued to a word."
3479///
3480/// Used by [`DecoderRecognizer::recognize`] for the prose-glue
3481/// suppression early-out. A 2-letter inner content like `(TS)` is
3482/// outside the heuristic's scope — multi-letter classification
3483/// abbrevs are rare in prose and don't share the plural-suffix
3484/// confusability that drives this filter.
3485fn is_single_letter_portion(bytes: &[u8]) -> bool {
3486    let trimmed = bytes
3487        .iter()
3488        .position(|b| !b.is_ascii_whitespace())
3489        .map(|i| &bytes[i..])
3490        .unwrap_or(bytes);
3491    matches!(trimmed, [b'(', inner, b')'] if inner.is_ascii_alphabetic())
3492}
3493
3494/// Used inside the decoder itself to filter out lenient-parse-
3495/// accepts-anything results (`FROBNITZ//WIBBLE` trip-fires the
3496/// banner scanner and produces a zero-attribute parse); without
3497/// the filter, every `X//Y` prose fragment would materialize a
3498/// fabricated empty marking candidate.
3499fn is_nontrivial_marking(marking: &CapcoMarking) -> bool {
3500    let a = &marking.0;
3501    a.classification.is_some()
3502        || !a.sci_controls.is_empty()
3503        || a.sar_markings.is_some()
3504        || !a.aea_markings.is_empty()
3505        || a.fgi_marker.is_some()
3506        || !a.dissem_controls.is_empty()
3507        || !a.non_ic_dissem.is_empty()
3508        || !a.rel_to.is_empty()
3509        || a.classified_by.is_some()
3510        || a.derived_from.is_some()
3511        || a.declassify_on.is_some()
3512        || a.declass_exemption.is_some()
3513}
3514
3515/// True when the strict-parse result is complete enough that the
3516/// dispatcher should accept it and skip the decoder fallback.
3517///
3518/// The strict parser (`marque_core::Parser`) is lenient about
3519/// content: it categorizes tokens by *position* (the first token
3520/// inside `(...)` is marked as `TokenKind::Classification`
3521/// regardless of whether its text is a valid classification value),
3522/// and falls back to `TokenKind::Unknown` only for truly unplaceable
3523/// tokens. So a shape like `(SERCET//NOFORN)` parses to a marking
3524/// with `classification: None` (SERCET doesn't resolve to any
3525/// `Classification` variant), `dissem_controls: [Nf]` (NOFORN was
3526/// recognized), and a Classification-kind `TokenSpan` carrying the
3527/// literal text "SERCET". That result is *nontrivial* but also
3528/// *incomplete* — exactly the mangled-input case the decoder exists
3529/// to recover.
3530///
3531/// Predicate, kind-aware:
3532///
3533/// - [`MarkingType::Portion`] / [`MarkingType::Banner`]: complete
3534///   iff `classification.is_some()` AND no `TokenKind::Unknown`
3535///   spans survived. Both branches matter — SERCET→None catches
3536///   the classification-slot typo; the `Unknown` check catches
3537///   typos in the tail (e.g., `(S//FRBN)` where the classification
3538///   is fine but FRBN is mangled and lands as Unknown).
3539/// - [`MarkingType::Cab`]: complete iff any CAB field is present
3540///   (`classified_by` / `derived_from` / `declassify_on`).
3541///   CAB-kind input doesn't require a classification axis — an
3542///   isolated authority block stands on its own.
3543/// - Anything else: fall back to the generic nontrivial check.
3544fn strict_parse_is_complete(marking: &CapcoMarking, kind: MarkingType) -> bool {
3545    use marque_ism::TokenKind;
3546    let attrs = &marking.0;
3547    match kind {
3548        MarkingType::Portion | MarkingType::Banner => {
3549            attrs.classification.is_some()
3550                && !attrs
3551                    .token_spans
3552                    .iter()
3553                    .any(|s| matches!(s.kind, TokenKind::Unknown))
3554        }
3555        MarkingType::Cab => {
3556            attrs.classified_by.is_some()
3557                || attrs.derived_from.is_some()
3558                || attrs.declassify_on.is_some()
3559                || attrs.declass_exemption.is_some()
3560        }
3561        _ => is_nontrivial_marking(marking),
3562    }
3563}
3564
3565// ---------------------------------------------------------------------------
3566// Scoring
3567// ---------------------------------------------------------------------------
3568
3569/// Floor log-prior for canonical tokens that don't appear in the
3570/// baked `TOKEN_BASE_RATES` table.
3571///
3572/// Baked priors are `log((hits + 1) / (total + |V|))` with
3573/// Laplace smoothing over the non-IC Enron corpus (see
3574/// `tools/corpus-analysis/analyze.py::derive_priors`). A token the
3575/// corpus never observed still receives a non-zero smoothed prior in
3576/// that build; this constant exists for the different, rarer case
3577/// where the canonical-tokens iterator produces a string that was
3578/// not in the build's vocabulary at all (e.g., a CVE token added
3579/// after the last priors regeneration). Without this floor, such
3580/// tokens would silently contribute `0.0` to the sum — and since
3581/// every real log-prior is negative, a missing token would score
3582/// HIGHER than a known one, inverting the ranking.
3583///
3584/// Magnitude (`-12.0` nats ≈ log(6e-6)) is chosen to be strictly
3585/// lower than every log-prior the generator would emit for a
3586/// non-empty corpus: the Enron-derived values bottom out around
3587/// `-11.7` for the most infrequent observed tokens (see
3588/// `crates/capco/corpus/priors.json`).
3589const MISSING_TOKEN_LOG_PRIOR: f32 = -12.0;
3590
3591/// Posterior penalty applied when a candidate's strict parse buries a
3592/// reserved dissem-control token (a hard splitter — see
3593/// [`is_hard_splitter`]) inside a SAR or SCI sub-component slot.
3594///
3595/// **Why this exists.** Hard-splitter tokens (NOFORN, ORCON, EXDIS,
3596/// FOUO, …) have hard reserved meanings as dissem controls per CAPCO-
3597/// 2016 §H.8/§H.9; they have no in-segment role inside SCI or SAR
3598/// sub-components. A strict parse that places such a token under
3599/// [`marque_ism::SarMarking`] or [`marque_ism::SciMarking`] is
3600/// essentially always a missing-
3601/// `//` artifact in the input — the alternative parse with the token
3602/// emitted as a dissem control is the correct interpretation. (REL
3603/// TO is intentionally excluded from the penalty surface here: its
3604/// payload is a list of country trigraphs whose grammar accepts only
3605/// 3-letter alpha codes drawn from the CVE-derived trigraph table,
3606/// so a 4+-char hard splitter cannot land in a REL TO slot in the
3607/// first place. The Copilot review on PR #178 flagged a wider doc
3608/// claim that suggested otherwise — the doc is now scoped to the
3609/// slots the penalty actually defends.)
3610///
3611/// **Why scoring needs help.** The bag-of-tokens scorer above sums
3612/// log-priors for the marking's canonical tokens, and `canonical_tokens_for`
3613/// deliberately excludes SAR program/compartment/sub-compartment text
3614/// (open-set agency-assigned codewords). So an absorbing parse contributes
3615/// only the classification's prior; the equivalent delim-inserted parse
3616/// contributes classification + the dissem token's prior, which is a
3617/// MORE NEGATIVE log-posterior. Without a corrective penalty the
3618/// absorbing parse always wins. SCI absorption usually self-resolves
3619/// because [`marque_core::Parser::parse`]'s SCI subgrammar produces
3620/// [`marque_ism::TokenKind::Unknown`] for non-alphanumeric/wrong-shape
3621/// compartment tokens (which step 3a then drops), but SAR's grammar accepts any
3622/// `[A-Z0-9]+` identifier and absorbs cleanly — leaving SAR as the
3623/// observed failure mode on the SC-004 corpus (the `SAR-BP-J12 …` and
3624/// `SPECIAL ACCESS REQUIRED-BUTTER POPCORN …` fixtures pre-PR-5).
3625///
3626/// **Magnitude.** Empirically the absorbing-vs-delim-inserted spread
3627/// on those two fixtures is ~9 nats; the [`MISSING_TOKEN_LOG_PRIOR`]
3628/// floor (`-12.0`) gives a comfortable margin and is robust to small
3629/// future shifts in the priors table. Defining the penalty as
3630/// `MISSING_TOKEN_LOG_PRIOR` (rather than re-stating the literal)
3631/// keeps the two below-floor signals mechanically at parity for any
3632/// candidate that triggers both — a future ratchet of one constant
3633/// pulls the other along.
3634///
3635/// **Safety.** Hard-splitter tokens are all 4+ chars and have shapes
3636/// distinct from real SAR identifiers (`BP`, `CD`, `XR` are 2-char;
3637/// `BUTTER POPCORN`, `J12`, `K15`, `XRA` are alphanumeric short
3638/// codes that don't collide with the hard-splitter list). So this
3639/// penalty cannot fire on a legitimate SAR/SCI parse.
3640const HARD_SPLITTER_ABSORPTION_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
3641
3642/// Per-entry structural penalty for SCI markings whose control system
3643/// landed as [`SciControlSystem::Custom`]. Issue #133 PR 6.
3644///
3645/// **Why this penalty exists.** `marque_core::Parser`'s structural SCI
3646/// subparser (CAPCO-2016 §A.6 grammar) accepts any alphanumeric
3647/// identifier as a "custom" control system / compartment when the
3648/// segment text contains `-` or `/`. That branch was added so legal
3649/// compound SCI shapes (`SI-G ABCD DEFG-MMM AACD`) parse correctly,
3650/// but it has a side effect: a typo'd or stray segment like
3651/// `USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB` parses cleanly into
3652/// three `Custom`-system SCI markings (USAR/CD/XR with attached
3653/// compartments). The bag-of-tokens scorer can't tell that this is
3654/// the wrong interpretation — `Custom` SCI control systems don't
3655/// appear in `canonical_tokens_for`, so they don't shift the prior
3656/// either way, and the candidate ties with structurally-richer
3657/// alternatives like the SAR-repaired candidate that
3658/// `try_sar_indicator_repair` emits.
3659///
3660/// **What the penalty does.** Adds [`MISSING_TOKEN_LOG_PRIOR`] (the
3661/// same below-observed-floor magnitude as
3662/// [`HARD_SPLITTER_ABSORPTION_PENALTY`]) per `Custom`-system SCI
3663/// marking. The penalty is per-entry so candidates that absorbed
3664/// multiple stray segments (like the 3-segment USAR/CD/XR case) get
3665/// progressively worse posteriors, restoring the SAR-repair
3666/// candidate's lead by a margin that clears
3667/// [`UNAMBIGUOUS_LOG_MARGIN`].
3668///
3669/// **Magnitude.** Same `-12.0` as the hard-splitter penalty: both are
3670/// "this parse pattern is highly unlikely in well-formed CAPCO
3671/// markings" structural signals, and keeping them at parity by
3672/// definition (rather than literal duplication) lets a future
3673/// ratchet of one move both together. A single legitimate custom
3674/// control (the §A.6 p16 `99` example) gets one `-12.0` hit but
3675/// remains the sole candidate when no alternative interpretation
3676/// exists, so the dispatcher still emits `Unambiguous`.
3677///
3678/// **Safety / discriminator choice.** The discriminator is
3679/// `sm.system == SciControlSystem::Custom(_)`, NOT
3680/// `sm.canonical_enum.is_none()`. The two are NOT equivalent:
3681/// `canonical_enum` is also `None` for legitimate `Published`-system
3682/// SCI markings whenever the `{system}-{first_compartment}` pair
3683/// doesn't map to a CVE atom (per the `canonical_enum` doc in
3684/// `crates/scheme/src/scheme.rs` — populated only when "the bare
3685/// control or `{ctrl}-{first_comp}` matches a CVE value AND no
3686/// sub-compartments are present"). Using `canonical_enum` as the
3687/// discriminator would penalize legitimate `SI-G ABCD DEFG-MMM AACD`-
3688/// style markings (system=`Published(Si)`, sub-compartments present
3689/// → canonical_enum=None), broadly skewing scoring against rich
3690/// SCI shapes. Discriminating on `system` directly catches the
3691/// USAR/CD/XR custom-only case while leaving every published SCI
3692/// marking — bare or compound — untouched. A candidate with mixed
3693/// SCI (e.g., `SI-G ABCD//99`) gets a single penalty for the `99`
3694/// `Custom` entry only, which is a reasonable cost for a
3695/// structurally suspicious mixed shape. The penalty does NOT fire
3696/// on candidates with empty `sci_markings` — so the SAR-repaired
3697/// candidate (which projects no SCI) is unaffected.
3698const CUSTOM_SCI_MARKING_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
3699
3700// (`LENIENT_REL_PREFIX_PENALTY` removed — under the current PR-9
3701// architecture, `try_rel_to_structural_repair` runs as preprocessing
3702// on the normalized text before any candidate is emitted, so
3703// `RELT O ` / `REL OT ` patterns at a token boundary are rewritten
3704// to canonical `REL TO ` before scoring sees them. The defense-in-
3705// depth scorer penalty that PR 9 originally introduced was meant to
3706// break a tie between competing raw vs. repaired *candidates* —
3707// that tie no longer exists since the repair is no longer a
3708// separate candidate. The accuracy harness
3709// (`resolution_rate_at_0_85`, `resolution_rate_does_not_regress`,
3710// per-class floors) is the load-bearing regression gate for this
3711// recovery path. Issue #186 (REL TO trigraph corpus-weighted
3712// recovery) is the followup that handles the remaining lenient-
3713// header cases via priors rather than scorer penalties.)
3714
3715/// Bag-of-tokens scorer (foundational-plan §5.2).
3716///
3717/// Returns `(prior, posterior)` where:
3718///
3719/// - `prior` = Σ [`marque_capco::priors::token_log_prior`] over the
3720///   marking's canonical tokens **plus** Σ
3721///   [`marque_capco::priors::country_code_log_prior`] over the
3722///   marking's `rel_to` country codes (issue #233). This is the prior
3723///   alone — nothing else — and is what
3724///   [`Candidate::prior_log_odds`] is documented to carry (see
3725///   `crates/scheme/src/ambiguity.rs`). Tokens or country codes
3726///   missing from the baked tables contribute
3727///   [`MISSING_TOKEN_LOG_PRIOR`] (a below-observed-floor penalty)
3728///   rather than `0.0`. The country-code contribution is what lets
3729///   the decoder break fuzzy-correction ties between common (USA,
3730///   GBR, AUS) and rare-lookalike (USB-not-a-country, UZB, ASM, AUT)
3731///   trigraphs in REL TO blocks.
3732/// - `posterior` = `prior + Σ attempt.features[i].delta + structural
3733///   penalties`. This is the quantity the decoder sorts and thresholds
3734///   on. The only structural penalty today is
3735///   [`HARD_SPLITTER_ABSORPTION_PENALTY`], applied when the strict
3736///   parse buries a reserved dissem-control token in a SAR/SCI slot.
3737///
3738/// Splitting the two prevents the caller from writing the full
3739/// posterior into `Candidate::prior_log_odds` — that would double-
3740/// count the feature deltas once any resolver re-adds
3741/// `EvidenceFeature.log_odds`. Structural penalties are deliberately
3742/// folded into the posterior only (not the prior or the per-feature
3743/// log-odds): they are a likelihood statement about parse plausibility,
3744/// not a corpus-frequency claim about token co-occurrence.
3745///
3746/// Precision: computed in `f32` — the baked priors are already `f32`
3747/// and the feature deltas are small constants (single-digit magnitude
3748/// at most), so the accumulator doesn't need `f64` headroom for the
3749/// K=8 candidate set.
3750fn score_candidate(attempt: &CanonicalAttempt, marking: &CapcoMarking) -> (f32, f32) {
3751    // Prior: sum of baked log-priors for the canonical tokens that
3752    // appear in the parsed marking. Tokens missing from the baked
3753    // table receive the floor penalty rather than a neutral 0.0
3754    // contribution — see the MISSING_TOKEN_LOG_PRIOR doc.
3755    let mut prior: f32 = 0.0;
3756    let tokens = canonical_tokens_for(marking);
3757    for token in tokens {
3758        prior += marque_capco::priors::token_log_prior(token).unwrap_or(MISSING_TOKEN_LOG_PRIOR);
3759    }
3760
3761    // Country-code prior contribution (issue #233). REL TO country
3762    // codes are not part of the `canonical_tokens_for` set because
3763    // `CountryCode::as_str()` returns a borrowed `&str` rather than
3764    // `&'static str`, and because the per-token corpus coverage for
3765    // country codes used to be sparse. Issue #233 adds a parallel
3766    // `COUNTRY_CODE_BASE_RATES` table (issue #186 sub-feature 1) so
3767    // the decoder can break fuzzy ties between popular codes (USA,
3768    // GBR, AUS, FVEY, …) and rare lookalikes (UZB, ASM,
3769    // AUT-as-Austria) by log-prior delta rather than edit distance
3770    // alone. Look up each observed REL TO code at score-time —
3771    // shape-agnostic, so the loop handles 2-char (`EU`), 3-char, and
3772    // 4-char tetragraphs uniformly. Duplicate REL TO entries do not
3773    // provide additional evidence, so score each distinct country
3774    // code at most once. Unknown entries fall to
3775    // MISSING_TOKEN_LOG_PRIOR — the same penalty the non-country-code
3776    // path uses for unrecognized tokens, which is the correct
3777    // behavior for a candidate that resolved to a non-CVE country
3778    // string.
3779    let mut seen_rel_to_codes = BTreeSet::new();
3780    for country in marking.0.rel_to.iter() {
3781        if seen_rel_to_codes.insert(country.as_str()) {
3782            prior += marque_capco::priors::country_code_log_prior(country.as_str())
3783                .unwrap_or(MISSING_TOKEN_LOG_PRIOR);
3784        }
3785    }
3786
3787    // Posterior: prior plus feature deltas plus structural penalties.
3788    let feature_sum: f32 = attempt.features.iter().map(|f| f.delta).sum();
3789    let mut posterior = prior + feature_sum;
3790    if absorbs_hard_splitter_in_sar_or_sci(marking) {
3791        posterior += HARD_SPLITTER_ABSORPTION_PENALTY;
3792    }
3793    posterior += custom_sci_marking_penalty(marking);
3794
3795    (prior, posterior)
3796}
3797
3798/// Total per-entry penalty for SCI markings whose strict parse landed
3799/// with [`SciControlSystem::Custom`] as the control system. See
3800/// [`CUSTOM_SCI_MARKING_PENALTY`] for rationale, including why this
3801/// discriminates on `sm.system` rather than on
3802/// `sm.canonical_enum.is_none()`.
3803fn custom_sci_marking_penalty(marking: &CapcoMarking) -> f32 {
3804    let attrs = &marking.0;
3805    let custom_count = attrs
3806        .sci_markings
3807        .iter()
3808        .filter(|sm| matches!(sm.system, SciControlSystem::Custom(_)))
3809        .count();
3810    custom_count as f32 * CUSTOM_SCI_MARKING_PENALTY
3811}
3812
3813/// True when the strict parse of a candidate buries a hard-splitter
3814/// dissem-control token (NOFORN, ORCON, EXDIS, FOUO, …) inside a SAR
3815/// program/compartment/sub-compartment slot or an SCI compartment/
3816/// sub-compartment slot.
3817///
3818/// Used by [`score_candidate`] to apply
3819/// [`HARD_SPLITTER_ABSORPTION_PENALTY`] — the penalty exists because
3820/// SAR's grammar accepts any alphanumeric identifier and quietly
3821/// absorbs trailing dissem-control tokens that should have been
3822/// separated from the SAR block by `//`. See the
3823/// `HARD_SPLITTER_ABSORPTION_PENALTY` doc for the full rationale.
3824///
3825/// Identifiers are checked both as whole strings AND as whitespace-
3826/// separated word sequences. The whitespace split matters for the
3827/// `Full` SAR indicator form (`SPECIAL ACCESS REQUIRED-BUTTER
3828/// POPCORN`): a multi-word program nickname like `"BUTTER POPCORN"`
3829/// may have `NOFORN` absorbed as a trailing word, producing
3830/// `identifier: "BUTTER POPCORN NOFORN"`. Without the per-word
3831/// check, the absorption pattern slips past the whole-string
3832/// `is_hard_splitter` lookup.
3833fn absorbs_hard_splitter_in_sar_or_sci(marking: &CapcoMarking) -> bool {
3834    let attrs = &marking.0;
3835
3836    if let Some(sar) = attrs.sar_markings.as_ref() {
3837        for prog in sar.programs.iter() {
3838            if contains_hard_splitter_word(&prog.identifier) {
3839                return true;
3840            }
3841            for comp in prog.compartments.iter() {
3842                if contains_hard_splitter_word(&comp.identifier) {
3843                    return true;
3844                }
3845                if comp
3846                    .sub_compartments
3847                    .iter()
3848                    .any(|sub| contains_hard_splitter_word(sub))
3849                {
3850                    return true;
3851                }
3852            }
3853        }
3854    }
3855
3856    for sci in attrs.sci_markings.iter() {
3857        for comp in sci.compartments.iter() {
3858            if contains_hard_splitter_word(&comp.identifier) {
3859                return true;
3860            }
3861            if comp
3862                .sub_compartments
3863                .iter()
3864                .any(|sub| contains_hard_splitter_word(sub))
3865            {
3866                return true;
3867            }
3868        }
3869    }
3870
3871    false
3872}
3873
3874/// True when `s` is a hard-splitter token, or contains a hard-
3875/// splitter token as a whitespace-separated word. The per-word check
3876/// covers multi-word `Full` SAR program nicknames (`BUTTER POPCORN`)
3877/// that absorbed a trailing dissem-control word.
3878fn contains_hard_splitter_word(s: &str) -> bool {
3879    if is_hard_splitter(s) {
3880        return true;
3881    }
3882    s.split_whitespace().any(is_hard_splitter)
3883}
3884
3885/// Enumerate the canonical tokens present in `marking` that have a
3886/// `&'static str` representation suitable for
3887/// [`marque_capco::priors::TOKEN_BASE_RATES`] lookup.
3888///
3889/// Scored token families, by `IsmAttributes` field:
3890///
3891/// - `classification` — effective level's banner string
3892///   (`SECRET`, `TOP SECRET`, ...).
3893/// - `sci_controls` — each variant's `as_str()` (`SI`, `TK`, `HCS-P`, ...).
3894/// - `dissem_controls` — IC dissem variants' `as_str()`
3895///   (`NF`, `OC`, `RELIDO`, ...).
3896/// - `non_ic_dissem` — non-IC dissem variants' `banner_str()`
3897///   (`LIMDIS`, `EXDIS`, `NODIS`, `SBU`, `LES`, ...).
3898/// - `aea_markings` — category token `"AEA"` when any AEA marking is
3899///   present. Individual AEA sub-variants (RD / FRD / CNWDI /
3900///   SIGMA / UCNI variants) are not broken out for scoring because
3901///   the baked priors don't carry per-sub-variant base rates and
3902///   adding floor-penalty contributions for each variant would hurt
3903///   AEA-bearing candidates across the board.
3904/// - `fgi_marker` — category token `"FGI"` when an FGI marker is set.
3905///
3906/// Deliberately NOT included here:
3907///
3908/// - `sar_markings` — SAR program identifiers are agency-assigned
3909///   codewords (open set, not in the baked priors).
3910/// - `rel_to` country codes — scored separately in
3911///   [`score_candidate`] via
3912///   [`marque_capco::priors::country_code_log_prior`] (issue #233).
3913///   `CountryCode::as_str()` returns a `&str` tied to `&self`, not
3914///   `&'static str`, so the country-code contribution is summed at
3915///   score-time rather than collected here.
3916/// - CAB fields (`classified_by`, `derived_from`, `declassify_on`) —
3917///   free-form text, not CVE-enumerable.
3918///
3919/// Expansion work is tracked in future PRs alongside any priors
3920/// regeneration that widens coverage (e.g., counting SAR indicator
3921/// base rates from a larger corpus).
3922fn canonical_tokens_for(marking: &CapcoMarking) -> Vec<&'static str> {
3923    let attrs = &marking.0;
3924    let mut tokens: BTreeSet<&'static str> = BTreeSet::new();
3925
3926    if let Some(class) = attrs.classification.as_ref() {
3927        // Use the effective level's banner form as the classification
3928        // token — this is the form the priors corpus keys on for the
3929        // "common classification appears" prior.
3930        tokens.insert(class.effective_level().banner_str());
3931    }
3932
3933    for ctrl in attrs.sci_controls.iter() {
3934        tokens.insert(ctrl.as_str());
3935    }
3936    for dis in attrs.dissem_controls.iter() {
3937        tokens.insert(dis.as_str());
3938    }
3939    for nic in attrs.non_ic_dissem.iter() {
3940        // `NonIcDissem::banner_str` returns `&'static str` with the
3941        // banner form (LIMDIS, EXDIS, NODIS, SBU, LES, SSI,
3942        // SBU NOFORN, LES NOFORN). The compound forms ("SBU NOFORN",
3943        // "LES NOFORN") won't hit a single-token priors entry — they
3944        // fall to MISSING_TOKEN_LOG_PRIOR. That's fine: the
3945        // comparison against peer candidates remains consistent.
3946        tokens.insert(nic.banner_str());
3947    }
3948    if !attrs.aea_markings.is_empty() {
3949        tokens.insert("AEA");
3950    }
3951    if attrs.fgi_marker.is_some() {
3952        tokens.insert("FGI");
3953    }
3954
3955    tokens.into_iter().collect()
3956}
3957
3958// ---------------------------------------------------------------------------
3959// Strict + decoder dispatcher
3960// ---------------------------------------------------------------------------
3961
3962/// Recognizer that runs the strict path first and falls back to the
3963/// decoder when the strict parse yields no meaningful attributes.
3964///
3965/// Default recognizer installed by [`crate::Engine::new`]. Callers
3966/// that need strict-only dispatch (the SC-001 interactive-latency
3967/// benchmark, tests asserting strict behavior) install
3968/// [`StrictRecognizer`] explicitly via
3969/// [`crate::Engine::with_recognizer`].
3970///
3971/// Within this recognizer, dispatch is keyed off
3972/// [`ParseContext::strict_evidence`]:
3973///
3974/// - `strict_evidence = true`: collapse to strict-only behavior. The
3975///   decoder is not called. The engine never sets this; it's reserved
3976///   for callers (e.g., test code) that construct a `ParseContext`
3977///   directly and want to drive only the strict half of the dispatcher.
3978/// - `strict_evidence = false` (the engine default): try strict first.
3979///   Fall back to the decoder when the strict result is either (a)
3980///   zero-candidate `Ambiguous` or (b) `Unambiguous` with an empty /
3981///   trivial [`CapcoMarking`] (no classification, no SCI, no dissem,
3982///   no FGI, etc.). The trivial-Unambiguous case matters because
3983///   `marque_core::Parser` is lenient: it accepts arbitrary
3984///   `BYTES//BYTES` shapes and returns `Ok` with an empty
3985///   `IsmAttributes` when nothing in the input is a recognized CVE
3986///   token. Treating such a result as a successful parse would leave
3987///   the decoder dormant on exactly the mangled inputs it exists to
3988///   recover (`SERCET//NOFORN`, `NOFORN//SECRET`, …). Strict is
3989///   always called with `strict_evidence = true` internally; the
3990///   decoder is always called with `strict_evidence = false`
3991///   internally.
3992///
3993/// Other [`ParseContext`] fields (`zone`, `position`,
3994/// `classification_floor`) pass through unchanged.
3995#[derive(Debug, Default, Clone, Copy)]
3996pub struct StrictOrDecoderRecognizer {
3997    strict: StrictRecognizer,
3998    decoder: DecoderRecognizer,
3999}
4000
4001impl StrictOrDecoderRecognizer {
4002    pub const fn new() -> Self {
4003        Self {
4004            strict: StrictRecognizer::new(),
4005            decoder: DecoderRecognizer::new(),
4006        }
4007    }
4008}
4009
4010impl Recognizer<CapcoScheme> for StrictOrDecoderRecognizer {
4011    fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
4012        let strict_inner_cx = ParseContext {
4013            strict_evidence: true,
4014            ..cx.clone()
4015        };
4016        let strict_result = self.strict.recognize(bytes, &strict_inner_cx);
4017
4018        // When the outer caller asked for strict-only via
4019        // `strict_evidence = true`, collapse to the strict result —
4020        // never call the decoder. The engine never sets this flag (it
4021        // installs `StrictRecognizer` directly via `with_recognizer`
4022        // when a strict-only mode is needed); this branch exists for
4023        // direct callers that construct a `ParseContext` themselves
4024        // (e.g., test code).
4025        if cx.strict_evidence {
4026            return strict_result;
4027        }
4028
4029        // Infer the candidate kind from the byte shape so
4030        // `strict_parse_is_complete` can apply the right rule
4031        // (classification-requiring for portion/banner, CAB-field-
4032        // requiring for CAB). If inference fails the bytes are too
4033        // degenerate for either path — skip and return whatever the
4034        // strict path produced (most likely zero-candidate Ambiguous).
4035        let Some(kind) = infer_marking_type(bytes) else {
4036            return strict_result;
4037        };
4038
4039        // Complete strict parse — take it, decoder not needed.
4040        if matches!(&strict_result, Parsed::Unambiguous(m) if strict_parse_is_complete(m, kind)) {
4041            return strict_result;
4042        }
4043
4044        // Strict already produced non-empty candidates — keep them.
4045        if matches!(&strict_result, Parsed::Ambiguous { candidates } if !candidates.is_empty()) {
4046            return strict_result;
4047        }
4048
4049        // Remaining cases: either an incomplete-but-Unambiguous strict parse
4050        // (partial attrs, `TokenKind::Unknown` spans, missing classification,
4051        // etc.) or a zero-candidate strict Ambiguous. Both warrant a decoder
4052        // attempt. Cases:
4053        //   (a) Truly empty attrs (`FROBNITZ//WIBBLE`) — zero-candidate strict.
4054        //   (b) Partial attrs (`(SERCET//NOFORN)` — NOFORN parsed, SERCET
4055        //       left in a Classification-kind span with
4056        //       `attrs.classification = None`) — incomplete Unambiguous.
4057        let decoder_cx = ParseContext {
4058            strict_evidence: false,
4059            ..cx.clone()
4060        };
4061        let decoder_result = self.decoder.recognize(bytes, &decoder_cx);
4062
4063        // Only adopt the decoder result when it produced an Unambiguous
4064        // marking. If the decoder is also uncertain, preserve the strict
4065        // result so rules can still fire on any partial attrs — avoiding
4066        // deep-scan silently reducing observability/diagnostics on
4067        // mangled input.
4068        match decoder_result {
4069            Parsed::Unambiguous(_) => decoder_result,
4070            _ => strict_result,
4071        }
4072    }
4073}
4074
4075// ---------------------------------------------------------------------------
4076// Tests
4077// ---------------------------------------------------------------------------
4078
4079#[cfg(test)]
4080#[cfg_attr(coverage_nightly, coverage(off))]
4081mod tests {
4082    use super::*;
4083
4084    #[test]
4085    fn decoder_is_send_sync_as_trait_object() {
4086        fn assert_send_sync<T: Send + Sync + ?Sized>() {}
4087        assert_send_sync::<DecoderRecognizer>();
4088        assert_send_sync::<StrictOrDecoderRecognizer>();
4089        assert_send_sync::<std::sync::Arc<dyn Recognizer<CapcoScheme>>>();
4090    }
4091
4092    fn deep_cx() -> ParseContext {
4093        ParseContext {
4094            strict_evidence: false,
4095            zone: None,
4096            position: None,
4097            classification_floor: None,
4098            as_of: None,
4099            preceded_by_whitespace: true,
4100        }
4101    }
4102
4103    // ----- Missing-delimiter insertion (issue #133 PR 3) -----
4104
4105    #[test]
4106    fn try_insert_delimiter_inserts_before_long_form_dissem() {
4107        // Hard-splitter rule: long-form dissem after whitespace.
4108        let cases: &[(&str, &str)] = &[
4109            ("SECRET//NOFORN EXDIS", "SECRET//NOFORN//EXDIS"),
4110            ("SECRET//NOFORN ORCON", "SECRET//NOFORN//ORCON"),
4111            ("SECRET//SI ORCON", "SECRET//SI//ORCON"),
4112        ];
4113        for (input, expected) in cases {
4114            let result = try_insert_delimiter(input);
4115            assert_eq!(
4116                result.as_deref(),
4117                Some(*expected),
4118                "input {input:?} should produce {expected:?}; got {result:?}"
4119            );
4120        }
4121    }
4122
4123    #[test]
4124    fn try_insert_delimiter_classification_boundary() {
4125        // Rule 1: classification → next segment.
4126        let cases: &[(&str, &str)] = &[
4127            (
4128                "SECRET REL TO USA, AUS, GBR",
4129                "SECRET//REL TO USA, AUS, GBR",
4130            ),
4131            ("SECRET NOFORN", "SECRET//NOFORN"),
4132            ("TOP SECRET NOFORN", "TOP SECRET//NOFORN"),
4133        ];
4134        for (input, expected) in cases {
4135            let result = try_insert_delimiter(input);
4136            assert_eq!(
4137                result.as_deref(),
4138                Some(*expected),
4139                "input {input:?} should produce {expected:?}; got {result:?}"
4140            );
4141        }
4142    }
4143
4144    #[test]
4145    fn try_insert_delimiter_does_not_split_top_secret() {
4146        // TOP SECRET is the only multi-word classification — the
4147        // helper must not insert `//` between TOP and SECRET.
4148        // The first rule fires only on the first NON-classification
4149        // token; SECRET after TOP is a classification continuation.
4150        let result = try_insert_delimiter("TOP SECRET//NF");
4151        // No insertion needed at all (input is already canonical).
4152        assert_eq!(result, None);
4153    }
4154
4155    #[test]
4156    fn try_insert_delimiter_does_not_split_sbu_noforn() {
4157        // SBU NOFORN is the non-IC dissem banner long form for
4158        // SbuNf — must remain a single multi-word atom.
4159        let result = try_insert_delimiter("SECRET//SBU NOFORN");
4160        assert_eq!(result, None, "SBU NOFORN must not be split; got {result:?}");
4161    }
4162
4163    #[test]
4164    fn try_insert_delimiter_does_not_split_les_noforn() {
4165        // LES NOFORN is the non-IC dissem banner long form for
4166        // LesNf — must remain a single multi-word atom.
4167        let result = try_insert_delimiter("SECRET//LES NOFORN");
4168        assert_eq!(result, None, "LES NOFORN must not be split; got {result:?}");
4169    }
4170
4171    #[test]
4172    fn try_insert_delimiter_no_op_on_canonical() {
4173        // Already-canonical inputs produce None (no insertion).
4174        for input in &[
4175            "SECRET//NOFORN",
4176            "TOP SECRET//SI//NOFORN",
4177            "(S//NF)",
4178            "UNCLASSIFIED",
4179        ] {
4180            let result = try_insert_delimiter(input);
4181            assert_eq!(
4182                result, None,
4183                "input {input:?} is canonical; should produce None, got {result:?}"
4184            );
4185        }
4186    }
4187
4188    #[test]
4189    fn try_insert_delimiter_capped_at_max_insertions() {
4190        // Pathological input with many splitters — the cap should
4191        // limit insertions. Hard cap is `MAX_DELIMITER_INSERTIONS`
4192        // (4 today); 6 splitters in the input should produce at
4193        // most 4 insertions in the output.
4194        let input = "SECRET NOFORN ORCON PROPIN IMCON RELIDO RSEN";
4195        let result = try_insert_delimiter(input);
4196        assert!(result.is_some());
4197        let inserted = result.unwrap();
4198        let inserted_count = inserted.matches("//").count();
4199        assert!(
4200            inserted_count <= MAX_DELIMITER_INSERTIONS,
4201            "must not exceed MAX_DELIMITER_INSERTIONS={MAX_DELIMITER_INSERTIONS}; \
4202             got {inserted_count} insertions in {inserted:?}"
4203        );
4204    }
4205
4206    #[test]
4207    fn try_insert_delimiter_preserves_existing_double_slash() {
4208        // Existing `//` separators must be preserved verbatim.
4209        let result = try_insert_delimiter("SECRET//NOFORN EXDIS");
4210        let s = result.expect("should insert");
4211        // Two `//` total: one preserved in SECRET//NOFORN, plus one
4212        // inserted for NOFORN//EXDIS.
4213        let count = s.matches("//").count();
4214        assert_eq!(
4215            count, 2,
4216            "expected 2 `//` total (1 preserved + 1 inserted), got {count} in {s:?}"
4217        );
4218    }
4219
4220    #[test]
4221    fn try_insert_delimiter_preserves_non_ascii_characters_verbatim() {
4222        // Regression guard for PR #175 review: the helper used to do
4223        // `result.push(bytes[i] as char)` for non-token, non-`/`,
4224        // non-whitespace characters, which corrupts multi-byte UTF-8
4225        // sequences by emitting each byte as a separate Latin-1
4226        // codepoint (e.g., `∕` → 3 garbage codepoints). The fix
4227        // walks `text[i..].chars()` to take one full character and
4228        // advances `i` by `ch.len_utf8()`, preserving the original
4229        // UTF-8 byte sequence in the output.
4230        //
4231        // The fixture below has a stray `∕` (U+2215, 3 bytes in
4232        // UTF-8) that the upstream delimiter normalizer didn't catch.
4233        // The helper must echo the original bytes verbatim into the
4234        // output (no insertion would happen here — there's no
4235        // splitter token after the `∕`), and the round-trip must
4236        // preserve the `∕` character intact.
4237        let input = "SECRET ∕∕ NOFORN";
4238        let result = try_insert_delimiter(input);
4239        // Whether or not the helper emits a result depends on the
4240        // tokenization — what matters is that NO character in the
4241        // output corrupts the `∕` UTF-8 sequence. Test the result
4242        // (or the input passthrough if None).
4243        let was_some = result.is_some();
4244        let s = result.unwrap_or_else(|| input.to_string());
4245        assert!(
4246            s.is_char_boundary(s.len()),
4247            "output {s:?} must end on a char boundary"
4248        );
4249        // The `∕` character (U+2215) must survive intact in the
4250        // output. If the old `bytes[i] as char` shape was still in
4251        // play, the 3-byte UTF-8 sequence [0xE2, 0x88, 0x95] would
4252        // be emitted as three separate codepoints (U+00E2 U+0088
4253        // U+0095), and the original `∕` would not appear.
4254        assert!(
4255            !was_some || s.contains('∕'),
4256            "output {s:?} must preserve the U+2215 character when the \
4257             helper emitted any output"
4258        );
4259    }
4260
4261    #[test]
4262    fn is_hard_splitter_covers_documented_long_forms() {
4263        // Pin the hard-splitter set against accidental shrinkage —
4264        // every long-form dissem from the doc table must remain
4265        // a hard splitter.
4266        for token in &[
4267            "NOFORN",
4268            "ORCON",
4269            "ORCON-USGOV",
4270            "PROPIN",
4271            "IMCON",
4272            "RELIDO",
4273            "RSEN",
4274            "EYESONLY",
4275            "FOUO",
4276            "FISA",
4277            "DSEN",
4278            "EXDIS",
4279            "NODIS",
4280            "LIMDIS",
4281        ] {
4282            assert!(
4283                is_hard_splitter(token),
4284                "{token:?} must be a hard splitter (issue #133 PR 3)"
4285            );
4286        }
4287    }
4288
4289    #[test]
4290    fn is_hard_splitter_excludes_short_forms() {
4291        // Short-form abbreviations (NF, OC, PR, IMC, RS) are
4292        // intentionally excluded — they could collide with SAR
4293        // compartment / sub-compartment naming.
4294        for token in &["NF", "OC", "PR", "IMC", "RS"] {
4295            assert!(
4296                !is_hard_splitter(token),
4297                "{token:?} is intentionally NOT a hard splitter (collision risk)"
4298            );
4299        }
4300    }
4301
4302    // ----- Position-aware classification heuristic (issue #133 PR 2) -----
4303
4304    #[test]
4305    fn heuristic_2char_ts_cluster() {
4306        // T-cluster + S-cluster → TS. Cover the full 6×5 = 30
4307        // combinations that should fire, plus a couple that shouldn't.
4308        for first in &['T', 'R', 'Y', 'H', 'G', 'F'] {
4309            for second in &['A', 'W', 'E', 'Z', 'S'] {
4310                let token: String = [*first, *second].iter().collect();
4311                assert_eq!(
4312                    try_2char_classification_heuristic(&token),
4313                    Some("TS"),
4314                    "{token:?} should heuristic-fix to TS"
4315                );
4316            }
4317        }
4318        // Lowercase variants normalize via the helper's
4319        // to_ascii_uppercase.
4320        assert_eq!(try_2char_classification_heuristic("ys"), Some("TS"));
4321        assert_eq!(try_2char_classification_heuristic("Ys"), Some("TS"));
4322    }
4323
4324    #[test]
4325    fn heuristic_2char_no_match_outside_clusters() {
4326        // First char outside T-cluster → no match.
4327        for token in &["AS", "WS", "ZS", "BS", "DS", "QS"] {
4328            assert_eq!(
4329                try_2char_classification_heuristic(token),
4330                None,
4331                "{token:?} should not heuristic-fix"
4332            );
4333        }
4334        // Second char outside S-cluster → no match.
4335        for token in &["TR", "RY", "HG", "GH", "FB"] {
4336            assert_eq!(
4337                try_2char_classification_heuristic(token),
4338                None,
4339                "{token:?} should not heuristic-fix"
4340            );
4341        }
4342    }
4343
4344    #[test]
4345    fn heuristic_1char_s_cluster() {
4346        // S-key neighbors → S. Bare S is canonical and excluded by
4347        // the upstream `is_canonical_short_classification` guard, so
4348        // the helper returns Some("S") for S-key neighbors and the
4349        // outer logic suppresses the no-op case.
4350        for token in &["A", "W", "E", "Z"] {
4351            assert_eq!(
4352                try_1char_classification_heuristic(token),
4353                Some("S"),
4354                "{token:?} should heuristic-fix to S"
4355            );
4356        }
4357        // X is between C and S; defaults to S per the design note.
4358        assert_eq!(try_1char_classification_heuristic("X"), Some("S"));
4359    }
4360
4361    #[test]
4362    fn heuristic_1char_c_cluster() {
4363        // C-key neighbors → C.
4364        for token in &["V", "F"] {
4365            assert_eq!(
4366                try_1char_classification_heuristic(token),
4367                Some("C"),
4368                "{token:?} should heuristic-fix to C"
4369            );
4370        }
4371    }
4372
4373    #[test]
4374    fn heuristic_1char_no_match_outside_clusters() {
4375        // Letters not in any heuristic cluster.
4376        for token in &["B", "D", "G", "K", "M", "N", "Q", "T", "Y"] {
4377            assert_eq!(
4378                try_1char_classification_heuristic(token),
4379                None,
4380                "{token:?} should not heuristic-fix"
4381            );
4382        }
4383    }
4384
4385    #[test]
4386    fn heuristic_skips_canonical_classifications() {
4387        // Bare canonical short forms must not produce a heuristic
4388        // fix — the strict parser already accepts them.
4389        for canonical in &["U", "R", "C", "S", "TS"] {
4390            assert!(
4391                is_canonical_short_classification(canonical),
4392                "{canonical:?} should be recognized as canonical"
4393            );
4394        }
4395        // And the wrapper helper short-circuits these.
4396        assert_eq!(try_classification_heuristic_fix("(S//NF)"), None);
4397        assert_eq!(try_classification_heuristic_fix("(TS//NF)"), None);
4398        assert_eq!(try_classification_heuristic_fix("(C//NF)"), None);
4399        assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
4400    }
4401
4402    #[test]
4403    fn heuristic_fixes_portion_form() {
4404        assert_eq!(
4405            try_classification_heuristic_fix("(YS//NF)").as_deref(),
4406            Some("(TS//NF)")
4407        );
4408        assert_eq!(
4409            try_classification_heuristic_fix("(W//NF)").as_deref(),
4410            Some("(S//NF)")
4411        );
4412        assert_eq!(
4413            try_classification_heuristic_fix("(F//NF)").as_deref(),
4414            Some("(C//NF)")
4415        );
4416        // Lowercase first token (inside parens).
4417        assert_eq!(
4418            try_classification_heuristic_fix("(ys//NF)").as_deref(),
4419            Some("(TS//NF)")
4420        );
4421    }
4422
4423    #[test]
4424    fn heuristic_fixes_banner_form() {
4425        // Banner shapes don't have parens but otherwise behave the
4426        // same — leading classification token in the first segment.
4427        assert_eq!(
4428            try_classification_heuristic_fix("RS//NOFORN").as_deref(),
4429            Some("TS//NOFORN")
4430        );
4431        assert_eq!(
4432            try_classification_heuristic_fix("X//NOFORN").as_deref(),
4433            Some("S//NOFORN")
4434        );
4435    }
4436
4437    #[test]
4438    fn heuristic_skips_cab_shape() {
4439        // CAB lines don't have a leading classification token. The
4440        // `is_cab_head` short-circuit at the top of the helper should
4441        // catch every CAB-keyword prefix.
4442        assert_eq!(try_classification_heuristic_fix("Classified By: foo"), None);
4443        assert_eq!(try_classification_heuristic_fix("Derived From: bar"), None);
4444        assert_eq!(try_classification_heuristic_fix("Declassify On: baz"), None);
4445    }
4446
4447    #[test]
4448    fn heuristic_skips_long_token() {
4449        // 4+ char tokens fall through the length match arm — the
4450        // vocab fuzzy path handles them. 3-char tokens are mostly
4451        // handled by the vocab path too (now that PR 8 added bare
4452        // `TOP` to `EXTENDED_CORRECTION_VOCAB`, shapes like `TPP`
4453        // and `UOP` correct via dist-1 fuzzy); the 3-char heuristic
4454        // is intentionally narrow (only `OTP` → `TOP`) so unrelated
4455        // 3-char tokens like `YES` return None.
4456        assert_eq!(try_classification_heuristic_fix("(YES//NF)"), None);
4457        assert_eq!(try_classification_heuristic_fix("(SECT//NF)"), None);
4458        assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
4459    }
4460
4461    // ----- 3-char classification heuristic (issue #133 PR 8) -----
4462
4463    #[test]
4464    fn heuristic_recovers_otp_to_top_via_3char_rule() {
4465        // OTP → TOP: T↔O transposition. Standard Levenshtein dist 2
4466        // blocked by the vocab fuzzy path's `MIN_USEFUL_CONFIDENCE`
4467        // floor; the targeted 3-char heuristic is the recovery path.
4468        let cases: &[(&str, &str)] = &[
4469            ("OTP SECRET//NOFORN", "TOP SECRET//NOFORN"),
4470            ("(OTP//NF)", "(TOP//NF)"),
4471            ("OTP SECRET//SI//NOFORN", "TOP SECRET//SI//NOFORN"),
4472        ];
4473        for (input, expected) in cases {
4474            let result = try_classification_heuristic_fix(input);
4475            assert_eq!(
4476                result.as_deref(),
4477                Some(*expected),
4478                "input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
4479            );
4480        }
4481    }
4482
4483    #[test]
4484    fn try_3char_classification_heuristic_only_matches_otp() {
4485        // The 3-char heuristic is intentionally narrow (a single
4486        // hardcoded `OTP → TOP` mapping). Any other 3-char input
4487        // returns None and falls through to other recovery paths.
4488        // Pinned because the dense 3-char trigraph vocab (TON, TUR,
4489        // TWN, …) means a wider rule would generate too many false
4490        // positives.
4491        assert_eq!(try_3char_classification_heuristic("OTP"), Some("TOP"));
4492        for not_a_match in &["TON", "TPP", "UOP", "TIP", "TPO", "TOO", "ABC", "YES"] {
4493            assert_eq!(
4494                try_3char_classification_heuristic(not_a_match),
4495                None,
4496                "3-char heuristic must not fire on {not_a_match:?}",
4497            );
4498        }
4499    }
4500
4501    // ----- Extended 2-char heuristic for TP/TO → TOP -----
4502
4503    #[test]
4504    fn heuristic_recovers_tp_and_to_to_top_via_2char_rule() {
4505        // PR 8 extended the 2-char heuristic to map `TP`/`TO` → `TOP`.
4506        // These are corpus-attested classification typos where the
4507        // middle `O` (`TP`) or trailing `P` (`TO`) was elided. They
4508        // must not collide with the TS rule because neither `P` nor
4509        // `O` is in the S-cluster.
4510        let cases: &[(&str, &str)] = &[
4511            ("TP SECRET//NOFORN", "TOP SECRET//NOFORN"),
4512            ("TO SECRET//NOFORN", "TOP SECRET//NOFORN"),
4513            ("(TP//NF)", "(TOP//NF)"),
4514            ("(TO//NF)", "(TOP//NF)"),
4515        ];
4516        for (input, expected) in cases {
4517            let result = try_classification_heuristic_fix(input);
4518            assert_eq!(
4519                result.as_deref(),
4520                Some(*expected),
4521                "input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
4522            );
4523        }
4524    }
4525
4526    #[test]
4527    fn try_2char_classification_heuristic_ts_rule_takes_precedence() {
4528        // The TS rule (T-cluster + S-cluster pair) is checked first;
4529        // the TP/TO → TOP rule is a fallback. None of the TP/TO
4530        // characters are in the S-cluster (P, O), so there's no
4531        // ambiguity in practice — but pinning the precedence here
4532        // keeps a future widening of the TP/TO rule from silently
4533        // overriding the TS rule.
4534        // Pure T-cluster + S-cluster → TS.
4535        assert_eq!(try_2char_classification_heuristic("TS"), Some("TS"));
4536        assert_eq!(try_2char_classification_heuristic("RS"), Some("TS"));
4537        assert_eq!(try_2char_classification_heuristic("YS"), Some("TS"));
4538        // T + non-S-cluster → TOP (only for P/O).
4539        assert_eq!(try_2char_classification_heuristic("TP"), Some("TOP"));
4540        assert_eq!(try_2char_classification_heuristic("TO"), Some("TOP"));
4541        // T + other non-S-cluster → still None (don't broaden).
4542        assert_eq!(try_2char_classification_heuristic("TI"), None);
4543        assert_eq!(try_2char_classification_heuristic("TX"), None);
4544    }
4545
4546    #[test]
4547    fn is_canonical_short_classification_recognizes_top() {
4548        // PR 8 added bare `TOP` to the canonical-short set so the
4549        // classification heuristic doesn't fire on already-canonical
4550        // `TOP SECRET//...` input (whose first whitespace-token is
4551        // `TOP`). Pre-PR-8 this was a no-op because the length-3
4552        // heuristic always returned None; PR 8's OTP rule made it
4553        // load-bearing.
4554        assert!(is_canonical_short_classification("TOP"));
4555        // Existing canonical short forms still recognized.
4556        for s in &["U", "R", "C", "S", "TS"] {
4557            assert!(
4558                is_canonical_short_classification(s),
4559                "{s:?} must be recognized as canonical short classification",
4560            );
4561        }
4562        // Non-canonical or wrong-case forms still return false.
4563        assert!(!is_canonical_short_classification("TPP"));
4564        assert!(!is_canonical_short_classification("top")); // case-sensitive
4565        assert!(!is_canonical_short_classification("TOPS"));
4566    }
4567
4568    #[test]
4569    fn heuristic_skips_unknown_first_char() {
4570        // First char isn't in any heuristic cluster → no fix.
4571        assert_eq!(try_classification_heuristic_fix("(B//NF)"), None);
4572        assert_eq!(try_classification_heuristic_fix("(QS//NF)"), None);
4573    }
4574
4575    #[test]
4576    fn heuristic_skips_lone_inputs() {
4577        // Issue #133 PR 4 / #176 lone-input safety guard. The
4578        // heuristic must NOT fire on inputs without marking-shape
4579        // signals beyond the leading token — auto-applying lone-case
4580        // fixes would surface as false positives on parenthetical
4581        // refs like `(A)`, `(W)`, `(F)` that are common in business
4582        // prose. The corpus measurement at PR 4 found `A` alone has
4583        // 214,539 unrestricted body-text occurrences in the Enron
4584        // corpus vs 168 in marking-context — the lone-case FP rate
4585        // is ~3 orders of magnitude higher than the in-context rate.
4586        //
4587        // Form-field input (caller asserts the input IS a marking
4588        // attempt) should still fire; tracking via #176 — when the
4589        // input-source signal lands, this guard becomes conditional.
4590        for lone in &[
4591            "(YS)",  // 2-char trigger, parens, nothing else
4592            "(W)",   // 1-char trigger
4593            "(F)",   // 1-char trigger
4594            "(X)",   // 1-char trigger
4595            "YS",    // banner-shape lone
4596            "W",     // bare lone token
4597            "(YS )", // trailing whitespace only
4598        ] {
4599            assert_eq!(
4600                try_classification_heuristic_fix(lone),
4601                None,
4602                "lone input {lone:?} must not fire heuristic (#133 PR 4 / #176 lone-input guard)"
4603            );
4604        }
4605    }
4606
4607    #[test]
4608    fn heuristic_fires_when_marking_signal_present() {
4609        // Counterpart to `heuristic_skips_lone_inputs`. The guard is
4610        // about LONE inputs only; inputs with ANY marking content
4611        // beyond the leading token (a `//` separator OR another
4612        // whitespace-separated token in the first segment) still
4613        // fire normally.
4614        let cases: &[(&str, &str)] = &[
4615            ("(YS//NF)", "(TS//NF)"), // `//` separator after token
4616            ("(YS NF)", "(TS NF)"),   // whitespace + another token
4617            ("YS//NOFORN", "TS//NOFORN"),
4618            ("W//NF", "S//NF"),
4619        ];
4620        for (input, expected) in cases {
4621            let result = try_classification_heuristic_fix(input);
4622            assert_eq!(
4623                result.as_deref(),
4624                Some(*expected),
4625                "input {input:?} should heuristic-fix to {expected:?} \
4626                 (marking signal present); got {result:?}"
4627            );
4628        }
4629    }
4630
4631    #[test]
4632    fn decoder_defers_to_strict_when_strict_evidence_is_set() {
4633        let rx = DecoderRecognizer::new();
4634        let cx = ParseContext::default(); // strict_evidence = true
4635        match rx.recognize(b"(S//NF)", &cx) {
4636            Parsed::Ambiguous { candidates } => assert!(candidates.is_empty()),
4637            other => panic!("expected zero-candidate Ambiguous, got {other:?}"),
4638        }
4639    }
4640
4641    #[test]
4642    fn decoder_zero_candidate_on_no_template_fit() {
4643        let rx = DecoderRecognizer::new();
4644        // Neither token is in the vocabulary and no fuzzy match.
4645        match rx.recognize(b"FROBNITZ//WIBBLE", &deep_cx()) {
4646            Parsed::Ambiguous { candidates } => assert!(
4647                candidates.is_empty(),
4648                "unrecognized input must be zero-candidate, got {} candidate(s)",
4649                candidates.len()
4650            ),
4651            Parsed::Unambiguous(m) => panic!("unexpected strict match: {m:?}"),
4652        }
4653    }
4654
4655    #[test]
4656    fn score_candidate_splits_prior_and_posterior() {
4657        // Synthesize a fake attempt with known non-zero feature deltas
4658        // and verify the (prior, posterior) return tuple: posterior
4659        // must be prior + Σ feature.delta, and prior must NOT include
4660        // any of the feature deltas.
4661        let token_set = CapcoTokenSet;
4662        let parser = Parser::new(&token_set);
4663        let candidate = MarkingCandidate {
4664            span: Span::new(0, 14),
4665            kind: MarkingType::Banner,
4666        };
4667        let parsed = parser
4668            .parse(&candidate, b"SECRET//NOFORN")
4669            .expect("SECRET//NOFORN must parse");
4670        let marking = CapcoMarking::new(parsed.attrs);
4671
4672        let features = vec![
4673            FeatureEntry {
4674                id: FeatureId::EditDistance1,
4675                delta: -0.5,
4676            },
4677            FeatureId::TokenReorder.into(),
4678        ];
4679        let attempt = CanonicalAttempt {
4680            bytes: b"SECRET//NOFORN".to_vec(),
4681            features: features.clone(),
4682            fix_source: marque_rules::FixSource::DecoderPosterior,
4683        };
4684        let (prior, posterior) = score_candidate(&attempt, &marking);
4685
4686        let feature_sum: f32 = features.iter().map(|f| f.delta).sum();
4687        let reconstructed = prior + feature_sum;
4688        assert!(
4689            (reconstructed - posterior).abs() < 1e-6,
4690            "posterior must equal prior + Σ feature deltas; \
4691             prior={prior}, feature_sum={feature_sum}, posterior={posterior}"
4692        );
4693        // And the prior alone must differ from the posterior when
4694        // the features carry non-trivial deltas.
4695        assert!(
4696            (prior - posterior).abs() > f32::EPSILON,
4697            "prior_log_odds must exclude feature deltas; \
4698             prior={prior}, posterior={posterior}"
4699        );
4700    }
4701
4702    // Convenience conversion for the test above.
4703    impl From<FeatureId> for FeatureEntry {
4704        fn from(id: FeatureId) -> Self {
4705            Self { id, delta: -0.4 }
4706        }
4707    }
4708
4709    #[test]
4710    fn score_candidate_includes_country_code_prior_for_rel_to() {
4711        // Issue #233: `score_candidate` sums `country_code_log_prior` over
4712        // the `rel_to` slice of the parsed marking. A marking with TWO REL TO
4713        // entries must produce a strictly lower (more negative) prior than the
4714        // same marking with ONE entry, because each country code contributes a
4715        // negative log-prior term and GBR is a known high-frequency trigraph.
4716        let token_set = CapcoTokenSet;
4717        let parser = Parser::new(&token_set);
4718
4719        let one_candidate = MarkingCandidate {
4720            span: Span::new(0, 18),
4721            kind: MarkingType::Banner,
4722        };
4723        let one_parsed = parser
4724            .parse(&one_candidate, b"SECRET//REL TO USA")
4725            .expect("SECRET//REL TO USA must parse");
4726        let one_marking = CapcoMarking::new(one_parsed.attrs);
4727
4728        let two_candidate = MarkingCandidate {
4729            span: Span::new(0, 23),
4730            kind: MarkingType::Banner,
4731        };
4732        let two_parsed = parser
4733            .parse(&two_candidate, b"SECRET//REL TO USA, GBR")
4734            .expect("SECRET//REL TO USA, GBR must parse");
4735        let two_marking = CapcoMarking::new(two_parsed.attrs);
4736
4737        let no_features: Vec<FeatureEntry> = vec![];
4738        let attempt_one = CanonicalAttempt {
4739            bytes: b"SECRET//REL TO USA".to_vec(),
4740            features: no_features.clone(),
4741            fix_source: marque_rules::FixSource::DecoderPosterior,
4742        };
4743        let attempt_two = CanonicalAttempt {
4744            bytes: b"SECRET//REL TO USA, GBR".to_vec(),
4745            features: no_features.clone(),
4746            fix_source: marque_rules::FixSource::DecoderPosterior,
4747        };
4748
4749        let (prior_one, _) = score_candidate(&attempt_one, &one_marking);
4750        let (prior_two, _) = score_candidate(&attempt_two, &two_marking);
4751
4752        // GBR has a known negative log-prior, so adding it to the REL TO
4753        // list must make the total prior strictly more negative.
4754        assert!(
4755            prior_two < prior_one,
4756            "adding GBR to REL TO must lower (more negative) the prior via \
4757             country_code_log_prior; prior_one={prior_one}, prior_two={prior_two}"
4758        );
4759    }
4760
4761    #[test]
4762    fn score_candidate_deduplicates_rel_to_entries() {
4763        // Issue #233 dedup guard: a duplicate REL TO entry (e.g. "USA, USA")
4764        // must score identically to the deduplicated form ("USA") because
4765        // `seen_rel_to_codes` prevents double-counting.
4766        let token_set = CapcoTokenSet;
4767        let parser = Parser::new(&token_set);
4768
4769        let dup_candidate = MarkingCandidate {
4770            span: Span::new(0, 23),
4771            kind: MarkingType::Banner,
4772        };
4773        // Parser may or may not produce two rel_to entries for "USA, USA" —
4774        // the dedup guard must be robust either way: the prior must equal
4775        // that of a single "USA" entry.
4776        let dup_parsed = parser
4777            .parse(&dup_candidate, b"SECRET//REL TO USA, USA")
4778            .expect("SECRET//REL TO USA, USA must parse leniently");
4779        let dup_marking = CapcoMarking::new(dup_parsed.attrs);
4780
4781        let once_candidate = MarkingCandidate {
4782            span: Span::new(0, 18),
4783            kind: MarkingType::Banner,
4784        };
4785        let once_parsed = parser
4786            .parse(&once_candidate, b"SECRET//REL TO USA")
4787            .expect("SECRET//REL TO USA must parse");
4788        let once_marking = CapcoMarking::new(once_parsed.attrs);
4789
4790        let no_features: Vec<FeatureEntry> = vec![];
4791        let attempt_dup = CanonicalAttempt {
4792            bytes: b"SECRET//REL TO USA, USA".to_vec(),
4793            features: no_features.clone(),
4794            fix_source: marque_rules::FixSource::DecoderPosterior,
4795        };
4796        let attempt_once = CanonicalAttempt {
4797            bytes: b"SECRET//REL TO USA".to_vec(),
4798            features: no_features.clone(),
4799            fix_source: marque_rules::FixSource::DecoderPosterior,
4800        };
4801
4802        let (prior_dup, _) = score_candidate(&attempt_dup, &dup_marking);
4803        let (prior_once, _) = score_candidate(&attempt_once, &once_marking);
4804
4805        // Deduplication ensures the duplicate USA is only scored once, so
4806        // both priors must be equal (same base tokens + same single USA prior).
4807        assert!(
4808            (prior_dup - prior_once).abs() < 1e-5,
4809            "duplicate REL TO entry must not double-count the country-code prior; \
4810             prior_dup={prior_dup}, prior_once={prior_once}"
4811        );
4812    }
4813
4814    #[test]
4815    fn feature_entry_to_evidence_uses_canonical_label_registry() {
4816        // Regression guard for PR #142 H2: the projection from
4817        // `FeatureEntry` onto `EvidenceFeature::label` MUST route
4818        // through `FeatureId::as_str()` — the single source of truth
4819        // declared in `crates/rules/src/confidence.rs:208`. A divergent
4820        // local registry (the pre-fix shape, snake_case labels in a
4821        // duplicate match arm) produces wire-format drift the audit
4822        // emitter cannot detect, because today's dispatcher discards
4823        // `Parsed::Ambiguous` results and the bug stays latent.
4824        //
4825        // This test exhaustively covers every `FeatureId` variant. A
4826        // new variant added without an `as_str()` arm fails compilation
4827        // there (the match is exhaustive); a new variant whose label
4828        // diverges from `as_str()` here would have to be deliberately
4829        // wrong, since this test reads `id.as_str()` directly. The
4830        // load-bearing assertion is that `feature_entry_to_evidence`
4831        // does the same thing.
4832        for id in [
4833            FeatureId::EditDistance1,
4834            FeatureId::EditDistance2,
4835            FeatureId::TokenReorder,
4836            FeatureId::SupersededToken,
4837            FeatureId::BaseRateCommonMarking,
4838            FeatureId::StrictContextClassification,
4839            FeatureId::CorpusOverrideInEffect,
4840        ] {
4841            let entry = FeatureEntry { id, delta: -0.5 };
4842            let evidence = feature_entry_to_evidence(&entry);
4843            assert_eq!(
4844                evidence.label,
4845                id.as_str(),
4846                "decoder evidence label diverged from FeatureId::as_str() \
4847                 for {id:?}: got {label:?}, expected {expected:?}",
4848                label = evidence.label,
4849                expected = id.as_str(),
4850            );
4851            assert_eq!(evidence.log_odds, -0.5);
4852        }
4853    }
4854
4855    #[test]
4856    fn runner_up_ratio_saturates_on_extreme_log_margin() {
4857        // Regression guard for PR #127 review comment on decoder.rs:305:
4858        // when `log_margin` is large enough that `f32::exp()` overflows
4859        // (≈ ≥ 88.7 nats on f32), the previous code emitted `+∞` into
4860        // `Confidence::runner_up_ratio` and `Confidence::validate`
4861        // rejected the resulting record at the audit boundary,
4862        // panicking inside `FixProposal::new`. The fix saturates at
4863        // `f32::MAX`. We exercise both branches here with bare
4864        // `f32::exp` since the saturation logic is the same closed
4865        // expression used in `recognize`.
4866        for &log_margin in &[88.0_f32, 100.0_f32, 200.0_f32, 1000.0_f32] {
4867            let ratio = log_margin.exp();
4868            let clamped = if ratio.is_finite() { ratio } else { f32::MAX };
4869            assert!(
4870                clamped.is_finite(),
4871                "log_margin = {log_margin}: clamped ratio must be finite, got {clamped}"
4872            );
4873            assert!(
4874                clamped > 0.0,
4875                "log_margin = {log_margin}: clamped ratio must be > 0, got {clamped}"
4876            );
4877        }
4878        // And a sanity check on the in-band path: at the
4879        // UNAMBIGUOUS_LOG_MARGIN threshold, `exp()` returns a finite
4880        // value and we don't clamp.
4881        let at_threshold = UNAMBIGUOUS_LOG_MARGIN.exp();
4882        assert!(at_threshold.is_finite() && at_threshold > 1.0);
4883    }
4884
4885    #[test]
4886    fn strict_parse_is_complete_rejects_unknown_classification() {
4887        // This is the regression-guard for PR #114 review comment
4888        // on decoder.rs:946 — strict parse of `(SERCET//NOFORN)`
4889        // recognizes NOFORN but leaves `classification: None` because
4890        // SERCET doesn't resolve to any `Classification` variant.
4891        // Without the `strict_parse_is_complete` check, the
4892        // dispatcher would accept this as a complete strict result
4893        // and never fall through to the decoder.
4894        let token_set = CapcoTokenSet;
4895        let parser = Parser::new(&token_set);
4896        let candidate = MarkingCandidate {
4897            span: Span::new(0, 16),
4898            kind: MarkingType::Portion,
4899        };
4900        let parsed = parser
4901            .parse(&candidate, b"(SERCET//NOFORN)")
4902            .expect("strict parser should accept (SERCET//NOFORN) leniently");
4903        let marking = CapcoMarking::new(parsed.attrs);
4904        assert!(
4905            is_nontrivial_marking(&marking),
4906            "NOFORN survives as a dissem control → marking is nontrivial"
4907        );
4908        assert!(
4909            !strict_parse_is_complete(&marking, MarkingType::Portion),
4910            "SERCET left `classification: None` → strict parse is incomplete; \
4911             dispatcher must fall back to decoder. attrs = {:?}",
4912            marking.0,
4913        );
4914    }
4915
4916    #[test]
4917    fn strict_parse_is_complete_accepts_clean_marking() {
4918        let token_set = CapcoTokenSet;
4919        let parser = Parser::new(&token_set);
4920        let candidate = MarkingCandidate {
4921            span: Span::new(0, 7),
4922            kind: MarkingType::Portion,
4923        };
4924        let parsed = parser
4925            .parse(&candidate, b"(S//NF)")
4926            .expect("canonical portion must strict-parse");
4927        let marking = CapcoMarking::new(parsed.attrs);
4928        assert!(
4929            strict_parse_is_complete(&marking, MarkingType::Portion),
4930            "canonical (S//NF) must be accepted as complete; attrs = {:?}",
4931            marking.0,
4932        );
4933    }
4934
4935    #[test]
4936    fn strict_parse_is_complete_rejects_trailing_unknown_token() {
4937        // `(S//FRBN)` — classification parses (`S` → Secret) but the
4938        // tail token `FRBN` lands in an `Unknown` span. The
4939        // dispatcher must fall back so the decoder can resolve
4940        // `FRBN` → `NF` (or reject).
4941        let token_set = CapcoTokenSet;
4942        let parser = Parser::new(&token_set);
4943        let candidate = MarkingCandidate {
4944            span: Span::new(0, 9),
4945            kind: MarkingType::Portion,
4946        };
4947        let parsed = parser
4948            .parse(&candidate, b"(S//FRBN)")
4949            .expect("strict parser accepts (S//FRBN) leniently");
4950        let marking = CapcoMarking::new(parsed.attrs);
4951        // `S` resolved, so classification is Some — but the
4952        // Unknown-tail check still fires.
4953        assert!(
4954            !strict_parse_is_complete(&marking, MarkingType::Portion),
4955            "`FRBN` is Unknown-kind → strict parse is incomplete; attrs = {:?}",
4956            marking.0,
4957        );
4958    }
4959
4960    #[test]
4961    fn contains_hard_splitter_word_detects_per_word() {
4962        // Whole-string match.
4963        assert!(contains_hard_splitter_word("NOFORN"));
4964        assert!(contains_hard_splitter_word("ORCON"));
4965        assert!(contains_hard_splitter_word("EXDIS"));
4966        // Per-word match (the `Full` SAR-program-nickname absorption
4967        // shape — `BUTTER POPCORN NOFORN`).
4968        assert!(contains_hard_splitter_word("BUTTER POPCORN NOFORN"));
4969        assert!(contains_hard_splitter_word("ORCON BUTTER POPCORN"));
4970        assert!(contains_hard_splitter_word("BUTTER NOFORN POPCORN"));
4971        // Negatives — clean SAR identifiers must not match.
4972        assert!(!contains_hard_splitter_word("BP"));
4973        assert!(!contains_hard_splitter_word("J12"));
4974        assert!(!contains_hard_splitter_word("XRA"));
4975        assert!(!contains_hard_splitter_word("BUTTER POPCORN"));
4976        assert!(!contains_hard_splitter_word(""));
4977    }
4978
4979    #[test]
4980    fn absorbs_hard_splitter_detects_full_sar_program_with_trailing_noforn() {
4981        // The `SPECIAL ACCESS REQUIRED-BUTTER POPCORN NOFORN` shape:
4982        // strict parser builds a `Full`-indicator SAR with the program
4983        // identifier `"BUTTER POPCORN NOFORN"` (multi-word nickname,
4984        // NOFORN absorbed as the trailing word). Pinned to ensure the
4985        // per-word check in `contains_hard_splitter_word` keeps firing.
4986        use marque_ism::{IsmAttributes, SarIndicator, SarMarking, SarProgram};
4987        let sar = SarMarking::new(
4988            SarIndicator::Full,
4989            Box::new([SarProgram::new(
4990                Box::from("BUTTER POPCORN NOFORN"),
4991                Box::new([]),
4992            )]),
4993        );
4994        let mut attrs = IsmAttributes::default();
4995        attrs.sar_markings = Some(sar);
4996        let marking = CapcoMarking::new(attrs);
4997        assert!(
4998            absorbs_hard_splitter_in_sar_or_sci(&marking),
4999            "NOFORN as trailing word of multi-word SAR program identifier must be detected"
5000        );
5001    }
5002
5003    #[test]
5004    fn absorbs_hard_splitter_in_sar_detects_noforn_as_subcomp() {
5005        // Direct construction: a SAR program with NOFORN buried as a
5006        // sub-compartment of a normal compartment. Mirrors the parse
5007        // shape produced by `SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/
5008        // XR-XRA RB NOFORN` when the strict parser absorbs NOFORN at
5009        // the SAR-block tail.
5010        use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5011        let sar = SarMarking::new(
5012            SarIndicator::Abbrev,
5013            Box::new([SarProgram::new(
5014                Box::from("BP"),
5015                Box::new([SarCompartment::new(
5016                    Box::from("J12"),
5017                    Box::new([Box::from("RB"), Box::from("NOFORN")]),
5018                )]),
5019            )]),
5020        );
5021        let mut attrs = IsmAttributes::default();
5022        attrs.sar_markings = Some(sar);
5023        let marking = CapcoMarking::new(attrs);
5024        assert!(
5025            absorbs_hard_splitter_in_sar_or_sci(&marking),
5026            "NOFORN as SAR sub-compartment must be detected as absorption"
5027        );
5028    }
5029
5030    #[test]
5031    fn absorbs_hard_splitter_in_sar_detects_noforn_as_compartment_identifier() {
5032        // PR #178 review (Codecov, decoder.rs:1795): pin the
5033        // SAR-compartment-IDENTIFIER branch (vs the sub-compartment
5034        // branch covered above). Some absorbing parses end up with the
5035        // hard splitter as the compartment identifier itself rather
5036        // than a sub-compartment leaf — e.g., a `SAR-BP NOFORN` shape
5037        // where the strict parser emits `BP` as the program and
5038        // `NOFORN` as a bare compartment with no sub-compartments.
5039        use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5040        let sar = SarMarking::new(
5041            SarIndicator::Abbrev,
5042            Box::new([SarProgram::new(
5043                Box::from("BP"),
5044                Box::new([SarCompartment::new(Box::from("NOFORN"), Box::new([]))]),
5045            )]),
5046        );
5047        let mut attrs = IsmAttributes::default();
5048        attrs.sar_markings = Some(sar);
5049        let marking = CapcoMarking::new(attrs);
5050        assert!(
5051            absorbs_hard_splitter_in_sar_or_sci(&marking),
5052            "NOFORN as SAR compartment identifier must be detected as absorption"
5053        );
5054    }
5055
5056    #[test]
5057    fn absorbs_hard_splitter_accepts_clean_sar() {
5058        // Negative case: a SAR with realistic identifiers (`BP`, `J12`,
5059        // `RB`) and no hard-splitter token anywhere. Must NOT trigger
5060        // the penalty.
5061        use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
5062        let sar = SarMarking::new(
5063            SarIndicator::Abbrev,
5064            Box::new([SarProgram::new(
5065                Box::from("BP"),
5066                Box::new([SarCompartment::new(
5067                    Box::from("J12"),
5068                    Box::new([Box::from("RB"), Box::from("XRA")]),
5069                )]),
5070            )]),
5071        );
5072        let mut attrs = IsmAttributes::default();
5073        attrs.sar_markings = Some(sar);
5074        let marking = CapcoMarking::new(attrs);
5075        assert!(
5076            !absorbs_hard_splitter_in_sar_or_sci(&marking),
5077            "clean SAR identifiers must not trigger the absorption penalty"
5078        );
5079    }
5080
5081    #[test]
5082    fn absorbs_hard_splitter_in_sci_detects_orcon_as_subcomp() {
5083        // Defensive coverage for SCI absorption — the existing strict-
5084        // parser path drops most SCI absorption via the
5085        // `TokenKind::Unknown` filter in step 3a, but a future grammar
5086        // change that loosens SCI compartment shape could let a hard
5087        // splitter through. Pinned so the penalty stays defensive.
5088        use marque_ism::{
5089            IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
5090        };
5091        let sci = SciMarking::new(
5092            SciControlSystem::Published(SciControlBare::Si),
5093            Box::new([SciCompartment::new(
5094                Box::from("G"),
5095                Box::new([Box::from("ORCON")]),
5096            )]),
5097            None,
5098        );
5099        let mut attrs = IsmAttributes::default();
5100        attrs.sci_markings = Box::new([sci]);
5101        let marking = CapcoMarking::new(attrs);
5102        assert!(
5103            absorbs_hard_splitter_in_sar_or_sci(&marking),
5104            "ORCON as SCI sub-compartment must be detected as absorption"
5105        );
5106    }
5107
5108    #[test]
5109    fn absorbs_hard_splitter_in_sci_detects_orcon_as_compartment_identifier() {
5110        // PR #178 review (Codecov, decoder.rs:1811): pin the SCI-
5111        // compartment-IDENTIFIER branch (vs the sub-compartment branch
5112        // above). Defensive coverage — today's strict-parser SCI path
5113        // drops most absorption via the `TokenKind::Unknown` filter at
5114        // step 3a, but a future grammar change that lets a hard
5115        // splitter through as the compartment id needs the penalty
5116        // active.
5117        use marque_ism::{
5118            IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
5119        };
5120        let sci = SciMarking::new(
5121            SciControlSystem::Published(SciControlBare::Si),
5122            Box::new([SciCompartment::new(Box::from("ORCON"), Box::new([]))]),
5123            None,
5124        );
5125        let mut attrs = IsmAttributes::default();
5126        attrs.sci_markings = Box::new([sci]);
5127        let marking = CapcoMarking::new(attrs);
5128        assert!(
5129            absorbs_hard_splitter_in_sar_or_sci(&marking),
5130            "ORCON as SCI compartment identifier must be detected as absorption"
5131        );
5132    }
5133
5134    #[test]
5135    fn absorbs_hard_splitter_negative_on_empty_marking() {
5136        // Sanity floor: a marking with neither SAR nor SCI never
5137        // triggers the penalty.
5138        use marque_ism::IsmAttributes;
5139        let attrs = IsmAttributes::default();
5140        let marking = CapcoMarking::new(attrs);
5141        assert!(
5142            !absorbs_hard_splitter_in_sar_or_sci(&marking),
5143            "marking without SAR/SCI must not trigger the penalty"
5144        );
5145    }
5146
5147    #[test]
5148    fn decoder_resolves_sar_with_trailing_noforn_via_absorption_penalty() {
5149        // The SC-004 fixtures `SAR-BP-J12 …` and
5150        // `SPECIAL ACCESS REQUIRED-BUTTER POPCORN …` with a trailing
5151        // NOFORN have always produced the right candidate bytes from
5152        // `try_insert_delimiter`, but lost the scoring contest before
5153        // PR-5 because the absorbing strict parse contributed only the
5154        // classification's prior while the delim-inserted parse paid
5155        // the additional log-prior of NF. The
5156        // `HARD_SPLITTER_ABSORPTION_PENALTY` flips the contest; this
5157        // test pins both fixture shapes.
5158        let rx = DecoderRecognizer::new();
5159        for input in &[
5160            "TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN NOFORN",
5161            "SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB NOFORN",
5162        ] {
5163            let parsed = rx.recognize(input.as_bytes(), &deep_cx());
5164            match parsed {
5165                Parsed::Unambiguous(m) => {
5166                    assert!(
5167                        m.0.sar_markings.is_some(),
5168                        "input {input:?}: expected SAR present in winning candidate"
5169                    );
5170                    // PR #178 review (Copilot, decoder.rs:2841): assert
5171                    // the SPECIFIC dissem control we expect — `Nf`.
5172                    // The previous `!is_empty()` check would silently
5173                    // accept a future regression that emitted a
5174                    // different dissem token (e.g., a misclassified
5175                    // `Oc`/`Pr`) and still call the test green.
5176                    assert!(
5177                        m.0.dissem_controls
5178                            .iter()
5179                            .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
5180                        "input {input:?}: expected NOFORN (DissemControl::Nf) to land \
5181                         as a dissem control (winning candidate must be the delim-\
5182                         inserted form, not the absorbing one); got dissem_controls = \
5183                         {:?}",
5184                        m.0.dissem_controls,
5185                    );
5186                    assert!(
5187                        !absorbs_hard_splitter_in_sar_or_sci(&m),
5188                        "input {input:?}: winning marking must not bury a hard \
5189                         splitter inside SAR/SCI"
5190                    );
5191                }
5192                other => panic!("input {input:?}: expected Unambiguous, got {other:?}"),
5193            }
5194        }
5195    }
5196
5197    #[test]
5198    fn decoder_rejects_trivial_strict_parse() {
5199        // The strict parser is lenient: it accepts `FROBNITZ//WIBBLE`
5200        // and emits an IsmAttributes with classification=None,
5201        // dissem_controls=[], sci_controls=[]. The decoder must treat
5202        // that as "no real parse" and drop the candidate — otherwise
5203        // it would fabricate an empty marking for arbitrary prose.
5204        let token_set = CapcoTokenSet;
5205        let parser = Parser::new(&token_set);
5206        let candidate = MarkingCandidate {
5207            span: Span::new(0, 16),
5208            kind: MarkingType::Banner,
5209        };
5210        let parsed = parser
5211            .parse(&candidate, b"FROBNITZ//WIBBLE")
5212            .expect("strict parser should accept arbitrary bytes");
5213        let marking = CapcoMarking::new(parsed.attrs);
5214        assert!(
5215            !is_nontrivial_marking(&marking),
5216            "empty marking must be filtered"
5217        );
5218    }
5219
5220    #[test]
5221    fn decoder_recovers_typo_sercet_to_secret() {
5222        let rx = DecoderRecognizer::new();
5223        match rx.recognize(b"SERCET//NOFORN", &deep_cx()) {
5224            Parsed::Unambiguous(m) => {
5225                assert_eq!(
5226                    marking_classification(&m),
5227                    Some(Classification::Secret),
5228                    "expected SECRET classification from SERCET fuzzy-correction"
5229                );
5230            }
5231            other => panic!("expected Unambiguous(SECRET//NOFORN), got {other:?}"),
5232        }
5233    }
5234
5235    #[test]
5236    fn decoder_recovers_case_mangled_input() {
5237        let rx = DecoderRecognizer::new();
5238        match rx.recognize(b"secret//noforn", &deep_cx()) {
5239            Parsed::Unambiguous(m) => {
5240                assert_eq!(marking_classification(&m), Some(Classification::Secret));
5241            }
5242            other => panic!("expected Unambiguous, got {other:?}"),
5243        }
5244    }
5245
5246    #[test]
5247    fn decoder_suppresses_prose_glue_single_letter_portion() {
5248        // Prose-glue heuristic: when the byte preceding the candidate
5249        // is NOT whitespace, a single-letter `(s)` / `(c)` is
5250        // overwhelmingly a plural-suffix (`letter(s)`) or function-
5251        // call glyph (`function(c)`). The decoder must produce zero
5252        // candidates so the engine doesn't synthesize a spurious R001
5253        // diagnostic.
5254        let rx = DecoderRecognizer::new();
5255        let glued = ParseContext {
5256            preceded_by_whitespace: false,
5257            ..deep_cx()
5258        };
5259        for input in &[b"(s)", b"(c)", b"(u)", b"(S)", b"(C)"] {
5260            match rx.recognize(*input, &glued) {
5261                Parsed::Ambiguous { candidates } => assert!(
5262                    candidates.is_empty(),
5263                    "{:?} glued to a word must produce zero candidates, got {}",
5264                    std::str::from_utf8(*input).unwrap_or("<bytes>"),
5265                    candidates.len(),
5266                ),
5267                Parsed::Unambiguous(_) => panic!(
5268                    "{:?} glued to a word must not resolve",
5269                    std::str::from_utf8(*input).unwrap_or("<bytes>"),
5270                ),
5271            }
5272        }
5273    }
5274
5275    #[test]
5276    fn decoder_canonicalizes_single_letter_when_preceded_by_whitespace() {
5277        // Counterpart to the prose-glue test: when
5278        // `preceded_by_whitespace = true` (the engine's start-of-buffer
5279        // / post-whitespace convention), single-letter portions still
5280        // canonicalize through the case-fold path. The heuristic only
5281        // suppresses the glued-to-a-word shape; mid-prose with leading
5282        // whitespace remains the decoder's responsibility (and is
5283        // governed separately by future per-token null-hypothesis
5284        // priors — see issue #258).
5285        let rx = DecoderRecognizer::new();
5286        match rx.recognize(b"(s)", &deep_cx()) {
5287            Parsed::Unambiguous(m) => {
5288                assert_eq!(
5289                    marking_classification(&m),
5290                    Some(Classification::Secret),
5291                    "lowercase (s) with preceded_by_whitespace=true must \
5292                     canonicalize to SECRET via the case-fold path"
5293                );
5294            }
5295            other => panic!("expected Unambiguous resolution, got {other:?}"),
5296        }
5297    }
5298
5299    #[test]
5300    fn decoder_rejects_bare_restricted_via_recognizer_predicate() {
5301        // `(R)` parses cleanly under the strict path's lenient
5302        // grammar but fails `is_us_restricted` at
5303        // both the strict recognizer and inside the decoder's
5304        // candidate loop (step 3c-bis). The decoder must produce
5305        // zero candidates regardless of preceded-by-whitespace.
5306        let rx = DecoderRecognizer::new();
5307        for cx in &[
5308            deep_cx(),
5309            ParseContext {
5310                preceded_by_whitespace: false,
5311                ..deep_cx()
5312            },
5313        ] {
5314            match rx.recognize(b"(r)", cx) {
5315                Parsed::Ambiguous { candidates } => assert!(
5316                    candidates.is_empty(),
5317                    "bare (r) must be zero-candidate (preceded_by_whitespace={}), got {}",
5318                    cx.preceded_by_whitespace,
5319                    candidates.len()
5320                ),
5321                Parsed::Unambiguous(m) => panic!(
5322                    "bare (r) must be rejected, got Unambiguous({:?})",
5323                    m.0.classification
5324                ),
5325            }
5326        }
5327    }
5328
5329    #[test]
5330    fn decoder_recovers_superseded_comint_to_si() {
5331        let rx = DecoderRecognizer::new();
5332        // SECRET//COMINT//NOFORN — COMINT is CAPCO-2016 §A.6 p16-superseded to SI.
5333        match rx.recognize(b"SECRET//COMINT//NOFORN", &deep_cx()) {
5334            Parsed::Unambiguous(m) => {
5335                assert_eq!(marking_classification(&m), Some(Classification::Secret));
5336                // Verify SI is in the SCI controls list after correction.
5337                let has_si =
5338                    m.0.sci_controls
5339                        .iter()
5340                        .any(|c| matches!(c, marque_ism::SciControl::Si));
5341                assert!(
5342                    has_si,
5343                    "expected SI in sci_controls after COMINT supersession"
5344                );
5345            }
5346            other => panic!("expected Unambiguous, got {other:?}"),
5347        }
5348    }
5349
5350    #[test]
5351    fn decoder_recovers_reordered_banner() {
5352        let rx = DecoderRecognizer::new();
5353        // Dissem-first mangled; canonical is classification-first.
5354        match rx.recognize(b"NOFORN//SECRET", &deep_cx()) {
5355            Parsed::Unambiguous(m) => {
5356                assert_eq!(marking_classification(&m), Some(Classification::Secret));
5357            }
5358            Parsed::Ambiguous { candidates } => {
5359                assert!(
5360                    !candidates.is_empty(),
5361                    "reordering should at least surface candidates"
5362                );
5363            }
5364        }
5365    }
5366
5367    #[test]
5368    fn decoder_honors_classification_floor_fr011() {
5369        let rx = DecoderRecognizer::new();
5370        // Input is "(U)" which canonicalizes to an UNCLASSIFIED
5371        // portion. With a Secret floor, the candidate must be
5372        // dropped.
5373        let cx = ParseContext {
5374            strict_evidence: false,
5375            zone: None,
5376            position: None,
5377            classification_floor: Some(Classification::Secret as u8),
5378            as_of: None,
5379            preceded_by_whitespace: true,
5380        };
5381        match rx.recognize(b"(U)", &cx) {
5382            Parsed::Ambiguous { candidates } => assert!(
5383                candidates.is_empty(),
5384                "UNCLASSIFIED below SECRET floor must produce zero candidates, got {}",
5385                candidates.len()
5386            ),
5387            Parsed::Unambiguous(m) => panic!(
5388                "expected zero-candidate, got Unambiguous({:?})",
5389                marking_classification(&m)
5390            ),
5391        }
5392    }
5393
5394    #[test]
5395    fn decoder_classification_floor_allows_equal_or_above() {
5396        let rx = DecoderRecognizer::new();
5397        // (S//NF) with Confidential floor — SECRET exceeds floor.
5398        let cx = ParseContext {
5399            strict_evidence: false,
5400            zone: None,
5401            position: None,
5402            classification_floor: Some(Classification::Confidential as u8),
5403            as_of: None,
5404            preceded_by_whitespace: true,
5405        };
5406        match rx.recognize(b"(S//NF)", &cx) {
5407            Parsed::Unambiguous(m) => {
5408                assert_eq!(marking_classification(&m), Some(Classification::Secret));
5409            }
5410            other => panic!("expected Unambiguous, got {other:?}"),
5411        }
5412    }
5413
5414    #[test]
5415    fn normalize_delimiters_collapses_garbled_slash() {
5416        let (out, _) = normalize_delimiters_and_case("S ∕∕ NOFORN");
5417        assert_eq!(out, "S//NOFORN");
5418    }
5419
5420    #[test]
5421    fn scan_token_captures_compound_with_hyphen() {
5422        assert_eq!(scan_token("SI-G ABCD"), 4); // "SI-G"
5423        assert_eq!(scan_token("HCS-P"), 5);
5424        assert_eq!(scan_token("SECRET//"), 6);
5425    }
5426
5427    #[test]
5428    fn try_canonical_reorder_swaps_dissem_first_banner() {
5429        assert_eq!(
5430            try_canonical_reorder("NOFORN//SECRET"),
5431            Some("SECRET//NOFORN".to_owned())
5432        );
5433    }
5434
5435    #[test]
5436    fn try_canonical_reorder_returns_none_when_already_canonical() {
5437        assert_eq!(try_canonical_reorder("SECRET//NOFORN"), None);
5438    }
5439
5440    #[test]
5441    fn classify_segment_treats_sci_as_other_not_dissem() {
5442        // HCS and SI are SCI controls per CAPCO §A.6, not dissem.
5443        // Regression guard for PR #114 review — previously HCS was
5444        // in `DISSEMS`, which caused `try_canonical_reorder` to
5445        // move an HCS segment to the very end of the banner/portion
5446        // (past the dissem block) and corrupt canonicalization.
5447        // SCI segments must fall through to `SegmentClass::Other`
5448        // so the reorder helper places them between classification
5449        // and dissem per §A.6.
5450        assert_eq!(classify_segment("HCS"), SegmentClass::Other);
5451        assert_eq!(classify_segment("HCS-P"), SegmentClass::Other);
5452        assert_eq!(classify_segment("SI"), SegmentClass::Other);
5453        assert_eq!(classify_segment("SI-G"), SegmentClass::Other);
5454        assert_eq!(classify_segment("TK"), SegmentClass::Other);
5455    }
5456
5457    #[test]
5458    fn classify_segment_non_ic_dissem_tokens() {
5459        // §H.9 abbreviations and long-title forms must classify as Dissem so
5460        // try_canonical_reorder places them after SCI, not in Other.
5461        // Regression guard for PR #256.
5462        for tok in &[
5463            "DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI", "LIMDIS", "EXDIS", "NODIS",
5464        ] {
5465            assert_eq!(
5466                classify_segment(tok),
5467                SegmentClass::Dissem,
5468                "classify_segment({tok:?}) should be Dissem"
5469            );
5470        }
5471        // Multi-word long-title forms.
5472        assert_eq!(
5473            classify_segment("LIMITED DISTRIBUTION"),
5474            SegmentClass::Dissem
5475        );
5476        assert_eq!(
5477            classify_segment("EXCLUSIVE DISTRIBUTION"),
5478            SegmentClass::Dissem
5479        );
5480        assert_eq!(classify_segment("NO DISTRIBUTION"), SegmentClass::Dissem);
5481        assert_eq!(
5482            classify_segment("LAW ENFORCEMENT SENSITIVE"),
5483            SegmentClass::Dissem
5484        );
5485        assert_eq!(
5486            classify_segment("SENSITIVE BUT UNCLASSIFIED"),
5487            SegmentClass::Dissem
5488        );
5489        assert_eq!(
5490            classify_segment("SENSITIVE SECURITY INFORMATION"),
5491            SegmentClass::Dissem
5492        );
5493    }
5494
5495    #[test]
5496    fn classify_segment_restricted_data_is_not_classification() {
5497        // "RESTRICTED DATA" (AEA, §H.6) must not be mistaken for the NATO
5498        // RESTRICTED classification even though "RESTRICTED" is in CLASSIFICATIONS.
5499        // Bare "RESTRICTED" (NATO classification) must still be Classification.
5500        // Regression guard for PR #256.
5501        assert_eq!(classify_segment("RESTRICTED DATA"), SegmentClass::Other);
5502        assert_eq!(
5503            classify_segment("RESTRICTED DATA-CNWDI"),
5504            SegmentClass::Other
5505        );
5506        assert_eq!(classify_segment("RESTRICTED"), SegmentClass::Classification);
5507    }
5508
5509    #[test]
5510    fn try_canonical_reorder_places_sci_between_classification_and_dissem() {
5511        // Dissem-first with an SCI segment in the middle — correct
5512        // canonical order is classification → SCI → dissem.
5513        assert_eq!(
5514            try_canonical_reorder("NOFORN//HCS-P//SECRET"),
5515            Some("SECRET//HCS-P//NOFORN".to_owned())
5516        );
5517    }
5518
5519    #[test]
5520    fn meets_classification_floor_rejects_below_floor() {
5521        // Synthesize a marking via the decoder and check the floor
5522        // predicate directly.
5523        let rx = DecoderRecognizer::new();
5524        let Parsed::Unambiguous(u_marking) = rx.recognize(b"(U)", &deep_cx()) else {
5525            panic!("(U) should decode to unambiguous UNCLASSIFIED");
5526        };
5527        // U below S floor → rejected.
5528        assert!(!meets_classification_floor(
5529            &u_marking,
5530            Classification::Secret as u8
5531        ));
5532        // U meets U floor.
5533        assert!(meets_classification_floor(
5534            &u_marking,
5535            Classification::Unclassified as u8
5536        ));
5537    }
5538
5539    // ----- SAR indicator-keyword structural repair (issue #133 PR 6) -----
5540
5541    #[test]
5542    fn sar_indicator_repair_strips_one_letter_prefix() {
5543        // The canonical USAR-BP shape from the mangled corpus.
5544        assert_eq!(
5545            try_sar_indicator_repair(
5546                "SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN"
5547            ),
5548            Some("SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN".to_owned())
5549        );
5550    }
5551
5552    #[test]
5553    fn sar_indicator_repair_strips_multi_letter_prefix() {
5554        // Two- and three-letter prefixes are still in the structural
5555        // window. `XYZ` isn't a CAPCO token or trigraph.
5556        assert_eq!(
5557            try_sar_indicator_repair("SECRET//ABSAR-BP//NOFORN"),
5558            Some("SECRET//SAR-BP//NOFORN".to_owned())
5559        );
5560        assert_eq!(
5561            try_sar_indicator_repair("SECRET//XYZSAR-BP//NOFORN"),
5562            Some("SECRET//SAR-BP//NOFORN".to_owned())
5563        );
5564    }
5565
5566    #[test]
5567    fn sar_indicator_repair_strips_even_capco_token_prefix() {
5568        // The prefix-strip pass intentionally does NOT defend
5569        // against prefixes that spell a CAPCO token in isolation
5570        // (`U`, `S`, `R`, `C`, `TS`, `SI`, `USA`, …). Canonical
5571        // CAPCO never glues a classification token, SCI control,
5572        // or trigraph directly to `SAR-` without a `//` separator,
5573        // so the apparent prefix at a `//`/`(`/start boundary is
5574        // OCR/transcription drift regardless of whether the bytes
5575        // happen to spell a known token. An earlier defensive check
5576        // that refused to strip such prefixes broke the central
5577        // `USAR-` recovery case (`U` is the UNCLASSIFIED portion
5578        // form). Pinned here so a future "be more conservative"
5579        // PR reviews the rationale before re-adding the guard.
5580        assert_eq!(
5581            try_sar_indicator_repair("SECRET//USASAR-BP//NOFORN"),
5582            Some("SECRET//SAR-BP//NOFORN".to_owned()),
5583            "must strip USA at boundary even though USA is a trigraph",
5584        );
5585        assert_eq!(
5586            try_sar_indicator_repair("(USAR-BP)"),
5587            Some("(SAR-BP)".to_owned()),
5588            "boundary `(` must also trigger the strip pass",
5589        );
5590    }
5591
5592    #[test]
5593    fn sar_indicator_repair_inserts_missing_hyphen_two_char_id() {
5594        // The canonical SARBP missing-hyphen shape.
5595        assert_eq!(
5596            try_sar_indicator_repair("TOP SECRET//SARBP//NOFORN"),
5597            Some("TOP SECRET//SAR-BP//NOFORN".to_owned())
5598        );
5599    }
5600
5601    #[test]
5602    fn sar_indicator_repair_inserts_missing_hyphen_three_char_id() {
5603        // 3-char alphanumeric program identifier per §H.5 p100.
5604        assert_eq!(
5605            try_sar_indicator_repair("TOP SECRET//SARABC//NOFORN"),
5606            Some("TOP SECRET//SAR-ABC//NOFORN".to_owned())
5607        );
5608    }
5609
5610    #[test]
5611    fn sar_indicator_repair_inserts_missing_hyphen_before_compound() {
5612        // `SARBP-J12` → `SAR-BP-J12`. The 2-char alnum run BP
5613        // terminates at the `-` delimiter; that's the missing-hyphen
5614        // pattern. The trailing `-J12` is preserved verbatim.
5615        assert_eq!(
5616            try_sar_indicator_repair("SECRET//SARBP-J12 J54//NOFORN"),
5617            Some("SECRET//SAR-BP-J12 J54//NOFORN".to_owned())
5618        );
5619    }
5620
5621    #[test]
5622    fn sar_indicator_repair_no_op_on_canonical() {
5623        // Canonical SAR shapes must pass through with `None`.
5624        let cases: &[&str] = &[
5625            "SECRET//SAR-BP//NOFORN",
5626            "SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
5627            "TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
5628            "SECRET//NOFORN",
5629        ];
5630        for input in cases {
5631            assert_eq!(
5632                try_sar_indicator_repair(input),
5633                None,
5634                "canonical input {input:?} must not be repaired"
5635            );
5636        }
5637    }
5638
5639    #[test]
5640    fn sar_indicator_repair_skips_non_boundary_sar() {
5641        // `SAR` embedded mid-token (no boundary char before `S`)
5642        // is not the indicator — could be a SAR program identifier
5643        // happening to contain the letters. Don't touch.
5644        assert_eq!(
5645            try_sar_indicator_repair("SECRET//FOO-USAR-BP"),
5646            None,
5647            "non-boundary SAR is not the indicator keyword"
5648        );
5649    }
5650
5651    #[test]
5652    fn sar_indicator_repair_skips_long_alnum_run() {
5653        // 4+ alphanumeric chars after SAR don't match the §H.5 p100
5654        // 2-3 char Abbrev-form identifier. The helper refuses to
5655        // insert a hyphen — inserting `SAR-ABCD` would be inventing
5656        // a malformed identifier.
5657        assert_eq!(
5658            try_sar_indicator_repair("SECRET//SARABCD//NOFORN"),
5659            None,
5660            "4-char alnum run violates §H.5 p100 2-3 char identifier"
5661        );
5662    }
5663
5664    #[test]
5665    fn sar_indicator_repair_returns_none_when_no_sar_substring() {
5666        // Pre-check fast path: if `SAR` doesn't appear in the input
5667        // at all, no repair is possible.
5668        assert_eq!(
5669            try_sar_indicator_repair("TOP SECRET//SI-G ABCD//NOFORN"),
5670            None
5671        );
5672        assert_eq!(try_sar_indicator_repair(""), None);
5673        assert_eq!(try_sar_indicator_repair("UNCLASSIFIED"), None);
5674    }
5675
5676    #[test]
5677    fn match_sar_prefix_detects_one_to_three_letter_prefix() {
5678        assert_eq!(match_sar_prefix(b"USAR-BP", 0), Some((1, 5)));
5679        assert_eq!(match_sar_prefix(b"ABSAR-BP", 0), Some((2, 6)));
5680        assert_eq!(match_sar_prefix(b"XYZSAR-BP", 0), Some((3, 7)));
5681    }
5682
5683    #[test]
5684    fn match_sar_prefix_rejects_no_prefix_or_no_sar() {
5685        assert_eq!(match_sar_prefix(b"SAR-BP", 0), None);
5686        assert_eq!(match_sar_prefix(b"USAR", 0), None);
5687        assert_eq!(match_sar_prefix(b"USARBP", 0), None);
5688    }
5689
5690    #[test]
5691    fn match_sar_missing_hyphen_detects_2_3_char_id() {
5692        assert_eq!(match_sar_missing_hyphen(b"SARBP/", 0), Some(5));
5693        assert_eq!(match_sar_missing_hyphen(b"SARABC ", 0), Some(6));
5694        // End-of-string also counts as a delim.
5695        assert_eq!(match_sar_missing_hyphen(b"SARBP", 0), Some(5));
5696    }
5697
5698    #[test]
5699    fn match_sar_missing_hyphen_rejects_canonical_and_too_long() {
5700        // `SAR-` already canonical (alnum run is 0).
5701        assert_eq!(match_sar_missing_hyphen(b"SAR-BP", 0), None);
5702        // 4-char alnum run is outside the §H.5 p100 2-3 window.
5703        assert_eq!(match_sar_missing_hyphen(b"SARABCD/", 0), None);
5704        // 1-char alnum run is also outside the window.
5705        assert_eq!(match_sar_missing_hyphen(b"SARB/", 0), None);
5706    }
5707
5708    #[test]
5709    fn match_sar_missing_hyphen_rejects_non_delim_following_char() {
5710        // Alnum run is in the §H.5 p100 2-3 window, but the byte
5711        // immediately after the run is non-alphanumeric AND not in
5712        // the delimiter set (`-`, `/`, ` `, `\t`, `\n`, `\r`).
5713        // Every non-delim non-alnum byte triggers the
5714        // `next_is_delim = false` branch and the helper returns
5715        // `None` — refusing to repair grammatically-suspicious
5716        // shapes (a SAR identifier doesn't terminate at `,`, `)`,
5717        // `;`, etc.). Direct-helper test because the higher-level
5718        // pinning in `try_sar_indicator_repair` only exercises a
5719        // subset of these via the boundary check upstream.
5720        let cases: &[&[u8]] = &[
5721            b"SARBP)",  // closing paren — same byte that ends a portion mark
5722            b"SARBP,",  // comma — common typo separator
5723            b"SARBP;",  // semicolon
5724            b"SARBP*",  // asterisk
5725            b"SARBP=",  // equals
5726            b"SARABC.", // period after 3-char id
5727            b"SARABC?", // question mark
5728        ];
5729        for input in cases {
5730            assert_eq!(
5731                match_sar_missing_hyphen(input, 0),
5732                None,
5733                "input {:?} has non-delim follower; helper must refuse repair",
5734                std::str::from_utf8(input).unwrap_or("<non-utf8>"),
5735            );
5736        }
5737    }
5738
5739    #[test]
5740    fn sar_indicator_repair_skips_pattern_b_with_non_delim_follower() {
5741        // End-to-end pinning of the same `next_is_delim = false`
5742        // rejection through `try_sar_indicator_repair`. `SARBP)`
5743        // appears at a `//` boundary (so `at_boundary` is true and
5744        // Pattern B is attempted), the alnum run is 2, but `)` isn't
5745        // in the delim set — the helper falls through to the
5746        // verbatim-copy default. Without the rejection branch we'd
5747        // emit `SAR-BP)`, silently inventing a hyphen for a
5748        // grammatically-suspicious input.
5749        assert_eq!(
5750            try_sar_indicator_repair("SECRET//SARBP)//NOFORN"),
5751            None,
5752            "Pattern B must refuse to fire when the post-alnum char isn't a delim",
5753        );
5754    }
5755
5756    // ----- Stray-character `/X/` recovery (issue #133 PR 7) -----
5757
5758    #[test]
5759    fn try_collapse_stray_char_slash_emits_three_transforms() {
5760        // Each `/X/` match emits exactly three candidate bytes
5761        // (drop, right-attach, left-attach). This pins the contract
5762        // and makes any future scope expansion (multi-pass, extra
5763        // transforms) a deliberate, reviewable change.
5764        let result = try_collapse_stray_char_slash("AB/X/CD");
5765        assert_eq!(result.len(), 3, "expected 3 candidates; got {result:?}");
5766        assert_eq!(result[0], "AB//CD"); // drop X
5767        assert_eq!(result[1], "AB//XCD"); // right-attach X to CD
5768        assert_eq!(result[2], "ABX//CD"); // left-attach X to AB
5769    }
5770
5771    #[test]
5772    fn try_collapse_stray_char_slash_returns_empty_when_no_pattern() {
5773        // Inputs without a `/X/` pattern produce no candidates.
5774        let cases: &[&str] = &[
5775            "SECRET",
5776            "SECRET//NOFORN",
5777            "SECRET//NOFORN//EXDIS",
5778            "(C)",
5779            "",
5780            // A `/` followed by 2+ alnum chars is NOT the pattern —
5781            // `/AB/` is a regular 2-char token between slashes.
5782            "SECRET/AB/CD",
5783            // `//` (canonical separator) doesn't match because the
5784            // single-char-between-slashes shape requires alnum at
5785            // bytes[i+1].
5786            "SECRET////NOFORN",
5787        ];
5788        for input in cases {
5789            assert!(
5790                try_collapse_stray_char_slash(input).is_empty(),
5791                "input {input:?} should not match /X/ pattern",
5792            );
5793        }
5794    }
5795
5796    #[test]
5797    fn try_collapse_stray_char_slash_requires_alnum_boundary() {
5798        // The pattern requires alnum on both sides of `/X/`. Without
5799        // both, the recovery is semantically meaningless (no token
5800        // to attach X to / no token next to the strip).
5801        // Leading boundary missing: `/X/Y` at position 0 has no
5802        // alnum at i-1.
5803        assert!(try_collapse_stray_char_slash("/X/Y").is_empty());
5804        // Trailing boundary missing: `Y/X/` has no alnum at i+3.
5805        assert!(try_collapse_stray_char_slash("Y/X/").is_empty());
5806        // Both alnum: matches.
5807        assert_eq!(
5808            try_collapse_stray_char_slash("Y/X/Z").len(),
5809            3,
5810            "alnum on both sides should match"
5811        );
5812    }
5813
5814    // ----- REL TO structural repair (issue #133 PR 9) -----
5815
5816    #[test]
5817    fn rel_to_header_normalize_fixes_rel_ot_transposition() {
5818        // Pattern 1: `REL OT ` (TO → OT) → `REL TO `.
5819        let result = try_rel_to_header_normalize("SECRET//REL OT USA, AUS, GBR");
5820        assert_eq!(
5821            result.as_deref(),
5822            Some("SECRET//REL TO USA, AUS, GBR"),
5823            "REL OT must rewrite to REL TO at //-boundary",
5824        );
5825    }
5826
5827    #[test]
5828    fn rel_to_header_normalize_fixes_relt_o_token_boundary() {
5829        // Pattern 2: `RELT O ` (T migrated from REL to start of next
5830        // token) → `REL TO `. The fuzzy pass would otherwise rewrite
5831        // `RELT` (4 chars) → `REL` (in-vocab DissemControl, distance
5832        // 1) and silently drop USA from the strict parse.
5833        let result = try_rel_to_header_normalize("SECRET//RELT O USA, AUS, GBR");
5834        assert_eq!(
5835            result.as_deref(),
5836            Some("SECRET//REL TO USA, AUS, GBR"),
5837            "RELT O must rewrite to REL TO at //-boundary",
5838        );
5839    }
5840
5841    #[test]
5842    fn rel_to_header_normalize_returns_none_on_canonical() {
5843        // Canonical `REL TO ` (and texts without REL at all) round-
5844        // trip unchanged.
5845        assert!(try_rel_to_header_normalize("SECRET//REL TO USA, AUS, GBR").is_none());
5846        assert!(try_rel_to_header_normalize("SECRET//NOFORN").is_none());
5847        assert!(try_rel_to_header_normalize("").is_none());
5848    }
5849
5850    #[test]
5851    fn rel_to_header_normalize_requires_token_boundary() {
5852        // The pattern must not fire when embedded inside a longer
5853        // alphanumeric run. Without the boundary check, `XREL OT Y`
5854        // would match the substring `REL OT` even though the leading
5855        // `X` makes the whole thing a single 6-char token.
5856        assert!(try_rel_to_header_normalize("XREL OT Y").is_none());
5857        assert!(try_rel_to_header_normalize("SOMETHINGRELT O Y").is_none());
5858    }
5859
5860    #[test]
5861    fn rel_to_entry_normalize_joins_a_us_to_aus() {
5862        // Pattern 3: 4-char entry `A US` joins to AUS only when the
5863        // joined 3-letter string is a known trigraph. AUS is a
5864        // trigraph; A alone is not.
5865        let result = try_rel_to_entry_normalize("SECRET//REL TO USA,A US, GBR");
5866        // The replacement preserves the entry's leading whitespace
5867        // (none here), so the rewritten block is `USA,AUS, GBR`.
5868        assert_eq!(
5869            result.as_deref(),
5870            Some("SECRET//REL TO USA,AUS, GBR"),
5871            "A US should join to AUS when is_trigraph(AUS) holds",
5872        );
5873    }
5874
5875    #[test]
5876    fn rel_to_entry_normalize_swaps_au_comma_s_to_aus_comma() {
5877        // Pattern 4: `<2-upper>,<1-upper><space>` swaps to
5878        // `<3-upper joined>,` only when the joined trigraph is
5879        // valid AND the 2-letter prefix alone is not a trigraph.
5880        let result = try_rel_to_entry_normalize("SECRET//REL TO USA, AU,S GBR");
5881        assert_eq!(
5882            result.as_deref(),
5883            Some("SECRET//REL TO USA, AUS, GBR"),
5884            "AU,S should swap to AUS, when is_trigraph(AUS) holds and AU is not a trigraph",
5885        );
5886    }
5887
5888    #[test]
5889    fn rel_to_entry_normalize_does_not_corrupt_eu_comma_pattern() {
5890        // EU is itself a valid 2-char trigraph entry. Pattern 4 must
5891        // not fire on `EU,X ` because `is_trigraph(EU)` is true —
5892        // this guards the rule "only fix when the prefix alone is
5893        // invalid". (Even though `EUX` may not be a trigraph and
5894        // wouldn't pass the join-is-trigraph guard either, the
5895        // prefix-is-trigraph check is the cleaner discriminator.)
5896        let result = try_rel_to_entry_normalize("SECRET//REL TO USA, EU, GBR");
5897        assert!(
5898            result.is_none(),
5899            "canonical EU entry must round-trip unchanged",
5900        );
5901    }
5902
5903    #[test]
5904    fn rel_to_entry_normalize_returns_none_outside_rel_to() {
5905        // No REL TO header → no entry-pass fixes. The patterns are
5906        // scoped to inside REL TO blocks specifically.
5907        assert!(try_rel_to_entry_normalize("SECRET//SI/TK//NOFORN").is_none());
5908        assert!(try_rel_to_entry_normalize("").is_none());
5909    }
5910
5911    #[test]
5912    fn rel_to_structural_repair_short_circuits_without_rel() {
5913        // Pre-check: text without `REL` returns None immediately,
5914        // skipping the byte walks.
5915        assert!(try_rel_to_structural_repair("SECRET//NOFORN").is_none());
5916        assert!(try_rel_to_structural_repair("(C)").is_none());
5917        assert!(try_rel_to_structural_repair("").is_none());
5918    }
5919
5920    // ----- SCI delimiter recovery (issue #198, #133 PR 10) -----
5921
5922    #[test]
5923    fn sci_delimiter_repair_concatenated_compound_hcsp() {
5924        // Pattern A: `HCSP` (registered compound `HCS-P` with hyphen
5925        // missing) → `HCS-P`.
5926        let result = try_sci_delimiter_repair("SECRET//HCSP//NOFORN");
5927        assert_eq!(
5928            result.as_deref(),
5929            Some("SECRET//HCS-P//NOFORN"),
5930            "HCSP must rewrite to HCS-P (CVE-registered compound)",
5931        );
5932    }
5933
5934    #[test]
5935    fn sci_delimiter_repair_concatenated_compound_hcso() {
5936        // Pattern A: HCSO → HCS-O.
5937        let result = try_sci_delimiter_repair("SECRET//HCSO//NOFORN");
5938        assert_eq!(result.as_deref(), Some("SECRET//HCS-O//NOFORN"));
5939    }
5940
5941    #[test]
5942    fn sci_delimiter_repair_concatenated_compound_sig() {
5943        // Pattern A: SIG → SI-G. The CVE list has SI-G; G is a
5944        // compartment of SI per §A.6 p16.
5945        let result = try_sci_delimiter_repair("SECRET//SIG//NOFORN");
5946        assert_eq!(result.as_deref(), Some("SECRET//SI-G//NOFORN"));
5947    }
5948
5949    #[test]
5950    fn sci_delimiter_repair_concatenated_compound_tkkand() {
5951        // Pattern A: TKKAND → TK-KAND. Tests that the longer
5952        // concatenated forms (6 chars) are matched correctly.
5953        let result = try_sci_delimiter_repair("SECRET//TKKAND//NOFORN");
5954        assert_eq!(result.as_deref(), Some("SECRET//TK-KAND//NOFORN"));
5955    }
5956
5957    #[test]
5958    fn sci_delimiter_repair_schema_coverage_bur_compounds() {
5959        // Pattern A is schema-driven via `SciControl::parse`, so it
5960        // covers every CVE compound automatically — including the
5961        // BUR-* family that an earlier hand-maintained list omitted.
5962        // Locks in the schema-derived contract: any future ODNI
5963        // schema bump that adds new compounds is auto-covered without
5964        // changes to this file.
5965        assert_eq!(
5966            try_sci_delimiter_repair("SECRET//BURBLG//NOFORN").as_deref(),
5967            Some("SECRET//BUR-BLG//NOFORN"),
5968        );
5969        assert_eq!(
5970            try_sci_delimiter_repair("SECRET//BURDTP//NOFORN").as_deref(),
5971            Some("SECRET//BUR-DTP//NOFORN"),
5972        );
5973        assert_eq!(
5974            try_sci_delimiter_repair("SECRET//BURWRG//NOFORN").as_deref(),
5975            Some("SECRET//BUR-WRG//NOFORN"),
5976        );
5977    }
5978
5979    #[test]
5980    fn sci_delimiter_repair_missing_slash_sitk() {
5981        // Pattern B: SITK → SI/TK. Per §A.6 p16 + p194 example,
5982        // multiple control systems within an SCI category use `/`.
5983        let result = try_sci_delimiter_repair("SECRET//SITK//NOFORN");
5984        assert_eq!(
5985            result.as_deref(),
5986            Some("SECRET//SI/TK//NOFORN"),
5987            "SITK must rewrite to SI/TK (two bare control systems concatenated)",
5988        );
5989    }
5990
5991    #[test]
5992    fn sci_delimiter_repair_missing_slash_hcssi() {
5993        // Pattern B: HCSSI → HCS/SI. Tests 3+2 split (HCS is len 3,
5994        // SI is len 2).
5995        let result = try_sci_delimiter_repair("SECRET//HCSSI//NOFORN");
5996        assert_eq!(result.as_deref(), Some("SECRET//HCS/SI//NOFORN"));
5997    }
5998
5999    #[test]
6000    fn sci_delimiter_repair_wrong_delimiter_si_dash_tk() {
6001        // Pattern C: SI-TK → SI/TK. The whole token is not a CVE
6002        // compound, both halves are bare CS, so `-` is wrong.
6003        let result = try_sci_delimiter_repair("SECRET//SI-TK//NOFORN");
6004        assert_eq!(
6005            result.as_deref(),
6006            Some("SECRET//SI/TK//NOFORN"),
6007            "SI-TK must rewrite to SI/TK (two bare CS, `-` is for control-compartment)",
6008        );
6009    }
6010
6011    #[test]
6012    fn sci_delimiter_repair_leaves_registered_compound_alone() {
6013        // Pattern C must NOT fire on registered compounds. SI-G is in
6014        // CVEnumISMSCIControls.xml — `-` is the correct separator.
6015        assert!(try_sci_delimiter_repair("SECRET//SI-G//NOFORN").is_none());
6016        assert!(try_sci_delimiter_repair("SECRET//HCS-P//NOFORN").is_none());
6017        assert!(try_sci_delimiter_repair("SECRET//TK-KAND//NOFORN").is_none());
6018    }
6019
6020    #[test]
6021    fn sci_delimiter_repair_returns_none_on_canonical() {
6022        // Already-canonical inputs round-trip unchanged.
6023        assert!(try_sci_delimiter_repair("SECRET//SI/TK//NOFORN").is_none());
6024        assert!(try_sci_delimiter_repair("SECRET//SI//NOFORN").is_none());
6025        assert!(try_sci_delimiter_repair("SECRET//NOFORN").is_none());
6026        assert!(try_sci_delimiter_repair("").is_none());
6027    }
6028
6029    #[test]
6030    fn sci_delimiter_repair_does_not_fire_on_word_substring() {
6031        // SIGMA contains "SIG" as a substring but is a single token
6032        // — Pattern A requires whole-token equality, not contains.
6033        assert!(try_sci_delimiter_repair("SIGMA").is_none());
6034        // SITE, SITS — same protection.
6035        assert!(try_sci_delimiter_repair("SITE").is_none());
6036        // SIGNAL — contains SIG; whole token is not in Pattern A.
6037        assert!(try_sci_delimiter_repair("SIGNAL").is_none());
6038    }
6039
6040    #[test]
6041    fn sci_delimiter_repair_short_circuits_without_sci_root() {
6042        // Pre-check: no SCI control system substring → no repair.
6043        assert!(try_sci_delimiter_repair("CONFIDENTIAL//NOFORN").is_none());
6044        assert!(try_sci_delimiter_repair("(C)").is_none());
6045        assert!(try_sci_delimiter_repair("").is_none());
6046    }
6047
6048    #[test]
6049    fn sci_delimiter_repair_does_not_panic_on_non_ascii() {
6050        // The function must not panic on multi-byte UTF-8 input. The
6051        // SCI vocabulary is pure ASCII, so any non-ASCII input is
6052        // unmatchable — bail early rather than risk a byte-offset
6053        // slice landing mid-codepoint. Inputs intentionally chosen
6054        // to exercise both the outer scanner (`try_sci_delimiter_repair`)
6055        // and the inner per-token classifier (`repair_sci_token`).
6056        assert!(try_sci_delimiter_repair("SECRET//SI/TK//日本語").is_none());
6057        assert!(try_sci_delimiter_repair("Ω SI TK").is_none());
6058        assert!(try_sci_delimiter_repair("こんにちは").is_none());
6059        // Direct call to the per-token helper with non-ASCII content.
6060        assert!(repair_sci_token("SI日").is_none());
6061        assert!(repair_sci_token("日本").is_none());
6062    }
6063
6064    #[test]
6065    fn repair_sci_token_rejects_partial_decompositions() {
6066        // HCSI = HCS+I (I not bare) or H+CSI (neither bare) — no
6067        // valid Pattern B decomposition.
6068        assert!(repair_sci_token("HCSI").is_none());
6069        // ABCDE — random, no valid CS decomposition.
6070        assert!(repair_sci_token("ABCDE").is_none());
6071        // BUR alone — bare CS by itself, len 3, fails Pattern B's
6072        // 4..=6 length check, no `-`, not in Pattern A. Returns None.
6073        assert!(repair_sci_token("BUR").is_none());
6074    }
6075
6076    #[test]
6077    fn try_collapse_stray_char_slash_processes_only_first_match() {
6078        // PR 7 scope: only the first `/X/` is processed. Multi-
6079        // pattern inputs need a future multi-pass extension.
6080        let result = try_collapse_stray_char_slash("A/X/B/Y/C");
6081        assert_eq!(result.len(), 3);
6082        // Each candidate carries only the first transform — the
6083        // second `/Y/` pattern is left in place verbatim.
6084        assert_eq!(result[0], "A//B/Y/C"); // drop first X
6085        assert_eq!(result[1], "A//XB/Y/C"); // right-attach first X
6086        assert_eq!(result[2], "AX//B/Y/C"); // left-attach first X
6087    }
6088
6089    #[test]
6090    fn decoder_recovers_drop_stray_char() {
6091        // End-to-end: `SECRET//NOFORN/R/EXDIS` resolves to the
6092        // canonical `SECRET//NOFORN//EXDIS` via the drop-X transform.
6093        // The right-attach (`SECRET//NOFORN//REXDIS` — REXDIS unknown)
6094        // and left-attach (`SECRET//NOFORNR//EXDIS` — NOFORNR unknown)
6095        // candidates are dropped by step 3a's Unknown-token filter.
6096        // Pinned per `tests/fixtures/mangled/typo/7885156a2c2c125f.json`.
6097        let rx = DecoderRecognizer::new();
6098        let Parsed::Unambiguous(marking) = rx.recognize(b"SECRET//NOFORN/R/EXDIS", &deep_cx())
6099        else {
6100            panic!("`/R/` between NOFORN and EXDIS must resolve via drop-X");
6101        };
6102        assert_eq!(
6103            marking
6104                .0
6105                .classification
6106                .as_ref()
6107                .map(|c| c.effective_level()),
6108            Some(Classification::Secret),
6109        );
6110        assert!(
6111            marking
6112                .0
6113                .dissem_controls
6114                .iter()
6115                .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6116            "NOFORN must survive; attrs = {:?}",
6117            marking.0,
6118        );
6119        assert!(
6120            marking
6121                .0
6122                .non_ic_dissem
6123                .iter()
6124                .any(|d| matches!(d, marque_ism::NonIcDissem::Exdis)),
6125            "EXDIS must survive; attrs = {:?}",
6126            marking.0,
6127        );
6128    }
6129
6130    #[test]
6131    fn decoder_recovers_right_attach_stray_char() {
6132        // End-to-end: `TOP SECRET//SI/N/OFORN` resolves to the
6133        // canonical `TOP SECRET//SI//NOFORN` via right-attach (the
6134        // `N` was the leading char of NOFORN). The drop candidate
6135        // (`TOP SECRET//SI//OFORN` — OFORN unknown) and left-attach
6136        // (`TOP SECRET//SIN//OFORN` — both unknown) are dropped by
6137        // step 3a's Unknown-token filter. Pinned per
6138        // `tests/fixtures/mangled/typo/2cb13fe4682ff31c.json`.
6139        let rx = DecoderRecognizer::new();
6140        let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SI/N/OFORN", &deep_cx())
6141        else {
6142            panic!("`/N/` before OFORN must resolve via right-attach");
6143        };
6144        assert_eq!(
6145            marking
6146                .0
6147                .classification
6148                .as_ref()
6149                .map(|c| c.effective_level()),
6150            Some(Classification::TopSecret),
6151        );
6152        assert!(
6153            marking
6154                .0
6155                .sci_controls
6156                .iter()
6157                .any(|c| matches!(c, marque_ism::SciControl::Si)),
6158            "SI must survive; attrs = {:?}",
6159            marking.0,
6160        );
6161        assert!(
6162            marking
6163                .0
6164                .dissem_controls
6165                .iter()
6166                .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6167            "NOFORN must be reconstructed; attrs = {:?}",
6168            marking.0,
6169        );
6170    }
6171
6172    #[test]
6173    fn decoder_recovers_left_attach_stray_char() {
6174        // End-to-end: `SECRE/T/REL TO USA, AUS, GBR` resolves to the
6175        // canonical `SECRET//REL TO USA, AUS, GBR` via left-attach
6176        // (the `T` was the trailing char of SECRET). The drop
6177        // (`SECRE//REL TO ...` — SECRE unknown) and right-attach
6178        // (`SECRE//TREL TO ...` — both unknown) are dropped by
6179        // step 3a. Pinned per
6180        // `tests/fixtures/mangled/typo/cff1d0ac74e901c3.json`.
6181        let rx = DecoderRecognizer::new();
6182        let Parsed::Unambiguous(marking) =
6183            rx.recognize(b"SECRE/T/REL TO USA, AUS, GBR", &deep_cx())
6184        else {
6185            panic!("`/T/` after SECRE must resolve via left-attach");
6186        };
6187        assert_eq!(
6188            marking
6189                .0
6190                .classification
6191                .as_ref()
6192                .map(|c| c.effective_level()),
6193            Some(Classification::Secret),
6194        );
6195        assert_eq!(
6196            marking.0.rel_to.len(),
6197            3,
6198            "REL TO must carry 3 trigraphs (USA, AUS, GBR); attrs = {:?}",
6199            marking.0,
6200        );
6201    }
6202
6203    #[test]
6204    fn decoder_recovers_usar_prefix_via_sar_indicator_repair() {
6205        // End-to-end recognizer test: the canonical USAR-BP fixture
6206        // shape from the mangled corpus must resolve unambiguously
6207        // to a SECRET marking with a SAR block. Pinned per
6208        // `tests/fixtures/mangled/typo/d04f45f7a4f5a8b4.json`.
6209        let rx = DecoderRecognizer::new();
6210        let Parsed::Unambiguous(marking) = rx.recognize(
6211            b"SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
6212            &deep_cx(),
6213        ) else {
6214            panic!("USAR-BP-... must resolve via SAR indicator repair");
6215        };
6216        assert_eq!(
6217            marking
6218                .0
6219                .classification
6220                .as_ref()
6221                .map(|c| c.effective_level()),
6222            Some(Classification::Secret),
6223        );
6224        assert!(
6225            marking.0.sar_markings.is_some(),
6226            "SAR block must be present after USAR→SAR repair; attrs = {:?}",
6227            marking.0,
6228        );
6229        assert!(
6230            marking
6231                .0
6232                .dissem_controls
6233                .iter()
6234                .any(|d| matches!(d, marque_ism::DissemControl::Nf)),
6235            "NOFORN must survive; attrs = {:?}",
6236            marking.0,
6237        );
6238    }
6239
6240    #[test]
6241    fn decoder_recovers_sarbp_missing_hyphen_via_sar_indicator_repair() {
6242        // End-to-end: `SARBP` (no hyphen) → `SAR-BP` (canonical) per
6243        // §H.5 p100. Pinned per
6244        // `tests/fixtures/mangled/typo/fbf5ed813c109c14.json`.
6245        let rx = DecoderRecognizer::new();
6246        let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SARBP//NOFORN", &deep_cx())
6247        else {
6248            panic!("SARBP must resolve via SAR indicator repair");
6249        };
6250        assert_eq!(
6251            marking
6252                .0
6253                .classification
6254                .as_ref()
6255                .map(|c| c.effective_level()),
6256            Some(Classification::TopSecret),
6257        );
6258        let sar = marking
6259            .0
6260            .sar_markings
6261            .as_ref()
6262            .expect("SAR block must be present");
6263        assert_eq!(sar.programs.len(), 1, "exactly one program; got {sar:?}");
6264        assert_eq!(
6265            &*sar.programs[0].identifier, "BP",
6266            "program identifier must be `BP` after hyphen insertion; got {sar:?}",
6267        );
6268    }
6269
6270    #[test]
6271    fn decoder_recovers_spcial_via_extended_correction_vocab() {
6272        // `SPCIAL` (typo in `SPECIAL`) — issue #133 PR 6 vocab
6273        // addition. The fuzzy matcher now finds `SPECIAL` at edit
6274        // distance 1, the strict SAR parser then matches the
6275        // `SPECIAL ACCESS REQUIRED-BUTTER POPCORN` indicator
6276        // literally. Pinned per
6277        // `tests/fixtures/mangled/typo/1f75ddd89b432949.json`.
6278        let rx = DecoderRecognizer::new();
6279        let Parsed::Unambiguous(marking) = rx.recognize(
6280            b"TOP SECRET//SPCIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
6281            &deep_cx(),
6282        ) else {
6283            panic!("SPCIAL must fuzzy-correct to SPECIAL");
6284        };
6285        assert_eq!(
6286            marking
6287                .0
6288                .classification
6289                .as_ref()
6290                .map(|c| c.effective_level()),
6291            Some(Classification::TopSecret),
6292        );
6293        let sar = marking
6294            .0
6295            .sar_markings
6296            .as_ref()
6297            .expect("SAR block must be present");
6298        assert_eq!(
6299            &*sar.programs[0].identifier, "BUTTER POPCORN",
6300            "Full-form program identifier must round-trip; got {sar:?}",
6301        );
6302    }
6303}
marque_engine/decoder.rs

marque_engine/
decoder.rs