use std::collections::BTreeSet;
use marque_capco::provenance::DecoderProvenance;
use marque_capco::{CapcoMarking, CapcoScheme};
use marque_core::{Parser, fuzzy::FuzzyVocabMatcher};
use marque_ism::{
CapcoTokenSet, Classification, SciControl, SciControlBare, SciControlSystem,
span::{MarkingCandidate, MarkingType, Span},
token_set::TokenSet as _,
};
use marque_rules::confidence::{FeatureContribution, FeatureId};
use marque_scheme::ambiguity::{Candidate, EvidenceFeature, Parsed};
use marque_scheme::recognizer::{ParseContext, Recognizer};
use crate::recognizer::{StrictRecognizer, is_us_restricted};
const K_MAX_CANDIDATES: usize = 8;
const UNAMBIGUOUS_LOG_MARGIN: f32 = 1.6;
#[derive(Debug, Default, Clone, Copy)]
pub struct DecoderRecognizer;
impl DecoderRecognizer {
pub const fn new() -> Self {
Self
}
}
impl Recognizer<CapcoScheme> for DecoderRecognizer {
fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
if cx.strict_evidence {
return Parsed::Ambiguous {
candidates: Vec::new(),
};
}
let Some(kind) = infer_marking_type(bytes) else {
return Parsed::Ambiguous {
candidates: Vec::new(),
};
};
if !cx.preceded_by_whitespace
&& matches!(kind, MarkingType::Portion)
&& is_single_letter_portion(bytes)
{
return Parsed::Ambiguous {
candidates: Vec::new(),
};
}
let canonical_attempts = generate_candidate_bytes(bytes);
if canonical_attempts.is_empty() {
return Parsed::Ambiguous {
candidates: Vec::new(),
};
}
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let synthetic_candidate = MarkingCandidate {
span: Span::new(0, 0), kind,
};
let mut scored: Vec<ScoredCandidate> = Vec::new();
for attempt in canonical_attempts {
let candidate = MarkingCandidate {
span: Span::new(0, attempt.bytes.len()),
..synthetic_candidate
};
let Ok(mut parsed) = parser.parse(&candidate, &attempt.bytes) else {
continue;
};
let has_unknown_token = parsed
.attrs
.token_spans
.iter()
.any(|s| matches!(s.kind, marque_ism::TokenKind::Unknown));
if has_unknown_token {
continue;
}
parsed.attrs.token_spans = Box::new([]);
let marking = CapcoMarking::new(parsed.attrs);
if !is_nontrivial_marking(&marking) {
continue;
}
if is_us_restricted(&marking) {
continue;
}
if let Some(floor) = cx.classification_floor
&& !meets_classification_floor(&marking, floor)
{
continue;
}
if matches!(kind, MarkingType::Portion | MarkingType::Banner)
&& marking.0.classification.is_none()
{
continue;
}
let (prior, posterior) = score_candidate(&attempt, &marking);
scored.push(ScoredCandidate {
marking,
prior,
posterior,
canonical_bytes: attempt.bytes.into_boxed_slice(),
features: attempt.features,
fix_source: attempt.fix_source,
});
}
if scored.is_empty() {
return Parsed::Ambiguous {
candidates: Vec::new(),
};
}
debug_assert!(
scored.iter().all(|c| c.posterior.is_finite()),
"decoder produced non-finite posterior — invariant violated"
);
scored.retain(|c| c.posterior.is_finite());
if scored.is_empty() {
return Parsed::Ambiguous {
candidates: Vec::new(),
};
}
scored.sort_by(|a, b| b.posterior.total_cmp(&a.posterior));
scored.truncate(K_MAX_CANDIDATES);
let top_score = scored[0].posterior;
let runner_up_score = scored
.get(1)
.map(|c| c.posterior)
.unwrap_or(f32::NEG_INFINITY);
let log_margin = top_score - runner_up_score;
if scored.len() == 1 || log_margin >= UNAMBIGUOUS_LOG_MARGIN {
let top = scored.swap_remove(0);
let runner_up_ratio = if runner_up_score.is_finite() {
let ratio = log_margin.exp();
Some(if ratio.is_finite() { ratio } else { f32::MAX })
} else {
None
};
let mut marking = top.marking;
marking.1 = Some(DecoderProvenance::new(
top.canonical_bytes,
top.posterior,
runner_up_ratio,
top.features
.into_iter()
.map(|f| FeatureContribution {
id: f.id,
delta: f.delta,
})
.collect::<Vec<_>>()
.into_boxed_slice(),
top.fix_source,
));
return Parsed::Unambiguous(marking);
}
Parsed::Ambiguous {
candidates: scored
.into_iter()
.map(|s| Candidate {
marking: s.marking,
evidence: s.features.iter().map(feature_entry_to_evidence).collect(),
prior_log_odds: s.prior,
})
.collect(),
}
}
}
struct ScoredCandidate {
marking: CapcoMarking,
prior: f32,
posterior: f32,
canonical_bytes: Box<[u8]>,
features: Vec<FeatureEntry>,
fix_source: marque_rules::FixSource,
}
#[derive(Debug, Clone, Copy)]
struct FeatureEntry {
id: FeatureId,
delta: f32,
}
fn feature_entry_to_evidence(f: &FeatureEntry) -> EvidenceFeature {
EvidenceFeature {
label: f.id.as_str(),
log_odds: f.delta,
}
}
struct CanonicalAttempt {
bytes: Vec<u8>,
features: Vec<FeatureEntry>,
fix_source: marque_rules::FixSource,
}
fn infer_marking_type(bytes: &[u8]) -> Option<MarkingType> {
let first = bytes.iter().copied().find(|&b| !b.is_ascii_whitespace())?;
if first == b'(' {
return Some(MarkingType::Portion);
}
if is_cab_head(bytes) {
return Some(MarkingType::Cab);
}
Some(MarkingType::Banner)
}
fn is_cab_head(bytes: &[u8]) -> bool {
let Ok(text) = std::str::from_utf8(bytes) else {
return false;
};
let trimmed = text.trim_start();
trimmed.starts_with("Classified By:")
|| trimmed.starts_with("Derived From:")
|| trimmed.starts_with("Declassify On:")
}
fn generate_candidate_bytes(bytes: &[u8]) -> Vec<CanonicalAttempt> {
let Ok(text) = std::str::from_utf8(bytes) else {
return Vec::new();
};
let trimmed = text.trim();
if trimmed.is_empty() {
return Vec::new();
}
let mut attempts: Vec<CanonicalAttempt> = Vec::new();
let mut emit =
|bytes: Vec<u8>, features: Vec<FeatureEntry>, fix_source: marque_rules::FixSource| {
if attempts.len() >= K_MAX_CANDIDATES * 2 {
return;
}
if !attempts.iter().any(|a| a.bytes == bytes) {
attempts.push(CanonicalAttempt {
bytes,
features,
fix_source,
});
}
};
let (normalized, mut delim_features) = normalize_delimiters_and_case(trimmed);
let repaired_text = match try_rel_to_structural_repair(&normalized) {
Some(repaired) => {
delim_features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
repaired
}
None => normalized,
};
let repaired_text = match try_sci_delimiter_repair(&repaired_text) {
Some(repaired) => {
delim_features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
repaired
}
None => repaired_text,
};
let vocab = CapcoTokenSet.correction_vocab();
let matcher = FuzzyVocabMatcher::new(vocab);
let (fuzzy_corrected, fuzzy_features) = fuzzy_correct_tokens(&repaired_text, &matcher);
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
emit(
fuzzy_corrected.clone().into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
if let Some(reordered) = try_canonical_reorder(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(FeatureEntry {
id: FeatureId::TokenReorder,
delta: -0.4,
});
emit(
reordered.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
if let Some(prefixed) = try_add_non_us_prefix(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(FeatureEntry {
id: FeatureId::TokenReorder,
delta: -0.4,
});
emit(
prefixed.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
if let Some(delim_inserted) = try_insert_delimiter(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
emit(
delim_inserted.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
if let Some(sar_repaired) = try_sar_indicator_repair(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
emit(
sar_repaired.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
for candidate in try_collapse_stray_char_slash(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
emit(
candidate.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
let trigraph_matcher = FuzzyVocabMatcher::new(marque_ism::TRIGRAPHS);
for (alt_text, edit_feature) in
try_rel_to_fuzzy_trigraph_candidates(&fuzzy_corrected, &trigraph_matcher)
{
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(edit_feature);
features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: 0.0,
});
emit(
alt_text.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
for (alt_text, prior_feature) in try_rel_to_usa_injection_candidates(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
features.push(prior_feature);
emit(
alt_text.into_bytes(),
features,
marque_rules::FixSource::DecoderPosterior,
);
}
if let Some(heuristic_bytes) = try_classification_heuristic_fix(&fuzzy_corrected) {
let mut features = delim_features.clone();
features.extend(fuzzy_features.iter().copied());
emit(
heuristic_bytes.into_bytes(),
features,
marque_rules::FixSource::DecoderClassificationHeuristic,
);
}
attempts
}
#[cfg(feature = "decoder-harness")]
pub fn diagnostic_canonical_attempts(bytes: &[u8]) -> Vec<Vec<u8>> {
generate_candidate_bytes(bytes)
.into_iter()
.map(|a| a.bytes)
.collect()
}
fn normalize_delimiters_and_case(text: &str) -> (String, Vec<FeatureEntry>) {
let mut features = Vec::new();
let mut normalized: String = text.to_owned();
let replacements = [
("∕∕", "//"),
(" // ", "//"),
("// ", "//"),
(" //", "//"),
("/ / ", "//"),
(" / / ", "//"),
("/ /", "//"),
];
let mut delim_changed = false;
for (from, to) in replacements {
if normalized.contains(from) {
normalized = normalized.replace(from, to);
delim_changed = true;
}
}
let had_lowercase = normalized.chars().any(|c| c.is_ascii_lowercase());
if had_lowercase {
normalized = normalized.to_ascii_uppercase();
}
if delim_changed || had_lowercase {
features.push(FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: -0.3,
});
}
(normalized, features)
}
fn fuzzy_correct_tokens(
text: &str,
matcher: &FuzzyVocabMatcher<'_>,
) -> (String, Vec<FeatureEntry>) {
let mut features = Vec::new();
let mut out = String::with_capacity(text.len());
let mut rest = text;
while !rest.is_empty() {
let non_token_len = rest
.chars()
.take_while(|c| !is_token_char(*c))
.map(|c| c.len_utf8())
.sum::<usize>();
if non_token_len > 0 {
out.push_str(&rest[..non_token_len]);
rest = &rest[non_token_len..];
continue;
}
let token_len = scan_token(rest);
if token_len == 0 {
break;
}
let (token, tail) = rest.split_at(token_len);
rest = tail;
if let Some(replacement) = SUPERSEDED_TOKEN_MAP
.iter()
.find(|&&(from, _)| from == token)
.map(|&(_, to)| to)
{
out.push_str(replacement);
features.push(FeatureEntry {
id: FeatureId::SupersededToken,
delta: -0.2,
});
continue;
}
let embedded_replacement = SUPERSEDED_TOKEN_MAP
.iter()
.find(|&&(from, _)| token != from && token.contains(from))
.map(|&(from, to)| token.replace(from, to));
if let Some(replaced) = embedded_replacement {
out.push_str(&replaced);
features.push(FeatureEntry {
id: FeatureId::SupersededToken,
delta: -0.2,
});
continue;
}
if CapcoTokenSet.canonicalize(token).is_some() || CapcoTokenSet.is_trigraph(token) {
out.push_str(token);
continue;
}
if let Some(correction) = matcher.correct(token) {
out.push_str(correction.token);
let feature = match correction.distance {
0 => None,
1 => Some(FeatureEntry {
id: FeatureId::EditDistance1,
delta: -0.5,
}),
_ => Some(FeatureEntry {
id: FeatureId::EditDistance2,
delta: -1.2,
}),
};
if let Some(entry) = feature {
features.push(entry);
}
continue;
}
out.push_str(token);
}
(out, features)
}
fn is_token_char(c: char) -> bool {
c.is_ascii_alphanumeric()
}
fn scan_token(text: &str) -> usize {
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
let is_alnum = b.is_ascii_alphanumeric();
let is_internal_hyphen =
b == b'-' && i > 0 && i + 1 < bytes.len() && bytes[i + 1].is_ascii_alphanumeric();
if is_alnum || is_internal_hyphen {
i += 1;
} else {
break;
}
}
i
}
const SUPERSEDED_TOKEN_MAP: &[(&str, &str)] = &[("COMINT", "SI")];
fn try_classification_heuristic_fix(text: &str) -> Option<String> {
if is_cab_head(text.as_bytes()) {
return None;
}
let (open_paren, body, close_paren) = if text.starts_with('(') && text.ends_with(')') {
("(", &text[1..text.len() - 1], ")")
} else {
("", text, "")
};
let first_seg_end = body.find("//").unwrap_or(body.len());
let first_seg = &body[..first_seg_end];
let after_first_seg = &body[first_seg_end..];
let first_seg_trimmed_start = first_seg
.char_indices()
.find(|(_, c)| !c.is_whitespace())
.map(|(i, _)| i)
.unwrap_or(0);
let leading_ws = &first_seg[..first_seg_trimmed_start];
let after_leading_ws = &first_seg[first_seg_trimmed_start..];
let token_end = after_leading_ws
.find(char::is_whitespace)
.unwrap_or(after_leading_ws.len());
let first_token = &after_leading_ws[..token_end];
let after_first_token = &after_leading_ws[token_end..];
if is_canonical_short_classification(first_token) {
return None;
}
let has_other_marking_content = after_first_token.chars().any(|c| !c.is_whitespace())
|| after_first_seg.chars().any(|c| !c.is_whitespace());
if !has_other_marking_content {
return None;
}
let replacement = match first_token.len() {
3 => try_3char_classification_heuristic(first_token)?,
2 => try_2char_classification_heuristic(first_token)?,
1 => try_1char_classification_heuristic(first_token)?,
_ => return None,
};
Some(format!(
"{open_paren}{leading_ws}{replacement}{after_first_token}{after_first_seg}{close_paren}"
))
}
fn is_canonical_short_classification(token: &str) -> bool {
matches!(token, "U" | "R" | "C" | "S" | "TS" | "TOP")
}
fn try_2char_classification_heuristic(token: &str) -> Option<&'static str> {
let bytes = token.as_bytes();
debug_assert_eq!(bytes.len(), 2);
let first = bytes[0].to_ascii_uppercase();
let second = bytes[1].to_ascii_uppercase();
let t_cluster = matches!(first, b'T' | b'R' | b'Y' | b'H' | b'G' | b'F');
let s_cluster = matches!(second, b'A' | b'W' | b'E' | b'Z' | b'S');
if t_cluster && s_cluster {
return Some("TS");
}
if first == b'T' && matches!(second, b'P' | b'O') {
return Some("TOP");
}
None
}
fn try_3char_classification_heuristic(token: &str) -> Option<&'static str> {
let bytes = token.as_bytes();
debug_assert_eq!(bytes.len(), 3);
let upper = [
bytes[0].to_ascii_uppercase(),
bytes[1].to_ascii_uppercase(),
bytes[2].to_ascii_uppercase(),
];
if upper == *b"OTP" {
return Some("TOP");
}
None
}
fn try_1char_classification_heuristic(token: &str) -> Option<&'static str> {
let bytes = token.as_bytes();
debug_assert_eq!(bytes.len(), 1);
match bytes[0].to_ascii_uppercase() {
b'A' | b'W' | b'E' | b'Z' => Some("S"),
b'V' | b'F' => Some("C"),
b'X' => Some("S"),
_ => None,
}
}
fn try_insert_delimiter(text: &str) -> Option<String> {
let bytes = text.as_bytes();
let mut result = String::with_capacity(text.len() + 8);
let mut insertions = 0;
let mut prev_token: Option<&str> = None;
let mut in_classification = true;
let mut seen_double_slash = false;
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'/' && i + 1 < bytes.len() && bytes[i + 1] == b'/' {
result.push_str("//");
seen_double_slash = true;
in_classification = false;
prev_token = None;
i += 2;
continue;
}
if bytes[i].is_ascii_whitespace() {
let ws_start = i;
while i < bytes.len() && bytes[i].is_ascii_whitespace() {
i += 1;
}
let ws = &text[ws_start..i];
let token_start = i;
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
i += 1;
}
if token_start == i {
result.push_str(ws);
continue;
}
let next_token = &text[token_start..i];
let should_insert = decide_insert_delimiter(
prev_token,
next_token,
in_classification,
seen_double_slash,
);
if should_insert && insertions < MAX_DELIMITER_INSERTIONS {
result.push_str("//");
insertions += 1;
seen_double_slash = true;
in_classification = false;
} else {
result.push_str(ws);
}
result.push_str(next_token);
if !is_classification_continuation(next_token, prev_token) {
in_classification = false;
}
prev_token = Some(next_token);
continue;
}
let other_start = i;
if bytes[i].is_ascii_alphanumeric() {
while i < bytes.len() && (bytes[i].is_ascii_alphanumeric() || bytes[i] == b'-') {
i += 1;
}
let leading_token = &text[other_start..i];
result.push_str(leading_token);
if !is_classification_continuation(leading_token, prev_token) {
in_classification = false;
}
prev_token = Some(leading_token);
continue;
}
let ch = text[i..]
.chars()
.next()
.expect("byte index must remain on a char boundary");
result.push(ch);
i += ch.len_utf8();
}
if insertions == 0 { None } else { Some(result) }
}
const MAX_DELIMITER_INSERTIONS: usize = 4;
fn decide_insert_delimiter(
prev_token: Option<&str>,
next_token: &str,
in_classification: bool,
seen_double_slash: bool,
) -> bool {
if next_token == "NOFORN" && matches!(prev_token, Some("SBU") | Some("LES")) {
return false;
}
if in_classification && !seen_double_slash && !is_classification_token(next_token) {
return true;
}
is_hard_splitter(next_token)
}
fn is_classification_token(token: &str) -> bool {
matches!(
token,
"U" | "R"
| "C"
| "S"
| "TS"
| "TOP"
| "UNCLASSIFIED"
| "RESTRICTED"
| "CONFIDENTIAL"
| "SECRET"
)
}
fn is_classification_continuation(next_token: &str, prev_token: Option<&str>) -> bool {
if next_token == "SECRET" && prev_token == Some("TOP") {
return true;
}
is_classification_token(next_token)
}
fn is_hard_splitter(token: &str) -> bool {
matches!(
token,
"NOFORN"
| "ORCON"
| "ORCON-USGOV"
| "PROPIN"
| "IMCON"
| "RELIDO"
| "RSEN"
| "EYESONLY"
| "FOUO"
| "FISA"
| "DSEN"
| "EXDIS"
| "NODIS"
| "LIMDIS"
)
}
fn try_sar_indicator_repair(text: &str) -> Option<String> {
if !text.contains("SAR") {
return None;
}
let bytes = text.as_bytes();
let mut result: Option<String> = None;
let mut last_copied: usize = 0;
let mut i = 0;
while i < bytes.len() {
let at_boundary =
i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
if at_boundary {
if let Some((_prefix_len, post)) = match_sar_prefix(bytes, i) {
let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
r.push_str(&text[last_copied..i]);
r.push_str("SAR-");
last_copied = post;
i = post;
continue;
}
if let Some(end) = match_sar_missing_hyphen(bytes, i) {
let r = result.get_or_insert_with(|| String::with_capacity(text.len() + 4));
r.push_str(&text[last_copied..i]);
r.push_str("SAR-");
r.push_str(&text[i + 3..end]);
last_copied = end;
i = end;
continue;
}
}
let ch = text[i..]
.chars()
.next()
.expect("byte index must remain on a char boundary");
i += ch.len_utf8();
}
result.map(|mut r| {
r.push_str(&text[last_copied..]);
r
})
}
fn match_sar_prefix(bytes: &[u8], i: usize) -> Option<(usize, usize)> {
for prefix_len in 1..=3 {
let sar_start = i + prefix_len;
if sar_start + 4 > bytes.len() {
break;
}
if !bytes[i..sar_start].iter().all(|b| b.is_ascii_uppercase()) {
break;
}
if &bytes[sar_start..sar_start + 4] == b"SAR-" {
return Some((prefix_len, sar_start + 4));
}
}
None
}
fn match_sar_missing_hyphen(bytes: &[u8], i: usize) -> Option<usize> {
if i + 3 > bytes.len() || &bytes[i..i + 3] != b"SAR" {
return None;
}
let after_sar = i + 3;
let mut j = after_sar;
while j < bytes.len() && bytes[j].is_ascii_alphanumeric() {
j += 1;
}
let run = j - after_sar;
if !(2..=3).contains(&run) {
return None;
}
let next_is_delim =
j == bytes.len() || matches!(bytes[j], b'-' | b'/' | b' ' | b'\t' | b'\n' | b'\r');
if !next_is_delim {
return None;
}
Some(j)
}
fn try_collapse_stray_char_slash(text: &str) -> Vec<String> {
let bytes = text.as_bytes();
let mut i = 0;
while i + 3 <= bytes.len() {
if bytes[i] != b'/' || !bytes[i + 1].is_ascii_alphanumeric() || bytes[i + 2] != b'/' {
i += 1;
continue;
}
let prev_alnum = i > 0 && bytes[i - 1].is_ascii_alphanumeric();
let next_alnum = i + 3 < bytes.len() && bytes[i + 3].is_ascii_alphanumeric();
if !prev_alnum || !next_alnum {
i += 1;
continue;
}
let x = bytes[i + 1];
let prefix = &bytes[..i];
let suffix = &bytes[i + 3..];
let mut out = Vec::with_capacity(3);
let mut buf = Vec::with_capacity(bytes.len());
buf.extend_from_slice(prefix);
buf.extend_from_slice(b"//");
buf.extend_from_slice(suffix);
out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
let mut buf = Vec::with_capacity(bytes.len());
buf.extend_from_slice(prefix);
buf.extend_from_slice(b"//");
buf.push(x);
buf.extend_from_slice(suffix);
out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
let mut buf = Vec::with_capacity(bytes.len());
buf.extend_from_slice(prefix);
buf.push(x);
buf.extend_from_slice(b"//");
buf.extend_from_slice(suffix);
out.push(String::from_utf8(buf).expect("ASCII insertions on UTF-8 prefix/suffix"));
return out;
}
Vec::new()
}
fn try_rel_to_structural_repair(text: &str) -> Option<String> {
if !text.contains("REL") {
return None;
}
let mut working: Option<String> = None;
let mut any_change = false;
if let Some(normalized) = try_rel_to_header_normalize(text) {
working = Some(normalized);
any_change = true;
}
let entry_input: &str = working.as_deref().unwrap_or(text);
if let Some(entry_fixed) = try_rel_to_entry_normalize(entry_input) {
working = Some(entry_fixed);
any_change = true;
}
if any_change { working } else { None }
}
fn try_rel_to_header_normalize(text: &str) -> Option<String> {
let bytes = text.as_bytes();
let mut result: Option<String> = None;
let mut last_copied: usize = 0;
let mut i = 0;
while i < bytes.len() {
let at_boundary =
i == 0 || matches!(bytes[i - 1], b'/' | b'(' | b' ' | b'\t' | b'\n' | b'\r');
if at_boundary && i + 7 <= bytes.len() {
let window = &bytes[i..i + 7];
if window == b"REL OT " || window == b"RELT O " {
let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
r.push_str(&text[last_copied..i]);
r.push_str("REL TO ");
last_copied = i + 7;
i = last_copied;
continue;
}
}
let ch = text[i..]
.chars()
.next()
.expect("byte index must remain on a char boundary");
i += ch.len_utf8();
}
result.map(|mut r| {
r.push_str(&text[last_copied..]);
r
})
}
fn try_rel_to_entry_normalize(text: &str) -> Option<String> {
if !text.contains("REL TO ") {
return None;
}
let token_set = CapcoTokenSet;
let mut any_change = false;
let mut current: Option<String> = None;
loop {
let input: &str = current.as_deref().unwrap_or(text);
match apply_rel_to_entry_pass(input, &token_set) {
Some(rewritten) => {
current = Some(rewritten);
any_change = true;
}
None => break,
}
}
if any_change { current } else { None }
}
fn apply_rel_to_entry_pass(text: &str, token_set: &CapcoTokenSet) -> Option<String> {
let mut search_start = 0;
while let Some(rel_pos) = text[search_start..].find("REL TO ") {
let header_end = search_start + rel_pos + "REL TO ".len();
let block_end = text[header_end..]
.find("//")
.map(|p| header_end + p)
.unwrap_or(text.len());
let block = &text[header_end..block_end];
if let Some((rel_local_offset, fixed_block)) = fix_rel_to_block(block, token_set) {
let mut result = String::with_capacity(text.len());
result.push_str(&text[..header_end]);
result.push_str(&fixed_block);
result.push_str(&text[block_end..]);
let _ = rel_local_offset;
return Some(result);
}
search_start = block_end;
}
None
}
fn fix_rel_to_block(block: &str, token_set: &CapcoTokenSet) -> Option<(usize, String)> {
let mut entries: Vec<(usize, &str)> = Vec::new();
let mut cursor = 0;
for entry in block.split(',') {
entries.push((cursor, entry));
cursor += entry.len() + 1; }
for (entry_offset, entry) in &entries {
let trimmed = entry.trim();
if trimmed.len() != 4 {
continue;
}
let bytes = trimmed.as_bytes();
if !bytes[0].is_ascii_uppercase()
|| bytes[1] != b' '
|| !bytes[2].is_ascii_uppercase()
|| !bytes[3].is_ascii_uppercase()
{
continue;
}
let joined = format!(
"{}{}{}",
bytes[0] as char, bytes[2] as char, bytes[3] as char
);
if !token_set.is_trigraph(&joined) {
continue;
}
let one_letter = std::str::from_utf8(&bytes[..1]).expect("ASCII upper");
if token_set.is_trigraph(one_letter) {
continue;
}
let lead_ws_len = entry.len() - entry.trim_start().len();
let mut rewritten_entry = String::with_capacity(entry.len() - 1);
rewritten_entry.push_str(&entry[..lead_ws_len]);
rewritten_entry.push_str(&joined);
rewritten_entry.push_str(&entry[lead_ws_len + trimmed.len()..]);
let mut result = String::with_capacity(block.len());
result.push_str(&block[..*entry_offset]);
result.push_str(&rewritten_entry);
result.push_str(&block[*entry_offset + entry.len()..]);
return Some((*entry_offset, result));
}
for i in 0..entries.len().saturating_sub(1) {
let (left_off, left_entry) = &entries[i];
let (right_off, right_entry) = &entries[i + 1];
let left_trim = left_entry.trim();
let right_trim_start = right_entry.trim_start();
if left_trim.len() != 2 || !left_trim.chars().all(|c| c.is_ascii_uppercase()) {
continue;
}
let right_bytes = right_trim_start.as_bytes();
if right_bytes.len() < 2 || !right_bytes[0].is_ascii_uppercase() || right_bytes[1] != b' ' {
continue;
}
let joined = format!("{}{}", left_trim, right_bytes[0] as char);
if !token_set.is_trigraph(&joined) {
continue;
}
if token_set.is_trigraph(left_trim) {
continue;
}
let left_lead = left_entry.len() - left_entry.trim_start().len();
let mut new_left = String::with_capacity(left_entry.len() + 1);
new_left.push_str(&left_entry[..left_lead]);
new_left.push_str(&joined);
let right_lead = right_entry.len() - right_trim_start.len();
let after_first = &right_trim_start[2..];
let mut new_right = String::with_capacity(right_entry.len());
new_right.push_str(&right_entry[..right_lead]);
new_right.push(' ');
new_right.push_str(after_first);
let mut result = String::with_capacity(block.len() + 1);
result.push_str(&block[..*left_off]);
result.push_str(&new_left);
result.push(',');
result.push_str(&new_right);
result.push_str(&block[*right_off + right_entry.len()..]);
return Some((*left_off, result));
}
None
}
fn try_rel_to_fuzzy_trigraph_candidates(
text: &str,
trigraph_matcher: &FuzzyVocabMatcher<'_>,
) -> Vec<(String, FeatureEntry)> {
let token_set = CapcoTokenSet;
let mut out: Vec<(String, FeatureEntry)> = Vec::new();
let mut search_start = 0;
while let Some(rel_pos) = text[search_start..].find("REL TO ") {
let header_end = search_start + rel_pos + "REL TO ".len();
let tail = &text[header_end..];
let block_len = ["//", "\n", ")"]
.iter()
.filter_map(|sep| tail.find(sep))
.min()
.unwrap_or(tail.len());
let block_end = header_end + block_len;
let block = &text[header_end..block_end];
let mut cursor = 0usize;
for entry in block.split(',') {
let entry_start = cursor;
let entry_end = cursor + entry.len();
cursor = entry_end + 1;
let trimmed = entry.trim();
let tlen = trimmed.len();
if (tlen != 3 && tlen != 4) || !trimmed.bytes().all(|b| b.is_ascii_uppercase()) {
continue;
}
if token_set.is_trigraph(trimmed) {
continue;
}
let mut candidates = trigraph_matcher.correct_all_with_floor(trimmed, 0.0);
if candidates.is_empty() {
continue;
}
let other_trigraphs: Vec<&str> = block
.split(',')
.map(str::trim)
.filter(|e| {
let elen = e.len();
(elen == 3 || elen == 4)
&& e.bytes().all(|b| b.is_ascii_uppercase())
&& *e != trimmed
&& token_set.is_trigraph(e)
})
.collect();
candidates.retain(|c| !other_trigraphs.contains(&c.token));
if candidates.is_empty() {
continue;
}
const TRIGRAPH_FUZZY_TOP_K: usize = 4;
candidates.sort_by(|a, b| {
a.distance.cmp(&b.distance).then_with(|| {
let pa = marque_capco::priors::country_code_log_prior(a.token)
.unwrap_or(f32::NEG_INFINITY);
let pb = marque_capco::priors::country_code_log_prior(b.token)
.unwrap_or(f32::NEG_INFINITY);
pb.total_cmp(&pa)
})
});
candidates.truncate(TRIGRAPH_FUZZY_TOP_K);
for cand in &candidates {
let lead_ws_len = entry.len() - entry.trim_start().len();
let trail_ws_len = entry.len() - entry.trim_end().len();
let mut rewritten_entry = String::with_capacity(entry.len());
rewritten_entry.push_str(&entry[..lead_ws_len]);
rewritten_entry.push_str(cand.token);
rewritten_entry.push_str(&entry[entry.len() - trail_ws_len..]);
let mut alt = String::with_capacity(text.len());
alt.push_str(&text[..header_end + entry_start]);
alt.push_str(&rewritten_entry);
alt.push_str(&text[header_end + entry_end..]);
let entry = if cand.distance <= 1 {
FeatureEntry {
id: FeatureId::EditDistance1,
delta: -0.5,
}
} else {
FeatureEntry {
id: FeatureId::EditDistance2,
delta: -1.2,
}
};
out.push((alt, entry));
}
}
search_start = block_end;
}
out
}
fn try_rel_to_usa_injection_candidates(text: &str) -> Vec<(String, FeatureEntry)> {
let mut out: Vec<(String, FeatureEntry)> = Vec::new();
let mut search_start = 0;
while let Some(rel_pos) = text[search_start..].find("REL TO ") {
let header_end = search_start + rel_pos + "REL TO ".len();
let tail = &text[header_end..];
let block_len = ["//", "\n", ")"]
.iter()
.filter_map(|sep| tail.find(sep))
.min()
.unwrap_or(tail.len());
let block_end = header_end + block_len;
let block = &text[header_end..block_end];
let entries: Vec<(usize, &str)> = {
let mut v = Vec::with_capacity(block.bytes().filter(|&b| b == b',').count() + 1);
let mut cursor = 0usize;
for entry in block.split(',') {
v.push((cursor, entry));
cursor += entry.len() + 1; }
v
};
if entries.len() < 2 {
search_start = block_end;
continue;
}
let (first_entry_offset, first_entry) = entries[0];
let trimmed = first_entry.trim();
let is_short =
(1..=2).contains(&trimmed.len()) && trimmed.bytes().all(|b| b.is_ascii_uppercase());
if !is_short {
search_start = block_end;
continue;
}
let already_has_usa = entries.iter().skip(1).any(|(_, e)| e.trim() == "USA");
if already_has_usa {
search_start = block_end;
continue;
}
let lead_ws_len = first_entry.len() - first_entry.trim_start().len();
let trail_ws_len = first_entry.len() - first_entry.trim_end().len();
let mut rewritten_entry = String::with_capacity(first_entry.len() + 3);
rewritten_entry.push_str(&first_entry[..lead_ws_len]);
rewritten_entry.push_str("USA");
rewritten_entry.push_str(&first_entry[first_entry.len() - trail_ws_len..]);
let mut alt = String::with_capacity(text.len() + 3);
alt.push_str(&text[..header_end + first_entry_offset]);
alt.push_str(&rewritten_entry);
alt.push_str(&text[header_end + first_entry_offset + first_entry.len()..]);
let entry = FeatureEntry {
id: FeatureId::BaseRateCommonMarking,
delta: 0.0,
};
out.push((alt, entry));
search_start = block_end;
}
out
}
fn try_sci_delimiter_repair(text: &str) -> Option<String> {
if !contains_any_sci_root(text) {
return None;
}
if !text.is_ascii() {
return None;
}
let bytes = text.as_bytes();
let mut result: Option<String> = None;
let mut last_copied = 0usize;
let mut i = 0usize;
while i < bytes.len() {
let at_boundary = i == 0
|| matches!(
bytes[i - 1],
b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','
);
if !at_boundary {
i += 1;
continue;
}
let token_start = i;
let token_end = bytes[token_start..]
.iter()
.position(|&b| matches!(b, b'/' | b'(' | b')' | b' ' | b'\t' | b'\n' | b'\r' | b','))
.map(|n| token_start + n)
.unwrap_or(bytes.len());
if token_start < token_end {
let token = &text[token_start..token_end];
if let Some(repaired) = repair_sci_token(token) {
let r = result.get_or_insert_with(|| String::with_capacity(text.len()));
r.push_str(&text[last_copied..token_start]);
r.push_str(&repaired);
last_copied = token_end;
}
}
i = token_end + 1;
}
result.map(|mut r| {
r.push_str(&text[last_copied..]);
r
})
}
fn contains_any_sci_root(text: &str) -> bool {
text.contains("HCS")
|| text.contains("KLM")
|| text.contains("MVL")
|| text.contains("RSV")
|| text.contains("BUR")
|| text.contains("SI")
|| text.contains("TK")
}
fn repair_sci_token(token: &str) -> Option<String> {
if token.is_empty() {
return None;
}
if !token.is_ascii() {
return None;
}
let len = token.len();
if !token.contains('-') && (3..=8).contains(&len) {
for &split in &[2usize, 3] {
if split >= len {
continue;
}
let prefix = &token[..split];
let suffix = &token[split..];
if SciControlBare::parse(prefix).is_some() {
let canonical = format!("{prefix}-{suffix}");
if SciControl::parse(&canonical).is_some() {
return Some(canonical);
}
}
}
}
if let Some(dash_pos) = token.find('-') {
if SciControl::parse(token).is_some() {
return None;
}
let prefix = &token[..dash_pos];
let suffix = &token[dash_pos + 1..];
if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
return Some(format!("{prefix}/{suffix}"));
}
return None;
}
if !(4..=6).contains(&len) {
return None;
}
let mut found: Option<(&str, &str)> = None;
for &split in &[2usize, 3] {
if split >= len {
continue;
}
let suffix_len = len - split;
if !(2..=3).contains(&suffix_len) {
continue;
}
let prefix = &token[..split];
let suffix = &token[split..];
if SciControlBare::parse(prefix).is_some() && SciControlBare::parse(suffix).is_some() {
if found.is_some() {
return None;
}
found = Some((prefix, suffix));
}
}
found.map(|(p, s)| format!("{p}/{s}"))
}
fn try_canonical_reorder(text: &str) -> Option<String> {
if !text.contains("//") {
return None;
}
let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
("(", &text[1..text.len() - 1], ")")
} else {
("", text, "")
};
let segments: Vec<&str> = body.split("//").collect();
if segments.len() < 2 {
return None;
}
let mut class_segments: Vec<&str> = Vec::new();
let mut dissem_segments: Vec<&str> = Vec::new();
let mut other_segments: Vec<&str> = Vec::new();
for seg in &segments {
let seg = seg.trim();
if seg.is_empty() {
continue;
}
match classify_segment(seg) {
SegmentClass::Classification => class_segments.push(seg),
SegmentClass::Dissem => dissem_segments.push(seg),
SegmentClass::Other => other_segments.push(seg),
}
}
if class_segments.is_empty() {
return None;
}
let is_non_us = class_segments
.iter()
.any(|s| is_non_us_classification_segment(s));
if let Some(first) = segments.iter().find(|s| !s.trim().is_empty()) {
if class_segments.contains(&first.trim()) {
if !is_non_us || body.starts_with("//") {
return None;
}
}
}
let mut ordered: Vec<&str> = Vec::new();
ordered.extend(class_segments);
ordered.extend(other_segments);
ordered.extend(dissem_segments);
let joined = ordered.join("//");
if is_non_us {
Some(format!("{prefix}//{joined}{suffix}"))
} else {
Some(format!("{prefix}{joined}{suffix}"))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum SegmentClass {
Classification,
Dissem,
Other,
}
fn classify_segment(seg: &str) -> SegmentClass {
let first_token = seg.split_whitespace().next().unwrap_or("");
let first_token = first_token.trim_end_matches(',');
const CLASSIFICATIONS: &[&str] = &[
"U",
"R",
"C",
"S",
"TS",
"UNCLASSIFIED",
"RESTRICTED",
"CONFIDENTIAL",
"SECRET",
"NS",
"NC",
"NU",
"CTS",
"CTSA",
"NSAT",
"NCA",
"CTS-B",
"CTS-BALK",
"JOINT",
];
const DISSEMS: &[&str] = &[
"NOFORN", "NF", "ORCON", "OC", "PROPIN", "PR", "IMCON", "IMC", "RELIDO", "RS", "RSEN",
"DSEN", "FISA", "FOUO", "EYES", "REL",
"DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI",
"LIMDIS", "EXDIS", "NODIS",
];
if first_token == "RESTRICTED" && seg.split_whitespace().nth(1).is_some() {
return SegmentClass::Other;
}
if CLASSIFICATIONS.contains(&first_token) {
SegmentClass::Classification
} else if DISSEMS.contains(&first_token)
|| (first_token == "LIMITED" && seg.starts_with("LIMITED DISTRIBUTION"))
|| (first_token == "NO" && seg.starts_with("NO DISTRIBUTION"))
|| (first_token == "EXCLUSIVE" && seg.starts_with("EXCLUSIVE DISTRIBUTION"))
|| (first_token == "LAW" && seg.starts_with("LAW ENFORCEMENT SENSITIVE"))
|| (first_token == "SENSITIVE"
&& (seg.starts_with("SENSITIVE BUT UNCLASSIFIED")
|| seg.starts_with("SENSITIVE SECURITY INFORMATION")))
{
SegmentClass::Dissem
} else if (first_token == "TOP" && seg.starts_with("TOP SECRET"))
|| (first_token == "COSMIC" && seg.starts_with("COSMIC TOP SECRET"))
|| (first_token == "NATO"
&& (seg.starts_with("NATO SECRET")
|| seg.starts_with("NATO CONFIDENTIAL")
|| seg.starts_with("NATO UNCLASSIFIED")
|| seg.starts_with("NATO RESTRICTED")))
{
SegmentClass::Classification
} else if CapcoTokenSet.is_trigraph(first_token) {
let second = seg.split_whitespace().nth(1).unwrap_or("");
let second = second.trim_end_matches(',');
if matches!(
second,
"U" | "R"
| "C"
| "S"
| "TS"
| "UNCLASSIFIED"
| "RESTRICTED"
| "CONFIDENTIAL"
| "SECRET"
) || (second == "TOP"
&& seg
.split_whitespace()
.nth(2)
.is_some_and(|t| t.trim_end_matches(',') == "SECRET"))
{
SegmentClass::Classification
} else {
SegmentClass::Other
}
} else {
SegmentClass::Other
}
}
fn is_non_us_classification_segment(seg: &str) -> bool {
const NATO_ABBREVS: &[&str] = &[
"NS", "NC", "NU", "CTS", "CTSA", "NSAT", "NCA", "CTS-B", "CTS-BALK",
];
let mut tokens = seg.split_whitespace();
let first = tokens.next().unwrap_or("");
let first = first.trim_end_matches(',');
if NATO_ABBREVS.contains(&first) {
return true;
}
if first == "JOINT" {
return true;
}
if first == "COSMIC" && seg.starts_with("COSMIC TOP SECRET") {
return true;
}
if first == "NATO"
&& (seg.starts_with("NATO SECRET")
|| seg.starts_with("NATO CONFIDENTIAL")
|| seg.starts_with("NATO UNCLASSIFIED")
|| seg.starts_with("NATO RESTRICTED"))
{
return true;
}
if CapcoTokenSet.is_trigraph(first) {
let second = tokens.next().unwrap_or("");
let second = second.trim_end_matches(',');
if matches!(
second,
"U" | "R"
| "C"
| "S"
| "TS"
| "UNCLASSIFIED"
| "RESTRICTED"
| "CONFIDENTIAL"
| "SECRET"
) {
return true;
}
if second == "TOP"
&& tokens
.next()
.is_some_and(|t| t.trim_end_matches(',') == "SECRET")
{
return true;
}
}
false
}
fn try_add_non_us_prefix(text: &str) -> Option<String> {
if text.contains("//") {
return None;
}
let (prefix, body, suffix) = if text.starts_with('(') && text.ends_with(')') {
("(", &text[1..text.len() - 1], ")")
} else {
("", text, "")
};
if is_non_us_classification_segment(body.trim()) {
Some(format!("{prefix}//{body}{suffix}"))
} else {
None
}
}
fn meets_classification_floor(marking: &CapcoMarking, floor: u8) -> bool {
let Some(level) = marking_classification(marking) else {
return floor == Classification::Unclassified as u8;
};
(level as u8) >= floor
}
fn marking_classification(marking: &CapcoMarking) -> Option<Classification> {
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level())
}
fn is_single_letter_portion(bytes: &[u8]) -> bool {
let trimmed = bytes
.iter()
.position(|b| !b.is_ascii_whitespace())
.map(|i| &bytes[i..])
.unwrap_or(bytes);
matches!(trimmed, [b'(', inner, b')'] if inner.is_ascii_alphabetic())
}
fn is_nontrivial_marking(marking: &CapcoMarking) -> bool {
let a = &marking.0;
a.classification.is_some()
|| !a.sci_controls.is_empty()
|| a.sar_markings.is_some()
|| !a.aea_markings.is_empty()
|| a.fgi_marker.is_some()
|| !a.dissem_controls.is_empty()
|| !a.non_ic_dissem.is_empty()
|| !a.rel_to.is_empty()
|| a.classified_by.is_some()
|| a.derived_from.is_some()
|| a.declassify_on.is_some()
|| a.declass_exemption.is_some()
}
fn strict_parse_is_complete(marking: &CapcoMarking, kind: MarkingType) -> bool {
use marque_ism::TokenKind;
let attrs = &marking.0;
match kind {
MarkingType::Portion | MarkingType::Banner => {
attrs.classification.is_some()
&& !attrs
.token_spans
.iter()
.any(|s| matches!(s.kind, TokenKind::Unknown))
}
MarkingType::Cab => {
attrs.classified_by.is_some()
|| attrs.derived_from.is_some()
|| attrs.declassify_on.is_some()
|| attrs.declass_exemption.is_some()
}
_ => is_nontrivial_marking(marking),
}
}
const MISSING_TOKEN_LOG_PRIOR: f32 = -12.0;
const HARD_SPLITTER_ABSORPTION_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
const CUSTOM_SCI_MARKING_PENALTY: f32 = MISSING_TOKEN_LOG_PRIOR;
fn score_candidate(attempt: &CanonicalAttempt, marking: &CapcoMarking) -> (f32, f32) {
let mut prior: f32 = 0.0;
let tokens = canonical_tokens_for(marking);
for token in tokens {
prior += marque_capco::priors::token_log_prior(token).unwrap_or(MISSING_TOKEN_LOG_PRIOR);
}
let mut seen_rel_to_codes = BTreeSet::new();
for country in marking.0.rel_to.iter() {
if seen_rel_to_codes.insert(country.as_str()) {
prior += marque_capco::priors::country_code_log_prior(country.as_str())
.unwrap_or(MISSING_TOKEN_LOG_PRIOR);
}
}
let feature_sum: f32 = attempt.features.iter().map(|f| f.delta).sum();
let mut posterior = prior + feature_sum;
if absorbs_hard_splitter_in_sar_or_sci(marking) {
posterior += HARD_SPLITTER_ABSORPTION_PENALTY;
}
posterior += custom_sci_marking_penalty(marking);
(prior, posterior)
}
fn custom_sci_marking_penalty(marking: &CapcoMarking) -> f32 {
let attrs = &marking.0;
let custom_count = attrs
.sci_markings
.iter()
.filter(|sm| matches!(sm.system, SciControlSystem::Custom(_)))
.count();
custom_count as f32 * CUSTOM_SCI_MARKING_PENALTY
}
fn absorbs_hard_splitter_in_sar_or_sci(marking: &CapcoMarking) -> bool {
let attrs = &marking.0;
if let Some(sar) = attrs.sar_markings.as_ref() {
for prog in sar.programs.iter() {
if contains_hard_splitter_word(&prog.identifier) {
return true;
}
for comp in prog.compartments.iter() {
if contains_hard_splitter_word(&comp.identifier) {
return true;
}
if comp
.sub_compartments
.iter()
.any(|sub| contains_hard_splitter_word(sub))
{
return true;
}
}
}
}
for sci in attrs.sci_markings.iter() {
for comp in sci.compartments.iter() {
if contains_hard_splitter_word(&comp.identifier) {
return true;
}
if comp
.sub_compartments
.iter()
.any(|sub| contains_hard_splitter_word(sub))
{
return true;
}
}
}
false
}
fn contains_hard_splitter_word(s: &str) -> bool {
if is_hard_splitter(s) {
return true;
}
s.split_whitespace().any(is_hard_splitter)
}
fn canonical_tokens_for(marking: &CapcoMarking) -> Vec<&'static str> {
let attrs = &marking.0;
let mut tokens: BTreeSet<&'static str> = BTreeSet::new();
if let Some(class) = attrs.classification.as_ref() {
tokens.insert(class.effective_level().banner_str());
}
for ctrl in attrs.sci_controls.iter() {
tokens.insert(ctrl.as_str());
}
for dis in attrs.dissem_controls.iter() {
tokens.insert(dis.as_str());
}
for nic in attrs.non_ic_dissem.iter() {
tokens.insert(nic.banner_str());
}
if !attrs.aea_markings.is_empty() {
tokens.insert("AEA");
}
if attrs.fgi_marker.is_some() {
tokens.insert("FGI");
}
tokens.into_iter().collect()
}
#[derive(Debug, Default, Clone, Copy)]
pub struct StrictOrDecoderRecognizer {
strict: StrictRecognizer,
decoder: DecoderRecognizer,
}
impl StrictOrDecoderRecognizer {
pub const fn new() -> Self {
Self {
strict: StrictRecognizer::new(),
decoder: DecoderRecognizer::new(),
}
}
}
impl Recognizer<CapcoScheme> for StrictOrDecoderRecognizer {
fn recognize(&self, bytes: &[u8], cx: &ParseContext) -> Parsed<CapcoMarking> {
let strict_inner_cx = ParseContext {
strict_evidence: true,
..cx.clone()
};
let strict_result = self.strict.recognize(bytes, &strict_inner_cx);
if cx.strict_evidence {
return strict_result;
}
let Some(kind) = infer_marking_type(bytes) else {
return strict_result;
};
if matches!(&strict_result, Parsed::Unambiguous(m) if strict_parse_is_complete(m, kind)) {
return strict_result;
}
if matches!(&strict_result, Parsed::Ambiguous { candidates } if !candidates.is_empty()) {
return strict_result;
}
let decoder_cx = ParseContext {
strict_evidence: false,
..cx.clone()
};
let decoder_result = self.decoder.recognize(bytes, &decoder_cx);
match decoder_result {
Parsed::Unambiguous(_) => decoder_result,
_ => strict_result,
}
}
}
#[cfg(test)]
#[cfg_attr(coverage_nightly, coverage(off))]
mod tests {
use super::*;
#[test]
fn decoder_is_send_sync_as_trait_object() {
fn assert_send_sync<T: Send + Sync + ?Sized>() {}
assert_send_sync::<DecoderRecognizer>();
assert_send_sync::<StrictOrDecoderRecognizer>();
assert_send_sync::<std::sync::Arc<dyn Recognizer<CapcoScheme>>>();
}
fn deep_cx() -> ParseContext {
ParseContext {
strict_evidence: false,
zone: None,
position: None,
classification_floor: None,
as_of: None,
preceded_by_whitespace: true,
}
}
#[test]
fn try_insert_delimiter_inserts_before_long_form_dissem() {
let cases: &[(&str, &str)] = &[
("SECRET//NOFORN EXDIS", "SECRET//NOFORN//EXDIS"),
("SECRET//NOFORN ORCON", "SECRET//NOFORN//ORCON"),
("SECRET//SI ORCON", "SECRET//SI//ORCON"),
];
for (input, expected) in cases {
let result = try_insert_delimiter(input);
assert_eq!(
result.as_deref(),
Some(*expected),
"input {input:?} should produce {expected:?}; got {result:?}"
);
}
}
#[test]
fn try_insert_delimiter_classification_boundary() {
let cases: &[(&str, &str)] = &[
(
"SECRET REL TO USA, AUS, GBR",
"SECRET//REL TO USA, AUS, GBR",
),
("SECRET NOFORN", "SECRET//NOFORN"),
("TOP SECRET NOFORN", "TOP SECRET//NOFORN"),
];
for (input, expected) in cases {
let result = try_insert_delimiter(input);
assert_eq!(
result.as_deref(),
Some(*expected),
"input {input:?} should produce {expected:?}; got {result:?}"
);
}
}
#[test]
fn try_insert_delimiter_does_not_split_top_secret() {
let result = try_insert_delimiter("TOP SECRET//NF");
assert_eq!(result, None);
}
#[test]
fn try_insert_delimiter_does_not_split_sbu_noforn() {
let result = try_insert_delimiter("SECRET//SBU NOFORN");
assert_eq!(result, None, "SBU NOFORN must not be split; got {result:?}");
}
#[test]
fn try_insert_delimiter_does_not_split_les_noforn() {
let result = try_insert_delimiter("SECRET//LES NOFORN");
assert_eq!(result, None, "LES NOFORN must not be split; got {result:?}");
}
#[test]
fn try_insert_delimiter_no_op_on_canonical() {
for input in &[
"SECRET//NOFORN",
"TOP SECRET//SI//NOFORN",
"(S//NF)",
"UNCLASSIFIED",
] {
let result = try_insert_delimiter(input);
assert_eq!(
result, None,
"input {input:?} is canonical; should produce None, got {result:?}"
);
}
}
#[test]
fn try_insert_delimiter_capped_at_max_insertions() {
let input = "SECRET NOFORN ORCON PROPIN IMCON RELIDO RSEN";
let result = try_insert_delimiter(input);
assert!(result.is_some());
let inserted = result.unwrap();
let inserted_count = inserted.matches("//").count();
assert!(
inserted_count <= MAX_DELIMITER_INSERTIONS,
"must not exceed MAX_DELIMITER_INSERTIONS={MAX_DELIMITER_INSERTIONS}; \
got {inserted_count} insertions in {inserted:?}"
);
}
#[test]
fn try_insert_delimiter_preserves_existing_double_slash() {
let result = try_insert_delimiter("SECRET//NOFORN EXDIS");
let s = result.expect("should insert");
let count = s.matches("//").count();
assert_eq!(
count, 2,
"expected 2 `//` total (1 preserved + 1 inserted), got {count} in {s:?}"
);
}
#[test]
fn try_insert_delimiter_preserves_non_ascii_characters_verbatim() {
let input = "SECRET ∕∕ NOFORN";
let result = try_insert_delimiter(input);
let was_some = result.is_some();
let s = result.unwrap_or_else(|| input.to_string());
assert!(
s.is_char_boundary(s.len()),
"output {s:?} must end on a char boundary"
);
assert!(
!was_some || s.contains('∕'),
"output {s:?} must preserve the U+2215 character when the \
helper emitted any output"
);
}
#[test]
fn is_hard_splitter_covers_documented_long_forms() {
for token in &[
"NOFORN",
"ORCON",
"ORCON-USGOV",
"PROPIN",
"IMCON",
"RELIDO",
"RSEN",
"EYESONLY",
"FOUO",
"FISA",
"DSEN",
"EXDIS",
"NODIS",
"LIMDIS",
] {
assert!(
is_hard_splitter(token),
"{token:?} must be a hard splitter (issue #133 PR 3)"
);
}
}
#[test]
fn is_hard_splitter_excludes_short_forms() {
for token in &["NF", "OC", "PR", "IMC", "RS"] {
assert!(
!is_hard_splitter(token),
"{token:?} is intentionally NOT a hard splitter (collision risk)"
);
}
}
#[test]
fn heuristic_2char_ts_cluster() {
for first in &['T', 'R', 'Y', 'H', 'G', 'F'] {
for second in &['A', 'W', 'E', 'Z', 'S'] {
let token: String = [*first, *second].iter().collect();
assert_eq!(
try_2char_classification_heuristic(&token),
Some("TS"),
"{token:?} should heuristic-fix to TS"
);
}
}
assert_eq!(try_2char_classification_heuristic("ys"), Some("TS"));
assert_eq!(try_2char_classification_heuristic("Ys"), Some("TS"));
}
#[test]
fn heuristic_2char_no_match_outside_clusters() {
for token in &["AS", "WS", "ZS", "BS", "DS", "QS"] {
assert_eq!(
try_2char_classification_heuristic(token),
None,
"{token:?} should not heuristic-fix"
);
}
for token in &["TR", "RY", "HG", "GH", "FB"] {
assert_eq!(
try_2char_classification_heuristic(token),
None,
"{token:?} should not heuristic-fix"
);
}
}
#[test]
fn heuristic_1char_s_cluster() {
for token in &["A", "W", "E", "Z"] {
assert_eq!(
try_1char_classification_heuristic(token),
Some("S"),
"{token:?} should heuristic-fix to S"
);
}
assert_eq!(try_1char_classification_heuristic("X"), Some("S"));
}
#[test]
fn heuristic_1char_c_cluster() {
for token in &["V", "F"] {
assert_eq!(
try_1char_classification_heuristic(token),
Some("C"),
"{token:?} should heuristic-fix to C"
);
}
}
#[test]
fn heuristic_1char_no_match_outside_clusters() {
for token in &["B", "D", "G", "K", "M", "N", "Q", "T", "Y"] {
assert_eq!(
try_1char_classification_heuristic(token),
None,
"{token:?} should not heuristic-fix"
);
}
}
#[test]
fn heuristic_skips_canonical_classifications() {
for canonical in &["U", "R", "C", "S", "TS"] {
assert!(
is_canonical_short_classification(canonical),
"{canonical:?} should be recognized as canonical"
);
}
assert_eq!(try_classification_heuristic_fix("(S//NF)"), None);
assert_eq!(try_classification_heuristic_fix("(TS//NF)"), None);
assert_eq!(try_classification_heuristic_fix("(C//NF)"), None);
assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
}
#[test]
fn heuristic_fixes_portion_form() {
assert_eq!(
try_classification_heuristic_fix("(YS//NF)").as_deref(),
Some("(TS//NF)")
);
assert_eq!(
try_classification_heuristic_fix("(W//NF)").as_deref(),
Some("(S//NF)")
);
assert_eq!(
try_classification_heuristic_fix("(F//NF)").as_deref(),
Some("(C//NF)")
);
assert_eq!(
try_classification_heuristic_fix("(ys//NF)").as_deref(),
Some("(TS//NF)")
);
}
#[test]
fn heuristic_fixes_banner_form() {
assert_eq!(
try_classification_heuristic_fix("RS//NOFORN").as_deref(),
Some("TS//NOFORN")
);
assert_eq!(
try_classification_heuristic_fix("X//NOFORN").as_deref(),
Some("S//NOFORN")
);
}
#[test]
fn heuristic_skips_cab_shape() {
assert_eq!(try_classification_heuristic_fix("Classified By: foo"), None);
assert_eq!(try_classification_heuristic_fix("Derived From: bar"), None);
assert_eq!(try_classification_heuristic_fix("Declassify On: baz"), None);
}
#[test]
fn heuristic_skips_long_token() {
assert_eq!(try_classification_heuristic_fix("(YES//NF)"), None);
assert_eq!(try_classification_heuristic_fix("(SECT//NF)"), None);
assert_eq!(try_classification_heuristic_fix("SECRET//NOFORN"), None);
}
#[test]
fn heuristic_recovers_otp_to_top_via_3char_rule() {
let cases: &[(&str, &str)] = &[
("OTP SECRET//NOFORN", "TOP SECRET//NOFORN"),
("(OTP//NF)", "(TOP//NF)"),
("OTP SECRET//SI//NOFORN", "TOP SECRET//SI//NOFORN"),
];
for (input, expected) in cases {
let result = try_classification_heuristic_fix(input);
assert_eq!(
result.as_deref(),
Some(*expected),
"input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
);
}
}
#[test]
fn try_3char_classification_heuristic_only_matches_otp() {
assert_eq!(try_3char_classification_heuristic("OTP"), Some("TOP"));
for not_a_match in &["TON", "TPP", "UOP", "TIP", "TPO", "TOO", "ABC", "YES"] {
assert_eq!(
try_3char_classification_heuristic(not_a_match),
None,
"3-char heuristic must not fire on {not_a_match:?}",
);
}
}
#[test]
fn heuristic_recovers_tp_and_to_to_top_via_2char_rule() {
let cases: &[(&str, &str)] = &[
("TP SECRET//NOFORN", "TOP SECRET//NOFORN"),
("TO SECRET//NOFORN", "TOP SECRET//NOFORN"),
("(TP//NF)", "(TOP//NF)"),
("(TO//NF)", "(TOP//NF)"),
];
for (input, expected) in cases {
let result = try_classification_heuristic_fix(input);
assert_eq!(
result.as_deref(),
Some(*expected),
"input {input:?} should heuristic-fix to {expected:?}; got {result:?}"
);
}
}
#[test]
fn try_2char_classification_heuristic_ts_rule_takes_precedence() {
assert_eq!(try_2char_classification_heuristic("TS"), Some("TS"));
assert_eq!(try_2char_classification_heuristic("RS"), Some("TS"));
assert_eq!(try_2char_classification_heuristic("YS"), Some("TS"));
assert_eq!(try_2char_classification_heuristic("TP"), Some("TOP"));
assert_eq!(try_2char_classification_heuristic("TO"), Some("TOP"));
assert_eq!(try_2char_classification_heuristic("TI"), None);
assert_eq!(try_2char_classification_heuristic("TX"), None);
}
#[test]
fn is_canonical_short_classification_recognizes_top() {
assert!(is_canonical_short_classification("TOP"));
for s in &["U", "R", "C", "S", "TS"] {
assert!(
is_canonical_short_classification(s),
"{s:?} must be recognized as canonical short classification",
);
}
assert!(!is_canonical_short_classification("TPP"));
assert!(!is_canonical_short_classification("top")); assert!(!is_canonical_short_classification("TOPS"));
}
#[test]
fn heuristic_skips_unknown_first_char() {
assert_eq!(try_classification_heuristic_fix("(B//NF)"), None);
assert_eq!(try_classification_heuristic_fix("(QS//NF)"), None);
}
#[test]
fn heuristic_skips_lone_inputs() {
for lone in &[
"(YS)", "(W)", "(F)", "(X)", "YS", "W", "(YS )", ] {
assert_eq!(
try_classification_heuristic_fix(lone),
None,
"lone input {lone:?} must not fire heuristic (#133 PR 4 / #176 lone-input guard)"
);
}
}
#[test]
fn heuristic_fires_when_marking_signal_present() {
let cases: &[(&str, &str)] = &[
("(YS//NF)", "(TS//NF)"), ("(YS NF)", "(TS NF)"), ("YS//NOFORN", "TS//NOFORN"),
("W//NF", "S//NF"),
];
for (input, expected) in cases {
let result = try_classification_heuristic_fix(input);
assert_eq!(
result.as_deref(),
Some(*expected),
"input {input:?} should heuristic-fix to {expected:?} \
(marking signal present); got {result:?}"
);
}
}
#[test]
fn decoder_defers_to_strict_when_strict_evidence_is_set() {
let rx = DecoderRecognizer::new();
let cx = ParseContext::default(); match rx.recognize(b"(S//NF)", &cx) {
Parsed::Ambiguous { candidates } => assert!(candidates.is_empty()),
other => panic!("expected zero-candidate Ambiguous, got {other:?}"),
}
}
#[test]
fn decoder_zero_candidate_on_no_template_fit() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"FROBNITZ//WIBBLE", &deep_cx()) {
Parsed::Ambiguous { candidates } => assert!(
candidates.is_empty(),
"unrecognized input must be zero-candidate, got {} candidate(s)",
candidates.len()
),
Parsed::Unambiguous(m) => panic!("unexpected strict match: {m:?}"),
}
}
#[test]
fn score_candidate_splits_prior_and_posterior() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let candidate = MarkingCandidate {
span: Span::new(0, 14),
kind: MarkingType::Banner,
};
let parsed = parser
.parse(&candidate, b"SECRET//NOFORN")
.expect("SECRET//NOFORN must parse");
let marking = CapcoMarking::new(parsed.attrs);
let features = vec![
FeatureEntry {
id: FeatureId::EditDistance1,
delta: -0.5,
},
FeatureId::TokenReorder.into(),
];
let attempt = CanonicalAttempt {
bytes: b"SECRET//NOFORN".to_vec(),
features: features.clone(),
fix_source: marque_rules::FixSource::DecoderPosterior,
};
let (prior, posterior) = score_candidate(&attempt, &marking);
let feature_sum: f32 = features.iter().map(|f| f.delta).sum();
let reconstructed = prior + feature_sum;
assert!(
(reconstructed - posterior).abs() < 1e-6,
"posterior must equal prior + Σ feature deltas; \
prior={prior}, feature_sum={feature_sum}, posterior={posterior}"
);
assert!(
(prior - posterior).abs() > f32::EPSILON,
"prior_log_odds must exclude feature deltas; \
prior={prior}, posterior={posterior}"
);
}
impl From<FeatureId> for FeatureEntry {
fn from(id: FeatureId) -> Self {
Self { id, delta: -0.4 }
}
}
#[test]
fn score_candidate_includes_country_code_prior_for_rel_to() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let one_candidate = MarkingCandidate {
span: Span::new(0, 18),
kind: MarkingType::Banner,
};
let one_parsed = parser
.parse(&one_candidate, b"SECRET//REL TO USA")
.expect("SECRET//REL TO USA must parse");
let one_marking = CapcoMarking::new(one_parsed.attrs);
let two_candidate = MarkingCandidate {
span: Span::new(0, 23),
kind: MarkingType::Banner,
};
let two_parsed = parser
.parse(&two_candidate, b"SECRET//REL TO USA, GBR")
.expect("SECRET//REL TO USA, GBR must parse");
let two_marking = CapcoMarking::new(two_parsed.attrs);
let no_features: Vec<FeatureEntry> = vec![];
let attempt_one = CanonicalAttempt {
bytes: b"SECRET//REL TO USA".to_vec(),
features: no_features.clone(),
fix_source: marque_rules::FixSource::DecoderPosterior,
};
let attempt_two = CanonicalAttempt {
bytes: b"SECRET//REL TO USA, GBR".to_vec(),
features: no_features.clone(),
fix_source: marque_rules::FixSource::DecoderPosterior,
};
let (prior_one, _) = score_candidate(&attempt_one, &one_marking);
let (prior_two, _) = score_candidate(&attempt_two, &two_marking);
assert!(
prior_two < prior_one,
"adding GBR to REL TO must lower (more negative) the prior via \
country_code_log_prior; prior_one={prior_one}, prior_two={prior_two}"
);
}
#[test]
fn score_candidate_deduplicates_rel_to_entries() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let dup_candidate = MarkingCandidate {
span: Span::new(0, 23),
kind: MarkingType::Banner,
};
let dup_parsed = parser
.parse(&dup_candidate, b"SECRET//REL TO USA, USA")
.expect("SECRET//REL TO USA, USA must parse leniently");
let dup_marking = CapcoMarking::new(dup_parsed.attrs);
let once_candidate = MarkingCandidate {
span: Span::new(0, 18),
kind: MarkingType::Banner,
};
let once_parsed = parser
.parse(&once_candidate, b"SECRET//REL TO USA")
.expect("SECRET//REL TO USA must parse");
let once_marking = CapcoMarking::new(once_parsed.attrs);
let no_features: Vec<FeatureEntry> = vec![];
let attempt_dup = CanonicalAttempt {
bytes: b"SECRET//REL TO USA, USA".to_vec(),
features: no_features.clone(),
fix_source: marque_rules::FixSource::DecoderPosterior,
};
let attempt_once = CanonicalAttempt {
bytes: b"SECRET//REL TO USA".to_vec(),
features: no_features.clone(),
fix_source: marque_rules::FixSource::DecoderPosterior,
};
let (prior_dup, _) = score_candidate(&attempt_dup, &dup_marking);
let (prior_once, _) = score_candidate(&attempt_once, &once_marking);
assert!(
(prior_dup - prior_once).abs() < 1e-5,
"duplicate REL TO entry must not double-count the country-code prior; \
prior_dup={prior_dup}, prior_once={prior_once}"
);
}
#[test]
fn feature_entry_to_evidence_uses_canonical_label_registry() {
for id in [
FeatureId::EditDistance1,
FeatureId::EditDistance2,
FeatureId::TokenReorder,
FeatureId::SupersededToken,
FeatureId::BaseRateCommonMarking,
FeatureId::StrictContextClassification,
FeatureId::CorpusOverrideInEffect,
] {
let entry = FeatureEntry { id, delta: -0.5 };
let evidence = feature_entry_to_evidence(&entry);
assert_eq!(
evidence.label,
id.as_str(),
"decoder evidence label diverged from FeatureId::as_str() \
for {id:?}: got {label:?}, expected {expected:?}",
label = evidence.label,
expected = id.as_str(),
);
assert_eq!(evidence.log_odds, -0.5);
}
}
#[test]
fn runner_up_ratio_saturates_on_extreme_log_margin() {
for &log_margin in &[88.0_f32, 100.0_f32, 200.0_f32, 1000.0_f32] {
let ratio = log_margin.exp();
let clamped = if ratio.is_finite() { ratio } else { f32::MAX };
assert!(
clamped.is_finite(),
"log_margin = {log_margin}: clamped ratio must be finite, got {clamped}"
);
assert!(
clamped > 0.0,
"log_margin = {log_margin}: clamped ratio must be > 0, got {clamped}"
);
}
let at_threshold = UNAMBIGUOUS_LOG_MARGIN.exp();
assert!(at_threshold.is_finite() && at_threshold > 1.0);
}
#[test]
fn strict_parse_is_complete_rejects_unknown_classification() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let candidate = MarkingCandidate {
span: Span::new(0, 16),
kind: MarkingType::Portion,
};
let parsed = parser
.parse(&candidate, b"(SERCET//NOFORN)")
.expect("strict parser should accept (SERCET//NOFORN) leniently");
let marking = CapcoMarking::new(parsed.attrs);
assert!(
is_nontrivial_marking(&marking),
"NOFORN survives as a dissem control → marking is nontrivial"
);
assert!(
!strict_parse_is_complete(&marking, MarkingType::Portion),
"SERCET left `classification: None` → strict parse is incomplete; \
dispatcher must fall back to decoder. attrs = {:?}",
marking.0,
);
}
#[test]
fn strict_parse_is_complete_accepts_clean_marking() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let candidate = MarkingCandidate {
span: Span::new(0, 7),
kind: MarkingType::Portion,
};
let parsed = parser
.parse(&candidate, b"(S//NF)")
.expect("canonical portion must strict-parse");
let marking = CapcoMarking::new(parsed.attrs);
assert!(
strict_parse_is_complete(&marking, MarkingType::Portion),
"canonical (S//NF) must be accepted as complete; attrs = {:?}",
marking.0,
);
}
#[test]
fn strict_parse_is_complete_rejects_trailing_unknown_token() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let candidate = MarkingCandidate {
span: Span::new(0, 9),
kind: MarkingType::Portion,
};
let parsed = parser
.parse(&candidate, b"(S//FRBN)")
.expect("strict parser accepts (S//FRBN) leniently");
let marking = CapcoMarking::new(parsed.attrs);
assert!(
!strict_parse_is_complete(&marking, MarkingType::Portion),
"`FRBN` is Unknown-kind → strict parse is incomplete; attrs = {:?}",
marking.0,
);
}
#[test]
fn contains_hard_splitter_word_detects_per_word() {
assert!(contains_hard_splitter_word("NOFORN"));
assert!(contains_hard_splitter_word("ORCON"));
assert!(contains_hard_splitter_word("EXDIS"));
assert!(contains_hard_splitter_word("BUTTER POPCORN NOFORN"));
assert!(contains_hard_splitter_word("ORCON BUTTER POPCORN"));
assert!(contains_hard_splitter_word("BUTTER NOFORN POPCORN"));
assert!(!contains_hard_splitter_word("BP"));
assert!(!contains_hard_splitter_word("J12"));
assert!(!contains_hard_splitter_word("XRA"));
assert!(!contains_hard_splitter_word("BUTTER POPCORN"));
assert!(!contains_hard_splitter_word(""));
}
#[test]
fn absorbs_hard_splitter_detects_full_sar_program_with_trailing_noforn() {
use marque_ism::{IsmAttributes, SarIndicator, SarMarking, SarProgram};
let sar = SarMarking::new(
SarIndicator::Full,
Box::new([SarProgram::new(
Box::from("BUTTER POPCORN NOFORN"),
Box::new([]),
)]),
);
let mut attrs = IsmAttributes::default();
attrs.sar_markings = Some(sar);
let marking = CapcoMarking::new(attrs);
assert!(
absorbs_hard_splitter_in_sar_or_sci(&marking),
"NOFORN as trailing word of multi-word SAR program identifier must be detected"
);
}
#[test]
fn absorbs_hard_splitter_in_sar_detects_noforn_as_subcomp() {
use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
let sar = SarMarking::new(
SarIndicator::Abbrev,
Box::new([SarProgram::new(
Box::from("BP"),
Box::new([SarCompartment::new(
Box::from("J12"),
Box::new([Box::from("RB"), Box::from("NOFORN")]),
)]),
)]),
);
let mut attrs = IsmAttributes::default();
attrs.sar_markings = Some(sar);
let marking = CapcoMarking::new(attrs);
assert!(
absorbs_hard_splitter_in_sar_or_sci(&marking),
"NOFORN as SAR sub-compartment must be detected as absorption"
);
}
#[test]
fn absorbs_hard_splitter_in_sar_detects_noforn_as_compartment_identifier() {
use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
let sar = SarMarking::new(
SarIndicator::Abbrev,
Box::new([SarProgram::new(
Box::from("BP"),
Box::new([SarCompartment::new(Box::from("NOFORN"), Box::new([]))]),
)]),
);
let mut attrs = IsmAttributes::default();
attrs.sar_markings = Some(sar);
let marking = CapcoMarking::new(attrs);
assert!(
absorbs_hard_splitter_in_sar_or_sci(&marking),
"NOFORN as SAR compartment identifier must be detected as absorption"
);
}
#[test]
fn absorbs_hard_splitter_accepts_clean_sar() {
use marque_ism::{IsmAttributes, SarCompartment, SarIndicator, SarMarking, SarProgram};
let sar = SarMarking::new(
SarIndicator::Abbrev,
Box::new([SarProgram::new(
Box::from("BP"),
Box::new([SarCompartment::new(
Box::from("J12"),
Box::new([Box::from("RB"), Box::from("XRA")]),
)]),
)]),
);
let mut attrs = IsmAttributes::default();
attrs.sar_markings = Some(sar);
let marking = CapcoMarking::new(attrs);
assert!(
!absorbs_hard_splitter_in_sar_or_sci(&marking),
"clean SAR identifiers must not trigger the absorption penalty"
);
}
#[test]
fn absorbs_hard_splitter_in_sci_detects_orcon_as_subcomp() {
use marque_ism::{
IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
};
let sci = SciMarking::new(
SciControlSystem::Published(SciControlBare::Si),
Box::new([SciCompartment::new(
Box::from("G"),
Box::new([Box::from("ORCON")]),
)]),
None,
);
let mut attrs = IsmAttributes::default();
attrs.sci_markings = Box::new([sci]);
let marking = CapcoMarking::new(attrs);
assert!(
absorbs_hard_splitter_in_sar_or_sci(&marking),
"ORCON as SCI sub-compartment must be detected as absorption"
);
}
#[test]
fn absorbs_hard_splitter_in_sci_detects_orcon_as_compartment_identifier() {
use marque_ism::{
IsmAttributes, SciCompartment, SciControlBare, SciControlSystem, SciMarking,
};
let sci = SciMarking::new(
SciControlSystem::Published(SciControlBare::Si),
Box::new([SciCompartment::new(Box::from("ORCON"), Box::new([]))]),
None,
);
let mut attrs = IsmAttributes::default();
attrs.sci_markings = Box::new([sci]);
let marking = CapcoMarking::new(attrs);
assert!(
absorbs_hard_splitter_in_sar_or_sci(&marking),
"ORCON as SCI compartment identifier must be detected as absorption"
);
}
#[test]
fn absorbs_hard_splitter_negative_on_empty_marking() {
use marque_ism::IsmAttributes;
let attrs = IsmAttributes::default();
let marking = CapcoMarking::new(attrs);
assert!(
!absorbs_hard_splitter_in_sar_or_sci(&marking),
"marking without SAR/SCI must not trigger the penalty"
);
}
#[test]
fn decoder_resolves_sar_with_trailing_noforn_via_absorption_penalty() {
let rx = DecoderRecognizer::new();
for input in &[
"TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN NOFORN",
"SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB NOFORN",
] {
let parsed = rx.recognize(input.as_bytes(), &deep_cx());
match parsed {
Parsed::Unambiguous(m) => {
assert!(
m.0.sar_markings.is_some(),
"input {input:?}: expected SAR present in winning candidate"
);
assert!(
m.0.dissem_controls
.iter()
.any(|d| matches!(d, marque_ism::DissemControl::Nf)),
"input {input:?}: expected NOFORN (DissemControl::Nf) to land \
as a dissem control (winning candidate must be the delim-\
inserted form, not the absorbing one); got dissem_controls = \
{:?}",
m.0.dissem_controls,
);
assert!(
!absorbs_hard_splitter_in_sar_or_sci(&m),
"input {input:?}: winning marking must not bury a hard \
splitter inside SAR/SCI"
);
}
other => panic!("input {input:?}: expected Unambiguous, got {other:?}"),
}
}
}
#[test]
fn decoder_rejects_trivial_strict_parse() {
let token_set = CapcoTokenSet;
let parser = Parser::new(&token_set);
let candidate = MarkingCandidate {
span: Span::new(0, 16),
kind: MarkingType::Banner,
};
let parsed = parser
.parse(&candidate, b"FROBNITZ//WIBBLE")
.expect("strict parser should accept arbitrary bytes");
let marking = CapcoMarking::new(parsed.attrs);
assert!(
!is_nontrivial_marking(&marking),
"empty marking must be filtered"
);
}
#[test]
fn decoder_recovers_typo_sercet_to_secret() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"SERCET//NOFORN", &deep_cx()) {
Parsed::Unambiguous(m) => {
assert_eq!(
marking_classification(&m),
Some(Classification::Secret),
"expected SECRET classification from SERCET fuzzy-correction"
);
}
other => panic!("expected Unambiguous(SECRET//NOFORN), got {other:?}"),
}
}
#[test]
fn decoder_recovers_case_mangled_input() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"secret//noforn", &deep_cx()) {
Parsed::Unambiguous(m) => {
assert_eq!(marking_classification(&m), Some(Classification::Secret));
}
other => panic!("expected Unambiguous, got {other:?}"),
}
}
#[test]
fn decoder_suppresses_prose_glue_single_letter_portion() {
let rx = DecoderRecognizer::new();
let glued = ParseContext {
preceded_by_whitespace: false,
..deep_cx()
};
for input in &[b"(s)", b"(c)", b"(u)", b"(S)", b"(C)"] {
match rx.recognize(*input, &glued) {
Parsed::Ambiguous { candidates } => assert!(
candidates.is_empty(),
"{:?} glued to a word must produce zero candidates, got {}",
std::str::from_utf8(*input).unwrap_or("<bytes>"),
candidates.len(),
),
Parsed::Unambiguous(_) => panic!(
"{:?} glued to a word must not resolve",
std::str::from_utf8(*input).unwrap_or("<bytes>"),
),
}
}
}
#[test]
fn decoder_canonicalizes_single_letter_when_preceded_by_whitespace() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"(s)", &deep_cx()) {
Parsed::Unambiguous(m) => {
assert_eq!(
marking_classification(&m),
Some(Classification::Secret),
"lowercase (s) with preceded_by_whitespace=true must \
canonicalize to SECRET via the case-fold path"
);
}
other => panic!("expected Unambiguous resolution, got {other:?}"),
}
}
#[test]
fn decoder_rejects_bare_restricted_via_recognizer_predicate() {
let rx = DecoderRecognizer::new();
for cx in &[
deep_cx(),
ParseContext {
preceded_by_whitespace: false,
..deep_cx()
},
] {
match rx.recognize(b"(r)", cx) {
Parsed::Ambiguous { candidates } => assert!(
candidates.is_empty(),
"bare (r) must be zero-candidate (preceded_by_whitespace={}), got {}",
cx.preceded_by_whitespace,
candidates.len()
),
Parsed::Unambiguous(m) => panic!(
"bare (r) must be rejected, got Unambiguous({:?})",
m.0.classification
),
}
}
}
#[test]
fn decoder_recovers_superseded_comint_to_si() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"SECRET//COMINT//NOFORN", &deep_cx()) {
Parsed::Unambiguous(m) => {
assert_eq!(marking_classification(&m), Some(Classification::Secret));
let has_si =
m.0.sci_controls
.iter()
.any(|c| matches!(c, marque_ism::SciControl::Si));
assert!(
has_si,
"expected SI in sci_controls after COMINT supersession"
);
}
other => panic!("expected Unambiguous, got {other:?}"),
}
}
#[test]
fn decoder_recovers_reordered_banner() {
let rx = DecoderRecognizer::new();
match rx.recognize(b"NOFORN//SECRET", &deep_cx()) {
Parsed::Unambiguous(m) => {
assert_eq!(marking_classification(&m), Some(Classification::Secret));
}
Parsed::Ambiguous { candidates } => {
assert!(
!candidates.is_empty(),
"reordering should at least surface candidates"
);
}
}
}
#[test]
fn decoder_honors_classification_floor_fr011() {
let rx = DecoderRecognizer::new();
let cx = ParseContext {
strict_evidence: false,
zone: None,
position: None,
classification_floor: Some(Classification::Secret as u8),
as_of: None,
preceded_by_whitespace: true,
};
match rx.recognize(b"(U)", &cx) {
Parsed::Ambiguous { candidates } => assert!(
candidates.is_empty(),
"UNCLASSIFIED below SECRET floor must produce zero candidates, got {}",
candidates.len()
),
Parsed::Unambiguous(m) => panic!(
"expected zero-candidate, got Unambiguous({:?})",
marking_classification(&m)
),
}
}
#[test]
fn decoder_classification_floor_allows_equal_or_above() {
let rx = DecoderRecognizer::new();
let cx = ParseContext {
strict_evidence: false,
zone: None,
position: None,
classification_floor: Some(Classification::Confidential as u8),
as_of: None,
preceded_by_whitespace: true,
};
match rx.recognize(b"(S//NF)", &cx) {
Parsed::Unambiguous(m) => {
assert_eq!(marking_classification(&m), Some(Classification::Secret));
}
other => panic!("expected Unambiguous, got {other:?}"),
}
}
#[test]
fn normalize_delimiters_collapses_garbled_slash() {
let (out, _) = normalize_delimiters_and_case("S ∕∕ NOFORN");
assert_eq!(out, "S//NOFORN");
}
#[test]
fn scan_token_captures_compound_with_hyphen() {
assert_eq!(scan_token("SI-G ABCD"), 4); assert_eq!(scan_token("HCS-P"), 5);
assert_eq!(scan_token("SECRET//"), 6);
}
#[test]
fn try_canonical_reorder_swaps_dissem_first_banner() {
assert_eq!(
try_canonical_reorder("NOFORN//SECRET"),
Some("SECRET//NOFORN".to_owned())
);
}
#[test]
fn try_canonical_reorder_returns_none_when_already_canonical() {
assert_eq!(try_canonical_reorder("SECRET//NOFORN"), None);
}
#[test]
fn classify_segment_treats_sci_as_other_not_dissem() {
assert_eq!(classify_segment("HCS"), SegmentClass::Other);
assert_eq!(classify_segment("HCS-P"), SegmentClass::Other);
assert_eq!(classify_segment("SI"), SegmentClass::Other);
assert_eq!(classify_segment("SI-G"), SegmentClass::Other);
assert_eq!(classify_segment("TK"), SegmentClass::Other);
}
#[test]
fn classify_segment_non_ic_dissem_tokens() {
for tok in &[
"DS", "XD", "ND", "SBU", "SBU-NF", "LES", "LES-NF", "SSI", "LIMDIS", "EXDIS", "NODIS",
] {
assert_eq!(
classify_segment(tok),
SegmentClass::Dissem,
"classify_segment({tok:?}) should be Dissem"
);
}
assert_eq!(
classify_segment("LIMITED DISTRIBUTION"),
SegmentClass::Dissem
);
assert_eq!(
classify_segment("EXCLUSIVE DISTRIBUTION"),
SegmentClass::Dissem
);
assert_eq!(classify_segment("NO DISTRIBUTION"), SegmentClass::Dissem);
assert_eq!(
classify_segment("LAW ENFORCEMENT SENSITIVE"),
SegmentClass::Dissem
);
assert_eq!(
classify_segment("SENSITIVE BUT UNCLASSIFIED"),
SegmentClass::Dissem
);
assert_eq!(
classify_segment("SENSITIVE SECURITY INFORMATION"),
SegmentClass::Dissem
);
}
#[test]
fn classify_segment_restricted_data_is_not_classification() {
assert_eq!(classify_segment("RESTRICTED DATA"), SegmentClass::Other);
assert_eq!(
classify_segment("RESTRICTED DATA-CNWDI"),
SegmentClass::Other
);
assert_eq!(classify_segment("RESTRICTED"), SegmentClass::Classification);
}
#[test]
fn try_canonical_reorder_places_sci_between_classification_and_dissem() {
assert_eq!(
try_canonical_reorder("NOFORN//HCS-P//SECRET"),
Some("SECRET//HCS-P//NOFORN".to_owned())
);
}
#[test]
fn meets_classification_floor_rejects_below_floor() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(u_marking) = rx.recognize(b"(U)", &deep_cx()) else {
panic!("(U) should decode to unambiguous UNCLASSIFIED");
};
assert!(!meets_classification_floor(
&u_marking,
Classification::Secret as u8
));
assert!(meets_classification_floor(
&u_marking,
Classification::Unclassified as u8
));
}
#[test]
fn sar_indicator_repair_strips_one_letter_prefix() {
assert_eq!(
try_sar_indicator_repair(
"SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN"
),
Some("SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN".to_owned())
);
}
#[test]
fn sar_indicator_repair_strips_multi_letter_prefix() {
assert_eq!(
try_sar_indicator_repair("SECRET//ABSAR-BP//NOFORN"),
Some("SECRET//SAR-BP//NOFORN".to_owned())
);
assert_eq!(
try_sar_indicator_repair("SECRET//XYZSAR-BP//NOFORN"),
Some("SECRET//SAR-BP//NOFORN".to_owned())
);
}
#[test]
fn sar_indicator_repair_strips_even_capco_token_prefix() {
assert_eq!(
try_sar_indicator_repair("SECRET//USASAR-BP//NOFORN"),
Some("SECRET//SAR-BP//NOFORN".to_owned()),
"must strip USA at boundary even though USA is a trigraph",
);
assert_eq!(
try_sar_indicator_repair("(USAR-BP)"),
Some("(SAR-BP)".to_owned()),
"boundary `(` must also trigger the strip pass",
);
}
#[test]
fn sar_indicator_repair_inserts_missing_hyphen_two_char_id() {
assert_eq!(
try_sar_indicator_repair("TOP SECRET//SARBP//NOFORN"),
Some("TOP SECRET//SAR-BP//NOFORN".to_owned())
);
}
#[test]
fn sar_indicator_repair_inserts_missing_hyphen_three_char_id() {
assert_eq!(
try_sar_indicator_repair("TOP SECRET//SARABC//NOFORN"),
Some("TOP SECRET//SAR-ABC//NOFORN".to_owned())
);
}
#[test]
fn sar_indicator_repair_inserts_missing_hyphen_before_compound() {
assert_eq!(
try_sar_indicator_repair("SECRET//SARBP-J12 J54//NOFORN"),
Some("SECRET//SAR-BP-J12 J54//NOFORN".to_owned())
);
}
#[test]
fn sar_indicator_repair_no_op_on_canonical() {
let cases: &[&str] = &[
"SECRET//SAR-BP//NOFORN",
"SECRET//SAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
"TOP SECRET//SPECIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
"SECRET//NOFORN",
];
for input in cases {
assert_eq!(
try_sar_indicator_repair(input),
None,
"canonical input {input:?} must not be repaired"
);
}
}
#[test]
fn sar_indicator_repair_skips_non_boundary_sar() {
assert_eq!(
try_sar_indicator_repair("SECRET//FOO-USAR-BP"),
None,
"non-boundary SAR is not the indicator keyword"
);
}
#[test]
fn sar_indicator_repair_skips_long_alnum_run() {
assert_eq!(
try_sar_indicator_repair("SECRET//SARABCD//NOFORN"),
None,
"4-char alnum run violates §H.5 p100 2-3 char identifier"
);
}
#[test]
fn sar_indicator_repair_returns_none_when_no_sar_substring() {
assert_eq!(
try_sar_indicator_repair("TOP SECRET//SI-G ABCD//NOFORN"),
None
);
assert_eq!(try_sar_indicator_repair(""), None);
assert_eq!(try_sar_indicator_repair("UNCLASSIFIED"), None);
}
#[test]
fn match_sar_prefix_detects_one_to_three_letter_prefix() {
assert_eq!(match_sar_prefix(b"USAR-BP", 0), Some((1, 5)));
assert_eq!(match_sar_prefix(b"ABSAR-BP", 0), Some((2, 6)));
assert_eq!(match_sar_prefix(b"XYZSAR-BP", 0), Some((3, 7)));
}
#[test]
fn match_sar_prefix_rejects_no_prefix_or_no_sar() {
assert_eq!(match_sar_prefix(b"SAR-BP", 0), None);
assert_eq!(match_sar_prefix(b"USAR", 0), None);
assert_eq!(match_sar_prefix(b"USARBP", 0), None);
}
#[test]
fn match_sar_missing_hyphen_detects_2_3_char_id() {
assert_eq!(match_sar_missing_hyphen(b"SARBP/", 0), Some(5));
assert_eq!(match_sar_missing_hyphen(b"SARABC ", 0), Some(6));
assert_eq!(match_sar_missing_hyphen(b"SARBP", 0), Some(5));
}
#[test]
fn match_sar_missing_hyphen_rejects_canonical_and_too_long() {
assert_eq!(match_sar_missing_hyphen(b"SAR-BP", 0), None);
assert_eq!(match_sar_missing_hyphen(b"SARABCD/", 0), None);
assert_eq!(match_sar_missing_hyphen(b"SARB/", 0), None);
}
#[test]
fn match_sar_missing_hyphen_rejects_non_delim_following_char() {
let cases: &[&[u8]] = &[
b"SARBP)", b"SARBP,", b"SARBP;", b"SARBP*", b"SARBP=", b"SARABC.", b"SARABC?", ];
for input in cases {
assert_eq!(
match_sar_missing_hyphen(input, 0),
None,
"input {:?} has non-delim follower; helper must refuse repair",
std::str::from_utf8(input).unwrap_or("<non-utf8>"),
);
}
}
#[test]
fn sar_indicator_repair_skips_pattern_b_with_non_delim_follower() {
assert_eq!(
try_sar_indicator_repair("SECRET//SARBP)//NOFORN"),
None,
"Pattern B must refuse to fire when the post-alnum char isn't a delim",
);
}
#[test]
fn try_collapse_stray_char_slash_emits_three_transforms() {
let result = try_collapse_stray_char_slash("AB/X/CD");
assert_eq!(result.len(), 3, "expected 3 candidates; got {result:?}");
assert_eq!(result[0], "AB//CD"); assert_eq!(result[1], "AB//XCD"); assert_eq!(result[2], "ABX//CD"); }
#[test]
fn try_collapse_stray_char_slash_returns_empty_when_no_pattern() {
let cases: &[&str] = &[
"SECRET",
"SECRET//NOFORN",
"SECRET//NOFORN//EXDIS",
"(C)",
"",
"SECRET/AB/CD",
"SECRET////NOFORN",
];
for input in cases {
assert!(
try_collapse_stray_char_slash(input).is_empty(),
"input {input:?} should not match /X/ pattern",
);
}
}
#[test]
fn try_collapse_stray_char_slash_requires_alnum_boundary() {
assert!(try_collapse_stray_char_slash("/X/Y").is_empty());
assert!(try_collapse_stray_char_slash("Y/X/").is_empty());
assert_eq!(
try_collapse_stray_char_slash("Y/X/Z").len(),
3,
"alnum on both sides should match"
);
}
#[test]
fn rel_to_header_normalize_fixes_rel_ot_transposition() {
let result = try_rel_to_header_normalize("SECRET//REL OT USA, AUS, GBR");
assert_eq!(
result.as_deref(),
Some("SECRET//REL TO USA, AUS, GBR"),
"REL OT must rewrite to REL TO at //-boundary",
);
}
#[test]
fn rel_to_header_normalize_fixes_relt_o_token_boundary() {
let result = try_rel_to_header_normalize("SECRET//RELT O USA, AUS, GBR");
assert_eq!(
result.as_deref(),
Some("SECRET//REL TO USA, AUS, GBR"),
"RELT O must rewrite to REL TO at //-boundary",
);
}
#[test]
fn rel_to_header_normalize_returns_none_on_canonical() {
assert!(try_rel_to_header_normalize("SECRET//REL TO USA, AUS, GBR").is_none());
assert!(try_rel_to_header_normalize("SECRET//NOFORN").is_none());
assert!(try_rel_to_header_normalize("").is_none());
}
#[test]
fn rel_to_header_normalize_requires_token_boundary() {
assert!(try_rel_to_header_normalize("XREL OT Y").is_none());
assert!(try_rel_to_header_normalize("SOMETHINGRELT O Y").is_none());
}
#[test]
fn rel_to_entry_normalize_joins_a_us_to_aus() {
let result = try_rel_to_entry_normalize("SECRET//REL TO USA,A US, GBR");
assert_eq!(
result.as_deref(),
Some("SECRET//REL TO USA,AUS, GBR"),
"A US should join to AUS when is_trigraph(AUS) holds",
);
}
#[test]
fn rel_to_entry_normalize_swaps_au_comma_s_to_aus_comma() {
let result = try_rel_to_entry_normalize("SECRET//REL TO USA, AU,S GBR");
assert_eq!(
result.as_deref(),
Some("SECRET//REL TO USA, AUS, GBR"),
"AU,S should swap to AUS, when is_trigraph(AUS) holds and AU is not a trigraph",
);
}
#[test]
fn rel_to_entry_normalize_does_not_corrupt_eu_comma_pattern() {
let result = try_rel_to_entry_normalize("SECRET//REL TO USA, EU, GBR");
assert!(
result.is_none(),
"canonical EU entry must round-trip unchanged",
);
}
#[test]
fn rel_to_entry_normalize_returns_none_outside_rel_to() {
assert!(try_rel_to_entry_normalize("SECRET//SI/TK//NOFORN").is_none());
assert!(try_rel_to_entry_normalize("").is_none());
}
#[test]
fn rel_to_structural_repair_short_circuits_without_rel() {
assert!(try_rel_to_structural_repair("SECRET//NOFORN").is_none());
assert!(try_rel_to_structural_repair("(C)").is_none());
assert!(try_rel_to_structural_repair("").is_none());
}
#[test]
fn sci_delimiter_repair_concatenated_compound_hcsp() {
let result = try_sci_delimiter_repair("SECRET//HCSP//NOFORN");
assert_eq!(
result.as_deref(),
Some("SECRET//HCS-P//NOFORN"),
"HCSP must rewrite to HCS-P (CVE-registered compound)",
);
}
#[test]
fn sci_delimiter_repair_concatenated_compound_hcso() {
let result = try_sci_delimiter_repair("SECRET//HCSO//NOFORN");
assert_eq!(result.as_deref(), Some("SECRET//HCS-O//NOFORN"));
}
#[test]
fn sci_delimiter_repair_concatenated_compound_sig() {
let result = try_sci_delimiter_repair("SECRET//SIG//NOFORN");
assert_eq!(result.as_deref(), Some("SECRET//SI-G//NOFORN"));
}
#[test]
fn sci_delimiter_repair_concatenated_compound_tkkand() {
let result = try_sci_delimiter_repair("SECRET//TKKAND//NOFORN");
assert_eq!(result.as_deref(), Some("SECRET//TK-KAND//NOFORN"));
}
#[test]
fn sci_delimiter_repair_schema_coverage_bur_compounds() {
assert_eq!(
try_sci_delimiter_repair("SECRET//BURBLG//NOFORN").as_deref(),
Some("SECRET//BUR-BLG//NOFORN"),
);
assert_eq!(
try_sci_delimiter_repair("SECRET//BURDTP//NOFORN").as_deref(),
Some("SECRET//BUR-DTP//NOFORN"),
);
assert_eq!(
try_sci_delimiter_repair("SECRET//BURWRG//NOFORN").as_deref(),
Some("SECRET//BUR-WRG//NOFORN"),
);
}
#[test]
fn sci_delimiter_repair_missing_slash_sitk() {
let result = try_sci_delimiter_repair("SECRET//SITK//NOFORN");
assert_eq!(
result.as_deref(),
Some("SECRET//SI/TK//NOFORN"),
"SITK must rewrite to SI/TK (two bare control systems concatenated)",
);
}
#[test]
fn sci_delimiter_repair_missing_slash_hcssi() {
let result = try_sci_delimiter_repair("SECRET//HCSSI//NOFORN");
assert_eq!(result.as_deref(), Some("SECRET//HCS/SI//NOFORN"));
}
#[test]
fn sci_delimiter_repair_wrong_delimiter_si_dash_tk() {
let result = try_sci_delimiter_repair("SECRET//SI-TK//NOFORN");
assert_eq!(
result.as_deref(),
Some("SECRET//SI/TK//NOFORN"),
"SI-TK must rewrite to SI/TK (two bare CS, `-` is for control-compartment)",
);
}
#[test]
fn sci_delimiter_repair_leaves_registered_compound_alone() {
assert!(try_sci_delimiter_repair("SECRET//SI-G//NOFORN").is_none());
assert!(try_sci_delimiter_repair("SECRET//HCS-P//NOFORN").is_none());
assert!(try_sci_delimiter_repair("SECRET//TK-KAND//NOFORN").is_none());
}
#[test]
fn sci_delimiter_repair_returns_none_on_canonical() {
assert!(try_sci_delimiter_repair("SECRET//SI/TK//NOFORN").is_none());
assert!(try_sci_delimiter_repair("SECRET//SI//NOFORN").is_none());
assert!(try_sci_delimiter_repair("SECRET//NOFORN").is_none());
assert!(try_sci_delimiter_repair("").is_none());
}
#[test]
fn sci_delimiter_repair_does_not_fire_on_word_substring() {
assert!(try_sci_delimiter_repair("SIGMA").is_none());
assert!(try_sci_delimiter_repair("SITE").is_none());
assert!(try_sci_delimiter_repair("SIGNAL").is_none());
}
#[test]
fn sci_delimiter_repair_short_circuits_without_sci_root() {
assert!(try_sci_delimiter_repair("CONFIDENTIAL//NOFORN").is_none());
assert!(try_sci_delimiter_repair("(C)").is_none());
assert!(try_sci_delimiter_repair("").is_none());
}
#[test]
fn sci_delimiter_repair_does_not_panic_on_non_ascii() {
assert!(try_sci_delimiter_repair("SECRET//SI/TK//日本語").is_none());
assert!(try_sci_delimiter_repair("Ω SI TK").is_none());
assert!(try_sci_delimiter_repair("こんにちは").is_none());
assert!(repair_sci_token("SI日").is_none());
assert!(repair_sci_token("日本").is_none());
}
#[test]
fn repair_sci_token_rejects_partial_decompositions() {
assert!(repair_sci_token("HCSI").is_none());
assert!(repair_sci_token("ABCDE").is_none());
assert!(repair_sci_token("BUR").is_none());
}
#[test]
fn try_collapse_stray_char_slash_processes_only_first_match() {
let result = try_collapse_stray_char_slash("A/X/B/Y/C");
assert_eq!(result.len(), 3);
assert_eq!(result[0], "A//B/Y/C"); assert_eq!(result[1], "A//XB/Y/C"); assert_eq!(result[2], "AX//B/Y/C"); }
#[test]
fn decoder_recovers_drop_stray_char() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) = rx.recognize(b"SECRET//NOFORN/R/EXDIS", &deep_cx())
else {
panic!("`/R/` between NOFORN and EXDIS must resolve via drop-X");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::Secret),
);
assert!(
marking
.0
.dissem_controls
.iter()
.any(|d| matches!(d, marque_ism::DissemControl::Nf)),
"NOFORN must survive; attrs = {:?}",
marking.0,
);
assert!(
marking
.0
.non_ic_dissem
.iter()
.any(|d| matches!(d, marque_ism::NonIcDissem::Exdis)),
"EXDIS must survive; attrs = {:?}",
marking.0,
);
}
#[test]
fn decoder_recovers_right_attach_stray_char() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SI/N/OFORN", &deep_cx())
else {
panic!("`/N/` before OFORN must resolve via right-attach");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::TopSecret),
);
assert!(
marking
.0
.sci_controls
.iter()
.any(|c| matches!(c, marque_ism::SciControl::Si)),
"SI must survive; attrs = {:?}",
marking.0,
);
assert!(
marking
.0
.dissem_controls
.iter()
.any(|d| matches!(d, marque_ism::DissemControl::Nf)),
"NOFORN must be reconstructed; attrs = {:?}",
marking.0,
);
}
#[test]
fn decoder_recovers_left_attach_stray_char() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) =
rx.recognize(b"SECRE/T/REL TO USA, AUS, GBR", &deep_cx())
else {
panic!("`/T/` after SECRE must resolve via left-attach");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::Secret),
);
assert_eq!(
marking.0.rel_to.len(),
3,
"REL TO must carry 3 trigraphs (USA, AUS, GBR); attrs = {:?}",
marking.0,
);
}
#[test]
fn decoder_recovers_usar_prefix_via_sar_indicator_repair() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) = rx.recognize(
b"SECRET//USAR-BP-J12 J54-K15/CD-YYY 456 689/XR-XRA RB//NOFORN",
&deep_cx(),
) else {
panic!("USAR-BP-... must resolve via SAR indicator repair");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::Secret),
);
assert!(
marking.0.sar_markings.is_some(),
"SAR block must be present after USAR→SAR repair; attrs = {:?}",
marking.0,
);
assert!(
marking
.0
.dissem_controls
.iter()
.any(|d| matches!(d, marque_ism::DissemControl::Nf)),
"NOFORN must survive; attrs = {:?}",
marking.0,
);
}
#[test]
fn decoder_recovers_sarbp_missing_hyphen_via_sar_indicator_repair() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) = rx.recognize(b"TOP SECRET//SARBP//NOFORN", &deep_cx())
else {
panic!("SARBP must resolve via SAR indicator repair");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::TopSecret),
);
let sar = marking
.0
.sar_markings
.as_ref()
.expect("SAR block must be present");
assert_eq!(sar.programs.len(), 1, "exactly one program; got {sar:?}");
assert_eq!(
&*sar.programs[0].identifier, "BP",
"program identifier must be `BP` after hyphen insertion; got {sar:?}",
);
}
#[test]
fn decoder_recovers_spcial_via_extended_correction_vocab() {
let rx = DecoderRecognizer::new();
let Parsed::Unambiguous(marking) = rx.recognize(
b"TOP SECRET//SPCIAL ACCESS REQUIRED-BUTTER POPCORN//NOFORN",
&deep_cx(),
) else {
panic!("SPCIAL must fuzzy-correct to SPECIAL");
};
assert_eq!(
marking
.0
.classification
.as_ref()
.map(|c| c.effective_level()),
Some(Classification::TopSecret),
);
let sar = marking
.0
.sar_markings
.as_ref()
.expect("SAR block must be present");
assert_eq!(
&*sar.programs[0].identifier, "BUTTER POPCORN",
"Full-form program identifier must round-trip; got {sar:?}",
);
}
}