use std::collections::BTreeMap;
use std::path::Path;
use std::sync::Arc;
use blake3::Hasher;
use once_cell::sync::Lazy;
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Serialize};
use crate::interfaces::{Compressed, Encoder, FallbackReason, Format, Measurer, Model};
pub const RULE_NAMES: &[&str] = &[
"and",
"ansi_stripped",
"arrow",
"blank_lines",
"failure",
"filler_removed",
"if_prefix",
"json_minified",
"json_records_table",
"numeric_range_lines",
"repeated_chunk_dict",
"repeated_lines",
"success",
"term_substitutions",
"tool_schema_semantic_table",
"trailing_ws",
"vs",
];
pub const MIN_INPUT_CHARS: usize = 32;
pub const MAX_INPUT_CHARS: usize = 256 * 1024;
const TERM_SUBSTITUTIONS: &[(&str, &str)] = &[
("post-tool authorization check", "PTA"),
("post-tool authorization", "PTA"),
("policy engine", "PE"),
("session store", "SS"),
("failure store", "FS"),
("response pipeline", "RP"),
("rate limiting", "RL"),
("rate limiter", "RL"),
("rate limit", "RL"),
("authentication module", "A.mod"),
("authorization module", "Z.mod"),
("authentication service", "A.svc"),
("authorization service", "Z.svc"),
("authentication", "A"),
("authorization", "Z"),
("authenticate", "A"),
("authorize", "Z"),
("authenticated", "A'd"),
("authorized", "Z'd"),
("handler", "H"),
("request", "R"),
("response", "Rp"),
("permissions", "P"),
("permission", "P"),
("telemetry", "T"),
("validate", "V"),
("validates", "V"),
("validated", "V'd"),
("validation", "V"),
("database", "DB"),
("JSON", "J"),
("bearer token", "BT"),
("principal", "Pr"),
("resource", "Rs"),
("operation", "Op"),
("configuration file", "Cf"),
("environment variable", "Env"),
("integration test", "IT"),
("regular expression", "RE"),
("working directory", "WD"),
("breaking change", "BC"),
("circuit breaker", "CiB"),
("pattern matching", "PM"),
("race condition", "RC"),
("type checking", "Typ"),
("command line", "CL"),
("content block", "CB"),
("error message", "EM"),
("feature flag", "FF"),
("function call", "FC"),
("kill switch", "KS"),
("merge request", "MR"),
("pull request", "PR"),
("stack trace", "ST"),
("system prompt", "SP"),
("tool result", "TR"),
("user prompt", "UP"),
("code review", "CR"),
("tool call", "TC"),
("tool use", "TU"),
("unit test", "UT"),
];
const FILLER_WORDS: &[&str] = &[
"the", "a", "an", "of", "to", "in", "on", "at", "by", "with", "from", "is", "are", "was",
"were", "be", "been", "being", "that", "this", "these", "those", "it", "its", "as", "then",
"which", "who", "whom", "whose", "each", "any", "some", "all", "also", "such", "into", "onto",
"for", "about", "around", "over", "through", "during", "within", "per", "via",
"just", "only", "very", "quite", "really", "actually", "simply",
];
static SUB_RULES: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(|| {
let mut subs: Vec<(&'static str, &'static str)> = TERM_SUBSTITUTIONS.to_vec();
subs.sort_by_key(|(long, _)| std::cmp::Reverse(long.len()));
subs.into_iter()
.map(|(long, short)| {
let pat = format!(r"\b{}\b", regex::escape(long));
let re = RegexBuilder::new(&pat)
.case_insensitive(true)
.build()
.expect("static substitution pattern");
(re, short)
})
.collect()
});
static IF_PREFIX: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\bif\b\s+")
.case_insensitive(true)
.build()
.expect("if-prefix")
});
static SUCCESS: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\b(succeeds?|ok|success|grants? access|grants?)\b")
.case_insensitive(true)
.build()
.expect("success")
});
static FAILURE: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\b(fails?|failure|failed)\b")
.case_insensitive(true)
.build()
.expect("failure")
});
static ARROW: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\b(returns?|forwarded? to|forwards? to|sends? to|invokes?|invoked)\b")
.case_insensitive(true)
.build()
.expect("arrow")
});
static VS: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\b(against|versus|vs\.?)\b")
.case_insensitive(true)
.build()
.expect("vs")
});
static AND: Lazy<Regex> = Lazy::new(|| {
RegexBuilder::new(r"\b(and|plus)\b")
.case_insensitive(true)
.build()
.expect("and")
});
static PUNCT_GAP: Lazy<Regex> =
Lazy::new(|| Regex::new(r"\s+([.,;:\u{2192}\u{2713}\u{2717}])\s*").expect("punct-gap"));
static MULTI_WS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").expect("multi-ws"));
pub(crate) fn has_structural_markers(s: &str) -> bool {
if s.contains("\n\n") {
return true;
}
for fence in ["\n```", "\n~~~"] {
if s.contains(fence) || s.starts_with(&fence[1..]) {
return true;
}
}
for line in s.lines() {
let trimmed = line.trim_start();
if trimmed.starts_with('#') {
let rest = trimmed.trim_start_matches('#');
let hashes = trimmed.len() - rest.len();
if (1..=6).contains(&hashes) && rest.starts_with(' ') {
return true;
}
}
if let Some(after) = trimmed
.strip_prefix("- ")
.or_else(|| trimmed.strip_prefix("* "))
.or_else(|| trimmed.strip_prefix("+ "))
{
if !after.is_empty() {
return true;
}
}
{
let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
if !digits.is_empty() && digits.len() <= 3 {
let rest = &trimmed[digits.len()..];
if rest.starts_with(". ") || rest.starts_with(") ") {
return true;
}
}
}
if trimmed.starts_with("> ") {
return true;
}
if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
return true;
}
}
false
}
fn strip_punct(word: &str) -> &str {
word.trim_matches(|c: char| matches!(c, '.' | ',' | ';' | ':'))
}
fn is_filler(word: &str) -> bool {
let stripped = strip_punct(word).to_ascii_lowercase();
FILLER_WORDS.iter().any(|w| *w == stripped)
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct EncoderTrace {
pub term_substitutions: u32,
pub if_prefix: u32,
pub success: u32,
pub failure: u32,
pub arrow: u32,
pub vs: u32,
pub and: u32,
pub filler_removed: u32,
pub ansi_stripped: u32,
pub trailing_ws: u32,
pub blank_lines: u32,
pub json_minified: u32,
pub json_records_table: u32,
pub numeric_range_lines: u32,
pub repeated_chunk_dict: u32,
pub repeated_lines: u32,
pub tool_schema_semantic_table: u32,
pub bytes_saved: [u64; 17],
}
impl EncoderTrace {
pub const IDX_AND: usize = 0;
pub const IDX_ANSI_STRIPPED: usize = 1;
pub const IDX_ARROW: usize = 2;
pub const IDX_BLANK_LINES: usize = 3;
pub const IDX_FAILURE: usize = 4;
pub const IDX_FILLER_REMOVED: usize = 5;
pub const IDX_IF_PREFIX: usize = 6;
pub const IDX_JSON_MINIFIED: usize = 7;
pub const IDX_JSON_RECORDS_TABLE: usize = 8;
pub const IDX_NUMERIC_RANGE_LINES: usize = 9;
pub const IDX_REPEATED_CHUNK_DICT: usize = 10;
pub const IDX_REPEATED_LINES: usize = 11;
pub const IDX_SUCCESS: usize = 12;
pub const IDX_TERM_SUBSTITUTIONS: usize = 13;
pub const IDX_TOOL_SCHEMA_SEMANTIC_TABLE: usize = 14;
pub const IDX_TRAILING_WS: usize = 15;
pub const IDX_VS: usize = 16;
#[must_use]
pub fn any_fired(&self) -> bool {
self.term_substitutions
+ self.if_prefix
+ self.success
+ self.failure
+ self.arrow
+ self.vs
+ self.and
+ self.filler_removed
+ self.ansi_stripped
+ self.trailing_ws
+ self.blank_lines
+ self.json_minified
+ self.json_records_table
+ self.numeric_range_lines
+ self.repeated_chunk_dict
+ self.repeated_lines
+ self.tool_schema_semantic_table
> 0
}
#[must_use]
pub fn bytes_saved_pairs(&self) -> [(&'static str, u64); 17] {
let names = self.as_pairs().map(|(name, _)| name);
let mut out = [("", 0u64); 17];
for i in 0..17 {
out[i] = (names[i], self.bytes_saved[i]);
}
out
}
#[must_use]
pub fn as_pairs(&self) -> [(&'static str, u32); 17] {
[
("and", self.and),
("ansi_stripped", self.ansi_stripped),
("arrow", self.arrow),
("blank_lines", self.blank_lines),
("failure", self.failure),
("filler_removed", self.filler_removed),
("if_prefix", self.if_prefix),
("json_minified", self.json_minified),
("json_records_table", self.json_records_table),
("numeric_range_lines", self.numeric_range_lines),
("repeated_chunk_dict", self.repeated_chunk_dict),
("repeated_lines", self.repeated_lines),
("success", self.success),
("term_substitutions", self.term_substitutions),
(
"tool_schema_semantic_table",
self.tool_schema_semantic_table,
),
("trailing_ws", self.trailing_ws),
("vs", self.vs),
]
}
pub fn merge(&mut self, other: EncoderTrace) {
self.and = self.and.saturating_add(other.and);
self.ansi_stripped = self.ansi_stripped.saturating_add(other.ansi_stripped);
self.arrow = self.arrow.saturating_add(other.arrow);
self.blank_lines = self.blank_lines.saturating_add(other.blank_lines);
self.failure = self.failure.saturating_add(other.failure);
self.filler_removed = self.filler_removed.saturating_add(other.filler_removed);
self.if_prefix = self.if_prefix.saturating_add(other.if_prefix);
self.json_minified = self.json_minified.saturating_add(other.json_minified);
self.json_records_table = self
.json_records_table
.saturating_add(other.json_records_table);
self.numeric_range_lines = self
.numeric_range_lines
.saturating_add(other.numeric_range_lines);
self.repeated_chunk_dict = self
.repeated_chunk_dict
.saturating_add(other.repeated_chunk_dict);
self.repeated_lines = self.repeated_lines.saturating_add(other.repeated_lines);
self.success = self.success.saturating_add(other.success);
self.term_substitutions = self
.term_substitutions
.saturating_add(other.term_substitutions);
self.tool_schema_semantic_table = self
.tool_schema_semantic_table
.saturating_add(other.tool_schema_semantic_table);
self.trailing_ws = self.trailing_ws.saturating_add(other.trailing_ws);
self.vs = self.vs.saturating_add(other.vs);
for i in 0..17 {
self.bytes_saved[i] = self.bytes_saved[i].saturating_add(other.bytes_saved[i]);
}
}
}
pub const ENABLE_WEIGHT_THRESHOLD: f32 = 0.05;
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct RuleSet {
#[serde(default)]
pub enabled: BTreeMap<String, bool>,
#[serde(default)]
pub weights: BTreeMap<String, f32>,
#[serde(default)]
pub version: Option<String>,
}
impl RuleSet {
#[must_use]
pub fn default_v1() -> Self {
let mut enabled = BTreeMap::new();
let mut weights = BTreeMap::new();
for name in RULE_NAMES {
let default_on = !matches!(
*name,
"json_records_table"
| "numeric_range_lines"
| "repeated_chunk_dict"
| "tool_schema_semantic_table"
| "success"
| "failure"
);
enabled.insert((*name).to_owned(), default_on);
weights.insert((*name).to_owned(), if default_on { 1.0 } else { 0.0 });
}
Self {
enabled,
weights,
version: Some("v1".to_owned()),
}
}
#[must_use]
pub fn safe_canary_v1() -> Self {
let mut rs = Self::default_v1();
rs.enabled.insert("success".to_owned(), false);
rs.enabled.insert("failure".to_owned(), false);
rs.weights.insert("success".to_owned(), 0.0);
rs.weights.insert("failure".to_owned(), 0.0);
rs.version = Some("safe-canary-v1-no-success-failure".to_owned());
rs
}
#[must_use]
pub fn agentic_canary_v2() -> Self {
let mut rs = Self::safe_canary_v1();
for name in [
"json_records_table",
"numeric_range_lines",
"repeated_chunk_dict",
"tool_schema_semantic_table",
] {
rs.enabled.insert(name.to_owned(), true);
rs.weights.insert(name.to_owned(), 1.0);
}
rs.version = Some("agentic-canary-v2-quality-ready-codecs".to_owned());
rs
}
#[must_use]
pub fn is_enabled(&self, rule: &str) -> bool {
if let Some(flag) = self.enabled.get(rule) {
if !*flag {
return false;
}
}
!matches!(self.weights.get(rule), Some(w) if *w < ENABLE_WEIGHT_THRESHOLD)
}
#[must_use]
pub fn weight(&self, rule: &str) -> f32 {
self.weights.get(rule).copied().unwrap_or(1.0)
}
pub fn from_toml_str(s: &str) -> Result<Self, toml::de::Error> {
let mut rs: Self = toml::from_str(s)?;
rs.enabled.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
rs.weights.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
Ok(rs)
}
pub fn from_toml_file(path: &Path) -> anyhow::Result<Self> {
let s = std::fs::read_to_string(path)?;
Self::from_toml_str(&s).map_err(|e| anyhow::anyhow!("ruleset parse: {e}"))
}
pub fn to_toml_string(&self) -> Result<String, toml::ser::Error> {
toml::to_string_pretty(self)
}
}
#[must_use]
pub fn encode_symbolic(text: &str) -> String {
encode_symbolic_traced(text).0
}
#[must_use]
pub fn encode_symbolic_traced(text: &str) -> (String, EncoderTrace) {
encode_symbolic_traced_with(text, &RuleSet::default_v1())
}
#[must_use]
pub fn encode_symbolic_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
let mut trace = EncoderTrace::default();
let mut t: String = text.to_owned();
if rs.is_enabled("term_substitutions") {
let before = t.len() as u64;
let mut fired = false;
for (re, short) in SUB_RULES.iter() {
let n = u32::try_from(re.find_iter(&t).count()).unwrap_or(u32::MAX);
if n > 0 {
trace.term_substitutions = trace.term_substitutions.saturating_add(n);
t = re.replace_all(&t, *short).into_owned();
fired = true;
}
}
if fired {
trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] =
before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("if_prefix") {
let n = u32::try_from(IF_PREFIX.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.if_prefix = n;
if n > 0 {
let before = t.len() as u64;
t = IF_PREFIX.replace_all(&t, "").into_owned();
trace.bytes_saved[EncoderTrace::IDX_IF_PREFIX] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("success") {
let n = u32::try_from(SUCCESS.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.success = n;
if n > 0 {
let before = t.len() as u64;
t = SUCCESS.replace_all(&t, "\u{2713}").into_owned();
trace.bytes_saved[EncoderTrace::IDX_SUCCESS] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("failure") {
let n = u32::try_from(FAILURE.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.failure = n;
if n > 0 {
let before = t.len() as u64;
t = FAILURE.replace_all(&t, "\u{2717}").into_owned();
trace.bytes_saved[EncoderTrace::IDX_FAILURE] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("arrow") {
let n = u32::try_from(ARROW.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.arrow = n;
if n > 0 {
let before = t.len() as u64;
t = ARROW.replace_all(&t, "\u{2192}").into_owned();
trace.bytes_saved[EncoderTrace::IDX_ARROW] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("vs") {
let n = u32::try_from(VS.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.vs = n;
if n > 0 {
let before = t.len() as u64;
t = VS.replace_all(&t, "vs").into_owned();
trace.bytes_saved[EncoderTrace::IDX_VS] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("and") {
let n = u32::try_from(AND.find_iter(&t).count()).unwrap_or(u32::MAX);
trace.and = n;
if n > 0 {
let before = t.len() as u64;
t = AND.replace_all(&t, "+").into_owned();
trace.bytes_saved[EncoderTrace::IDX_AND] = before.saturating_sub(t.len() as u64);
}
}
if rs.is_enabled("filler_removed") {
let before = t.len() as u64;
let words_before = t.split_whitespace().count();
let kept: Vec<&str> = t.split_whitespace().filter(|w| !is_filler(w)).collect();
let removed = u32::try_from(words_before.saturating_sub(kept.len())).unwrap_or(u32::MAX);
trace.filler_removed = removed;
t = kept.join(" ");
if removed > 0 {
trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] =
before.saturating_sub(t.len() as u64);
}
}
t = PUNCT_GAP.replace_all(&t, "$1 ").into_owned();
t = MULTI_WS.replace_all(&t, " ").into_owned();
(t.trim().to_owned(), trace)
}
fn compress_inline(body: &str, rs: &RuleSet, trace_accum: &mut EncoderTrace) -> String {
let (out, trace) = encode_symbolic_traced_with(body, rs);
trace_accum.merge(trace);
out
}
enum LineKind {
Blank,
Fence,
Heading { prefix: String, body: String },
ListItem { prefix: String, body: String },
Blockquote { body: String },
Table,
Prose,
}
fn classify_line(line: &str) -> LineKind {
if line.trim().is_empty() {
return LineKind::Blank;
}
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
return LineKind::Fence;
}
if trimmed.starts_with('#') {
let rest = trimmed.trim_start_matches('#');
let hashes = trimmed.len() - rest.len();
if (1..=6).contains(&hashes) && rest.starts_with(' ') {
let indent = &line[..line.len() - trimmed.len()];
let prefix = format!("{}{} ", indent, "#".repeat(hashes));
let body = rest.trim_start().to_owned();
return LineKind::Heading { prefix, body };
}
}
for marker in ["- ", "* ", "+ "] {
if let Some(body) = trimmed.strip_prefix(marker) {
if !body.is_empty() {
let indent = &line[..line.len() - trimmed.len()];
let prefix = format!("{}{}", indent, marker);
return LineKind::ListItem {
prefix,
body: body.to_owned(),
};
}
}
}
let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
if !digits.is_empty() && digits.len() <= 3 {
let after_digits = &trimmed[digits.len()..];
for sep in [". ", ") "] {
if let Some(body) = after_digits.strip_prefix(sep) {
if !body.is_empty() {
let indent = &line[..line.len() - trimmed.len()];
let prefix = format!("{}{}{}", indent, digits, sep);
return LineKind::ListItem {
prefix,
body: body.to_owned(),
};
}
}
}
}
if let Some(body) = trimmed.strip_prefix("> ") {
return LineKind::Blockquote {
body: body.to_owned(),
};
}
if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
return LineKind::Table;
}
LineKind::Prose
}
pub fn encode_symbolic_structural_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
let mut out = String::with_capacity(text.len());
let mut trace = EncoderTrace::default();
let mut prose_buf = String::new();
let mut in_fence = false;
let mut in_indented_code = false;
let mut prev_line_blank = true;
let flush = |prose_buf: &mut String, out: &mut String, trace: &mut EncoderTrace| {
if prose_buf.is_empty() {
return;
}
let compressed = compress_inline(prose_buf, rs, trace);
out.push_str(&compressed);
out.push('\n');
prose_buf.clear();
};
for line in text.split('\n') {
if in_fence {
out.push_str(line);
out.push('\n');
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
in_fence = false;
}
prev_line_blank = false;
continue;
}
let is_indented = line.starts_with(" ") || line.starts_with('\t');
let is_blank = line.trim().is_empty();
if in_indented_code {
if is_indented || is_blank {
out.push_str(line);
out.push('\n');
prev_line_blank = is_blank;
continue;
}
in_indented_code = false;
} else if prev_line_blank && is_indented && !is_blank {
flush(&mut prose_buf, &mut out, &mut trace);
in_indented_code = true;
out.push_str(line);
out.push('\n');
prev_line_blank = false;
continue;
}
match classify_line(line) {
LineKind::Fence => {
flush(&mut prose_buf, &mut out, &mut trace);
out.push_str(line);
out.push('\n');
in_fence = true;
}
LineKind::Blank => {
flush(&mut prose_buf, &mut out, &mut trace);
out.push('\n');
}
LineKind::Heading { prefix, body } => {
flush(&mut prose_buf, &mut out, &mut trace);
let body_c = compress_inline(&body, rs, &mut trace);
out.push_str(&prefix);
out.push_str(&body_c);
out.push('\n');
}
LineKind::ListItem { prefix, body } => {
flush(&mut prose_buf, &mut out, &mut trace);
let body_c = compress_inline(&body, rs, &mut trace);
out.push_str(&prefix);
out.push_str(&body_c);
out.push('\n');
}
LineKind::Blockquote { body } => {
flush(&mut prose_buf, &mut out, &mut trace);
let body_c = compress_inline(&body, rs, &mut trace);
out.push_str("> ");
out.push_str(&body_c);
out.push('\n');
}
LineKind::Table => {
flush(&mut prose_buf, &mut out, &mut trace);
out.push_str(line);
out.push('\n');
}
LineKind::Prose => {
if !prose_buf.is_empty() {
prose_buf.push(' ');
}
prose_buf.push_str(line);
}
}
prev_line_blank = is_blank;
}
flush(&mut prose_buf, &mut out, &mut trace);
if !text.ends_with('\n') && out.ends_with('\n') {
out.pop();
}
(out, trace)
}
pub struct SymbolicEncoder {
measurer: Arc<dyn Measurer>,
}
impl SymbolicEncoder {
#[must_use]
pub fn new(measurer: Arc<dyn Measurer>) -> Self {
Self { measurer }
}
fn hash(text: &str) -> String {
let mut h = Hasher::new();
h.update(text.as_bytes());
h.finalize().to_hex().to_string()
}
fn count_or_zero(&self, text: &str, model: &Model) -> u32 {
self.measurer.tokenize(text, model).unwrap_or(0)
}
fn build(
&self,
original: &str,
compressed: &str,
format: Format,
model: Model,
fallback: Option<FallbackReason>,
) -> Compressed {
let baseline = self.count_or_zero(original, &model);
let encoded = self.count_or_zero(compressed, &model);
Compressed {
content: compressed.to_owned(),
format,
baseline_tokens: baseline,
compressed_tokens: encoded,
model,
content_hash: Self::hash(original),
fallback,
}
}
}
impl SymbolicEncoder {
#[must_use]
pub fn compress_traced(&self, input: &str, model: Model) -> (Compressed, EncoderTrace) {
self.compress_traced_with(input, model, &RuleSet::default_v1())
}
#[must_use]
pub fn compress_traced_with(
&self,
input: &str,
model: Model,
rs: &RuleSet,
) -> (Compressed, EncoderTrace) {
if !self.measurer.supported(&model) {
return (
self.build(
input,
input,
Format::Prose,
model,
Some(FallbackReason::TokenizerMissing),
),
EncoderTrace::default(),
);
}
let chars = input.chars().count();
if chars < MIN_INPUT_CHARS {
return (
self.build(
input,
input,
Format::Prose,
model,
Some(FallbackReason::Uncompressible),
),
EncoderTrace::default(),
);
}
if chars > MAX_INPUT_CHARS {
return (
self.build(
input,
input,
Format::Prose,
model,
Some(FallbackReason::OversizedInput),
),
EncoderTrace::default(),
);
}
let (encoded, trace) = if has_structural_markers(input) {
encode_symbolic_structural_traced_with(input, rs)
} else {
encode_symbolic_traced_with(input, rs)
};
let baseline = self.count_or_zero(input, &model);
let candidate = self.count_or_zero(&encoded, &model);
if candidate >= baseline {
return (
self.build(
input,
input,
Format::Prose,
model,
Some(FallbackReason::Uncompressible),
),
EncoderTrace::default(),
);
}
(
self.build(input, &encoded, Format::Symbolic, model, None),
trace,
)
}
}
impl Encoder for SymbolicEncoder {
fn compress(&self, input: &str, model: Model) -> Compressed {
self.compress_traced(input, model).0
}
fn select_format(&self, input: &str, model: Model) -> Format {
if !self.measurer.supported(&model) || input.chars().count() < MIN_INPUT_CHARS {
return Format::Prose;
}
let encoded = encode_symbolic(input);
if self.count_or_zero(&encoded, &model) >= self.count_or_zero(input, &model) {
Format::Prose
} else {
Format::Symbolic
}
}
fn fallback(&self, input: &str, model: Model, reason: FallbackReason) -> Compressed {
self.build(input, input, Format::Prose, model, Some(reason))
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tokenizers::LocalMeasurer;
fn enc() -> SymbolicEncoder {
let m = LocalMeasurer::with_defaults().expect("measurer");
SymbolicEncoder::new(Arc::new(m))
}
#[test]
fn structural_gate_detects_paragraph_breaks() {
assert!(has_structural_markers("foo\n\nbar"));
}
#[test]
fn structural_gate_detects_headings() {
assert!(has_structural_markers("# Title\ncontent follows"));
assert!(has_structural_markers("content\n## Subheading\nmore"));
}
#[test]
fn structural_gate_detects_lists() {
assert!(has_structural_markers("intro\n- item one\n- item two"));
assert!(has_structural_markers("intro\n1. first\n2. second"));
}
#[test]
fn structural_gate_detects_fenced_code() {
assert!(has_structural_markers("prose\n```\ncode\n```"));
assert!(has_structural_markers("```rust\nfn main() {}\n```"));
}
#[test]
fn structural_gate_detects_tables_and_blockquotes() {
assert!(has_structural_markers(
"col\n| a | b |\n|---|---|\n| 1 | 2 |"
));
assert!(has_structural_markers("context\n> quoted line\nafter"));
}
#[test]
fn structural_gate_lets_flat_prose_through() {
assert!(!has_structural_markers(
"one sentence. another sentence. a third. no line breaks here."
));
}
#[test]
fn markdown_input_preserves_structure_through_compression() {
let md = "# Heading\n\nFirst paragraph with enough body to clear the thirty-two-char floor.\n\n- list item one\n- list item two\n\nSecond paragraph follows here.";
let (out, _trace) = enc().compress_traced(md, Model::ClaudeSonnet47);
let content = &out.content;
assert!(
content.contains("\n\n"),
"expected paragraph break preserved, got: {content:?}"
);
assert!(
content.starts_with("# "),
"expected heading prefix preserved, got: {content:?}"
);
assert!(
content.contains("\n- "),
"expected list-item marker preserved, got: {content:?}"
);
let newlines = content.matches('\n').count();
assert!(
newlines >= 4,
"expected >=4 newlines (paragraph + 2 list + blank), got {newlines} in {content:?}"
);
}
#[test]
fn structural_encoder_preserves_fenced_code_verbatim() {
use crate::RuleSet;
let md =
"Intro paragraph.\n\n```rust\nfn main() {\n println!(\"x\");\n}\n```\n\nEpilogue.";
let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
assert!(
out.contains("```rust\nfn main() {\n println!(\"x\");\n}\n```"),
"fenced code must be preserved byte-for-byte, got: {out:?}"
);
}
#[test]
fn structural_encoder_preserves_four_space_indented_code() {
use crate::RuleSet;
let md = "intro paragraph.\n\n fn check(token: &Token) -> bool {\n token.expires_at <= Utc::now()\n }\n\nepilogue paragraph.";
let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
assert!(
out.contains(" fn check(token: &Token) -> bool {"),
"four-space indented code must be verbatim, got: {out:?}"
);
assert!(
out.contains(" token.expires_at <= Utc::now()"),
"indented-code continuation (8 spaces) must be verbatim, got: {out:?}"
);
assert!(
out.contains(" }"),
"closing brace line must be verbatim, got: {out:?}"
);
}
#[test]
fn structural_encoder_preserves_tab_indented_code() {
use crate::RuleSet;
let md = "intro.\n\n\tlet x = 1;\n\tlet y = 2;\n\nafter.";
let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
assert!(
out.contains("\tlet x = 1;"),
"tab-indented code must be verbatim, got: {out:?}"
);
}
#[test]
fn structural_encoder_compresses_paragraph_body() {
use crate::RuleSet;
let md = "Title line no header.\n\nThe authentication module sends a request to the policy engine and it returns a result.";
let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
assert!(out.contains("\n\n"), "paragraph break preserved");
assert!(
!out.contains("authentication module"),
"expected term_substitutions to rewrite 'authentication module', got: {out:?}"
);
}
#[test]
fn substitutes_authorization_term() {
let out = encode_symbolic(
"The user authentication module sends the request to the policy engine.",
);
assert!(out.contains("A.mod"), "expected A.mod in {out}");
assert!(out.contains("PE"), "expected PE in {out}");
}
#[test]
fn drops_filler_words() {
let out = encode_symbolic("The user is in the system.");
let lc = out.to_lowercase();
assert!(!lc.split_whitespace().any(|w| w == "the"));
assert!(!lc.split_whitespace().any(|w| w == "is"));
}
#[test]
fn drops_expanded_prepositions_and_intensifiers() {
let out = encode_symbolic(
"The request is just really very important for the handler to actually log during the call.",
);
let lc = out.to_lowercase();
let words: std::collections::HashSet<_> = lc.split_whitespace().collect();
for stripped in ["for", "during", "just", "really", "very", "actually"] {
assert!(
!words.contains(stripped),
"filler `{stripped}` must be stripped from: {out}",
);
}
for kept in ["important", "log", "call"] {
assert!(
words.iter().any(|w| w.contains(kept)),
"content word `{kept}` must survive: {out}",
);
}
}
#[test]
fn polarity_bearing_words_are_not_filler() {
for forbidden in [
"not", "never", "no", "nothing", "maybe", "perhaps", "likely", "possibly",
] {
assert!(
!crate::encoder::FILLER_WORDS.contains(&forbidden),
"polarity-bearing word `{forbidden}` must NOT be in FILLER_WORDS",
);
}
}
#[test]
fn arrow_replacement() {
let out = encode_symbolic("The handler invokes the policy engine.");
assert!(out.contains('\u{2192}'), "missing arrow in {out}");
}
#[test]
fn success_glyph_replacement_when_rule_is_explicitly_enabled() {
let mut rs = RuleSet::default_v1();
rs.enabled.insert("success".to_owned(), true);
rs.weights.insert("success".to_owned(), 1.0);
let (out, _) =
encode_symbolic_traced_with("If validation succeeds the request continues.", &rs);
assert!(out.contains('\u{2713}'), "missing check in {out}");
}
#[test]
fn failure_glyph_replacement_when_rule_is_explicitly_enabled() {
let mut rs = RuleSet::default_v1();
rs.enabled.insert("failure".to_owned(), true);
rs.weights.insert("failure".to_owned(), 1.0);
let (out, _) =
encode_symbolic_traced_with("If validation fails the request is rejected.", &rs);
assert!(out.contains('\u{2717}'), "missing cross in {out}");
}
#[test]
fn default_v1_disables_success_and_failure_glyphs() {
let rs = RuleSet::default_v1();
assert_eq!(
rs.enabled.get("success").copied(),
Some(false),
"success must be OFF by default",
);
assert_eq!(
rs.enabled.get("failure").copied(),
Some(false),
"failure must be OFF by default",
);
let (out, trace) = encode_symbolic_traced_with(
"If validation succeeds the call fails and the handler logs it.",
&rs,
);
assert!(!out.contains('\u{2713}'));
assert!(!out.contains('\u{2717}'));
assert_eq!(trace.success, 0);
assert_eq!(trace.failure, 0);
}
#[test]
fn longer_term_wins_over_shorter() {
let out = encode_symbolic("The authentication module handles login.");
assert!(out.contains("A.mod"));
assert!(!out.contains("A module"));
}
#[test]
fn idempotent_on_minimal_input() {
let out = encode_symbolic("hi");
assert_eq!(out, "hi");
}
#[test]
fn compress_returns_symbolic_when_net_positive() {
let inp = "The authentication module forwards the request to the policy engine \
for validation against the session store.";
let out = enc().compress(inp, Model::Gpt4);
assert_eq!(out.format, Format::Symbolic);
assert!(out.compressed_tokens < out.baseline_tokens, "{out:?}");
assert!(out.fallback.is_none());
}
#[test]
fn compress_falls_back_when_too_short() {
let out = enc().compress("hello world", Model::Gpt4);
assert_eq!(out.format, Format::Prose);
assert!(matches!(out.fallback, Some(FallbackReason::Uncompressible)));
}
#[test]
fn compress_falls_back_for_unregistered_model() {
let out = enc().compress(
"The authentication module forwards the request to the policy engine.",
Model::Gemini25Pro,
);
assert_eq!(out.format, Format::Prose);
assert!(matches!(
out.fallback,
Some(FallbackReason::TokenizerMissing)
));
}
#[test]
fn select_format_matches_compress_choice() {
let inp = "The authentication module forwards the request to the policy engine \
for validation against the session store.";
let f = enc().select_format(inp, Model::Gpt4);
let c = enc().compress(inp, Model::Gpt4);
assert_eq!(f, c.format);
}
#[test]
fn explicit_fallback_returns_prose() {
let out = enc().fallback(
"The authentication module forwards the request.",
Model::Gpt4,
FallbackReason::QualityDegraded,
);
assert_eq!(out.format, Format::Prose);
assert!(matches!(
out.fallback,
Some(FallbackReason::QualityDegraded)
));
}
#[test]
fn content_hash_is_blake3_of_original_not_compressed() {
let inp = "The authentication module forwards the request to the policy engine.";
let out = enc().compress(inp, Model::Gpt4);
let mut h = Hasher::new();
h.update(inp.as_bytes());
assert_eq!(out.content_hash, h.finalize().to_hex().to_string());
}
#[test]
fn trace_records_term_substitution_count() {
let (_, t) = encode_symbolic_traced(
"The authentication module forwards a request to the policy engine \
for validation against the session store.",
);
assert!(t.term_substitutions >= 3, "{t:?}");
}
#[test]
fn trace_records_filler_removal_count() {
let (_, t) = encode_symbolic_traced("The user is in the system and is using the database.");
assert!(t.filler_removed >= 4, "{t:?}");
}
#[test]
fn trace_no_fire_for_neutral_text() {
let (_, t) = encode_symbolic_traced("Lorem ipsum dolor sit amet consectetur");
assert_eq!(t.term_substitutions, 0);
assert_eq!(t.if_prefix, 0);
assert!(!t.any_fired() || t.filler_removed > 0);
}
#[test]
fn step9_bytes_saved_populated_when_multiple_rules_fire() {
let input = "The authentication module forwards a request to the policy \
engine for validation against the session store.";
let (out, trace) = encode_symbolic_traced(input);
assert!(
trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] > 0,
"term_substitutions should have saved bytes; trace={trace:?}"
);
assert!(
trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] > 0,
"filler_removed should have saved bytes; trace={trace:?}"
);
assert_eq!(
trace.bytes_saved[EncoderTrace::IDX_ANSI_STRIPPED],
0,
"ansi_stripped cannot fire on plain prose"
);
let sum_deltas: u64 = trace.bytes_saved.iter().sum();
let total_delta = (input.len() as u64).saturating_sub(out.len() as u64);
assert!(sum_deltas > 0, "at least one rule contributed");
assert!(
sum_deltas <= total_delta,
"sum of per-rule deltas ({sum_deltas}) must not exceed \
total shrink ({total_delta}); input={} output={}",
input.len(),
out.len()
);
}
#[test]
fn compress_traced_returns_empty_trace_on_short_input() {
let (_, trace) = enc().compress_traced("hi", Model::Gpt4);
assert_eq!(trace, EncoderTrace::default());
}
#[test]
fn compress_traced_returns_empty_trace_on_unsupported_model() {
let (_, trace) = enc().compress_traced(
"The authentication module forwards the request.",
Model::Gemini25Pro,
);
assert_eq!(trace, EncoderTrace::default());
}
#[test]
fn compress_traced_returns_empty_trace_on_oversized_input() {
let big = "abc ".repeat(MAX_INPUT_CHARS);
let (out, trace) = enc().compress_traced(&big, Model::Gpt4);
assert_eq!(out.format, Format::Prose);
assert!(matches!(out.fallback, Some(FallbackReason::OversizedInput)));
assert_eq!(trace, EncoderTrace::default());
}
#[test]
fn trace_pairs_are_alphabetical() {
let t = EncoderTrace::default();
let pairs = t.as_pairs();
let mut sorted = pairs;
sorted.sort_by_key(|(name, _)| *name);
assert_eq!(pairs, sorted);
}
#[test]
fn ruleset_default_v1_matches_constants_only_encoder() {
let inputs = [
"The authentication module forwards the request to the policy engine.",
"If validation succeeds the request continues. The handler invokes the rate limiter.",
"User is authorized via the bearer token; resource handler validates the operation.",
];
let rs = RuleSet::default_v1();
for inp in inputs {
let (a, ta) = encode_symbolic_traced(inp);
let (b, tb) = encode_symbolic_traced_with(inp, &rs);
assert_eq!(a, b, "default_v1 must match legacy on `{inp}`");
assert_eq!(ta, tb, "trace must match on `{inp}`");
}
}
#[test]
fn ruleset_disabled_rule_does_not_fire() {
let mut rs = RuleSet::default_v1();
rs.enabled.insert("success".to_owned(), false);
let (out, trace) = encode_symbolic_traced_with(
"If validation succeeds the request continues to the handler.",
&rs,
);
assert!(
!out.contains('\u{2713}'),
"success glyph must not appear: {out}"
);
assert_eq!(trace.success, 0, "success rule trace must be zero");
}
#[test]
fn ruleset_weight_below_threshold_is_treated_as_off() {
let mut rs = RuleSet::default_v1();
rs.weights.insert("arrow".to_owned(), 0.02);
let (out, trace) = encode_symbolic_traced_with(
"The handler invokes the policy engine to validate the request.",
&rs,
);
assert!(
!out.contains('\u{2192}'),
"arrow glyph must not appear: {out}"
);
assert_eq!(trace.arrow, 0);
}
#[test]
fn ruleset_weight_above_threshold_but_below_legacy_half_is_on() {
let mut rs = RuleSet::default_v1();
rs.weights.insert("arrow".to_owned(), 0.2);
let (out, trace) = encode_symbolic_traced_with(
"The handler invokes the policy engine to validate the request.",
&rs,
);
assert!(
out.contains('\u{2192}'),
"arrow glyph must be applied at weight 0.2 under revised threshold: {out}"
);
assert!(trace.arrow > 0);
}
#[test]
fn ruleset_unrecognised_keys_are_dropped_on_load() {
let toml = r"
[enabled]
success = false
made_up_rule = true
[weights]
arrow = 0.3
another_made_up = 0.7
";
let rs = RuleSet::from_toml_str(toml).expect("parse");
assert_eq!(rs.enabled.get("success").copied(), Some(false));
assert!(!rs.enabled.contains_key("made_up_rule"));
assert!(!rs.weights.contains_key("another_made_up"));
}
#[test]
fn ruleset_round_trip_through_toml() {
let mut rs = RuleSet::default_v1();
rs.enabled.insert("success".to_owned(), false);
rs.weights.insert("arrow".to_owned(), 0.42);
rs.version = Some("test-r1".to_owned());
let s = rs.to_toml_string().expect("serialize");
let rs2 = RuleSet::from_toml_str(&s).expect("parse");
assert_eq!(rs2.enabled.get("success").copied(), Some(false));
assert!((rs2.weight("arrow") - 0.42).abs() < 1e-6);
assert_eq!(rs2.version.as_deref(), Some("test-r1"));
}
#[test]
fn safe_canary_preserves_success_failure_words() {
let rs = RuleSet::safe_canary_v1();
let (out, trace) = encode_symbolic_traced_with(
"If validation succeeds the request continues. If validation fails the request is rejected.",
&rs,
);
assert!(
out.to_lowercase().contains("succeeds"),
"success word should remain: {out}"
);
assert!(
out.to_lowercase().contains("fails"),
"failure word should remain: {out}"
);
assert_eq!(trace.success, 0);
assert_eq!(trace.failure, 0);
assert_eq!(
rs.version.as_deref(),
Some("safe-canary-v1-no-success-failure")
);
}
#[test]
fn agentic_canary_v2_enables_quality_ready_tool_codecs() {
let rs = RuleSet::agentic_canary_v2();
for name in [
"json_records_table",
"numeric_range_lines",
"repeated_chunk_dict",
"tool_schema_semantic_table",
] {
assert!(rs.is_enabled(name), "{name} should be enabled");
assert!((rs.weight(name) - 1.0).abs() < f32::EPSILON);
}
assert!(!rs.is_enabled("success"), "success glyphs stay disabled");
assert!(!rs.is_enabled("failure"), "failure glyphs stay disabled");
assert_eq!(
rs.version.as_deref(),
Some("agentic-canary-v2-quality-ready-codecs")
);
}
#[test]
fn compress_traced_with_respects_ruleset_toggle() {
let mut rs = RuleSet::default_v1();
rs.enabled.insert("term_substitutions".to_owned(), false);
let inp = "The authentication module forwards the request to the policy engine \
for validation against the session store.";
let (out, _) = enc().compress_traced_with(inp, Model::Gpt4, &rs);
assert!(!out.content.contains("A.mod"), "{:?}", out.content);
assert!(!out.content.contains("PE"), "{:?}", out.content);
}
#[test]
fn compress_meets_section_10() {
use std::time::Instant;
let e = enc();
let inp = "The authentication module forwards the request to the policy engine \
for validation against the session store and then the response \
pipeline returns the result. "
.repeat(20);
let mut samples = Vec::with_capacity(100);
for _ in 0..100 {
let t = Instant::now();
let _ = e.compress(&inp, Model::Gpt4);
samples.push(t.elapsed().as_micros());
}
samples.sort_unstable();
let p50 = samples[50];
let p95 = samples[94];
let p99 = samples[98];
eprintln!(
"compress {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
inp.len()
);
assert!(p95 < 50_000, "p95 {p95}us breaches debug ceiling");
}
#[test]
fn idx_constants_match_as_pairs_order() {
let names = EncoderTrace::default().as_pairs().map(|(n, _)| n);
assert_eq!(names[EncoderTrace::IDX_AND], "and");
assert_eq!(names[EncoderTrace::IDX_ANSI_STRIPPED], "ansi_stripped");
assert_eq!(names[EncoderTrace::IDX_ARROW], "arrow");
assert_eq!(names[EncoderTrace::IDX_BLANK_LINES], "blank_lines");
assert_eq!(names[EncoderTrace::IDX_FAILURE], "failure");
assert_eq!(names[EncoderTrace::IDX_FILLER_REMOVED], "filler_removed");
assert_eq!(names[EncoderTrace::IDX_IF_PREFIX], "if_prefix");
assert_eq!(names[EncoderTrace::IDX_JSON_MINIFIED], "json_minified");
assert_eq!(
names[EncoderTrace::IDX_JSON_RECORDS_TABLE],
"json_records_table"
);
assert_eq!(
names[EncoderTrace::IDX_NUMERIC_RANGE_LINES],
"numeric_range_lines"
);
assert_eq!(
names[EncoderTrace::IDX_REPEATED_CHUNK_DICT],
"repeated_chunk_dict"
);
assert_eq!(names[EncoderTrace::IDX_REPEATED_LINES], "repeated_lines");
assert_eq!(names[EncoderTrace::IDX_SUCCESS], "success");
assert_eq!(
names[EncoderTrace::IDX_TERM_SUBSTITUTIONS],
"term_substitutions"
);
assert_eq!(
names[EncoderTrace::IDX_TOOL_SCHEMA_SEMANTIC_TABLE],
"tool_schema_semantic_table"
);
assert_eq!(names[EncoderTrace::IDX_TRAILING_WS], "trailing_ws");
assert_eq!(names[EncoderTrace::IDX_VS], "vs");
}
#[test]
fn bytes_saved_pairs_parallel_to_as_pairs() {
let mut t = EncoderTrace::default();
t.bytes_saved[0] = 7; t.bytes_saved[13] = 42;
let counts = t.as_pairs();
let bytes = t.bytes_saved_pairs();
assert_eq!(counts.len(), bytes.len());
for i in 0..counts.len() {
assert_eq!(counts[i].0, bytes[i].0, "name at index {i} diverges");
}
assert_eq!(bytes[0], ("and", 7));
assert_eq!(bytes[13], ("term_substitutions", 42));
let d = EncoderTrace::default();
assert!(d.bytes_saved_pairs().iter().all(|(_, b)| *b == 0));
}
#[test]
fn bytes_saved_merge_is_saturating_sum() {
let mut a = EncoderTrace::default();
a.bytes_saved[5] = 100;
let mut b = EncoderTrace::default();
b.bytes_saved[5] = 50;
b.bytes_saved[9] = u64::MAX; a.merge(b);
assert_eq!(a.bytes_saved[5], 150);
assert_eq!(a.bytes_saved[9], u64::MAX);
let mut c = EncoderTrace::default();
c.bytes_saved[9] = 1;
a.merge(c);
assert_eq!(a.bytes_saved[9], u64::MAX);
}
}