pithy-core 0.0.2

//! Symbolic encoder -- Rust port of `research/f2_selector_oracle.py::encode_symbolic`.
//!
//! Implements the `Encoder` trait. The selector currently returns
//! `Format::Symbolic` for every input that passes a minimum-length floor;
//! richer dispatch (JIT-progressive, fragment-prose, structured-delim) is
//! tracked separately as F1-followup. Anything below the floor falls
//! through as `Format::Prose`.
//!
//! The substitution table and filler-word set are kept in sync with the
//! Python reference. Whenever you change one, port the change to the
//! other so research benchmarks and production stay consistent.
//!
//! Performance contract (DoD §10): `compress` must run in <5ms p95.
//! Latency budget is dominated by `Measurer::tokenize` (the regex pass
//! itself is ~50us on a 4kB input -- see `tests::compress_meets_section_10`).

use std::collections::BTreeMap;
use std::path::Path;
use std::sync::Arc;

use blake3::Hasher;
use once_cell::sync::Lazy;
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Serialize};

use crate::interfaces::{Compressed, Encoder, FallbackReason, Format, Measurer, Model};

/// Stable rule names emitted by `EncoderTrace::as_pairs`. Used as
/// the canonical keys for `RuleSet::enabled` / `weights` so the
/// closed-loop tuner attributes savings to the same identifier the
/// encoder fires under.
pub const RULE_NAMES: &[&str] = &[
    "and",
    "ansi_stripped",
    "arrow",
    "blank_lines",
    "failure",
    "filler_removed",
    "if_prefix",
    "json_minified",
    "json_records_table",
    "numeric_range_lines",
    "repeated_chunk_dict",
    "repeated_lines",
    "success",
    "term_substitutions",
    "tool_schema_semantic_table",
    "trailing_ws",
    "vs",
];

/// Minimum input length (chars) below which encoder bypasses to Prose.
///
/// Matches the v0.1 selector behaviour: tiny inputs cost more in dialect
/// pre/post-amble than they save in body compression.
pub const MIN_INPUT_CHARS: usize = 32;

/// Maximum input length (chars) the encoder will process. Inputs above
/// this fall through to Prose with `OversizedInput` so a pathological
/// caller cannot pin a worker on regex work.
///
/// 256 KiB chosen as a defensive ceiling: real LLM context windows top
/// out around 1M tokens (~4 MB UTF-8); anything passed to a single
/// encoder call above 256K is almost certainly a bug or attack.
pub const MAX_INPUT_CHARS: usize = 256 * 1024;

/// (long-form, abbreviation) substitution pairs, sorted longest-first
/// at construction so greedy matching binds the longest term first.
const TERM_SUBSTITUTIONS: &[(&str, &str)] = &[
    ("post-tool authorization check", "PTA"),
    ("post-tool authorization", "PTA"),
    ("policy engine", "PE"),
    ("session store", "SS"),
    ("failure store", "FS"),
    ("response pipeline", "RP"),
    ("rate limiting", "RL"),
    ("rate limiter", "RL"),
    ("rate limit", "RL"),
    ("authentication module", "A.mod"),
    ("authorization module", "Z.mod"),
    ("authentication service", "A.svc"),
    ("authorization service", "Z.svc"),
    ("authentication", "A"),
    ("authorization", "Z"),
    ("authenticate", "A"),
    ("authorize", "Z"),
    ("authenticated", "A'd"),
    ("authorized", "Z'd"),
    ("handler", "H"),
    ("request", "R"),
    ("response", "Rp"),
    ("permissions", "P"),
    ("permission", "P"),
    ("telemetry", "T"),
    ("validate", "V"),
    ("validates", "V"),
    ("validated", "V'd"),
    ("validation", "V"),
    ("database", "DB"),
    ("JSON", "J"),
    ("bearer token", "BT"),
    ("principal", "Pr"),
    ("resource", "Rs"),
    ("operation", "Op"),
    // 2026-04-24 multi-word expansion. All 26 pairs below are 1-token
    // wins verified by `scripts/tokenize_subs.py` with leading-space
    // tiktoken on `cl100k_base` (the Claude-compatible BPE used as
    // our tokenizer proxy in `LocalMeasurer`). Multi-word originals
    // are provably multi-token so the saturation problem that killed
    // the single-word expansion in #12 does not apply here.
    //
    // Collision-checked against all prior short forms — none overlap.
    // Ordered longest-first happens automatically in `SUB_RULES` sort.
    ("configuration file", "Cf"),
    ("environment variable", "Env"),
    ("integration test", "IT"),
    ("regular expression", "RE"),
    ("working directory", "WD"),
    ("breaking change", "BC"),
    ("circuit breaker", "CiB"),
    ("pattern matching", "PM"),
    ("race condition", "RC"),
    ("type checking", "Typ"),
    ("command line", "CL"),
    ("content block", "CB"),
    ("error message", "EM"),
    ("feature flag", "FF"),
    ("function call", "FC"),
    ("kill switch", "KS"),
    ("merge request", "MR"),
    ("pull request", "PR"),
    ("stack trace", "ST"),
    ("system prompt", "SP"),
    ("tool result", "TR"),
    ("user prompt", "UP"),
    ("code review", "CR"),
    ("tool call", "TC"),
    ("tool use", "TU"),
    ("unit test", "UT"),
];

/// Filler words that can be safely stripped from prose without
/// changing polarity or introducing ambiguity.
///
/// Three groups (kept in a single flat list for lookup speed):
///
/// 1. Original (v1) closed-class determiners, aux verbs, demonstratives,
///    and relative pronouns: never carry polarity, expand via the
///    tokenizer anyway.
///
/// 2. Pure prepositions added 2026-04-24: every token in this group
///    lacks polarity (no "not"/"no"/"without"-style contrast) and
///    occurs with very high frequency in agent prompts. Expected
///    marginal savings from ablation analysis of `filler_removed`'s
///    97% savings share.
///
/// 3. Hedges / intensifiers added 2026-04-24: modify degree, never
///    polarity. Dropping "very important" to "important" loses a
///    shade of nuance but preserves the core judgement — safe for
///    compression, and these are some of the most-seen words in
///    Claude's own CoT. Explicitly NOT including polarity-bearing
///    hedges like "maybe", "perhaps", "likely", "possibly" which
///    flip assertion to tentative.
const FILLER_WORDS: &[&str] = &[
    // Group 1 — v1 baseline.
    "the", "a", "an", "of", "to", "in", "on", "at", "by", "with", "from", "is", "are", "was",
    "were", "be", "been", "being", "that", "this", "these", "those", "it", "its", "as", "then",
    "which", "who", "whom", "whose", "each", "any", "some", "all", "also", "such", "into", "onto",
    // Group 2 — prepositions (2026-04-24 ablation expansion).
    "for", "about", "around", "over", "through", "during", "within", "per", "via",
    // Group 3 — degree-only intensifiers (2026-04-24 ablation expansion).
    "just", "only", "very", "quite", "really", "actually", "simply",
];

/// Pre-compiled per-substitution regex set (longest term first).
static SUB_RULES: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(|| {
    let mut subs: Vec<(&'static str, &'static str)> = TERM_SUBSTITUTIONS.to_vec();
    subs.sort_by_key(|(long, _)| std::cmp::Reverse(long.len()));
    subs.into_iter()
        .map(|(long, short)| {
            let pat = format!(r"\b{}\b", regex::escape(long));
            let re = RegexBuilder::new(&pat)
                .case_insensitive(true)
                .build()
                .expect("static substitution pattern");
            (re, short)
        })
        .collect()
});

static IF_PREFIX: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\bif\b\s+")
        .case_insensitive(true)
        .build()
        .expect("if-prefix")
});

static SUCCESS: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\b(succeeds?|ok|success|grants? access|grants?)\b")
        .case_insensitive(true)
        .build()
        .expect("success")
});

static FAILURE: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\b(fails?|failure|failed)\b")
        .case_insensitive(true)
        .build()
        .expect("failure")
});

static ARROW: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\b(returns?|forwarded? to|forwards? to|sends? to|invokes?|invoked)\b")
        .case_insensitive(true)
        .build()
        .expect("arrow")
});

static VS: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\b(against|versus|vs\.?)\b")
        .case_insensitive(true)
        .build()
        .expect("vs")
});

static AND: Lazy<Regex> = Lazy::new(|| {
    RegexBuilder::new(r"\b(and|plus)\b")
        .case_insensitive(true)
        .build()
        .expect("and")
});

static PUNCT_GAP: Lazy<Regex> =
    Lazy::new(|| Regex::new(r"\s+([.,;:\u{2192}\u{2713}\u{2717}])\s*").expect("punct-gap"));

static MULTI_WS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").expect("multi-ws"));

/// Markdown / structured-content detector. Returns `true` when the
/// input carries semantics the symbolic pipeline cannot preserve:
/// paragraph breaks, fenced code, headings, list items, blockquotes,
/// or tables. Exists to protect `compress_traced_with` from the
/// unconditional `MULTI_WS` whitespace flattening that would
/// otherwise destroy those structural markers (B8).
///
/// Conservative by design: any one signal trips the gate. False
/// positives cost savings; false negatives cost structure — and the
/// latter is the contract violation the project is unwilling to
/// accept. See CONTRACT.md §V16.
pub(crate) fn has_structural_markers(s: &str) -> bool {
    // Paragraph break: two or more newlines in a row (ignoring trailing whitespace on blank lines).
    // Cheap substring check handles the common case; regex would be overkill here.
    if s.contains("\n\n") {
        return true;
    }
    // Fenced code block: ``` or ~~~ at a line start.
    for fence in ["\n```", "\n~~~"] {
        if s.contains(fence) || s.starts_with(&fence[1..]) {
            return true;
        }
    }
    for line in s.lines() {
        let trimmed = line.trim_start();
        // ATX heading: `# ` through `###### `.
        if trimmed.starts_with('#') {
            let rest = trimmed.trim_start_matches('#');
            let hashes = trimmed.len() - rest.len();
            if (1..=6).contains(&hashes) && rest.starts_with(' ') {
                return true;
            }
        }
        // Unordered list item.
        if let Some(after) = trimmed
            .strip_prefix("- ")
            .or_else(|| trimmed.strip_prefix("* "))
            .or_else(|| trimmed.strip_prefix("+ "))
        {
            if !after.is_empty() {
                return true;
            }
        }
        // Ordered list item: `digits. ` or `digits) `.
        {
            let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
            if !digits.is_empty() && digits.len() <= 3 {
                let rest = &trimmed[digits.len()..];
                if rest.starts_with(". ") || rest.starts_with(") ") {
                    return true;
                }
            }
        }
        // Blockquote.
        if trimmed.starts_with("> ") {
            return true;
        }
        // Table row: pipe-delimited, at least two pipes.
        if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
            return true;
        }
    }
    false
}

fn strip_punct(word: &str) -> &str {
    word.trim_matches(|c: char| matches!(c, '.' | ',' | ';' | ':'))
}

fn is_filler(word: &str) -> bool {
    let stripped = strip_punct(word).to_ascii_lowercase();
    FILLER_WORDS.iter().any(|w| *w == stripped)
}

/// Per-rule fire counts emitted alongside an encode pass.
///
/// Each field counts how many *match instances* the rule produced on
/// this input. A non-firing rule reports 0; rules with no semantic
/// effect on a given input are useful exactly because they cost
/// nothing to evaluate. Use `EncoderTrace::any_fired` to gate
/// telemetry on whether the encoder did real work.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct EncoderTrace {
    /// Total term-substitution match count across the 35-pair table.
    pub term_substitutions: u32,
    /// Number of `if ` prefix removals.
    pub if_prefix: u32,
    /// Success-glyph (`\u{2713}`) substitutions.
    pub success: u32,
    /// Failure-glyph (`\u{2717}`) substitutions.
    pub failure: u32,
    /// Arrow-glyph (`\u{2192}`) substitutions.
    pub arrow: u32,
    /// `vs` substitutions.
    pub vs: u32,
    /// `+` substitutions for `and`/`plus`.
    pub and: u32,
    /// Number of filler words removed.
    pub filler_removed: u32,
    /// Number of ANSI-escape sequences stripped from `tool_result`
    /// content blocks. Applied shim-side before the encoder runs to
    /// claw back tokens burned on terminal-color bytes the LLM never
    /// needs to see. Lossless for model consumption.
    pub ansi_stripped: u32,
    /// Number of lines where trailing `[ \t]+` was removed from
    /// `tool_result` content. Pure-waste whitespace that tokenizes
    /// into real input-billed tokens; LLMs ignore trailing spaces
    /// semantically. Lossless for model consumption.
    pub trailing_ws: u32,
    /// Number of excess blank lines collapsed in `tool_result`
    /// content. Runs of 3+ consecutive `\n` are squeezed to exactly
    /// two (one blank line preserved as a paragraph break). Preserves
    /// semantic section separation while dropping layout-only padding.
    pub blank_lines: u32,
    /// Number of JSON string payloads minified inside `tool_result`
    /// content. Lossless for valid JSON: only insignificant whitespace
    /// is removed.
    pub json_minified: u32,
    /// Number of homogeneous JSON record arrays rendered as compact
    /// TOON-like tables inside `tool_result` content. This preserves
    /// field names and scalar values but is no longer JSON syntax, so
    /// it is canary-only by default.
    pub json_records_table: u32,
    /// Number of numeric line ranges compacted in `tool_result`
    /// content. This is deterministic for lines that differ only by
    /// an incrementing number.
    pub numeric_range_lines: u32,
    /// Number of repeated chunk dictionaries emitted in `tool_result`
    /// content. Repeated log lines/sentences are represented once plus
    /// an explicit sequence, preserving reconstruction information.
    pub repeated_chunk_dict: u32,
    /// Number of consecutive repeated-line runs collapsed in
    /// `tool_result` content. Repetition count is preserved as `xN`,
    /// so log volume is compacted without hiding that repetition
    /// occurred.
    pub repeated_lines: u32,
    /// Number of simple tool-schema payloads rendered as compact
    /// semantic tables. Preserves tool name, description, required
    /// parameters, and property schemas for the supported simple schema
    /// subset; canary-only by default.
    pub tool_schema_semantic_table: u32,
    /// Per-rule byte-savings attribution, indexed identically to the
    /// pairs returned by `as_pairs` (alphabetical by rule name).
    ///
    /// PLAN_2026-04-24.md Step 9 substrate. Each entry is the UTF-8
    /// byte count a rule removed from the payload on THIS encoder pass
    /// — pre-rule body length minus post-rule body length. A non-firing
    /// rule contributes 0. Rules that never fire on a given input
    /// contribute 0, so zero-valued entries are signal, not noise.
    ///
    /// Default is an all-zero array (matches the default-non-firing
    /// trace). The actual per-rule measurement hooks inside
    /// `encode_symbolic_traced_with` will land in a follow-up commit
    /// so this schema can deploy without the rule-pass refactor. Until
    /// then, callers SHOULD treat any `bytes_saved` they read as "may
    /// be zero because the rule did not run OR because the measurement
    /// hook is not yet wired" — `any_fired()` on the same trace is the
    /// disambiguator.
    pub bytes_saved: [u64; 17],
}

impl EncoderTrace {
    /// Positional index of each rule in `bytes_saved` / `as_pairs`.
    /// Alphabetically ordered, so adding a new rule between existing
    /// ones shifts subsequent indices — readers MUST use these
    /// constants instead of hardcoded integers. Step-9 scaffold.
    pub const IDX_AND: usize = 0;
    pub const IDX_ANSI_STRIPPED: usize = 1;
    pub const IDX_ARROW: usize = 2;
    pub const IDX_BLANK_LINES: usize = 3;
    pub const IDX_FAILURE: usize = 4;
    pub const IDX_FILLER_REMOVED: usize = 5;
    pub const IDX_IF_PREFIX: usize = 6;
    pub const IDX_JSON_MINIFIED: usize = 7;
    pub const IDX_JSON_RECORDS_TABLE: usize = 8;
    pub const IDX_NUMERIC_RANGE_LINES: usize = 9;
    pub const IDX_REPEATED_CHUNK_DICT: usize = 10;
    pub const IDX_REPEATED_LINES: usize = 11;
    pub const IDX_SUCCESS: usize = 12;
    pub const IDX_TERM_SUBSTITUTIONS: usize = 13;
    pub const IDX_TOOL_SCHEMA_SEMANTIC_TABLE: usize = 14;
    pub const IDX_TRAILING_WS: usize = 15;
    pub const IDX_VS: usize = 16;

    /// True if any rule produced at least one substitution or removal.
    #[must_use]
    pub fn any_fired(&self) -> bool {
        self.term_substitutions
            + self.if_prefix
            + self.success
            + self.failure
            + self.arrow
            + self.vs
            + self.and
            + self.filler_removed
            + self.ansi_stripped
            + self.trailing_ws
            + self.blank_lines
            + self.json_minified
            + self.json_records_table
            + self.numeric_range_lines
            + self.repeated_chunk_dict
            + self.repeated_lines
            + self.tool_schema_semantic_table
            > 0
    }

    /// Render byte-savings attribution as a stable, sorted list of
    /// `(rule_name, bytes_saved)` pairs. Positional order matches
    /// `as_pairs`; non-firing rules contribute 0. Added with Step 9.
    ///
    /// Pairs with `as_pairs` for post-hoc analysis: counts tell you
    /// HOW OFTEN each rule fired, bytes_saved tells you WHICH BYTES
    /// each rule removed. Combined they let the bandit attribute
    /// savings to specific rules without re-running the encoder.
    #[must_use]
    pub fn bytes_saved_pairs(&self) -> [(&'static str, u64); 17] {
        let names = self.as_pairs().map(|(name, _)| name);
        let mut out = [("", 0u64); 17];
        for i in 0..17 {
            out[i] = (names[i], self.bytes_saved[i]);
        }
        out
    }

    /// Render as a stable, sorted list of `(rule_name, count)` pairs.
    /// Used by `pithy-cli analyze` and the audit ledger so a
    /// future rule rename does not break replay.
    #[must_use]
    pub fn as_pairs(&self) -> [(&'static str, u32); 17] {
        [
            ("and", self.and),
            ("ansi_stripped", self.ansi_stripped),
            ("arrow", self.arrow),
            ("blank_lines", self.blank_lines),
            ("failure", self.failure),
            ("filler_removed", self.filler_removed),
            ("if_prefix", self.if_prefix),
            ("json_minified", self.json_minified),
            ("json_records_table", self.json_records_table),
            ("numeric_range_lines", self.numeric_range_lines),
            ("repeated_chunk_dict", self.repeated_chunk_dict),
            ("repeated_lines", self.repeated_lines),
            ("success", self.success),
            ("term_substitutions", self.term_substitutions),
            (
                "tool_schema_semantic_table",
                self.tool_schema_semantic_table,
            ),
            ("trailing_ws", self.trailing_ws),
            ("vs", self.vs),
        ]
    }

    /// Sum-merge fire counts from `other` into `self`. Used by the
    /// structural encoder to aggregate traces across compressed
    /// paragraph segments.
    pub fn merge(&mut self, other: EncoderTrace) {
        self.and = self.and.saturating_add(other.and);
        self.ansi_stripped = self.ansi_stripped.saturating_add(other.ansi_stripped);
        self.arrow = self.arrow.saturating_add(other.arrow);
        self.blank_lines = self.blank_lines.saturating_add(other.blank_lines);
        self.failure = self.failure.saturating_add(other.failure);
        self.filler_removed = self.filler_removed.saturating_add(other.filler_removed);
        self.if_prefix = self.if_prefix.saturating_add(other.if_prefix);
        self.json_minified = self.json_minified.saturating_add(other.json_minified);
        self.json_records_table = self
            .json_records_table
            .saturating_add(other.json_records_table);
        self.numeric_range_lines = self
            .numeric_range_lines
            .saturating_add(other.numeric_range_lines);
        self.repeated_chunk_dict = self
            .repeated_chunk_dict
            .saturating_add(other.repeated_chunk_dict);
        self.repeated_lines = self.repeated_lines.saturating_add(other.repeated_lines);
        self.success = self.success.saturating_add(other.success);
        self.term_substitutions = self
            .term_substitutions
            .saturating_add(other.term_substitutions);
        self.tool_schema_semantic_table = self
            .tool_schema_semantic_table
            .saturating_add(other.tool_schema_semantic_table);
        self.trailing_ws = self.trailing_ws.saturating_add(other.trailing_ws);
        self.vs = self.vs.saturating_add(other.vs);
        // Step-9 scaffold: sum byte-savings by position. Safe with an
        // all-zero default on `other` until per-rule measurement hooks
        // land inside `encode_symbolic_traced_with`.
        for i in 0..17 {
            self.bytes_saved[i] = self.bytes_saved[i].saturating_add(other.bytes_saved[i]);
        }
    }
}

/// Soft-gate threshold below which a rule's sampled weight flips it
/// off. Originally 0.5 (midpoint of the Beta prior), revised to 0.05
/// on 2026-04-24 after a corpus-distribution simulation over 126
/// audit events showed 0 of 15 observed rules could cross 0.5 under
/// any attribution scheme — the bounded reward signal (p90=0.15,
/// median=0.01 on this corpus) makes a 0.5 threshold semantically
/// "rule must save 50% of bytes by itself to stay enabled", which no
/// individual rule can achieve. 0.05 discriminates productive rules
/// (mean ≥ 0.09) from genuinely-dead rules (Beta converges to ≈ 0).
/// Must stay strictly greater than
/// `pithy_controller::bandit::ZERO_FIRE_MAX_WEIGHT` so the zero-fire
/// clamp still disables rules that have never observed evidence.
pub const ENABLE_WEIGHT_THRESHOLD: f32 = 0.05;

/// Hot-reloadable rule configuration. Carries an enable flag and a
/// soft weight in `[0.0, 1.0]` for each of the 8 categorical rules
/// the encoder knows about. `weights` is stored for the closed-loop
/// bandit / experiment generator; the encoder itself treats
/// `weight < ENABLE_WEIGHT_THRESHOLD` as "off" and otherwise consults
/// `enabled`.
///
/// Missing keys default to enabled / weight 1.0, so a sparse TOML
/// file can express only the deltas from the v1 default ruleset.
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct RuleSet {
    /// Per-rule on/off override. Keys come from `RULE_NAMES`.
    /// Anything not listed is enabled.
    #[serde(default)]
    pub enabled: BTreeMap<String, bool>,
    /// Per-rule weight in `[0.0, 1.0]`. Used by the bandit; the
    /// encoder uses it only to derive an effective on/off when the
    /// `enabled` map omits the key.
    #[serde(default)]
    pub weights: BTreeMap<String, f32>,
    /// Optional version tag, surfaced by the dashboard / statusline
    /// so an operator can tell at a glance which ruleset is live.
    #[serde(default)]
    pub version: Option<String>,
}

impl RuleSet {
    /// The v1 default ruleset. Stable inline rules and lossless
    /// tool-output cleanup are enabled; canary-only semantic rewrites
    /// stay disabled. This remains identical in observable behaviour
    /// to the constants-only encoder for prompt text.
    ///
    /// `success` and `failure` are disabled by default as of
    /// 2026-04-24: a per-rule ablation study (commit d431b90,
    /// `scripts/ablate_rules.py`) showed they fire 11 and 28 times
    /// respectively on the real audit corpus but produce -13 and
    /// -42 tokens of "savings" — the unicode glyph substitutions
    /// (✓, ✗) tokenize to *more* tokens than the ASCII originals
    /// on the Claude tokenizers. Shipping them enabled actively
    /// harmed compression. Consumers that want the legacy
    /// behaviour can still opt in explicitly, and the
    /// `safe_canary_v1` ruleset (introduced earlier for the same
    /// reason) remains available as a typed checkpoint.
    #[must_use]
    pub fn default_v1() -> Self {
        let mut enabled = BTreeMap::new();
        let mut weights = BTreeMap::new();
        for name in RULE_NAMES {
            let default_on = !matches!(
                *name,
                "json_records_table"
                    | "numeric_range_lines"
                    | "repeated_chunk_dict"
                    | "tool_schema_semantic_table"
                    | "success"
                    | "failure"
            );
            enabled.insert((*name).to_owned(), default_on);
            weights.insert((*name).to_owned(), if default_on { 1.0 } else { 0.0 });
        }
        Self {
            enabled,
            weights,
            version: Some("v1".to_owned()),
        }
    }

    /// Lower-risk canary ruleset from the autoresearch loop: disable the
    /// success/failure glyph substitutions while keeping lossless tool-output
    /// cleanup enabled. This preserves English polarity words, which reduces
    /// semantic risk on prompts where `success`/`failure` are domain terms.
    #[must_use]
    pub fn safe_canary_v1() -> Self {
        let mut rs = Self::default_v1();
        rs.enabled.insert("success".to_owned(), false);
        rs.enabled.insert("failure".to_owned(), false);
        rs.weights.insert("success".to_owned(), 0.0);
        rs.weights.insert("failure".to_owned(), 0.0);
        rs.version = Some("safe-canary-v1-no-success-failure".to_owned());
        rs
    }

    /// Agentic canary ruleset from the quality-first autoresearch loop.
    /// Enables deterministic tool-output codecs that change surface syntax
    /// but preserve reconstruction or explicit semantic fields for their
    /// supported structures.
    #[must_use]
    pub fn agentic_canary_v2() -> Self {
        let mut rs = Self::safe_canary_v1();
        for name in [
            "json_records_table",
            "numeric_range_lines",
            "repeated_chunk_dict",
            "tool_schema_semantic_table",
        ] {
            rs.enabled.insert(name.to_owned(), true);
            rs.weights.insert(name.to_owned(), 1.0);
        }
        rs.version = Some("agentic-canary-v2-quality-ready-codecs".to_owned());
        rs
    }

    /// True if the named rule is enabled. Missing entries default
    /// to enabled; a weight below [`ENABLE_WEIGHT_THRESHOLD`] also
    /// flips the rule off so a bandit can soft-demote without
    /// explicit toggling.
    #[must_use]
    pub fn is_enabled(&self, rule: &str) -> bool {
        if let Some(flag) = self.enabled.get(rule) {
            if !*flag {
                return false;
            }
        }
        !matches!(self.weights.get(rule), Some(w) if *w < ENABLE_WEIGHT_THRESHOLD)
    }

    /// Read effective weight for a rule. Missing entries return 1.0.
    #[must_use]
    pub fn weight(&self, rule: &str) -> f32 {
        self.weights.get(rule).copied().unwrap_or(1.0)
    }

    /// Parse a TOML document into a `RuleSet`. Any unrecognised key
    /// is silently dropped (forward compatibility), and any rule
    /// name not in `RULE_NAMES` is also dropped (defensive against
    /// a corrupted ruleset file).
    ///
    /// # Errors
    /// Returns the underlying TOML parse error.
    pub fn from_toml_str(s: &str) -> Result<Self, toml::de::Error> {
        let mut rs: Self = toml::from_str(s)?;
        rs.enabled.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
        rs.weights.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
        Ok(rs)
    }

    /// Load a `RuleSet` from a TOML file on disk.
    ///
    /// # Errors
    /// I/O or parse errors.
    pub fn from_toml_file(path: &Path) -> anyhow::Result<Self> {
        let s = std::fs::read_to_string(path)?;
        Self::from_toml_str(&s).map_err(|e| anyhow::anyhow!("ruleset parse: {e}"))
    }

    /// Render this ruleset as a TOML document. Used by the controller
    /// to write a tuned ruleset back to disk before signalling reload.
    ///
    /// # Errors
    /// Serialization errors from `toml::to_string_pretty`.
    pub fn to_toml_string(&self) -> Result<String, toml::ser::Error> {
        toml::to_string_pretty(self)
    }
}

/// Run the symbolic-encoding pipeline. Pure function; deterministic.
#[must_use]
pub fn encode_symbolic(text: &str) -> String {
    encode_symbolic_traced(text).0
}

/// Run the symbolic-encoding pipeline AND return a per-rule firing trace.
///
/// Pure function; deterministic. Equivalent to
/// [`encode_symbolic_traced_with`] with the v1 default ruleset
/// (every rule enabled). Kept as a stable convenience wrapper for
/// callers that pre-date hot-reloadable rules.
#[must_use]
pub fn encode_symbolic_traced(text: &str) -> (String, EncoderTrace) {
    encode_symbolic_traced_with(text, &RuleSet::default_v1())
}

/// Run the symbolic-encoding pipeline against `text` under the
/// supplied `RuleSet`, returning the compressed text plus a
/// per-rule firing trace.
///
/// A disabled rule contributes 0 to the trace and produces no
/// substitutions in the output. The pipeline order is fixed (the
/// substitutions are applied longest-first, then the if/success/
/// failure/arrow/vs/and rules in declaration order, then filler
/// removal, then whitespace normalisation) so disabling a rule
/// never changes the order in which the remaining rules fire.
#[must_use]
pub fn encode_symbolic_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
    let mut trace = EncoderTrace::default();
    let mut t: String = text.to_owned();
    // Step-9 per-rule byte-delta attribution. Each rule snapshots
    // `t.len()` before the transform and records the UTF-8 byte delta
    // into `trace.bytes_saved[IDX_X]` when it fires. Matches the
    // shim-side pattern shipped for the tool-result pipeline in
    // commits 0e44d8f / 342741b. Closes the 17-rule roster.
    if rs.is_enabled("term_substitutions") {
        let before = t.len() as u64;
        let mut fired = false;
        for (re, short) in SUB_RULES.iter() {
            let n = u32::try_from(re.find_iter(&t).count()).unwrap_or(u32::MAX);
            if n > 0 {
                trace.term_substitutions = trace.term_substitutions.saturating_add(n);
                t = re.replace_all(&t, *short).into_owned();
                fired = true;
            }
        }
        if fired {
            trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] =
                before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("if_prefix") {
        let n = u32::try_from(IF_PREFIX.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.if_prefix = n;
        if n > 0 {
            let before = t.len() as u64;
            t = IF_PREFIX.replace_all(&t, "").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_IF_PREFIX] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("success") {
        let n = u32::try_from(SUCCESS.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.success = n;
        if n > 0 {
            let before = t.len() as u64;
            t = SUCCESS.replace_all(&t, "\u{2713}").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_SUCCESS] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("failure") {
        let n = u32::try_from(FAILURE.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.failure = n;
        if n > 0 {
            let before = t.len() as u64;
            t = FAILURE.replace_all(&t, "\u{2717}").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_FAILURE] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("arrow") {
        let n = u32::try_from(ARROW.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.arrow = n;
        if n > 0 {
            let before = t.len() as u64;
            t = ARROW.replace_all(&t, "\u{2192}").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_ARROW] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("vs") {
        let n = u32::try_from(VS.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.vs = n;
        if n > 0 {
            let before = t.len() as u64;
            t = VS.replace_all(&t, "vs").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_VS] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("and") {
        let n = u32::try_from(AND.find_iter(&t).count()).unwrap_or(u32::MAX);
        trace.and = n;
        if n > 0 {
            let before = t.len() as u64;
            t = AND.replace_all(&t, "+").into_owned();
            trace.bytes_saved[EncoderTrace::IDX_AND] = before.saturating_sub(t.len() as u64);
        }
    }
    if rs.is_enabled("filler_removed") {
        let before = t.len() as u64;
        let words_before = t.split_whitespace().count();
        let kept: Vec<&str> = t.split_whitespace().filter(|w| !is_filler(w)).collect();
        let removed = u32::try_from(words_before.saturating_sub(kept.len())).unwrap_or(u32::MAX);
        trace.filler_removed = removed;
        t = kept.join(" ");
        if removed > 0 {
            trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] =
                before.saturating_sub(t.len() as u64);
        }
    }
    t = PUNCT_GAP.replace_all(&t, "$1 ").into_owned();
    t = MULTI_WS.replace_all(&t, " ").into_owned();
    (t.trim().to_owned(), trace)
}

/// Compress a single inline prose fragment (e.g. a heading body or list
/// item body). Runs the same rule pipeline as
/// [`encode_symbolic_traced_with`] but skips the line-flattening
/// `MULTI_WS` step because the caller already knows the fragment is a
/// single line. Trace counts from the inline pass are merged into
/// `trace_accum` so the caller can report per-rule fires across all
/// segments.
fn compress_inline(body: &str, rs: &RuleSet, trace_accum: &mut EncoderTrace) -> String {
    let (out, trace) = encode_symbolic_traced_with(body, rs);
    trace_accum.merge(trace);
    out
}

/// Classification for a single markdown line.
enum LineKind {
    Blank,
    Fence,
    Heading { prefix: String, body: String },
    ListItem { prefix: String, body: String },
    Blockquote { body: String },
    Table,
    Prose,
}

fn classify_line(line: &str) -> LineKind {
    if line.trim().is_empty() {
        return LineKind::Blank;
    }
    let trimmed = line.trim_start();
    if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
        return LineKind::Fence;
    }
    if trimmed.starts_with('#') {
        let rest = trimmed.trim_start_matches('#');
        let hashes = trimmed.len() - rest.len();
        if (1..=6).contains(&hashes) && rest.starts_with(' ') {
            let indent = &line[..line.len() - trimmed.len()];
            let prefix = format!("{}{} ", indent, "#".repeat(hashes));
            let body = rest.trim_start().to_owned();
            return LineKind::Heading { prefix, body };
        }
    }
    for marker in ["- ", "* ", "+ "] {
        if let Some(body) = trimmed.strip_prefix(marker) {
            if !body.is_empty() {
                let indent = &line[..line.len() - trimmed.len()];
                let prefix = format!("{}{}", indent, marker);
                return LineKind::ListItem {
                    prefix,
                    body: body.to_owned(),
                };
            }
        }
    }
    // Ordered list: `\d{1,3}\. ` or `\d{1,3}\) `.
    let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
    if !digits.is_empty() && digits.len() <= 3 {
        let after_digits = &trimmed[digits.len()..];
        for sep in [". ", ") "] {
            if let Some(body) = after_digits.strip_prefix(sep) {
                if !body.is_empty() {
                    let indent = &line[..line.len() - trimmed.len()];
                    let prefix = format!("{}{}{}", indent, digits, sep);
                    return LineKind::ListItem {
                        prefix,
                        body: body.to_owned(),
                    };
                }
            }
        }
    }
    if let Some(body) = trimmed.strip_prefix("> ") {
        return LineKind::Blockquote {
            body: body.to_owned(),
        };
    }
    if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
        return LineKind::Table;
    }
    LineKind::Prose
}

/// Segment-aware symbolic compression (Phase B of B8 fix).
///
/// Splits markdown input into block-level segments, compresses prose
/// content within each segment via [`encode_symbolic_traced_with`], and
/// rejoins with structural markers preserved. Compared with the flat
/// encoder, this path retains paragraph breaks, headings, list-item
/// markers, blockquotes, and fenced-code blocks while still applying
/// the rule set to the words inside each segment.
///
/// Verbatim-preserved blocks: fenced code (entire fence content),
/// table rows, blank lines.
///
/// Structure-preserving compressed blocks: heading prefixes kept,
/// list-item markers kept, blockquote prefix kept; the body text after
/// each marker is compressed inline.
///
/// Paragraph handling: consecutive plain-prose lines are joined with a
/// single space into one paragraph, compressed as one unit, and emitted
/// on a single line. Markdown renders soft-wrapped paragraphs the same
/// as single-line paragraphs, so this is lossless at the rendered-
/// document boundary.
pub fn encode_symbolic_structural_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
    let mut out = String::with_capacity(text.len());
    let mut trace = EncoderTrace::default();
    let mut prose_buf = String::new();
    let mut in_fence = false;
    // T14: track whether we're currently inside an indented-code
    // block. Markdown (CommonMark §4.4) defines these as a run of
    // one-or-more lines that begin with at least 4 spaces (or 1
    // tab), opened by a blank line or the start of the document, and
    // closed by a line that is NOT 4-space/tab indented AND is not
    // blank. Lines inside the block are emitted verbatim; the prose
    // pipeline never sees them.
    let mut in_indented_code = false;
    let mut prev_line_blank = true; // start-of-doc counts as blank

    let flush = |prose_buf: &mut String, out: &mut String, trace: &mut EncoderTrace| {
        if prose_buf.is_empty() {
            return;
        }
        let compressed = compress_inline(prose_buf, rs, trace);
        out.push_str(&compressed);
        out.push('\n');
        prose_buf.clear();
    };

    for line in text.split('\n') {
        if in_fence {
            // Inside a fence: emit verbatim; flip on closing fence.
            out.push_str(line);
            out.push('\n');
            let trimmed = line.trim_start();
            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
                in_fence = false;
            }
            prev_line_blank = false;
            continue;
        }

        // T14 indented-code detection. The spec-correct minimum is
        // 4 spaces OR 1 tab. We recognise both. Staying inside the
        // block on later non-blank indented lines means only the
        // opening needs the blank-line precondition; continuation
        // lines can have any prose content as long as the indent is
        // still there.
        let is_indented = line.starts_with("    ") || line.starts_with('\t');
        let is_blank = line.trim().is_empty();
        if in_indented_code {
            if is_indented || is_blank {
                // still inside the block: emit verbatim
                out.push_str(line);
                out.push('\n');
                prev_line_blank = is_blank;
                continue;
            }
            // non-indented, non-blank line closes the block
            in_indented_code = false;
        } else if prev_line_blank && is_indented && !is_blank {
            // opening of an indented-code block
            flush(&mut prose_buf, &mut out, &mut trace);
            in_indented_code = true;
            out.push_str(line);
            out.push('\n');
            prev_line_blank = false;
            continue;
        }

        // prev_line_blank for the next iteration is set after the
        // match arms below, from `is_blank` — see the trailing
        // assignment after `LineKind::Prose`.
        match classify_line(line) {
            LineKind::Fence => {
                flush(&mut prose_buf, &mut out, &mut trace);
                out.push_str(line);
                out.push('\n');
                in_fence = true;
            }
            LineKind::Blank => {
                flush(&mut prose_buf, &mut out, &mut trace);
                out.push('\n');
            }
            LineKind::Heading { prefix, body } => {
                flush(&mut prose_buf, &mut out, &mut trace);
                let body_c = compress_inline(&body, rs, &mut trace);
                out.push_str(&prefix);
                out.push_str(&body_c);
                out.push('\n');
            }
            LineKind::ListItem { prefix, body } => {
                flush(&mut prose_buf, &mut out, &mut trace);
                let body_c = compress_inline(&body, rs, &mut trace);
                out.push_str(&prefix);
                out.push_str(&body_c);
                out.push('\n');
            }
            LineKind::Blockquote { body } => {
                flush(&mut prose_buf, &mut out, &mut trace);
                let body_c = compress_inline(&body, rs, &mut trace);
                out.push_str("> ");
                out.push_str(&body_c);
                out.push('\n');
            }
            LineKind::Table => {
                flush(&mut prose_buf, &mut out, &mut trace);
                out.push_str(line);
                out.push('\n');
            }
            LineKind::Prose => {
                if !prose_buf.is_empty() {
                    prose_buf.push(' ');
                }
                prose_buf.push_str(line);
            }
        }
        // T14: update the blank-line bookkeeping for the next
        // iteration. We use the raw input line's emptiness, not the
        // classification — a LineKind::Blank is always is_blank, but
        // we need the raw truth for indented-code detection.
        prev_line_blank = is_blank;
    }
    flush(&mut prose_buf, &mut out, &mut trace);

    // Mirror `text`'s trailing-newline state so a non-terminated input
    // does not acquire an extra '\n' from the emitter.
    if !text.ends_with('\n') && out.ends_with('\n') {
        out.pop();
    }
    (out, trace)
}

/// Production encoder backed by a `Measurer` for token accounting.
pub struct SymbolicEncoder {
    measurer: Arc<dyn Measurer>,
}

impl SymbolicEncoder {
    /// Wrap a measurer (typically `LocalMeasurer::with_defaults()`).
    #[must_use]
    pub fn new(measurer: Arc<dyn Measurer>) -> Self {
        Self { measurer }
    }

    fn hash(text: &str) -> String {
        let mut h = Hasher::new();
        h.update(text.as_bytes());
        h.finalize().to_hex().to_string()
    }

    fn count_or_zero(&self, text: &str, model: &Model) -> u32 {
        self.measurer.tokenize(text, model).unwrap_or(0)
    }

    fn build(
        &self,
        original: &str,
        compressed: &str,
        format: Format,
        model: Model,
        fallback: Option<FallbackReason>,
    ) -> Compressed {
        let baseline = self.count_or_zero(original, &model);
        let encoded = self.count_or_zero(compressed, &model);
        Compressed {
            content: compressed.to_owned(),
            format,
            baseline_tokens: baseline,
            compressed_tokens: encoded,
            model,
            content_hash: Self::hash(original),
            fallback,
        }
    }
}

impl SymbolicEncoder {
    /// Compress + return the per-rule firing trace alongside the
    /// `Compressed` output. The proxy / MCP server / Python SDK feed
    /// `EncoderTrace` into the audit ledger so post-hoc analysis can
    /// answer questions like "which rules pulled their weight on
    /// production traffic in 2026-W17?".
    ///
    /// On Prose fallback the trace is empty (`EncoderTrace::default`)
    /// because the pipeline never ran.
    #[must_use]
    pub fn compress_traced(&self, input: &str, model: Model) -> (Compressed, EncoderTrace) {
        self.compress_traced_with(input, model, &RuleSet::default_v1())
    }

    /// Same as [`compress_traced`] but consults the supplied
    /// [`RuleSet`] instead of the v1 defaults. Intended for the
    /// hot-reload path: the proxy reads the current ruleset under a
    /// read lock and passes a `&RuleSet` per request.
    #[must_use]
    pub fn compress_traced_with(
        &self,
        input: &str,
        model: Model,
        rs: &RuleSet,
    ) -> (Compressed, EncoderTrace) {
        if !self.measurer.supported(&model) {
            return (
                self.build(
                    input,
                    input,
                    Format::Prose,
                    model,
                    Some(FallbackReason::TokenizerMissing),
                ),
                EncoderTrace::default(),
            );
        }
        let chars = input.chars().count();
        if chars < MIN_INPUT_CHARS {
            return (
                self.build(
                    input,
                    input,
                    Format::Prose,
                    model,
                    Some(FallbackReason::Uncompressible),
                ),
                EncoderTrace::default(),
            );
        }
        if chars > MAX_INPUT_CHARS {
            return (
                self.build(
                    input,
                    input,
                    Format::Prose,
                    model,
                    Some(FallbackReason::OversizedInput),
                ),
                EncoderTrace::default(),
            );
        }
        // B8: for markdown-structured inputs, route through the
        // segment-aware encoder so paragraph breaks, headings,
        // lists, fences, blockquotes and tables survive; only the
        // body text inside each segment goes through the flat
        // pipeline that ends in MULTI_WS. Plain prose keeps the
        // original single-pass flat encoder.
        let (encoded, trace) = if has_structural_markers(input) {
            encode_symbolic_structural_traced_with(input, rs)
        } else {
            encode_symbolic_traced_with(input, rs)
        };
        let baseline = self.count_or_zero(input, &model);
        let candidate = self.count_or_zero(&encoded, &model);
        if candidate >= baseline {
            return (
                self.build(
                    input,
                    input,
                    Format::Prose,
                    model,
                    Some(FallbackReason::Uncompressible),
                ),
                EncoderTrace::default(),
            );
        }
        (
            self.build(input, &encoded, Format::Symbolic, model, None),
            trace,
        )
    }
}

impl Encoder for SymbolicEncoder {
    fn compress(&self, input: &str, model: Model) -> Compressed {
        self.compress_traced(input, model).0
    }

    fn select_format(&self, input: &str, model: Model) -> Format {
        if !self.measurer.supported(&model) || input.chars().count() < MIN_INPUT_CHARS {
            return Format::Prose;
        }
        let encoded = encode_symbolic(input);
        if self.count_or_zero(&encoded, &model) >= self.count_or_zero(input, &model) {
            Format::Prose
        } else {
            Format::Symbolic
        }
    }

    fn fallback(&self, input: &str, model: Model, reason: FallbackReason) -> Compressed {
        self.build(input, input, Format::Prose, model, Some(reason))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::tokenizers::LocalMeasurer;

    fn enc() -> SymbolicEncoder {
        let m = LocalMeasurer::with_defaults().expect("measurer");
        SymbolicEncoder::new(Arc::new(m))
    }

    // B8 regression: the Symbolic pipeline collapses all whitespace
    // into single spaces via MULTI_WS. If the encoder ever chooses
    // Symbolic for a markdown-structured input, all headings/lists/
    // code fences/paragraph breaks are destroyed. These tests pin
    // the structural-content gate that forces Prose fallback
    // whenever markdown markers are present.

    #[test]
    fn structural_gate_detects_paragraph_breaks() {
        assert!(has_structural_markers("foo\n\nbar"));
    }

    #[test]
    fn structural_gate_detects_headings() {
        assert!(has_structural_markers("# Title\ncontent follows"));
        assert!(has_structural_markers("content\n## Subheading\nmore"));
    }

    #[test]
    fn structural_gate_detects_lists() {
        assert!(has_structural_markers("intro\n- item one\n- item two"));
        assert!(has_structural_markers("intro\n1. first\n2. second"));
    }

    #[test]
    fn structural_gate_detects_fenced_code() {
        assert!(has_structural_markers("prose\n```\ncode\n```"));
        assert!(has_structural_markers("```rust\nfn main() {}\n```"));
    }

    #[test]
    fn structural_gate_detects_tables_and_blockquotes() {
        assert!(has_structural_markers(
            "col\n| a | b |\n|---|---|\n| 1 | 2 |"
        ));
        assert!(has_structural_markers("context\n> quoted line\nafter"));
    }

    #[test]
    fn structural_gate_lets_flat_prose_through() {
        assert!(!has_structural_markers(
            "one sentence. another sentence. a third. no line breaks here."
        ));
    }

    #[test]
    fn markdown_input_preserves_structure_through_compression() {
        // B8 Phase B: markdown input compresses via the structural
        // encoder. Paragraph breaks (\n\n), heading prefixes,
        // list-item markers, fenced code, blockquotes, and tables
        // must all survive. Body text inside each segment may be
        // rewritten by the rule pipeline.
        let md = "# Heading\n\nFirst paragraph with enough body to clear the thirty-two-char floor.\n\n- list item one\n- list item two\n\nSecond paragraph follows here.";
        let (out, _trace) = enc().compress_traced(md, Model::ClaudeSonnet47);
        // Expect Symbolic (compression went through) OR Prose (if
        // structural encoder could not beat baseline). Either way
        // structure must be intact in out.content.
        let content = &out.content;
        // Paragraph breaks preserved.
        assert!(
            content.contains("\n\n"),
            "expected paragraph break preserved, got: {content:?}"
        );
        // Heading prefix preserved.
        assert!(
            content.starts_with("# "),
            "expected heading prefix preserved, got: {content:?}"
        );
        // List-item markers preserved (at least one).
        assert!(
            content.contains("\n- "),
            "expected list-item marker preserved, got: {content:?}"
        );
        // Newline count must not collapse to zero.
        let newlines = content.matches('\n').count();
        assert!(
            newlines >= 4,
            "expected >=4 newlines (paragraph + 2 list + blank), got {newlines} in {content:?}"
        );
    }

    #[test]
    fn structural_encoder_preserves_fenced_code_verbatim() {
        use crate::RuleSet;
        let md =
            "Intro paragraph.\n\n```rust\nfn main() {\n    println!(\"x\");\n}\n```\n\nEpilogue.";
        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
        assert!(
            out.contains("```rust\nfn main() {\n    println!(\"x\");\n}\n```"),
            "fenced code must be preserved byte-for-byte, got: {out:?}"
        );
    }

    // T14 regression tests: indented-code blocks (CommonMark §4.4)
    // should be preserved verbatim through the structural encoder.

    #[test]
    fn structural_encoder_preserves_four_space_indented_code() {
        use crate::RuleSet;
        let md = "intro paragraph.\n\n    fn check(token: &Token) -> bool {\n        token.expires_at <= Utc::now()\n    }\n\nepilogue paragraph.";
        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
        assert!(
            out.contains("    fn check(token: &Token) -> bool {"),
            "four-space indented code must be verbatim, got: {out:?}"
        );
        assert!(
            out.contains("        token.expires_at <= Utc::now()"),
            "indented-code continuation (8 spaces) must be verbatim, got: {out:?}"
        );
        assert!(
            out.contains("    }"),
            "closing brace line must be verbatim, got: {out:?}"
        );
    }

    #[test]
    fn structural_encoder_preserves_tab_indented_code() {
        use crate::RuleSet;
        let md = "intro.\n\n\tlet x = 1;\n\tlet y = 2;\n\nafter.";
        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
        assert!(
            out.contains("\tlet x = 1;"),
            "tab-indented code must be verbatim, got: {out:?}"
        );
    }

    #[test]
    fn structural_encoder_compresses_paragraph_body() {
        use crate::RuleSet;
        // Term-substitution should still fire inside paragraphs.
        let md = "Title line no header.\n\nThe authentication module sends a request to the policy engine and it returns a result.";
        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
        assert!(out.contains("\n\n"), "paragraph break preserved");
        // Substitution evidence: "authentication module" should have been shortened.
        assert!(
            !out.contains("authentication module"),
            "expected term_substitutions to rewrite 'authentication module', got: {out:?}"
        );
    }

    #[test]
    fn substitutes_authorization_term() {
        let out = encode_symbolic(
            "The user authentication module sends the request to the policy engine.",
        );
        assert!(out.contains("A.mod"), "expected A.mod in {out}");
        assert!(out.contains("PE"), "expected PE in {out}");
    }

    #[test]
    fn drops_filler_words() {
        let out = encode_symbolic("The user is in the system.");
        // 'the', 'is', 'in' are all filler; we keep 'user' and 'system'.
        let lc = out.to_lowercase();
        assert!(!lc.split_whitespace().any(|w| w == "the"));
        assert!(!lc.split_whitespace().any(|w| w == "is"));
    }

    #[test]
    fn drops_expanded_prepositions_and_intensifiers() {
        // Regression gate for the 2026-04-24 FILLER_WORDS expansion.
        // Prepositions (for, about, through, during, via, per, over,
        // around, within) and degree-only intensifiers (just, only,
        // very, quite, really, actually, simply) must all strip.
        // Content words ('request', 'handler', 'log') must survive.
        let out = encode_symbolic(
            "The request is just really very important for the handler to actually log during the call.",
        );
        let lc = out.to_lowercase();
        let words: std::collections::HashSet<_> = lc.split_whitespace().collect();
        for stripped in ["for", "during", "just", "really", "very", "actually"] {
            assert!(
                !words.contains(stripped),
                "filler `{stripped}` must be stripped from: {out}",
            );
        }
        // Content words survive (may be substituted by TERM_SUBSTITUTIONS
        // -- e.g. 'request' -> 'R', 'handler' -> 'H' -- so assert on
        // something content-bearing that is *not* on the subst table).
        for kept in ["important", "log", "call"] {
            assert!(
                words.iter().any(|w| w.contains(kept)),
                "content word `{kept}` must survive: {out}",
            );
        }
    }

    #[test]
    fn polarity_bearing_words_are_not_filler() {
        // Explicit guardrail: the filler list must never include
        // polarity-bearing words (not / never / no / nothing) nor
        // epistemic hedges (maybe / perhaps / likely / possibly),
        // because dropping them silently flips assertion strength
        // or polarity on the compressed prompt. Regression pin for
        // the 2026-04-24 expansion.
        for forbidden in [
            "not", "never", "no", "nothing", "maybe", "perhaps", "likely", "possibly",
        ] {
            assert!(
                !crate::encoder::FILLER_WORDS.contains(&forbidden),
                "polarity-bearing word `{forbidden}` must NOT be in FILLER_WORDS",
            );
        }
    }

    #[test]
    fn arrow_replacement() {
        // ARROW regex matches the contiguous tokens; `invokes` is a
        // single-word match (unlike `forwards to` which needs `to` in
        // the very next token slot).
        let out = encode_symbolic("The handler invokes the policy engine.");
        assert!(out.contains('\u{2192}'), "missing arrow in {out}");
    }

    #[test]
    fn success_glyph_replacement_when_rule_is_explicitly_enabled() {
        // `success` is disabled in `default_v1` as of 2026-04-24
        // (ablation study: glyph tokenizes to more tokens than the
        // ASCII word). The underlying rule still works when a
        // caller opts in, so pin that capability with an explicit
        // ruleset.
        let mut rs = RuleSet::default_v1();
        rs.enabled.insert("success".to_owned(), true);
        rs.weights.insert("success".to_owned(), 1.0);
        let (out, _) =
            encode_symbolic_traced_with("If validation succeeds the request continues.", &rs);
        assert!(out.contains('\u{2713}'), "missing check in {out}");
    }

    #[test]
    fn failure_glyph_replacement_when_rule_is_explicitly_enabled() {
        // Companion to `success_glyph_replacement_when_rule_is_explicitly_enabled`.
        let mut rs = RuleSet::default_v1();
        rs.enabled.insert("failure".to_owned(), true);
        rs.weights.insert("failure".to_owned(), 1.0);
        let (out, _) =
            encode_symbolic_traced_with("If validation fails the request is rejected.", &rs);
        assert!(out.contains('\u{2717}'), "missing cross in {out}");
    }

    #[test]
    fn default_v1_disables_success_and_failure_glyphs() {
        // Regression gate for the 2026-04-24 ablation finding:
        // `success` and `failure` glyph substitutions made output
        // LARGER on real corpora, so they must stay OFF in the
        // first-boot / fallback ruleset. Prevents a silent revert
        // from reintroducing the -55-token-per-corpus regression.
        let rs = RuleSet::default_v1();
        assert_eq!(
            rs.enabled.get("success").copied(),
            Some(false),
            "success must be OFF by default",
        );
        assert_eq!(
            rs.enabled.get("failure").copied(),
            Some(false),
            "failure must be OFF by default",
        );
        let (out, trace) = encode_symbolic_traced_with(
            "If validation succeeds the call fails and the handler logs it.",
            &rs,
        );
        assert!(!out.contains('\u{2713}'));
        assert!(!out.contains('\u{2717}'));
        assert_eq!(trace.success, 0);
        assert_eq!(trace.failure, 0);
    }

    #[test]
    fn longer_term_wins_over_shorter() {
        let out = encode_symbolic("The authentication module handles login.");
        // "authentication module" -> "A.mod", not "authentication" -> "A".
        assert!(out.contains("A.mod"));
        assert!(!out.contains("A module"));
    }

    #[test]
    fn idempotent_on_minimal_input() {
        let out = encode_symbolic("hi");
        assert_eq!(out, "hi");
    }

    #[test]
    fn compress_returns_symbolic_when_net_positive() {
        let inp = "The authentication module forwards the request to the policy engine \
                   for validation against the session store.";
        let out = enc().compress(inp, Model::Gpt4);
        assert_eq!(out.format, Format::Symbolic);
        assert!(out.compressed_tokens < out.baseline_tokens, "{out:?}");
        assert!(out.fallback.is_none());
    }

    #[test]
    fn compress_falls_back_when_too_short() {
        let out = enc().compress("hello world", Model::Gpt4);
        assert_eq!(out.format, Format::Prose);
        assert!(matches!(out.fallback, Some(FallbackReason::Uncompressible)));
    }

    #[test]
    fn compress_falls_back_for_unregistered_model() {
        let out = enc().compress(
            "The authentication module forwards the request to the policy engine.",
            Model::Gemini25Pro,
        );
        assert_eq!(out.format, Format::Prose);
        assert!(matches!(
            out.fallback,
            Some(FallbackReason::TokenizerMissing)
        ));
    }

    #[test]
    fn select_format_matches_compress_choice() {
        let inp = "The authentication module forwards the request to the policy engine \
                   for validation against the session store.";
        let f = enc().select_format(inp, Model::Gpt4);
        let c = enc().compress(inp, Model::Gpt4);
        assert_eq!(f, c.format);
    }

    #[test]
    fn explicit_fallback_returns_prose() {
        let out = enc().fallback(
            "The authentication module forwards the request.",
            Model::Gpt4,
            FallbackReason::QualityDegraded,
        );
        assert_eq!(out.format, Format::Prose);
        assert!(matches!(
            out.fallback,
            Some(FallbackReason::QualityDegraded)
        ));
    }

    #[test]
    fn content_hash_is_blake3_of_original_not_compressed() {
        let inp = "The authentication module forwards the request to the policy engine.";
        let out = enc().compress(inp, Model::Gpt4);
        let mut h = Hasher::new();
        h.update(inp.as_bytes());
        assert_eq!(out.content_hash, h.finalize().to_hex().to_string());
    }

    #[test]
    fn trace_records_term_substitution_count() {
        let (_, t) = encode_symbolic_traced(
            "The authentication module forwards a request to the policy engine \
             for validation against the session store.",
        );
        // term_substitutions includes 'authentication', 'request', 'validation',
        // 'session store', 'policy engine'. Exact count varies with overlap;
        // all we need is "fired".
        assert!(t.term_substitutions >= 3, "{t:?}");
    }

    #[test]
    fn trace_records_filler_removal_count() {
        let (_, t) = encode_symbolic_traced("The user is in the system and is using the database.");
        // 'the' x3, 'is' x2, 'in', 'and', 'the' (some already in TERM hits)
        assert!(t.filler_removed >= 4, "{t:?}");
    }

    #[test]
    fn trace_no_fire_for_neutral_text() {
        let (_, t) = encode_symbolic_traced("Lorem ipsum dolor sit amet consectetur");
        assert_eq!(t.term_substitutions, 0);
        assert_eq!(t.if_prefix, 0);
        assert!(!t.any_fired() || t.filler_removed > 0);
    }

    #[test]
    fn step9_bytes_saved_populated_when_multiple_rules_fire() {
        // Step-9 integration test. Input is crafted to fire at least
        // term_substitutions + filler_removed. Verify:
        //   1. Each firing rule has bytes_saved > 0 at its slot.
        //   2. A non-firing rule stays at 0.
        //   3. Sum of per-rule bytes_saved is positive and bounded
        //      above by (input.len - output.len) + whitespace slop
        //      from the final PUNCT_GAP / MULTI_WS normalisation.
        let input = "The authentication module forwards a request to the policy \
                     engine for validation against the session store.";
        let (out, trace) = encode_symbolic_traced(input);

        // At least term_substitutions and filler_removed fired.
        assert!(
            trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] > 0,
            "term_substitutions should have saved bytes; trace={trace:?}"
        );
        assert!(
            trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] > 0,
            "filler_removed should have saved bytes; trace={trace:?}"
        );

        // A rule that clearly cannot fire on this input (no code
        // fences, no ANSI) stays at zero.
        assert_eq!(
            trace.bytes_saved[EncoderTrace::IDX_ANSI_STRIPPED],
            0,
            "ansi_stripped cannot fire on plain prose"
        );

        // Sanity: the sum of per-rule deltas is at most the total
        // shrink. Rules compose — rule N operates on rule N-1's
        // output — so sum-of-deltas is a LOWER bound on total bytes
        // removed, not exact. But it must be positive and not exceed
        // the total.
        let sum_deltas: u64 = trace.bytes_saved.iter().sum();
        let total_delta = (input.len() as u64).saturating_sub(out.len() as u64);
        assert!(sum_deltas > 0, "at least one rule contributed");
        assert!(
            sum_deltas <= total_delta,
            "sum of per-rule deltas ({sum_deltas}) must not exceed \
             total shrink ({total_delta}); input={} output={}",
            input.len(),
            out.len()
        );
    }

    #[test]
    fn compress_traced_returns_empty_trace_on_short_input() {
        let (_, trace) = enc().compress_traced("hi", Model::Gpt4);
        assert_eq!(trace, EncoderTrace::default());
    }

    #[test]
    fn compress_traced_returns_empty_trace_on_unsupported_model() {
        let (_, trace) = enc().compress_traced(
            "The authentication module forwards the request.",
            Model::Gemini25Pro,
        );
        assert_eq!(trace, EncoderTrace::default());
    }

    #[test]
    fn compress_traced_returns_empty_trace_on_oversized_input() {
        let big = "abc ".repeat(MAX_INPUT_CHARS);
        let (out, trace) = enc().compress_traced(&big, Model::Gpt4);
        assert_eq!(out.format, Format::Prose);
        assert!(matches!(out.fallback, Some(FallbackReason::OversizedInput)));
        assert_eq!(trace, EncoderTrace::default());
    }

    #[test]
    fn trace_pairs_are_alphabetical() {
        let t = EncoderTrace::default();
        let pairs = t.as_pairs();
        let mut sorted = pairs;
        sorted.sort_by_key(|(name, _)| *name);
        assert_eq!(pairs, sorted);
    }

    #[test]
    fn ruleset_default_v1_matches_constants_only_encoder() {
        // Equivalence proof: default_v1 ruleset must produce
        // bit-identical output to the legacy constants-only path.
        let inputs = [
            "The authentication module forwards the request to the policy engine.",
            "If validation succeeds the request continues. The handler invokes the rate limiter.",
            "User is authorized via the bearer token; resource handler validates the operation.",
        ];
        let rs = RuleSet::default_v1();
        for inp in inputs {
            let (a, ta) = encode_symbolic_traced(inp);
            let (b, tb) = encode_symbolic_traced_with(inp, &rs);
            assert_eq!(a, b, "default_v1 must match legacy on `{inp}`");
            assert_eq!(ta, tb, "trace must match on `{inp}`");
        }
    }

    #[test]
    fn ruleset_disabled_rule_does_not_fire() {
        let mut rs = RuleSet::default_v1();
        rs.enabled.insert("success".to_owned(), false);
        let (out, trace) = encode_symbolic_traced_with(
            "If validation succeeds the request continues to the handler.",
            &rs,
        );
        assert!(
            !out.contains('\u{2713}'),
            "success glyph must not appear: {out}"
        );
        assert_eq!(trace.success, 0, "success rule trace must be zero");
    }

    #[test]
    fn ruleset_weight_below_threshold_is_treated_as_off() {
        // Uses a weight strictly below `ENABLE_WEIGHT_THRESHOLD`
        // (revised from 0.5 → 0.05 on 2026-04-24 after corpus
        // simulation showed no individual rule could cross 0.5 on
        // real audit data). Keep this value in sync if the
        // threshold changes again.
        let mut rs = RuleSet::default_v1();
        rs.weights.insert("arrow".to_owned(), 0.02);
        let (out, trace) = encode_symbolic_traced_with(
            "The handler invokes the policy engine to validate the request.",
            &rs,
        );
        assert!(
            !out.contains('\u{2192}'),
            "arrow glyph must not appear: {out}"
        );
        assert_eq!(trace.arrow, 0);
    }

    #[test]
    fn ruleset_weight_above_threshold_but_below_legacy_half_is_on() {
        // Regression gate for the 2026-04-24 threshold revision:
        // weight=0.2 was "off" under the old 0.5 gate but must now
        // be "on" under the 0.05 gate. Prevents a future revert
        // from silently restoring the threshold that disabled
        // every live rule on the real corpus.
        let mut rs = RuleSet::default_v1();
        rs.weights.insert("arrow".to_owned(), 0.2);
        let (out, trace) = encode_symbolic_traced_with(
            "The handler invokes the policy engine to validate the request.",
            &rs,
        );
        assert!(
            out.contains('\u{2192}'),
            "arrow glyph must be applied at weight 0.2 under revised threshold: {out}"
        );
        assert!(trace.arrow > 0);
    }

    #[test]
    fn ruleset_unrecognised_keys_are_dropped_on_load() {
        let toml = r"
[enabled]
success = false
made_up_rule = true
[weights]
arrow = 0.3
another_made_up = 0.7
";
        let rs = RuleSet::from_toml_str(toml).expect("parse");
        assert_eq!(rs.enabled.get("success").copied(), Some(false));
        assert!(!rs.enabled.contains_key("made_up_rule"));
        assert!(!rs.weights.contains_key("another_made_up"));
    }

    #[test]
    fn ruleset_round_trip_through_toml() {
        let mut rs = RuleSet::default_v1();
        rs.enabled.insert("success".to_owned(), false);
        rs.weights.insert("arrow".to_owned(), 0.42);
        rs.version = Some("test-r1".to_owned());
        let s = rs.to_toml_string().expect("serialize");
        let rs2 = RuleSet::from_toml_str(&s).expect("parse");
        assert_eq!(rs2.enabled.get("success").copied(), Some(false));
        assert!((rs2.weight("arrow") - 0.42).abs() < 1e-6);
        assert_eq!(rs2.version.as_deref(), Some("test-r1"));
    }

    #[test]
    fn safe_canary_preserves_success_failure_words() {
        let rs = RuleSet::safe_canary_v1();
        let (out, trace) = encode_symbolic_traced_with(
            "If validation succeeds the request continues. If validation fails the request is rejected.",
            &rs,
        );
        assert!(
            out.to_lowercase().contains("succeeds"),
            "success word should remain: {out}"
        );
        assert!(
            out.to_lowercase().contains("fails"),
            "failure word should remain: {out}"
        );
        assert_eq!(trace.success, 0);
        assert_eq!(trace.failure, 0);
        assert_eq!(
            rs.version.as_deref(),
            Some("safe-canary-v1-no-success-failure")
        );
    }

    #[test]
    fn agentic_canary_v2_enables_quality_ready_tool_codecs() {
        let rs = RuleSet::agentic_canary_v2();
        for name in [
            "json_records_table",
            "numeric_range_lines",
            "repeated_chunk_dict",
            "tool_schema_semantic_table",
        ] {
            assert!(rs.is_enabled(name), "{name} should be enabled");
            assert!((rs.weight(name) - 1.0).abs() < f32::EPSILON);
        }
        assert!(!rs.is_enabled("success"), "success glyphs stay disabled");
        assert!(!rs.is_enabled("failure"), "failure glyphs stay disabled");
        assert_eq!(
            rs.version.as_deref(),
            Some("agentic-canary-v2-quality-ready-codecs")
        );
    }

    #[test]
    fn compress_traced_with_respects_ruleset_toggle() {
        let mut rs = RuleSet::default_v1();
        rs.enabled.insert("term_substitutions".to_owned(), false);
        let inp = "The authentication module forwards the request to the policy engine \
                   for validation against the session store.";
        let (out, _) = enc().compress_traced_with(inp, Model::Gpt4, &rs);
        // With substitutions disabled the canonical abbreviations
        // (A.mod, PE, SS) must not appear.
        assert!(!out.content.contains("A.mod"), "{:?}", out.content);
        assert!(!out.content.contains("PE"), "{:?}", out.content);
    }

    /// `DoD` §10 perf evidence for the encoder pipeline.
    #[test]
    fn compress_meets_section_10() {
        use std::time::Instant;
        let e = enc();
        let inp = "The authentication module forwards the request to the policy engine \
                   for validation against the session store and then the response \
                   pipeline returns the result. "
            .repeat(20);
        let mut samples = Vec::with_capacity(100);
        for _ in 0..100 {
            let t = Instant::now();
            let _ = e.compress(&inp, Model::Gpt4);
            samples.push(t.elapsed().as_micros());
        }
        samples.sort_unstable();
        let p50 = samples[50];
        let p95 = samples[94];
        let p99 = samples[98];
        eprintln!(
            "compress {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
            inp.len()
        );
        // Debug-build ceiling 50ms (release target <5ms tracked in ROADMAP).
        assert!(p95 < 50_000, "p95 {p95}us breaches debug ceiling");
    }

    #[test]
    fn idx_constants_match_as_pairs_order() {
        // Pin the invariant so renaming or reordering rule fields in
        // EncoderTrace can't silently desync the IDX_* constants from
        // as_pairs positions. Every consumer of bytes_saved[] relies
        // on this contract.
        let names = EncoderTrace::default().as_pairs().map(|(n, _)| n);
        assert_eq!(names[EncoderTrace::IDX_AND], "and");
        assert_eq!(names[EncoderTrace::IDX_ANSI_STRIPPED], "ansi_stripped");
        assert_eq!(names[EncoderTrace::IDX_ARROW], "arrow");
        assert_eq!(names[EncoderTrace::IDX_BLANK_LINES], "blank_lines");
        assert_eq!(names[EncoderTrace::IDX_FAILURE], "failure");
        assert_eq!(names[EncoderTrace::IDX_FILLER_REMOVED], "filler_removed");
        assert_eq!(names[EncoderTrace::IDX_IF_PREFIX], "if_prefix");
        assert_eq!(names[EncoderTrace::IDX_JSON_MINIFIED], "json_minified");
        assert_eq!(
            names[EncoderTrace::IDX_JSON_RECORDS_TABLE],
            "json_records_table"
        );
        assert_eq!(
            names[EncoderTrace::IDX_NUMERIC_RANGE_LINES],
            "numeric_range_lines"
        );
        assert_eq!(
            names[EncoderTrace::IDX_REPEATED_CHUNK_DICT],
            "repeated_chunk_dict"
        );
        assert_eq!(names[EncoderTrace::IDX_REPEATED_LINES], "repeated_lines");
        assert_eq!(names[EncoderTrace::IDX_SUCCESS], "success");
        assert_eq!(
            names[EncoderTrace::IDX_TERM_SUBSTITUTIONS],
            "term_substitutions"
        );
        assert_eq!(
            names[EncoderTrace::IDX_TOOL_SCHEMA_SEMANTIC_TABLE],
            "tool_schema_semantic_table"
        );
        assert_eq!(names[EncoderTrace::IDX_TRAILING_WS], "trailing_ws");
        assert_eq!(names[EncoderTrace::IDX_VS], "vs");
    }

    #[test]
    fn bytes_saved_pairs_parallel_to_as_pairs() {
        // Step-9 scaffold contract: `bytes_saved_pairs` returns the
        // same rule names in the same order as `as_pairs`, so the
        // audit-writer can zip them positionally without a name lookup.
        let mut t = EncoderTrace::default();
        t.bytes_saved[0] = 7; // "and"
        t.bytes_saved[13] = 42; // "term_substitutions"

        let counts = t.as_pairs();
        let bytes = t.bytes_saved_pairs();
        assert_eq!(counts.len(), bytes.len());
        for i in 0..counts.len() {
            assert_eq!(counts[i].0, bytes[i].0, "name at index {i} diverges");
        }
        assert_eq!(bytes[0], ("and", 7));
        assert_eq!(bytes[13], ("term_substitutions", 42));

        // Default trace is all-zero on the byte axis (matches the
        // non-firing counts). This pins the "zero is signal, not
        // noise" invariant documented in EncoderTrace::bytes_saved.
        let d = EncoderTrace::default();
        assert!(d.bytes_saved_pairs().iter().all(|(_, b)| *b == 0));
    }

    #[test]
    fn bytes_saved_merge_is_saturating_sum() {
        let mut a = EncoderTrace::default();
        a.bytes_saved[5] = 100;
        let mut b = EncoderTrace::default();
        b.bytes_saved[5] = 50;
        b.bytes_saved[9] = u64::MAX; // saturation test
        a.merge(b);
        assert_eq!(a.bytes_saved[5], 150);
        assert_eq!(a.bytes_saved[9], u64::MAX);

        let mut c = EncoderTrace::default();
        c.bytes_saved[9] = 1;
        a.merge(c);
        // saturating_add on MAX: stays at MAX, no overflow.
        assert_eq!(a.bytes_saved[9], u64::MAX);
    }
}