pithy_core/
encoder.rs

1//! Symbolic encoder -- Rust port of `research/f2_selector_oracle.py::encode_symbolic`.
2//!
3//! Implements the `Encoder` trait. The selector currently returns
4//! `Format::Symbolic` for every input that passes a minimum-length floor;
5//! richer dispatch (JIT-progressive, fragment-prose, structured-delim) is
6//! tracked separately as F1-followup. Anything below the floor falls
7//! through as `Format::Prose`.
8//!
9//! The substitution table and filler-word set are kept in sync with the
10//! Python reference. Whenever you change one, port the change to the
11//! other so research benchmarks and production stay consistent.
12//!
13//! Performance contract (DoD §10): `compress` must run in <5ms p95.
14//! Latency budget is dominated by `Measurer::tokenize` (the regex pass
15//! itself is ~50us on a 4kB input -- see `tests::compress_meets_section_10`).
16
17use std::collections::BTreeMap;
18use std::path::Path;
19use std::sync::Arc;
20
21use blake3::Hasher;
22use once_cell::sync::Lazy;
23use regex::{Regex, RegexBuilder};
24use serde::{Deserialize, Serialize};
25
26use crate::interfaces::{Compressed, Encoder, FallbackReason, Format, Measurer, Model};
27
28/// Stable rule names emitted by `EncoderTrace::as_pairs`. Used as
29/// the canonical keys for `RuleSet::enabled` / `weights` so the
30/// closed-loop tuner attributes savings to the same identifier the
31/// encoder fires under.
32pub const RULE_NAMES: &[&str] = &[
33    "and",
34    "ansi_stripped",
35    "arrow",
36    "blank_lines",
37    "failure",
38    "filler_removed",
39    "if_prefix",
40    "json_minified",
41    "json_records_table",
42    "numeric_range_lines",
43    "repeated_chunk_dict",
44    "repeated_lines",
45    "success",
46    "term_substitutions",
47    "tool_schema_semantic_table",
48    "trailing_ws",
49    "vs",
50];
51
52/// Minimum input length (chars) below which encoder bypasses to Prose.
53///
54/// Matches the v0.1 selector behaviour: tiny inputs cost more in dialect
55/// pre/post-amble than they save in body compression.
56pub const MIN_INPUT_CHARS: usize = 32;
57
58/// Maximum input length (chars) the encoder will process. Inputs above
59/// this fall through to Prose with `OversizedInput` so a pathological
60/// caller cannot pin a worker on regex work.
61///
62/// 256 KiB chosen as a defensive ceiling: real LLM context windows top
63/// out around 1M tokens (~4 MB UTF-8); anything passed to a single
64/// encoder call above 256K is almost certainly a bug or attack.
65pub const MAX_INPUT_CHARS: usize = 256 * 1024;
66
67/// (long-form, abbreviation) substitution pairs, sorted longest-first
68/// at construction so greedy matching binds the longest term first.
69const TERM_SUBSTITUTIONS: &[(&str, &str)] = &[
70    ("post-tool authorization check", "PTA"),
71    ("post-tool authorization", "PTA"),
72    ("policy engine", "PE"),
73    ("session store", "SS"),
74    ("failure store", "FS"),
75    ("response pipeline", "RP"),
76    ("rate limiting", "RL"),
77    ("rate limiter", "RL"),
78    ("rate limit", "RL"),
79    ("authentication module", "A.mod"),
80    ("authorization module", "Z.mod"),
81    ("authentication service", "A.svc"),
82    ("authorization service", "Z.svc"),
83    ("authentication", "A"),
84    ("authorization", "Z"),
85    ("authenticate", "A"),
86    ("authorize", "Z"),
87    ("authenticated", "A'd"),
88    ("authorized", "Z'd"),
89    ("handler", "H"),
90    ("request", "R"),
91    ("response", "Rp"),
92    ("permissions", "P"),
93    ("permission", "P"),
94    ("telemetry", "T"),
95    ("validate", "V"),
96    ("validates", "V"),
97    ("validated", "V'd"),
98    ("validation", "V"),
99    ("database", "DB"),
100    ("JSON", "J"),
101    ("bearer token", "BT"),
102    ("principal", "Pr"),
103    ("resource", "Rs"),
104    ("operation", "Op"),
105    // 2026-04-24 multi-word expansion. All 26 pairs below are 1-token
106    // wins verified by `scripts/tokenize_subs.py` with leading-space
107    // tiktoken on `cl100k_base` (the Claude-compatible BPE used as
108    // our tokenizer proxy in `LocalMeasurer`). Multi-word originals
109    // are provably multi-token so the saturation problem that killed
110    // the single-word expansion in #12 does not apply here.
111    //
112    // Collision-checked against all prior short forms — none overlap.
113    // Ordered longest-first happens automatically in `SUB_RULES` sort.
114    ("configuration file", "Cf"),
115    ("environment variable", "Env"),
116    ("integration test", "IT"),
117    ("regular expression", "RE"),
118    ("working directory", "WD"),
119    ("breaking change", "BC"),
120    ("circuit breaker", "CiB"),
121    ("pattern matching", "PM"),
122    ("race condition", "RC"),
123    ("type checking", "Typ"),
124    ("command line", "CL"),
125    ("content block", "CB"),
126    ("error message", "EM"),
127    ("feature flag", "FF"),
128    ("function call", "FC"),
129    ("kill switch", "KS"),
130    ("merge request", "MR"),
131    ("pull request", "PR"),
132    ("stack trace", "ST"),
133    ("system prompt", "SP"),
134    ("tool result", "TR"),
135    ("user prompt", "UP"),
136    ("code review", "CR"),
137    ("tool call", "TC"),
138    ("tool use", "TU"),
139    ("unit test", "UT"),
140];
141
142/// Filler words that can be safely stripped from prose without
143/// changing polarity or introducing ambiguity.
144///
145/// Three groups (kept in a single flat list for lookup speed):
146///
147/// 1. Original (v1) closed-class determiners, aux verbs, demonstratives,
148///    and relative pronouns: never carry polarity, expand via the
149///    tokenizer anyway.
150///
151/// 2. Pure prepositions added 2026-04-24: every token in this group
152///    lacks polarity (no "not"/"no"/"without"-style contrast) and
153///    occurs with very high frequency in agent prompts. Expected
154///    marginal savings from ablation analysis of `filler_removed`'s
155///    97% savings share.
156///
157/// 3. Hedges / intensifiers added 2026-04-24: modify degree, never
158///    polarity. Dropping "very important" to "important" loses a
159///    shade of nuance but preserves the core judgement — safe for
160///    compression, and these are some of the most-seen words in
161///    Claude's own CoT. Explicitly NOT including polarity-bearing
162///    hedges like "maybe", "perhaps", "likely", "possibly" which
163///    flip assertion to tentative.
164const FILLER_WORDS: &[&str] = &[
165    // Group 1 — v1 baseline.
166    "the", "a", "an", "of", "to", "in", "on", "at", "by", "with", "from", "is", "are", "was",
167    "were", "be", "been", "being", "that", "this", "these", "those", "it", "its", "as", "then",
168    "which", "who", "whom", "whose", "each", "any", "some", "all", "also", "such", "into", "onto",
169    // Group 2 — prepositions (2026-04-24 ablation expansion).
170    "for", "about", "around", "over", "through", "during", "within", "per", "via",
171    // Group 3 — degree-only intensifiers (2026-04-24 ablation expansion).
172    "just", "only", "very", "quite", "really", "actually", "simply",
173];
174
175/// Pre-compiled per-substitution regex set (longest term first).
176static SUB_RULES: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(|| {
177    let mut subs: Vec<(&'static str, &'static str)> = TERM_SUBSTITUTIONS.to_vec();
178    subs.sort_by_key(|(long, _)| std::cmp::Reverse(long.len()));
179    subs.into_iter()
180        .map(|(long, short)| {
181            let pat = format!(r"\b{}\b", regex::escape(long));
182            let re = RegexBuilder::new(&pat)
183                .case_insensitive(true)
184                .build()
185                .expect("static substitution pattern");
186            (re, short)
187        })
188        .collect()
189});
190
191static IF_PREFIX: Lazy<Regex> = Lazy::new(|| {
192    RegexBuilder::new(r"\bif\b\s+")
193        .case_insensitive(true)
194        .build()
195        .expect("if-prefix")
196});
197
198static SUCCESS: Lazy<Regex> = Lazy::new(|| {
199    RegexBuilder::new(r"\b(succeeds?|ok|success|grants? access|grants?)\b")
200        .case_insensitive(true)
201        .build()
202        .expect("success")
203});
204
205static FAILURE: Lazy<Regex> = Lazy::new(|| {
206    RegexBuilder::new(r"\b(fails?|failure|failed)\b")
207        .case_insensitive(true)
208        .build()
209        .expect("failure")
210});
211
212static ARROW: Lazy<Regex> = Lazy::new(|| {
213    RegexBuilder::new(r"\b(returns?|forwarded? to|forwards? to|sends? to|invokes?|invoked)\b")
214        .case_insensitive(true)
215        .build()
216        .expect("arrow")
217});
218
219static VS: Lazy<Regex> = Lazy::new(|| {
220    RegexBuilder::new(r"\b(against|versus|vs\.?)\b")
221        .case_insensitive(true)
222        .build()
223        .expect("vs")
224});
225
226static AND: Lazy<Regex> = Lazy::new(|| {
227    RegexBuilder::new(r"\b(and|plus)\b")
228        .case_insensitive(true)
229        .build()
230        .expect("and")
231});
232
233static PUNCT_GAP: Lazy<Regex> =
234    Lazy::new(|| Regex::new(r"\s+([.,;:\u{2192}\u{2713}\u{2717}])\s*").expect("punct-gap"));
235
236static MULTI_WS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").expect("multi-ws"));
237
238/// Markdown / structured-content detector. Returns `true` when the
239/// input carries semantics the symbolic pipeline cannot preserve:
240/// paragraph breaks, fenced code, headings, list items, blockquotes,
241/// or tables. Exists to protect `compress_traced_with` from the
242/// unconditional `MULTI_WS` whitespace flattening that would
243/// otherwise destroy those structural markers (B8).
244///
245/// Conservative by design: any one signal trips the gate. False
246/// positives cost savings; false negatives cost structure — and the
247/// latter is the contract violation the project is unwilling to
248/// accept. See CONTRACT.md §V16.
249pub(crate) fn has_structural_markers(s: &str) -> bool {
250    // Paragraph break: two or more newlines in a row (ignoring trailing whitespace on blank lines).
251    // Cheap substring check handles the common case; regex would be overkill here.
252    if s.contains("\n\n") {
253        return true;
254    }
255    // Fenced code block: ``` or ~~~ at a line start.
256    for fence in ["\n```", "\n~~~"] {
257        if s.contains(fence) || s.starts_with(&fence[1..]) {
258            return true;
259        }
260    }
261    for line in s.lines() {
262        let trimmed = line.trim_start();
263        // ATX heading: `# ` through `###### `.
264        if trimmed.starts_with('#') {
265            let rest = trimmed.trim_start_matches('#');
266            let hashes = trimmed.len() - rest.len();
267            if (1..=6).contains(&hashes) && rest.starts_with(' ') {
268                return true;
269            }
270        }
271        // Unordered list item.
272        if let Some(after) = trimmed
273            .strip_prefix("- ")
274            .or_else(|| trimmed.strip_prefix("* "))
275            .or_else(|| trimmed.strip_prefix("+ "))
276        {
277            if !after.is_empty() {
278                return true;
279            }
280        }
281        // Ordered list item: `digits. ` or `digits) `.
282        {
283            let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
284            if !digits.is_empty() && digits.len() <= 3 {
285                let rest = &trimmed[digits.len()..];
286                if rest.starts_with(". ") || rest.starts_with(") ") {
287                    return true;
288                }
289            }
290        }
291        // Blockquote.
292        if trimmed.starts_with("> ") {
293            return true;
294        }
295        // Table row: pipe-delimited, at least two pipes.
296        if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
297            return true;
298        }
299    }
300    false
301}
302
303fn strip_punct(word: &str) -> &str {
304    word.trim_matches(|c: char| matches!(c, '.' | ',' | ';' | ':'))
305}
306
307fn is_filler(word: &str) -> bool {
308    let stripped = strip_punct(word).to_ascii_lowercase();
309    FILLER_WORDS.iter().any(|w| *w == stripped)
310}
311
312/// Per-rule fire counts emitted alongside an encode pass.
313///
314/// Each field counts how many *match instances* the rule produced on
315/// this input. A non-firing rule reports 0; rules with no semantic
316/// effect on a given input are useful exactly because they cost
317/// nothing to evaluate. Use `EncoderTrace::any_fired` to gate
318/// telemetry on whether the encoder did real work.
319#[derive(Debug, Clone, Default, PartialEq, Eq)]
320pub struct EncoderTrace {
321    /// Total term-substitution match count across the 35-pair table.
322    pub term_substitutions: u32,
323    /// Number of `if ` prefix removals.
324    pub if_prefix: u32,
325    /// Success-glyph (`\u{2713}`) substitutions.
326    pub success: u32,
327    /// Failure-glyph (`\u{2717}`) substitutions.
328    pub failure: u32,
329    /// Arrow-glyph (`\u{2192}`) substitutions.
330    pub arrow: u32,
331    /// `vs` substitutions.
332    pub vs: u32,
333    /// `+` substitutions for `and`/`plus`.
334    pub and: u32,
335    /// Number of filler words removed.
336    pub filler_removed: u32,
337    /// Number of ANSI-escape sequences stripped from `tool_result`
338    /// content blocks. Applied shim-side before the encoder runs to
339    /// claw back tokens burned on terminal-color bytes the LLM never
340    /// needs to see. Lossless for model consumption.
341    pub ansi_stripped: u32,
342    /// Number of lines where trailing `[ \t]+` was removed from
343    /// `tool_result` content. Pure-waste whitespace that tokenizes
344    /// into real input-billed tokens; LLMs ignore trailing spaces
345    /// semantically. Lossless for model consumption.
346    pub trailing_ws: u32,
347    /// Number of excess blank lines collapsed in `tool_result`
348    /// content. Runs of 3+ consecutive `\n` are squeezed to exactly
349    /// two (one blank line preserved as a paragraph break). Preserves
350    /// semantic section separation while dropping layout-only padding.
351    pub blank_lines: u32,
352    /// Number of JSON string payloads minified inside `tool_result`
353    /// content. Lossless for valid JSON: only insignificant whitespace
354    /// is removed.
355    pub json_minified: u32,
356    /// Number of homogeneous JSON record arrays rendered as compact
357    /// TOON-like tables inside `tool_result` content. This preserves
358    /// field names and scalar values but is no longer JSON syntax, so
359    /// it is canary-only by default.
360    pub json_records_table: u32,
361    /// Number of numeric line ranges compacted in `tool_result`
362    /// content. This is deterministic for lines that differ only by
363    /// an incrementing number.
364    pub numeric_range_lines: u32,
365    /// Number of repeated chunk dictionaries emitted in `tool_result`
366    /// content. Repeated log lines/sentences are represented once plus
367    /// an explicit sequence, preserving reconstruction information.
368    pub repeated_chunk_dict: u32,
369    /// Number of consecutive repeated-line runs collapsed in
370    /// `tool_result` content. Repetition count is preserved as `xN`,
371    /// so log volume is compacted without hiding that repetition
372    /// occurred.
373    pub repeated_lines: u32,
374    /// Number of simple tool-schema payloads rendered as compact
375    /// semantic tables. Preserves tool name, description, required
376    /// parameters, and property schemas for the supported simple schema
377    /// subset; canary-only by default.
378    pub tool_schema_semantic_table: u32,
379    /// Per-rule byte-savings attribution, indexed identically to the
380    /// pairs returned by `as_pairs` (alphabetical by rule name).
381    ///
382    /// PLAN_2026-04-24.md Step 9 substrate. Each entry is the UTF-8
383    /// byte count a rule removed from the payload on THIS encoder pass
384    /// — pre-rule body length minus post-rule body length. A non-firing
385    /// rule contributes 0. Rules that never fire on a given input
386    /// contribute 0, so zero-valued entries are signal, not noise.
387    ///
388    /// Default is an all-zero array (matches the default-non-firing
389    /// trace). The actual per-rule measurement hooks inside
390    /// `encode_symbolic_traced_with` will land in a follow-up commit
391    /// so this schema can deploy without the rule-pass refactor. Until
392    /// then, callers SHOULD treat any `bytes_saved` they read as "may
393    /// be zero because the rule did not run OR because the measurement
394    /// hook is not yet wired" — `any_fired()` on the same trace is the
395    /// disambiguator.
396    pub bytes_saved: [u64; 17],
397}
398
399impl EncoderTrace {
400    /// Positional index of each rule in `bytes_saved` / `as_pairs`.
401    /// Alphabetically ordered, so adding a new rule between existing
402    /// ones shifts subsequent indices — readers MUST use these
403    /// constants instead of hardcoded integers. Step-9 scaffold.
404    pub const IDX_AND: usize = 0;
405    pub const IDX_ANSI_STRIPPED: usize = 1;
406    pub const IDX_ARROW: usize = 2;
407    pub const IDX_BLANK_LINES: usize = 3;
408    pub const IDX_FAILURE: usize = 4;
409    pub const IDX_FILLER_REMOVED: usize = 5;
410    pub const IDX_IF_PREFIX: usize = 6;
411    pub const IDX_JSON_MINIFIED: usize = 7;
412    pub const IDX_JSON_RECORDS_TABLE: usize = 8;
413    pub const IDX_NUMERIC_RANGE_LINES: usize = 9;
414    pub const IDX_REPEATED_CHUNK_DICT: usize = 10;
415    pub const IDX_REPEATED_LINES: usize = 11;
416    pub const IDX_SUCCESS: usize = 12;
417    pub const IDX_TERM_SUBSTITUTIONS: usize = 13;
418    pub const IDX_TOOL_SCHEMA_SEMANTIC_TABLE: usize = 14;
419    pub const IDX_TRAILING_WS: usize = 15;
420    pub const IDX_VS: usize = 16;
421
422    /// True if any rule produced at least one substitution or removal.
423    #[must_use]
424    pub fn any_fired(&self) -> bool {
425        self.term_substitutions
426            + self.if_prefix
427            + self.success
428            + self.failure
429            + self.arrow
430            + self.vs
431            + self.and
432            + self.filler_removed
433            + self.ansi_stripped
434            + self.trailing_ws
435            + self.blank_lines
436            + self.json_minified
437            + self.json_records_table
438            + self.numeric_range_lines
439            + self.repeated_chunk_dict
440            + self.repeated_lines
441            + self.tool_schema_semantic_table
442            > 0
443    }
444
445    /// Render byte-savings attribution as a stable, sorted list of
446    /// `(rule_name, bytes_saved)` pairs. Positional order matches
447    /// `as_pairs`; non-firing rules contribute 0. Added with Step 9.
448    ///
449    /// Pairs with `as_pairs` for post-hoc analysis: counts tell you
450    /// HOW OFTEN each rule fired, bytes_saved tells you WHICH BYTES
451    /// each rule removed. Combined they let the bandit attribute
452    /// savings to specific rules without re-running the encoder.
453    #[must_use]
454    pub fn bytes_saved_pairs(&self) -> [(&'static str, u64); 17] {
455        let names = self.as_pairs().map(|(name, _)| name);
456        let mut out = [("", 0u64); 17];
457        for i in 0..17 {
458            out[i] = (names[i], self.bytes_saved[i]);
459        }
460        out
461    }
462
463    /// Render as a stable, sorted list of `(rule_name, count)` pairs.
464    /// Used by `pithy-cli analyze` and the audit ledger so a
465    /// future rule rename does not break replay.
466    #[must_use]
467    pub fn as_pairs(&self) -> [(&'static str, u32); 17] {
468        [
469            ("and", self.and),
470            ("ansi_stripped", self.ansi_stripped),
471            ("arrow", self.arrow),
472            ("blank_lines", self.blank_lines),
473            ("failure", self.failure),
474            ("filler_removed", self.filler_removed),
475            ("if_prefix", self.if_prefix),
476            ("json_minified", self.json_minified),
477            ("json_records_table", self.json_records_table),
478            ("numeric_range_lines", self.numeric_range_lines),
479            ("repeated_chunk_dict", self.repeated_chunk_dict),
480            ("repeated_lines", self.repeated_lines),
481            ("success", self.success),
482            ("term_substitutions", self.term_substitutions),
483            (
484                "tool_schema_semantic_table",
485                self.tool_schema_semantic_table,
486            ),
487            ("trailing_ws", self.trailing_ws),
488            ("vs", self.vs),
489        ]
490    }
491
492    /// Sum-merge fire counts from `other` into `self`. Used by the
493    /// structural encoder to aggregate traces across compressed
494    /// paragraph segments.
495    pub fn merge(&mut self, other: EncoderTrace) {
496        self.and = self.and.saturating_add(other.and);
497        self.ansi_stripped = self.ansi_stripped.saturating_add(other.ansi_stripped);
498        self.arrow = self.arrow.saturating_add(other.arrow);
499        self.blank_lines = self.blank_lines.saturating_add(other.blank_lines);
500        self.failure = self.failure.saturating_add(other.failure);
501        self.filler_removed = self.filler_removed.saturating_add(other.filler_removed);
502        self.if_prefix = self.if_prefix.saturating_add(other.if_prefix);
503        self.json_minified = self.json_minified.saturating_add(other.json_minified);
504        self.json_records_table = self
505            .json_records_table
506            .saturating_add(other.json_records_table);
507        self.numeric_range_lines = self
508            .numeric_range_lines
509            .saturating_add(other.numeric_range_lines);
510        self.repeated_chunk_dict = self
511            .repeated_chunk_dict
512            .saturating_add(other.repeated_chunk_dict);
513        self.repeated_lines = self.repeated_lines.saturating_add(other.repeated_lines);
514        self.success = self.success.saturating_add(other.success);
515        self.term_substitutions = self
516            .term_substitutions
517            .saturating_add(other.term_substitutions);
518        self.tool_schema_semantic_table = self
519            .tool_schema_semantic_table
520            .saturating_add(other.tool_schema_semantic_table);
521        self.trailing_ws = self.trailing_ws.saturating_add(other.trailing_ws);
522        self.vs = self.vs.saturating_add(other.vs);
523        // Step-9 scaffold: sum byte-savings by position. Safe with an
524        // all-zero default on `other` until per-rule measurement hooks
525        // land inside `encode_symbolic_traced_with`.
526        for i in 0..17 {
527            self.bytes_saved[i] = self.bytes_saved[i].saturating_add(other.bytes_saved[i]);
528        }
529    }
530}
531
532/// Soft-gate threshold below which a rule's sampled weight flips it
533/// off. Originally 0.5 (midpoint of the Beta prior), revised to 0.05
534/// on 2026-04-24 after a corpus-distribution simulation over 126
535/// audit events showed 0 of 15 observed rules could cross 0.5 under
536/// any attribution scheme — the bounded reward signal (p90=0.15,
537/// median=0.01 on this corpus) makes a 0.5 threshold semantically
538/// "rule must save 50% of bytes by itself to stay enabled", which no
539/// individual rule can achieve. 0.05 discriminates productive rules
540/// (mean ≥ 0.09) from genuinely-dead rules (Beta converges to ≈ 0).
541/// Must stay strictly greater than
542/// `pithy_controller::bandit::ZERO_FIRE_MAX_WEIGHT` so the zero-fire
543/// clamp still disables rules that have never observed evidence.
544pub const ENABLE_WEIGHT_THRESHOLD: f32 = 0.05;
545
546/// Hot-reloadable rule configuration. Carries an enable flag and a
547/// soft weight in `[0.0, 1.0]` for each of the 8 categorical rules
548/// the encoder knows about. `weights` is stored for the closed-loop
549/// bandit / experiment generator; the encoder itself treats
550/// `weight < ENABLE_WEIGHT_THRESHOLD` as "off" and otherwise consults
551/// `enabled`.
552///
553/// Missing keys default to enabled / weight 1.0, so a sparse TOML
554/// file can express only the deltas from the v1 default ruleset.
555#[derive(Debug, Clone, Serialize, Deserialize, Default)]
556pub struct RuleSet {
557    /// Per-rule on/off override. Keys come from `RULE_NAMES`.
558    /// Anything not listed is enabled.
559    #[serde(default)]
560    pub enabled: BTreeMap<String, bool>,
561    /// Per-rule weight in `[0.0, 1.0]`. Used by the bandit; the
562    /// encoder uses it only to derive an effective on/off when the
563    /// `enabled` map omits the key.
564    #[serde(default)]
565    pub weights: BTreeMap<String, f32>,
566    /// Optional version tag, surfaced by the dashboard / statusline
567    /// so an operator can tell at a glance which ruleset is live.
568    #[serde(default)]
569    pub version: Option<String>,
570}
571
572impl RuleSet {
573    /// The v1 default ruleset. Stable inline rules and lossless
574    /// tool-output cleanup are enabled; canary-only semantic rewrites
575    /// stay disabled. This remains identical in observable behaviour
576    /// to the constants-only encoder for prompt text.
577    ///
578    /// `success` and `failure` are disabled by default as of
579    /// 2026-04-24: a per-rule ablation study (commit d431b90,
580    /// `scripts/ablate_rules.py`) showed they fire 11 and 28 times
581    /// respectively on the real audit corpus but produce -13 and
582    /// -42 tokens of "savings" — the unicode glyph substitutions
583    /// (✓, ✗) tokenize to *more* tokens than the ASCII originals
584    /// on the Claude tokenizers. Shipping them enabled actively
585    /// harmed compression. Consumers that want the legacy
586    /// behaviour can still opt in explicitly, and the
587    /// `safe_canary_v1` ruleset (introduced earlier for the same
588    /// reason) remains available as a typed checkpoint.
589    #[must_use]
590    pub fn default_v1() -> Self {
591        let mut enabled = BTreeMap::new();
592        let mut weights = BTreeMap::new();
593        for name in RULE_NAMES {
594            let default_on = !matches!(
595                *name,
596                "json_records_table"
597                    | "numeric_range_lines"
598                    | "repeated_chunk_dict"
599                    | "tool_schema_semantic_table"
600                    | "success"
601                    | "failure"
602            );
603            enabled.insert((*name).to_owned(), default_on);
604            weights.insert((*name).to_owned(), if default_on { 1.0 } else { 0.0 });
605        }
606        Self {
607            enabled,
608            weights,
609            version: Some("v1".to_owned()),
610        }
611    }
612
613    /// Lower-risk canary ruleset from the autoresearch loop: disable the
614    /// success/failure glyph substitutions while keeping lossless tool-output
615    /// cleanup enabled. This preserves English polarity words, which reduces
616    /// semantic risk on prompts where `success`/`failure` are domain terms.
617    #[must_use]
618    pub fn safe_canary_v1() -> Self {
619        let mut rs = Self::default_v1();
620        rs.enabled.insert("success".to_owned(), false);
621        rs.enabled.insert("failure".to_owned(), false);
622        rs.weights.insert("success".to_owned(), 0.0);
623        rs.weights.insert("failure".to_owned(), 0.0);
624        rs.version = Some("safe-canary-v1-no-success-failure".to_owned());
625        rs
626    }
627
628    /// Agentic canary ruleset from the quality-first autoresearch loop.
629    /// Enables deterministic tool-output codecs that change surface syntax
630    /// but preserve reconstruction or explicit semantic fields for their
631    /// supported structures.
632    #[must_use]
633    pub fn agentic_canary_v2() -> Self {
634        let mut rs = Self::safe_canary_v1();
635        for name in [
636            "json_records_table",
637            "numeric_range_lines",
638            "repeated_chunk_dict",
639            "tool_schema_semantic_table",
640        ] {
641            rs.enabled.insert(name.to_owned(), true);
642            rs.weights.insert(name.to_owned(), 1.0);
643        }
644        rs.version = Some("agentic-canary-v2-quality-ready-codecs".to_owned());
645        rs
646    }
647
648    /// True if the named rule is enabled. Missing entries default
649    /// to enabled; a weight below [`ENABLE_WEIGHT_THRESHOLD`] also
650    /// flips the rule off so a bandit can soft-demote without
651    /// explicit toggling.
652    #[must_use]
653    pub fn is_enabled(&self, rule: &str) -> bool {
654        if let Some(flag) = self.enabled.get(rule) {
655            if !*flag {
656                return false;
657            }
658        }
659        !matches!(self.weights.get(rule), Some(w) if *w < ENABLE_WEIGHT_THRESHOLD)
660    }
661
662    /// Read effective weight for a rule. Missing entries return 1.0.
663    #[must_use]
664    pub fn weight(&self, rule: &str) -> f32 {
665        self.weights.get(rule).copied().unwrap_or(1.0)
666    }
667
668    /// Parse a TOML document into a `RuleSet`. Any unrecognised key
669    /// is silently dropped (forward compatibility), and any rule
670    /// name not in `RULE_NAMES` is also dropped (defensive against
671    /// a corrupted ruleset file).
672    ///
673    /// # Errors
674    /// Returns the underlying TOML parse error.
675    pub fn from_toml_str(s: &str) -> Result<Self, toml::de::Error> {
676        let mut rs: Self = toml::from_str(s)?;
677        rs.enabled.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
678        rs.weights.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
679        Ok(rs)
680    }
681
682    /// Load a `RuleSet` from a TOML file on disk.
683    ///
684    /// # Errors
685    /// I/O or parse errors.
686    pub fn from_toml_file(path: &Path) -> anyhow::Result<Self> {
687        let s = std::fs::read_to_string(path)?;
688        Self::from_toml_str(&s).map_err(|e| anyhow::anyhow!("ruleset parse: {e}"))
689    }
690
691    /// Render this ruleset as a TOML document. Used by the controller
692    /// to write a tuned ruleset back to disk before signalling reload.
693    ///
694    /// # Errors
695    /// Serialization errors from `toml::to_string_pretty`.
696    pub fn to_toml_string(&self) -> Result<String, toml::ser::Error> {
697        toml::to_string_pretty(self)
698    }
699}
700
701/// Run the symbolic-encoding pipeline. Pure function; deterministic.
702#[must_use]
703pub fn encode_symbolic(text: &str) -> String {
704    encode_symbolic_traced(text).0
705}
706
707/// Run the symbolic-encoding pipeline AND return a per-rule firing trace.
708///
709/// Pure function; deterministic. Equivalent to
710/// [`encode_symbolic_traced_with`] with the v1 default ruleset
711/// (every rule enabled). Kept as a stable convenience wrapper for
712/// callers that pre-date hot-reloadable rules.
713#[must_use]
714pub fn encode_symbolic_traced(text: &str) -> (String, EncoderTrace) {
715    encode_symbolic_traced_with(text, &RuleSet::default_v1())
716}
717
718/// Run the symbolic-encoding pipeline against `text` under the
719/// supplied `RuleSet`, returning the compressed text plus a
720/// per-rule firing trace.
721///
722/// A disabled rule contributes 0 to the trace and produces no
723/// substitutions in the output. The pipeline order is fixed (the
724/// substitutions are applied longest-first, then the if/success/
725/// failure/arrow/vs/and rules in declaration order, then filler
726/// removal, then whitespace normalisation) so disabling a rule
727/// never changes the order in which the remaining rules fire.
728#[must_use]
729pub fn encode_symbolic_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
730    let mut trace = EncoderTrace::default();
731    let mut t: String = text.to_owned();
732    // Step-9 per-rule byte-delta attribution. Each rule snapshots
733    // `t.len()` before the transform and records the UTF-8 byte delta
734    // into `trace.bytes_saved[IDX_X]` when it fires. Matches the
735    // shim-side pattern shipped for the tool-result pipeline in
736    // commits 0e44d8f / 342741b. Closes the 17-rule roster.
737    if rs.is_enabled("term_substitutions") {
738        let before = t.len() as u64;
739        let mut fired = false;
740        for (re, short) in SUB_RULES.iter() {
741            let n = u32::try_from(re.find_iter(&t).count()).unwrap_or(u32::MAX);
742            if n > 0 {
743                trace.term_substitutions = trace.term_substitutions.saturating_add(n);
744                t = re.replace_all(&t, *short).into_owned();
745                fired = true;
746            }
747        }
748        if fired {
749            trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] =
750                before.saturating_sub(t.len() as u64);
751        }
752    }
753    if rs.is_enabled("if_prefix") {
754        let n = u32::try_from(IF_PREFIX.find_iter(&t).count()).unwrap_or(u32::MAX);
755        trace.if_prefix = n;
756        if n > 0 {
757            let before = t.len() as u64;
758            t = IF_PREFIX.replace_all(&t, "").into_owned();
759            trace.bytes_saved[EncoderTrace::IDX_IF_PREFIX] = before.saturating_sub(t.len() as u64);
760        }
761    }
762    if rs.is_enabled("success") {
763        let n = u32::try_from(SUCCESS.find_iter(&t).count()).unwrap_or(u32::MAX);
764        trace.success = n;
765        if n > 0 {
766            let before = t.len() as u64;
767            t = SUCCESS.replace_all(&t, "\u{2713}").into_owned();
768            trace.bytes_saved[EncoderTrace::IDX_SUCCESS] = before.saturating_sub(t.len() as u64);
769        }
770    }
771    if rs.is_enabled("failure") {
772        let n = u32::try_from(FAILURE.find_iter(&t).count()).unwrap_or(u32::MAX);
773        trace.failure = n;
774        if n > 0 {
775            let before = t.len() as u64;
776            t = FAILURE.replace_all(&t, "\u{2717}").into_owned();
777            trace.bytes_saved[EncoderTrace::IDX_FAILURE] = before.saturating_sub(t.len() as u64);
778        }
779    }
780    if rs.is_enabled("arrow") {
781        let n = u32::try_from(ARROW.find_iter(&t).count()).unwrap_or(u32::MAX);
782        trace.arrow = n;
783        if n > 0 {
784            let before = t.len() as u64;
785            t = ARROW.replace_all(&t, "\u{2192}").into_owned();
786            trace.bytes_saved[EncoderTrace::IDX_ARROW] = before.saturating_sub(t.len() as u64);
787        }
788    }
789    if rs.is_enabled("vs") {
790        let n = u32::try_from(VS.find_iter(&t).count()).unwrap_or(u32::MAX);
791        trace.vs = n;
792        if n > 0 {
793            let before = t.len() as u64;
794            t = VS.replace_all(&t, "vs").into_owned();
795            trace.bytes_saved[EncoderTrace::IDX_VS] = before.saturating_sub(t.len() as u64);
796        }
797    }
798    if rs.is_enabled("and") {
799        let n = u32::try_from(AND.find_iter(&t).count()).unwrap_or(u32::MAX);
800        trace.and = n;
801        if n > 0 {
802            let before = t.len() as u64;
803            t = AND.replace_all(&t, "+").into_owned();
804            trace.bytes_saved[EncoderTrace::IDX_AND] = before.saturating_sub(t.len() as u64);
805        }
806    }
807    if rs.is_enabled("filler_removed") {
808        let before = t.len() as u64;
809        let words_before = t.split_whitespace().count();
810        let kept: Vec<&str> = t.split_whitespace().filter(|w| !is_filler(w)).collect();
811        let removed = u32::try_from(words_before.saturating_sub(kept.len())).unwrap_or(u32::MAX);
812        trace.filler_removed = removed;
813        t = kept.join(" ");
814        if removed > 0 {
815            trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] =
816                before.saturating_sub(t.len() as u64);
817        }
818    }
819    t = PUNCT_GAP.replace_all(&t, "$1 ").into_owned();
820    t = MULTI_WS.replace_all(&t, " ").into_owned();
821    (t.trim().to_owned(), trace)
822}
823
824/// Compress a single inline prose fragment (e.g. a heading body or list
825/// item body). Runs the same rule pipeline as
826/// [`encode_symbolic_traced_with`] but skips the line-flattening
827/// `MULTI_WS` step because the caller already knows the fragment is a
828/// single line. Trace counts from the inline pass are merged into
829/// `trace_accum` so the caller can report per-rule fires across all
830/// segments.
831fn compress_inline(body: &str, rs: &RuleSet, trace_accum: &mut EncoderTrace) -> String {
832    let (out, trace) = encode_symbolic_traced_with(body, rs);
833    trace_accum.merge(trace);
834    out
835}
836
837/// Classification for a single markdown line.
838enum LineKind {
839    Blank,
840    Fence,
841    Heading { prefix: String, body: String },
842    ListItem { prefix: String, body: String },
843    Blockquote { body: String },
844    Table,
845    Prose,
846}
847
848fn classify_line(line: &str) -> LineKind {
849    if line.trim().is_empty() {
850        return LineKind::Blank;
851    }
852    let trimmed = line.trim_start();
853    if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
854        return LineKind::Fence;
855    }
856    if trimmed.starts_with('#') {
857        let rest = trimmed.trim_start_matches('#');
858        let hashes = trimmed.len() - rest.len();
859        if (1..=6).contains(&hashes) && rest.starts_with(' ') {
860            let indent = &line[..line.len() - trimmed.len()];
861            let prefix = format!("{}{} ", indent, "#".repeat(hashes));
862            let body = rest.trim_start().to_owned();
863            return LineKind::Heading { prefix, body };
864        }
865    }
866    for marker in ["- ", "* ", "+ "] {
867        if let Some(body) = trimmed.strip_prefix(marker) {
868            if !body.is_empty() {
869                let indent = &line[..line.len() - trimmed.len()];
870                let prefix = format!("{}{}", indent, marker);
871                return LineKind::ListItem {
872                    prefix,
873                    body: body.to_owned(),
874                };
875            }
876        }
877    }
878    // Ordered list: `\d{1,3}\. ` or `\d{1,3}\) `.
879    let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
880    if !digits.is_empty() && digits.len() <= 3 {
881        let after_digits = &trimmed[digits.len()..];
882        for sep in [". ", ") "] {
883            if let Some(body) = after_digits.strip_prefix(sep) {
884                if !body.is_empty() {
885                    let indent = &line[..line.len() - trimmed.len()];
886                    let prefix = format!("{}{}{}", indent, digits, sep);
887                    return LineKind::ListItem {
888                        prefix,
889                        body: body.to_owned(),
890                    };
891                }
892            }
893        }
894    }
895    if let Some(body) = trimmed.strip_prefix("> ") {
896        return LineKind::Blockquote {
897            body: body.to_owned(),
898        };
899    }
900    if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
901        return LineKind::Table;
902    }
903    LineKind::Prose
904}
905
906/// Segment-aware symbolic compression (Phase B of B8 fix).
907///
908/// Splits markdown input into block-level segments, compresses prose
909/// content within each segment via [`encode_symbolic_traced_with`], and
910/// rejoins with structural markers preserved. Compared with the flat
911/// encoder, this path retains paragraph breaks, headings, list-item
912/// markers, blockquotes, and fenced-code blocks while still applying
913/// the rule set to the words inside each segment.
914///
915/// Verbatim-preserved blocks: fenced code (entire fence content),
916/// table rows, blank lines.
917///
918/// Structure-preserving compressed blocks: heading prefixes kept,
919/// list-item markers kept, blockquote prefix kept; the body text after
920/// each marker is compressed inline.
921///
922/// Paragraph handling: consecutive plain-prose lines are joined with a
923/// single space into one paragraph, compressed as one unit, and emitted
924/// on a single line. Markdown renders soft-wrapped paragraphs the same
925/// as single-line paragraphs, so this is lossless at the rendered-
926/// document boundary.
927pub fn encode_symbolic_structural_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
928    let mut out = String::with_capacity(text.len());
929    let mut trace = EncoderTrace::default();
930    let mut prose_buf = String::new();
931    let mut in_fence = false;
932    // T14: track whether we're currently inside an indented-code
933    // block. Markdown (CommonMark §4.4) defines these as a run of
934    // one-or-more lines that begin with at least 4 spaces (or 1
935    // tab), opened by a blank line or the start of the document, and
936    // closed by a line that is NOT 4-space/tab indented AND is not
937    // blank. Lines inside the block are emitted verbatim; the prose
938    // pipeline never sees them.
939    let mut in_indented_code = false;
940    let mut prev_line_blank = true; // start-of-doc counts as blank
941
942    let flush = |prose_buf: &mut String, out: &mut String, trace: &mut EncoderTrace| {
943        if prose_buf.is_empty() {
944            return;
945        }
946        let compressed = compress_inline(prose_buf, rs, trace);
947        out.push_str(&compressed);
948        out.push('\n');
949        prose_buf.clear();
950    };
951
952    for line in text.split('\n') {
953        if in_fence {
954            // Inside a fence: emit verbatim; flip on closing fence.
955            out.push_str(line);
956            out.push('\n');
957            let trimmed = line.trim_start();
958            if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
959                in_fence = false;
960            }
961            prev_line_blank = false;
962            continue;
963        }
964
965        // T14 indented-code detection. The spec-correct minimum is
966        // 4 spaces OR 1 tab. We recognise both. Staying inside the
967        // block on later non-blank indented lines means only the
968        // opening needs the blank-line precondition; continuation
969        // lines can have any prose content as long as the indent is
970        // still there.
971        let is_indented = line.starts_with("    ") || line.starts_with('\t');
972        let is_blank = line.trim().is_empty();
973        if in_indented_code {
974            if is_indented || is_blank {
975                // still inside the block: emit verbatim
976                out.push_str(line);
977                out.push('\n');
978                prev_line_blank = is_blank;
979                continue;
980            }
981            // non-indented, non-blank line closes the block
982            in_indented_code = false;
983        } else if prev_line_blank && is_indented && !is_blank {
984            // opening of an indented-code block
985            flush(&mut prose_buf, &mut out, &mut trace);
986            in_indented_code = true;
987            out.push_str(line);
988            out.push('\n');
989            prev_line_blank = false;
990            continue;
991        }
992
993        // prev_line_blank for the next iteration is set after the
994        // match arms below, from `is_blank` — see the trailing
995        // assignment after `LineKind::Prose`.
996        match classify_line(line) {
997            LineKind::Fence => {
998                flush(&mut prose_buf, &mut out, &mut trace);
999                out.push_str(line);
1000                out.push('\n');
1001                in_fence = true;
1002            }
1003            LineKind::Blank => {
1004                flush(&mut prose_buf, &mut out, &mut trace);
1005                out.push('\n');
1006            }
1007            LineKind::Heading { prefix, body } => {
1008                flush(&mut prose_buf, &mut out, &mut trace);
1009                let body_c = compress_inline(&body, rs, &mut trace);
1010                out.push_str(&prefix);
1011                out.push_str(&body_c);
1012                out.push('\n');
1013            }
1014            LineKind::ListItem { prefix, body } => {
1015                flush(&mut prose_buf, &mut out, &mut trace);
1016                let body_c = compress_inline(&body, rs, &mut trace);
1017                out.push_str(&prefix);
1018                out.push_str(&body_c);
1019                out.push('\n');
1020            }
1021            LineKind::Blockquote { body } => {
1022                flush(&mut prose_buf, &mut out, &mut trace);
1023                let body_c = compress_inline(&body, rs, &mut trace);
1024                out.push_str("> ");
1025                out.push_str(&body_c);
1026                out.push('\n');
1027            }
1028            LineKind::Table => {
1029                flush(&mut prose_buf, &mut out, &mut trace);
1030                out.push_str(line);
1031                out.push('\n');
1032            }
1033            LineKind::Prose => {
1034                if !prose_buf.is_empty() {
1035                    prose_buf.push(' ');
1036                }
1037                prose_buf.push_str(line);
1038            }
1039        }
1040        // T14: update the blank-line bookkeeping for the next
1041        // iteration. We use the raw input line's emptiness, not the
1042        // classification — a LineKind::Blank is always is_blank, but
1043        // we need the raw truth for indented-code detection.
1044        prev_line_blank = is_blank;
1045    }
1046    flush(&mut prose_buf, &mut out, &mut trace);
1047
1048    // Mirror `text`'s trailing-newline state so a non-terminated input
1049    // does not acquire an extra '\n' from the emitter.
1050    if !text.ends_with('\n') && out.ends_with('\n') {
1051        out.pop();
1052    }
1053    (out, trace)
1054}
1055
1056/// Production encoder backed by a `Measurer` for token accounting.
1057pub struct SymbolicEncoder {
1058    measurer: Arc<dyn Measurer>,
1059}
1060
1061impl SymbolicEncoder {
1062    /// Wrap a measurer (typically `LocalMeasurer::with_defaults()`).
1063    #[must_use]
1064    pub fn new(measurer: Arc<dyn Measurer>) -> Self {
1065        Self { measurer }
1066    }
1067
1068    fn hash(text: &str) -> String {
1069        let mut h = Hasher::new();
1070        h.update(text.as_bytes());
1071        h.finalize().to_hex().to_string()
1072    }
1073
1074    fn count_or_zero(&self, text: &str, model: &Model) -> u32 {
1075        self.measurer.tokenize(text, model).unwrap_or(0)
1076    }
1077
1078    fn build(
1079        &self,
1080        original: &str,
1081        compressed: &str,
1082        format: Format,
1083        model: Model,
1084        fallback: Option<FallbackReason>,
1085    ) -> Compressed {
1086        let baseline = self.count_or_zero(original, &model);
1087        let encoded = self.count_or_zero(compressed, &model);
1088        Compressed {
1089            content: compressed.to_owned(),
1090            format,
1091            baseline_tokens: baseline,
1092            compressed_tokens: encoded,
1093            model,
1094            content_hash: Self::hash(original),
1095            fallback,
1096        }
1097    }
1098}
1099
1100impl SymbolicEncoder {
1101    /// Compress + return the per-rule firing trace alongside the
1102    /// `Compressed` output. The proxy / MCP server / Python SDK feed
1103    /// `EncoderTrace` into the audit ledger so post-hoc analysis can
1104    /// answer questions like "which rules pulled their weight on
1105    /// production traffic in 2026-W17?".
1106    ///
1107    /// On Prose fallback the trace is empty (`EncoderTrace::default`)
1108    /// because the pipeline never ran.
1109    #[must_use]
1110    pub fn compress_traced(&self, input: &str, model: Model) -> (Compressed, EncoderTrace) {
1111        self.compress_traced_with(input, model, &RuleSet::default_v1())
1112    }
1113
1114    /// Same as [`compress_traced`] but consults the supplied
1115    /// [`RuleSet`] instead of the v1 defaults. Intended for the
1116    /// hot-reload path: the proxy reads the current ruleset under a
1117    /// read lock and passes a `&RuleSet` per request.
1118    #[must_use]
1119    pub fn compress_traced_with(
1120        &self,
1121        input: &str,
1122        model: Model,
1123        rs: &RuleSet,
1124    ) -> (Compressed, EncoderTrace) {
1125        if !self.measurer.supported(&model) {
1126            return (
1127                self.build(
1128                    input,
1129                    input,
1130                    Format::Prose,
1131                    model,
1132                    Some(FallbackReason::TokenizerMissing),
1133                ),
1134                EncoderTrace::default(),
1135            );
1136        }
1137        let chars = input.chars().count();
1138        if chars < MIN_INPUT_CHARS {
1139            return (
1140                self.build(
1141                    input,
1142                    input,
1143                    Format::Prose,
1144                    model,
1145                    Some(FallbackReason::Uncompressible),
1146                ),
1147                EncoderTrace::default(),
1148            );
1149        }
1150        if chars > MAX_INPUT_CHARS {
1151            return (
1152                self.build(
1153                    input,
1154                    input,
1155                    Format::Prose,
1156                    model,
1157                    Some(FallbackReason::OversizedInput),
1158                ),
1159                EncoderTrace::default(),
1160            );
1161        }
1162        // B8: for markdown-structured inputs, route through the
1163        // segment-aware encoder so paragraph breaks, headings,
1164        // lists, fences, blockquotes and tables survive; only the
1165        // body text inside each segment goes through the flat
1166        // pipeline that ends in MULTI_WS. Plain prose keeps the
1167        // original single-pass flat encoder.
1168        let (encoded, trace) = if has_structural_markers(input) {
1169            encode_symbolic_structural_traced_with(input, rs)
1170        } else {
1171            encode_symbolic_traced_with(input, rs)
1172        };
1173        let baseline = self.count_or_zero(input, &model);
1174        let candidate = self.count_or_zero(&encoded, &model);
1175        if candidate >= baseline {
1176            return (
1177                self.build(
1178                    input,
1179                    input,
1180                    Format::Prose,
1181                    model,
1182                    Some(FallbackReason::Uncompressible),
1183                ),
1184                EncoderTrace::default(),
1185            );
1186        }
1187        (
1188            self.build(input, &encoded, Format::Symbolic, model, None),
1189            trace,
1190        )
1191    }
1192}
1193
1194impl Encoder for SymbolicEncoder {
1195    fn compress(&self, input: &str, model: Model) -> Compressed {
1196        self.compress_traced(input, model).0
1197    }
1198
1199    fn select_format(&self, input: &str, model: Model) -> Format {
1200        if !self.measurer.supported(&model) || input.chars().count() < MIN_INPUT_CHARS {
1201            return Format::Prose;
1202        }
1203        let encoded = encode_symbolic(input);
1204        if self.count_or_zero(&encoded, &model) >= self.count_or_zero(input, &model) {
1205            Format::Prose
1206        } else {
1207            Format::Symbolic
1208        }
1209    }
1210
1211    fn fallback(&self, input: &str, model: Model, reason: FallbackReason) -> Compressed {
1212        self.build(input, input, Format::Prose, model, Some(reason))
1213    }
1214}
1215
1216#[cfg(test)]
1217mod tests {
1218    use super::*;
1219    use crate::tokenizers::LocalMeasurer;
1220
1221    fn enc() -> SymbolicEncoder {
1222        let m = LocalMeasurer::with_defaults().expect("measurer");
1223        SymbolicEncoder::new(Arc::new(m))
1224    }
1225
1226    // B8 regression: the Symbolic pipeline collapses all whitespace
1227    // into single spaces via MULTI_WS. If the encoder ever chooses
1228    // Symbolic for a markdown-structured input, all headings/lists/
1229    // code fences/paragraph breaks are destroyed. These tests pin
1230    // the structural-content gate that forces Prose fallback
1231    // whenever markdown markers are present.
1232
1233    #[test]
1234    fn structural_gate_detects_paragraph_breaks() {
1235        assert!(has_structural_markers("foo\n\nbar"));
1236    }
1237
1238    #[test]
1239    fn structural_gate_detects_headings() {
1240        assert!(has_structural_markers("# Title\ncontent follows"));
1241        assert!(has_structural_markers("content\n## Subheading\nmore"));
1242    }
1243
1244    #[test]
1245    fn structural_gate_detects_lists() {
1246        assert!(has_structural_markers("intro\n- item one\n- item two"));
1247        assert!(has_structural_markers("intro\n1. first\n2. second"));
1248    }
1249
1250    #[test]
1251    fn structural_gate_detects_fenced_code() {
1252        assert!(has_structural_markers("prose\n```\ncode\n```"));
1253        assert!(has_structural_markers("```rust\nfn main() {}\n```"));
1254    }
1255
1256    #[test]
1257    fn structural_gate_detects_tables_and_blockquotes() {
1258        assert!(has_structural_markers(
1259            "col\n| a | b |\n|---|---|\n| 1 | 2 |"
1260        ));
1261        assert!(has_structural_markers("context\n> quoted line\nafter"));
1262    }
1263
1264    #[test]
1265    fn structural_gate_lets_flat_prose_through() {
1266        assert!(!has_structural_markers(
1267            "one sentence. another sentence. a third. no line breaks here."
1268        ));
1269    }
1270
1271    #[test]
1272    fn markdown_input_preserves_structure_through_compression() {
1273        // B8 Phase B: markdown input compresses via the structural
1274        // encoder. Paragraph breaks (\n\n), heading prefixes,
1275        // list-item markers, fenced code, blockquotes, and tables
1276        // must all survive. Body text inside each segment may be
1277        // rewritten by the rule pipeline.
1278        let md = "# Heading\n\nFirst paragraph with enough body to clear the thirty-two-char floor.\n\n- list item one\n- list item two\n\nSecond paragraph follows here.";
1279        let (out, _trace) = enc().compress_traced(md, Model::ClaudeSonnet47);
1280        // Expect Symbolic (compression went through) OR Prose (if
1281        // structural encoder could not beat baseline). Either way
1282        // structure must be intact in out.content.
1283        let content = &out.content;
1284        // Paragraph breaks preserved.
1285        assert!(
1286            content.contains("\n\n"),
1287            "expected paragraph break preserved, got: {content:?}"
1288        );
1289        // Heading prefix preserved.
1290        assert!(
1291            content.starts_with("# "),
1292            "expected heading prefix preserved, got: {content:?}"
1293        );
1294        // List-item markers preserved (at least one).
1295        assert!(
1296            content.contains("\n- "),
1297            "expected list-item marker preserved, got: {content:?}"
1298        );
1299        // Newline count must not collapse to zero.
1300        let newlines = content.matches('\n').count();
1301        assert!(
1302            newlines >= 4,
1303            "expected >=4 newlines (paragraph + 2 list + blank), got {newlines} in {content:?}"
1304        );
1305    }
1306
1307    #[test]
1308    fn structural_encoder_preserves_fenced_code_verbatim() {
1309        use crate::RuleSet;
1310        let md =
1311            "Intro paragraph.\n\n```rust\nfn main() {\n    println!(\"x\");\n}\n```\n\nEpilogue.";
1312        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1313        assert!(
1314            out.contains("```rust\nfn main() {\n    println!(\"x\");\n}\n```"),
1315            "fenced code must be preserved byte-for-byte, got: {out:?}"
1316        );
1317    }
1318
1319    // T14 regression tests: indented-code blocks (CommonMark §4.4)
1320    // should be preserved verbatim through the structural encoder.
1321
1322    #[test]
1323    fn structural_encoder_preserves_four_space_indented_code() {
1324        use crate::RuleSet;
1325        let md = "intro paragraph.\n\n    fn check(token: &Token) -> bool {\n        token.expires_at <= Utc::now()\n    }\n\nepilogue paragraph.";
1326        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1327        assert!(
1328            out.contains("    fn check(token: &Token) -> bool {"),
1329            "four-space indented code must be verbatim, got: {out:?}"
1330        );
1331        assert!(
1332            out.contains("        token.expires_at <= Utc::now()"),
1333            "indented-code continuation (8 spaces) must be verbatim, got: {out:?}"
1334        );
1335        assert!(
1336            out.contains("    }"),
1337            "closing brace line must be verbatim, got: {out:?}"
1338        );
1339    }
1340
1341    #[test]
1342    fn structural_encoder_preserves_tab_indented_code() {
1343        use crate::RuleSet;
1344        let md = "intro.\n\n\tlet x = 1;\n\tlet y = 2;\n\nafter.";
1345        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1346        assert!(
1347            out.contains("\tlet x = 1;"),
1348            "tab-indented code must be verbatim, got: {out:?}"
1349        );
1350    }
1351
1352    #[test]
1353    fn structural_encoder_compresses_paragraph_body() {
1354        use crate::RuleSet;
1355        // Term-substitution should still fire inside paragraphs.
1356        let md = "Title line no header.\n\nThe authentication module sends a request to the policy engine and it returns a result.";
1357        let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1358        assert!(out.contains("\n\n"), "paragraph break preserved");
1359        // Substitution evidence: "authentication module" should have been shortened.
1360        assert!(
1361            !out.contains("authentication module"),
1362            "expected term_substitutions to rewrite 'authentication module', got: {out:?}"
1363        );
1364    }
1365
1366    #[test]
1367    fn substitutes_authorization_term() {
1368        let out = encode_symbolic(
1369            "The user authentication module sends the request to the policy engine.",
1370        );
1371        assert!(out.contains("A.mod"), "expected A.mod in {out}");
1372        assert!(out.contains("PE"), "expected PE in {out}");
1373    }
1374
1375    #[test]
1376    fn drops_filler_words() {
1377        let out = encode_symbolic("The user is in the system.");
1378        // 'the', 'is', 'in' are all filler; we keep 'user' and 'system'.
1379        let lc = out.to_lowercase();
1380        assert!(!lc.split_whitespace().any(|w| w == "the"));
1381        assert!(!lc.split_whitespace().any(|w| w == "is"));
1382    }
1383
1384    #[test]
1385    fn drops_expanded_prepositions_and_intensifiers() {
1386        // Regression gate for the 2026-04-24 FILLER_WORDS expansion.
1387        // Prepositions (for, about, through, during, via, per, over,
1388        // around, within) and degree-only intensifiers (just, only,
1389        // very, quite, really, actually, simply) must all strip.
1390        // Content words ('request', 'handler', 'log') must survive.
1391        let out = encode_symbolic(
1392            "The request is just really very important for the handler to actually log during the call.",
1393        );
1394        let lc = out.to_lowercase();
1395        let words: std::collections::HashSet<_> = lc.split_whitespace().collect();
1396        for stripped in ["for", "during", "just", "really", "very", "actually"] {
1397            assert!(
1398                !words.contains(stripped),
1399                "filler `{stripped}` must be stripped from: {out}",
1400            );
1401        }
1402        // Content words survive (may be substituted by TERM_SUBSTITUTIONS
1403        // -- e.g. 'request' -> 'R', 'handler' -> 'H' -- so assert on
1404        // something content-bearing that is *not* on the subst table).
1405        for kept in ["important", "log", "call"] {
1406            assert!(
1407                words.iter().any(|w| w.contains(kept)),
1408                "content word `{kept}` must survive: {out}",
1409            );
1410        }
1411    }
1412
1413    #[test]
1414    fn polarity_bearing_words_are_not_filler() {
1415        // Explicit guardrail: the filler list must never include
1416        // polarity-bearing words (not / never / no / nothing) nor
1417        // epistemic hedges (maybe / perhaps / likely / possibly),
1418        // because dropping them silently flips assertion strength
1419        // or polarity on the compressed prompt. Regression pin for
1420        // the 2026-04-24 expansion.
1421        for forbidden in [
1422            "not", "never", "no", "nothing", "maybe", "perhaps", "likely", "possibly",
1423        ] {
1424            assert!(
1425                !crate::encoder::FILLER_WORDS.contains(&forbidden),
1426                "polarity-bearing word `{forbidden}` must NOT be in FILLER_WORDS",
1427            );
1428        }
1429    }
1430
1431    #[test]
1432    fn arrow_replacement() {
1433        // ARROW regex matches the contiguous tokens; `invokes` is a
1434        // single-word match (unlike `forwards to` which needs `to` in
1435        // the very next token slot).
1436        let out = encode_symbolic("The handler invokes the policy engine.");
1437        assert!(out.contains('\u{2192}'), "missing arrow in {out}");
1438    }
1439
1440    #[test]
1441    fn success_glyph_replacement_when_rule_is_explicitly_enabled() {
1442        // `success` is disabled in `default_v1` as of 2026-04-24
1443        // (ablation study: glyph tokenizes to more tokens than the
1444        // ASCII word). The underlying rule still works when a
1445        // caller opts in, so pin that capability with an explicit
1446        // ruleset.
1447        let mut rs = RuleSet::default_v1();
1448        rs.enabled.insert("success".to_owned(), true);
1449        rs.weights.insert("success".to_owned(), 1.0);
1450        let (out, _) =
1451            encode_symbolic_traced_with("If validation succeeds the request continues.", &rs);
1452        assert!(out.contains('\u{2713}'), "missing check in {out}");
1453    }
1454
1455    #[test]
1456    fn failure_glyph_replacement_when_rule_is_explicitly_enabled() {
1457        // Companion to `success_glyph_replacement_when_rule_is_explicitly_enabled`.
1458        let mut rs = RuleSet::default_v1();
1459        rs.enabled.insert("failure".to_owned(), true);
1460        rs.weights.insert("failure".to_owned(), 1.0);
1461        let (out, _) =
1462            encode_symbolic_traced_with("If validation fails the request is rejected.", &rs);
1463        assert!(out.contains('\u{2717}'), "missing cross in {out}");
1464    }
1465
1466    #[test]
1467    fn default_v1_disables_success_and_failure_glyphs() {
1468        // Regression gate for the 2026-04-24 ablation finding:
1469        // `success` and `failure` glyph substitutions made output
1470        // LARGER on real corpora, so they must stay OFF in the
1471        // first-boot / fallback ruleset. Prevents a silent revert
1472        // from reintroducing the -55-token-per-corpus regression.
1473        let rs = RuleSet::default_v1();
1474        assert_eq!(
1475            rs.enabled.get("success").copied(),
1476            Some(false),
1477            "success must be OFF by default",
1478        );
1479        assert_eq!(
1480            rs.enabled.get("failure").copied(),
1481            Some(false),
1482            "failure must be OFF by default",
1483        );
1484        let (out, trace) = encode_symbolic_traced_with(
1485            "If validation succeeds the call fails and the handler logs it.",
1486            &rs,
1487        );
1488        assert!(!out.contains('\u{2713}'));
1489        assert!(!out.contains('\u{2717}'));
1490        assert_eq!(trace.success, 0);
1491        assert_eq!(trace.failure, 0);
1492    }
1493
1494    #[test]
1495    fn longer_term_wins_over_shorter() {
1496        let out = encode_symbolic("The authentication module handles login.");
1497        // "authentication module" -> "A.mod", not "authentication" -> "A".
1498        assert!(out.contains("A.mod"));
1499        assert!(!out.contains("A module"));
1500    }
1501
1502    #[test]
1503    fn idempotent_on_minimal_input() {
1504        let out = encode_symbolic("hi");
1505        assert_eq!(out, "hi");
1506    }
1507
1508    #[test]
1509    fn compress_returns_symbolic_when_net_positive() {
1510        let inp = "The authentication module forwards the request to the policy engine \
1511                   for validation against the session store.";
1512        let out = enc().compress(inp, Model::Gpt4);
1513        assert_eq!(out.format, Format::Symbolic);
1514        assert!(out.compressed_tokens < out.baseline_tokens, "{out:?}");
1515        assert!(out.fallback.is_none());
1516    }
1517
1518    #[test]
1519    fn compress_falls_back_when_too_short() {
1520        let out = enc().compress("hello world", Model::Gpt4);
1521        assert_eq!(out.format, Format::Prose);
1522        assert!(matches!(out.fallback, Some(FallbackReason::Uncompressible)));
1523    }
1524
1525    #[test]
1526    fn compress_falls_back_for_unregistered_model() {
1527        let out = enc().compress(
1528            "The authentication module forwards the request to the policy engine.",
1529            Model::Gemini25Pro,
1530        );
1531        assert_eq!(out.format, Format::Prose);
1532        assert!(matches!(
1533            out.fallback,
1534            Some(FallbackReason::TokenizerMissing)
1535        ));
1536    }
1537
1538    #[test]
1539    fn select_format_matches_compress_choice() {
1540        let inp = "The authentication module forwards the request to the policy engine \
1541                   for validation against the session store.";
1542        let f = enc().select_format(inp, Model::Gpt4);
1543        let c = enc().compress(inp, Model::Gpt4);
1544        assert_eq!(f, c.format);
1545    }
1546
1547    #[test]
1548    fn explicit_fallback_returns_prose() {
1549        let out = enc().fallback(
1550            "The authentication module forwards the request.",
1551            Model::Gpt4,
1552            FallbackReason::QualityDegraded,
1553        );
1554        assert_eq!(out.format, Format::Prose);
1555        assert!(matches!(
1556            out.fallback,
1557            Some(FallbackReason::QualityDegraded)
1558        ));
1559    }
1560
1561    #[test]
1562    fn content_hash_is_blake3_of_original_not_compressed() {
1563        let inp = "The authentication module forwards the request to the policy engine.";
1564        let out = enc().compress(inp, Model::Gpt4);
1565        let mut h = Hasher::new();
1566        h.update(inp.as_bytes());
1567        assert_eq!(out.content_hash, h.finalize().to_hex().to_string());
1568    }
1569
1570    #[test]
1571    fn trace_records_term_substitution_count() {
1572        let (_, t) = encode_symbolic_traced(
1573            "The authentication module forwards a request to the policy engine \
1574             for validation against the session store.",
1575        );
1576        // term_substitutions includes 'authentication', 'request', 'validation',
1577        // 'session store', 'policy engine'. Exact count varies with overlap;
1578        // all we need is "fired".
1579        assert!(t.term_substitutions >= 3, "{t:?}");
1580    }
1581
1582    #[test]
1583    fn trace_records_filler_removal_count() {
1584        let (_, t) = encode_symbolic_traced("The user is in the system and is using the database.");
1585        // 'the' x3, 'is' x2, 'in', 'and', 'the' (some already in TERM hits)
1586        assert!(t.filler_removed >= 4, "{t:?}");
1587    }
1588
1589    #[test]
1590    fn trace_no_fire_for_neutral_text() {
1591        let (_, t) = encode_symbolic_traced("Lorem ipsum dolor sit amet consectetur");
1592        assert_eq!(t.term_substitutions, 0);
1593        assert_eq!(t.if_prefix, 0);
1594        assert!(!t.any_fired() || t.filler_removed > 0);
1595    }
1596
1597    #[test]
1598    fn step9_bytes_saved_populated_when_multiple_rules_fire() {
1599        // Step-9 integration test. Input is crafted to fire at least
1600        // term_substitutions + filler_removed. Verify:
1601        //   1. Each firing rule has bytes_saved > 0 at its slot.
1602        //   2. A non-firing rule stays at 0.
1603        //   3. Sum of per-rule bytes_saved is positive and bounded
1604        //      above by (input.len - output.len) + whitespace slop
1605        //      from the final PUNCT_GAP / MULTI_WS normalisation.
1606        let input = "The authentication module forwards a request to the policy \
1607                     engine for validation against the session store.";
1608        let (out, trace) = encode_symbolic_traced(input);
1609
1610        // At least term_substitutions and filler_removed fired.
1611        assert!(
1612            trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] > 0,
1613            "term_substitutions should have saved bytes; trace={trace:?}"
1614        );
1615        assert!(
1616            trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] > 0,
1617            "filler_removed should have saved bytes; trace={trace:?}"
1618        );
1619
1620        // A rule that clearly cannot fire on this input (no code
1621        // fences, no ANSI) stays at zero.
1622        assert_eq!(
1623            trace.bytes_saved[EncoderTrace::IDX_ANSI_STRIPPED],
1624            0,
1625            "ansi_stripped cannot fire on plain prose"
1626        );
1627
1628        // Sanity: the sum of per-rule deltas is at most the total
1629        // shrink. Rules compose — rule N operates on rule N-1's
1630        // output — so sum-of-deltas is a LOWER bound on total bytes
1631        // removed, not exact. But it must be positive and not exceed
1632        // the total.
1633        let sum_deltas: u64 = trace.bytes_saved.iter().sum();
1634        let total_delta = (input.len() as u64).saturating_sub(out.len() as u64);
1635        assert!(sum_deltas > 0, "at least one rule contributed");
1636        assert!(
1637            sum_deltas <= total_delta,
1638            "sum of per-rule deltas ({sum_deltas}) must not exceed \
1639             total shrink ({total_delta}); input={} output={}",
1640            input.len(),
1641            out.len()
1642        );
1643    }
1644
1645    #[test]
1646    fn compress_traced_returns_empty_trace_on_short_input() {
1647        let (_, trace) = enc().compress_traced("hi", Model::Gpt4);
1648        assert_eq!(trace, EncoderTrace::default());
1649    }
1650
1651    #[test]
1652    fn compress_traced_returns_empty_trace_on_unsupported_model() {
1653        let (_, trace) = enc().compress_traced(
1654            "The authentication module forwards the request.",
1655            Model::Gemini25Pro,
1656        );
1657        assert_eq!(trace, EncoderTrace::default());
1658    }
1659
1660    #[test]
1661    fn compress_traced_returns_empty_trace_on_oversized_input() {
1662        let big = "abc ".repeat(MAX_INPUT_CHARS);
1663        let (out, trace) = enc().compress_traced(&big, Model::Gpt4);
1664        assert_eq!(out.format, Format::Prose);
1665        assert!(matches!(out.fallback, Some(FallbackReason::OversizedInput)));
1666        assert_eq!(trace, EncoderTrace::default());
1667    }
1668
1669    #[test]
1670    fn trace_pairs_are_alphabetical() {
1671        let t = EncoderTrace::default();
1672        let pairs = t.as_pairs();
1673        let mut sorted = pairs;
1674        sorted.sort_by_key(|(name, _)| *name);
1675        assert_eq!(pairs, sorted);
1676    }
1677
1678    #[test]
1679    fn ruleset_default_v1_matches_constants_only_encoder() {
1680        // Equivalence proof: default_v1 ruleset must produce
1681        // bit-identical output to the legacy constants-only path.
1682        let inputs = [
1683            "The authentication module forwards the request to the policy engine.",
1684            "If validation succeeds the request continues. The handler invokes the rate limiter.",
1685            "User is authorized via the bearer token; resource handler validates the operation.",
1686        ];
1687        let rs = RuleSet::default_v1();
1688        for inp in inputs {
1689            let (a, ta) = encode_symbolic_traced(inp);
1690            let (b, tb) = encode_symbolic_traced_with(inp, &rs);
1691            assert_eq!(a, b, "default_v1 must match legacy on `{inp}`");
1692            assert_eq!(ta, tb, "trace must match on `{inp}`");
1693        }
1694    }
1695
1696    #[test]
1697    fn ruleset_disabled_rule_does_not_fire() {
1698        let mut rs = RuleSet::default_v1();
1699        rs.enabled.insert("success".to_owned(), false);
1700        let (out, trace) = encode_symbolic_traced_with(
1701            "If validation succeeds the request continues to the handler.",
1702            &rs,
1703        );
1704        assert!(
1705            !out.contains('\u{2713}'),
1706            "success glyph must not appear: {out}"
1707        );
1708        assert_eq!(trace.success, 0, "success rule trace must be zero");
1709    }
1710
1711    #[test]
1712    fn ruleset_weight_below_threshold_is_treated_as_off() {
1713        // Uses a weight strictly below `ENABLE_WEIGHT_THRESHOLD`
1714        // (revised from 0.5 → 0.05 on 2026-04-24 after corpus
1715        // simulation showed no individual rule could cross 0.5 on
1716        // real audit data). Keep this value in sync if the
1717        // threshold changes again.
1718        let mut rs = RuleSet::default_v1();
1719        rs.weights.insert("arrow".to_owned(), 0.02);
1720        let (out, trace) = encode_symbolic_traced_with(
1721            "The handler invokes the policy engine to validate the request.",
1722            &rs,
1723        );
1724        assert!(
1725            !out.contains('\u{2192}'),
1726            "arrow glyph must not appear: {out}"
1727        );
1728        assert_eq!(trace.arrow, 0);
1729    }
1730
1731    #[test]
1732    fn ruleset_weight_above_threshold_but_below_legacy_half_is_on() {
1733        // Regression gate for the 2026-04-24 threshold revision:
1734        // weight=0.2 was "off" under the old 0.5 gate but must now
1735        // be "on" under the 0.05 gate. Prevents a future revert
1736        // from silently restoring the threshold that disabled
1737        // every live rule on the real corpus.
1738        let mut rs = RuleSet::default_v1();
1739        rs.weights.insert("arrow".to_owned(), 0.2);
1740        let (out, trace) = encode_symbolic_traced_with(
1741            "The handler invokes the policy engine to validate the request.",
1742            &rs,
1743        );
1744        assert!(
1745            out.contains('\u{2192}'),
1746            "arrow glyph must be applied at weight 0.2 under revised threshold: {out}"
1747        );
1748        assert!(trace.arrow > 0);
1749    }
1750
1751    #[test]
1752    fn ruleset_unrecognised_keys_are_dropped_on_load() {
1753        let toml = r"
1754[enabled]
1755success = false
1756made_up_rule = true
1757[weights]
1758arrow = 0.3
1759another_made_up = 0.7
1760";
1761        let rs = RuleSet::from_toml_str(toml).expect("parse");
1762        assert_eq!(rs.enabled.get("success").copied(), Some(false));
1763        assert!(!rs.enabled.contains_key("made_up_rule"));
1764        assert!(!rs.weights.contains_key("another_made_up"));
1765    }
1766
1767    #[test]
1768    fn ruleset_round_trip_through_toml() {
1769        let mut rs = RuleSet::default_v1();
1770        rs.enabled.insert("success".to_owned(), false);
1771        rs.weights.insert("arrow".to_owned(), 0.42);
1772        rs.version = Some("test-r1".to_owned());
1773        let s = rs.to_toml_string().expect("serialize");
1774        let rs2 = RuleSet::from_toml_str(&s).expect("parse");
1775        assert_eq!(rs2.enabled.get("success").copied(), Some(false));
1776        assert!((rs2.weight("arrow") - 0.42).abs() < 1e-6);
1777        assert_eq!(rs2.version.as_deref(), Some("test-r1"));
1778    }
1779
1780    #[test]
1781    fn safe_canary_preserves_success_failure_words() {
1782        let rs = RuleSet::safe_canary_v1();
1783        let (out, trace) = encode_symbolic_traced_with(
1784            "If validation succeeds the request continues. If validation fails the request is rejected.",
1785            &rs,
1786        );
1787        assert!(
1788            out.to_lowercase().contains("succeeds"),
1789            "success word should remain: {out}"
1790        );
1791        assert!(
1792            out.to_lowercase().contains("fails"),
1793            "failure word should remain: {out}"
1794        );
1795        assert_eq!(trace.success, 0);
1796        assert_eq!(trace.failure, 0);
1797        assert_eq!(
1798            rs.version.as_deref(),
1799            Some("safe-canary-v1-no-success-failure")
1800        );
1801    }
1802
1803    #[test]
1804    fn agentic_canary_v2_enables_quality_ready_tool_codecs() {
1805        let rs = RuleSet::agentic_canary_v2();
1806        for name in [
1807            "json_records_table",
1808            "numeric_range_lines",
1809            "repeated_chunk_dict",
1810            "tool_schema_semantic_table",
1811        ] {
1812            assert!(rs.is_enabled(name), "{name} should be enabled");
1813            assert!((rs.weight(name) - 1.0).abs() < f32::EPSILON);
1814        }
1815        assert!(!rs.is_enabled("success"), "success glyphs stay disabled");
1816        assert!(!rs.is_enabled("failure"), "failure glyphs stay disabled");
1817        assert_eq!(
1818            rs.version.as_deref(),
1819            Some("agentic-canary-v2-quality-ready-codecs")
1820        );
1821    }
1822
1823    #[test]
1824    fn compress_traced_with_respects_ruleset_toggle() {
1825        let mut rs = RuleSet::default_v1();
1826        rs.enabled.insert("term_substitutions".to_owned(), false);
1827        let inp = "The authentication module forwards the request to the policy engine \
1828                   for validation against the session store.";
1829        let (out, _) = enc().compress_traced_with(inp, Model::Gpt4, &rs);
1830        // With substitutions disabled the canonical abbreviations
1831        // (A.mod, PE, SS) must not appear.
1832        assert!(!out.content.contains("A.mod"), "{:?}", out.content);
1833        assert!(!out.content.contains("PE"), "{:?}", out.content);
1834    }
1835
1836    /// `DoD` §10 perf evidence for the encoder pipeline.
1837    #[test]
1838    fn compress_meets_section_10() {
1839        use std::time::Instant;
1840        let e = enc();
1841        let inp = "The authentication module forwards the request to the policy engine \
1842                   for validation against the session store and then the response \
1843                   pipeline returns the result. "
1844            .repeat(20);
1845        let mut samples = Vec::with_capacity(100);
1846        for _ in 0..100 {
1847            let t = Instant::now();
1848            let _ = e.compress(&inp, Model::Gpt4);
1849            samples.push(t.elapsed().as_micros());
1850        }
1851        samples.sort_unstable();
1852        let p50 = samples[50];
1853        let p95 = samples[94];
1854        let p99 = samples[98];
1855        eprintln!(
1856            "compress {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
1857            inp.len()
1858        );
1859        // Debug-build ceiling 50ms (release target <5ms tracked in ROADMAP).
1860        assert!(p95 < 50_000, "p95 {p95}us breaches debug ceiling");
1861    }
1862
1863    #[test]
1864    fn idx_constants_match_as_pairs_order() {
1865        // Pin the invariant so renaming or reordering rule fields in
1866        // EncoderTrace can't silently desync the IDX_* constants from
1867        // as_pairs positions. Every consumer of bytes_saved[] relies
1868        // on this contract.
1869        let names = EncoderTrace::default().as_pairs().map(|(n, _)| n);
1870        assert_eq!(names[EncoderTrace::IDX_AND], "and");
1871        assert_eq!(names[EncoderTrace::IDX_ANSI_STRIPPED], "ansi_stripped");
1872        assert_eq!(names[EncoderTrace::IDX_ARROW], "arrow");
1873        assert_eq!(names[EncoderTrace::IDX_BLANK_LINES], "blank_lines");
1874        assert_eq!(names[EncoderTrace::IDX_FAILURE], "failure");
1875        assert_eq!(names[EncoderTrace::IDX_FILLER_REMOVED], "filler_removed");
1876        assert_eq!(names[EncoderTrace::IDX_IF_PREFIX], "if_prefix");
1877        assert_eq!(names[EncoderTrace::IDX_JSON_MINIFIED], "json_minified");
1878        assert_eq!(
1879            names[EncoderTrace::IDX_JSON_RECORDS_TABLE],
1880            "json_records_table"
1881        );
1882        assert_eq!(
1883            names[EncoderTrace::IDX_NUMERIC_RANGE_LINES],
1884            "numeric_range_lines"
1885        );
1886        assert_eq!(
1887            names[EncoderTrace::IDX_REPEATED_CHUNK_DICT],
1888            "repeated_chunk_dict"
1889        );
1890        assert_eq!(names[EncoderTrace::IDX_REPEATED_LINES], "repeated_lines");
1891        assert_eq!(names[EncoderTrace::IDX_SUCCESS], "success");
1892        assert_eq!(
1893            names[EncoderTrace::IDX_TERM_SUBSTITUTIONS],
1894            "term_substitutions"
1895        );
1896        assert_eq!(
1897            names[EncoderTrace::IDX_TOOL_SCHEMA_SEMANTIC_TABLE],
1898            "tool_schema_semantic_table"
1899        );
1900        assert_eq!(names[EncoderTrace::IDX_TRAILING_WS], "trailing_ws");
1901        assert_eq!(names[EncoderTrace::IDX_VS], "vs");
1902    }
1903
1904    #[test]
1905    fn bytes_saved_pairs_parallel_to_as_pairs() {
1906        // Step-9 scaffold contract: `bytes_saved_pairs` returns the
1907        // same rule names in the same order as `as_pairs`, so the
1908        // audit-writer can zip them positionally without a name lookup.
1909        let mut t = EncoderTrace::default();
1910        t.bytes_saved[0] = 7; // "and"
1911        t.bytes_saved[13] = 42; // "term_substitutions"
1912
1913        let counts = t.as_pairs();
1914        let bytes = t.bytes_saved_pairs();
1915        assert_eq!(counts.len(), bytes.len());
1916        for i in 0..counts.len() {
1917            assert_eq!(counts[i].0, bytes[i].0, "name at index {i} diverges");
1918        }
1919        assert_eq!(bytes[0], ("and", 7));
1920        assert_eq!(bytes[13], ("term_substitutions", 42));
1921
1922        // Default trace is all-zero on the byte axis (matches the
1923        // non-firing counts). This pins the "zero is signal, not
1924        // noise" invariant documented in EncoderTrace::bytes_saved.
1925        let d = EncoderTrace::default();
1926        assert!(d.bytes_saved_pairs().iter().all(|(_, b)| *b == 0));
1927    }
1928
1929    #[test]
1930    fn bytes_saved_merge_is_saturating_sum() {
1931        let mut a = EncoderTrace::default();
1932        a.bytes_saved[5] = 100;
1933        let mut b = EncoderTrace::default();
1934        b.bytes_saved[5] = 50;
1935        b.bytes_saved[9] = u64::MAX; // saturation test
1936        a.merge(b);
1937        assert_eq!(a.bytes_saved[5], 150);
1938        assert_eq!(a.bytes_saved[9], u64::MAX);
1939
1940        let mut c = EncoderTrace::default();
1941        c.bytes_saved[9] = 1;
1942        a.merge(c);
1943        // saturating_add on MAX: stays at MAX, no overflow.
1944        assert_eq!(a.bytes_saved[9], u64::MAX);
1945    }
1946}
pithy_core/encoder.rs

pithy_core/
encoder.rs