1use std::collections::BTreeMap;
18use std::path::Path;
19use std::sync::Arc;
20
21use blake3::Hasher;
22use once_cell::sync::Lazy;
23use regex::{Regex, RegexBuilder};
24use serde::{Deserialize, Serialize};
25
26use crate::interfaces::{Compressed, Encoder, FallbackReason, Format, Measurer, Model};
27
28pub const RULE_NAMES: &[&str] = &[
33 "and",
34 "ansi_stripped",
35 "arrow",
36 "blank_lines",
37 "failure",
38 "filler_removed",
39 "if_prefix",
40 "json_minified",
41 "json_records_table",
42 "numeric_range_lines",
43 "repeated_chunk_dict",
44 "repeated_lines",
45 "success",
46 "term_substitutions",
47 "tool_schema_semantic_table",
48 "trailing_ws",
49 "vs",
50];
51
52pub const MIN_INPUT_CHARS: usize = 32;
57
58pub const MAX_INPUT_CHARS: usize = 256 * 1024;
66
67const TERM_SUBSTITUTIONS: &[(&str, &str)] = &[
70 ("post-tool authorization check", "PTA"),
71 ("post-tool authorization", "PTA"),
72 ("policy engine", "PE"),
73 ("session store", "SS"),
74 ("failure store", "FS"),
75 ("response pipeline", "RP"),
76 ("rate limiting", "RL"),
77 ("rate limiter", "RL"),
78 ("rate limit", "RL"),
79 ("authentication module", "A.mod"),
80 ("authorization module", "Z.mod"),
81 ("authentication service", "A.svc"),
82 ("authorization service", "Z.svc"),
83 ("authentication", "A"),
84 ("authorization", "Z"),
85 ("authenticate", "A"),
86 ("authorize", "Z"),
87 ("authenticated", "A'd"),
88 ("authorized", "Z'd"),
89 ("handler", "H"),
90 ("request", "R"),
91 ("response", "Rp"),
92 ("permissions", "P"),
93 ("permission", "P"),
94 ("telemetry", "T"),
95 ("validate", "V"),
96 ("validates", "V"),
97 ("validated", "V'd"),
98 ("validation", "V"),
99 ("database", "DB"),
100 ("JSON", "J"),
101 ("bearer token", "BT"),
102 ("principal", "Pr"),
103 ("resource", "Rs"),
104 ("operation", "Op"),
105 ("configuration file", "Cf"),
115 ("environment variable", "Env"),
116 ("integration test", "IT"),
117 ("regular expression", "RE"),
118 ("working directory", "WD"),
119 ("breaking change", "BC"),
120 ("circuit breaker", "CiB"),
121 ("pattern matching", "PM"),
122 ("race condition", "RC"),
123 ("type checking", "Typ"),
124 ("command line", "CL"),
125 ("content block", "CB"),
126 ("error message", "EM"),
127 ("feature flag", "FF"),
128 ("function call", "FC"),
129 ("kill switch", "KS"),
130 ("merge request", "MR"),
131 ("pull request", "PR"),
132 ("stack trace", "ST"),
133 ("system prompt", "SP"),
134 ("tool result", "TR"),
135 ("user prompt", "UP"),
136 ("code review", "CR"),
137 ("tool call", "TC"),
138 ("tool use", "TU"),
139 ("unit test", "UT"),
140];
141
142const FILLER_WORDS: &[&str] = &[
165 "the", "a", "an", "of", "to", "in", "on", "at", "by", "with", "from", "is", "are", "was",
167 "were", "be", "been", "being", "that", "this", "these", "those", "it", "its", "as", "then",
168 "which", "who", "whom", "whose", "each", "any", "some", "all", "also", "such", "into", "onto",
169 "for", "about", "around", "over", "through", "during", "within", "per", "via",
171 "just", "only", "very", "quite", "really", "actually", "simply",
173];
174
175static SUB_RULES: Lazy<Vec<(Regex, &'static str)>> = Lazy::new(|| {
177 let mut subs: Vec<(&'static str, &'static str)> = TERM_SUBSTITUTIONS.to_vec();
178 subs.sort_by_key(|(long, _)| std::cmp::Reverse(long.len()));
179 subs.into_iter()
180 .map(|(long, short)| {
181 let pat = format!(r"\b{}\b", regex::escape(long));
182 let re = RegexBuilder::new(&pat)
183 .case_insensitive(true)
184 .build()
185 .expect("static substitution pattern");
186 (re, short)
187 })
188 .collect()
189});
190
191static IF_PREFIX: Lazy<Regex> = Lazy::new(|| {
192 RegexBuilder::new(r"\bif\b\s+")
193 .case_insensitive(true)
194 .build()
195 .expect("if-prefix")
196});
197
198static SUCCESS: Lazy<Regex> = Lazy::new(|| {
199 RegexBuilder::new(r"\b(succeeds?|ok|success|grants? access|grants?)\b")
200 .case_insensitive(true)
201 .build()
202 .expect("success")
203});
204
205static FAILURE: Lazy<Regex> = Lazy::new(|| {
206 RegexBuilder::new(r"\b(fails?|failure|failed)\b")
207 .case_insensitive(true)
208 .build()
209 .expect("failure")
210});
211
212static ARROW: Lazy<Regex> = Lazy::new(|| {
213 RegexBuilder::new(r"\b(returns?|forwarded? to|forwards? to|sends? to|invokes?|invoked)\b")
214 .case_insensitive(true)
215 .build()
216 .expect("arrow")
217});
218
219static VS: Lazy<Regex> = Lazy::new(|| {
220 RegexBuilder::new(r"\b(against|versus|vs\.?)\b")
221 .case_insensitive(true)
222 .build()
223 .expect("vs")
224});
225
226static AND: Lazy<Regex> = Lazy::new(|| {
227 RegexBuilder::new(r"\b(and|plus)\b")
228 .case_insensitive(true)
229 .build()
230 .expect("and")
231});
232
233static PUNCT_GAP: Lazy<Regex> =
234 Lazy::new(|| Regex::new(r"\s+([.,;:\u{2192}\u{2713}\u{2717}])\s*").expect("punct-gap"));
235
236static MULTI_WS: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").expect("multi-ws"));
237
238pub(crate) fn has_structural_markers(s: &str) -> bool {
250 if s.contains("\n\n") {
253 return true;
254 }
255 for fence in ["\n```", "\n~~~"] {
257 if s.contains(fence) || s.starts_with(&fence[1..]) {
258 return true;
259 }
260 }
261 for line in s.lines() {
262 let trimmed = line.trim_start();
263 if trimmed.starts_with('#') {
265 let rest = trimmed.trim_start_matches('#');
266 let hashes = trimmed.len() - rest.len();
267 if (1..=6).contains(&hashes) && rest.starts_with(' ') {
268 return true;
269 }
270 }
271 if let Some(after) = trimmed
273 .strip_prefix("- ")
274 .or_else(|| trimmed.strip_prefix("* "))
275 .or_else(|| trimmed.strip_prefix("+ "))
276 {
277 if !after.is_empty() {
278 return true;
279 }
280 }
281 {
283 let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
284 if !digits.is_empty() && digits.len() <= 3 {
285 let rest = &trimmed[digits.len()..];
286 if rest.starts_with(". ") || rest.starts_with(") ") {
287 return true;
288 }
289 }
290 }
291 if trimmed.starts_with("> ") {
293 return true;
294 }
295 if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
297 return true;
298 }
299 }
300 false
301}
302
303fn strip_punct(word: &str) -> &str {
304 word.trim_matches(|c: char| matches!(c, '.' | ',' | ';' | ':'))
305}
306
307fn is_filler(word: &str) -> bool {
308 let stripped = strip_punct(word).to_ascii_lowercase();
309 FILLER_WORDS.iter().any(|w| *w == stripped)
310}
311
312#[derive(Debug, Clone, Default, PartialEq, Eq)]
320pub struct EncoderTrace {
321 pub term_substitutions: u32,
323 pub if_prefix: u32,
325 pub success: u32,
327 pub failure: u32,
329 pub arrow: u32,
331 pub vs: u32,
333 pub and: u32,
335 pub filler_removed: u32,
337 pub ansi_stripped: u32,
342 pub trailing_ws: u32,
347 pub blank_lines: u32,
352 pub json_minified: u32,
356 pub json_records_table: u32,
361 pub numeric_range_lines: u32,
365 pub repeated_chunk_dict: u32,
369 pub repeated_lines: u32,
374 pub tool_schema_semantic_table: u32,
379 pub bytes_saved: [u64; 17],
397}
398
399impl EncoderTrace {
400 pub const IDX_AND: usize = 0;
405 pub const IDX_ANSI_STRIPPED: usize = 1;
406 pub const IDX_ARROW: usize = 2;
407 pub const IDX_BLANK_LINES: usize = 3;
408 pub const IDX_FAILURE: usize = 4;
409 pub const IDX_FILLER_REMOVED: usize = 5;
410 pub const IDX_IF_PREFIX: usize = 6;
411 pub const IDX_JSON_MINIFIED: usize = 7;
412 pub const IDX_JSON_RECORDS_TABLE: usize = 8;
413 pub const IDX_NUMERIC_RANGE_LINES: usize = 9;
414 pub const IDX_REPEATED_CHUNK_DICT: usize = 10;
415 pub const IDX_REPEATED_LINES: usize = 11;
416 pub const IDX_SUCCESS: usize = 12;
417 pub const IDX_TERM_SUBSTITUTIONS: usize = 13;
418 pub const IDX_TOOL_SCHEMA_SEMANTIC_TABLE: usize = 14;
419 pub const IDX_TRAILING_WS: usize = 15;
420 pub const IDX_VS: usize = 16;
421
422 #[must_use]
424 pub fn any_fired(&self) -> bool {
425 self.term_substitutions
426 + self.if_prefix
427 + self.success
428 + self.failure
429 + self.arrow
430 + self.vs
431 + self.and
432 + self.filler_removed
433 + self.ansi_stripped
434 + self.trailing_ws
435 + self.blank_lines
436 + self.json_minified
437 + self.json_records_table
438 + self.numeric_range_lines
439 + self.repeated_chunk_dict
440 + self.repeated_lines
441 + self.tool_schema_semantic_table
442 > 0
443 }
444
445 #[must_use]
454 pub fn bytes_saved_pairs(&self) -> [(&'static str, u64); 17] {
455 let names = self.as_pairs().map(|(name, _)| name);
456 let mut out = [("", 0u64); 17];
457 for i in 0..17 {
458 out[i] = (names[i], self.bytes_saved[i]);
459 }
460 out
461 }
462
463 #[must_use]
467 pub fn as_pairs(&self) -> [(&'static str, u32); 17] {
468 [
469 ("and", self.and),
470 ("ansi_stripped", self.ansi_stripped),
471 ("arrow", self.arrow),
472 ("blank_lines", self.blank_lines),
473 ("failure", self.failure),
474 ("filler_removed", self.filler_removed),
475 ("if_prefix", self.if_prefix),
476 ("json_minified", self.json_minified),
477 ("json_records_table", self.json_records_table),
478 ("numeric_range_lines", self.numeric_range_lines),
479 ("repeated_chunk_dict", self.repeated_chunk_dict),
480 ("repeated_lines", self.repeated_lines),
481 ("success", self.success),
482 ("term_substitutions", self.term_substitutions),
483 (
484 "tool_schema_semantic_table",
485 self.tool_schema_semantic_table,
486 ),
487 ("trailing_ws", self.trailing_ws),
488 ("vs", self.vs),
489 ]
490 }
491
492 pub fn merge(&mut self, other: EncoderTrace) {
496 self.and = self.and.saturating_add(other.and);
497 self.ansi_stripped = self.ansi_stripped.saturating_add(other.ansi_stripped);
498 self.arrow = self.arrow.saturating_add(other.arrow);
499 self.blank_lines = self.blank_lines.saturating_add(other.blank_lines);
500 self.failure = self.failure.saturating_add(other.failure);
501 self.filler_removed = self.filler_removed.saturating_add(other.filler_removed);
502 self.if_prefix = self.if_prefix.saturating_add(other.if_prefix);
503 self.json_minified = self.json_minified.saturating_add(other.json_minified);
504 self.json_records_table = self
505 .json_records_table
506 .saturating_add(other.json_records_table);
507 self.numeric_range_lines = self
508 .numeric_range_lines
509 .saturating_add(other.numeric_range_lines);
510 self.repeated_chunk_dict = self
511 .repeated_chunk_dict
512 .saturating_add(other.repeated_chunk_dict);
513 self.repeated_lines = self.repeated_lines.saturating_add(other.repeated_lines);
514 self.success = self.success.saturating_add(other.success);
515 self.term_substitutions = self
516 .term_substitutions
517 .saturating_add(other.term_substitutions);
518 self.tool_schema_semantic_table = self
519 .tool_schema_semantic_table
520 .saturating_add(other.tool_schema_semantic_table);
521 self.trailing_ws = self.trailing_ws.saturating_add(other.trailing_ws);
522 self.vs = self.vs.saturating_add(other.vs);
523 for i in 0..17 {
527 self.bytes_saved[i] = self.bytes_saved[i].saturating_add(other.bytes_saved[i]);
528 }
529 }
530}
531
532pub const ENABLE_WEIGHT_THRESHOLD: f32 = 0.05;
545
546#[derive(Debug, Clone, Serialize, Deserialize, Default)]
556pub struct RuleSet {
557 #[serde(default)]
560 pub enabled: BTreeMap<String, bool>,
561 #[serde(default)]
565 pub weights: BTreeMap<String, f32>,
566 #[serde(default)]
569 pub version: Option<String>,
570}
571
572impl RuleSet {
573 #[must_use]
590 pub fn default_v1() -> Self {
591 let mut enabled = BTreeMap::new();
592 let mut weights = BTreeMap::new();
593 for name in RULE_NAMES {
594 let default_on = !matches!(
595 *name,
596 "json_records_table"
597 | "numeric_range_lines"
598 | "repeated_chunk_dict"
599 | "tool_schema_semantic_table"
600 | "success"
601 | "failure"
602 );
603 enabled.insert((*name).to_owned(), default_on);
604 weights.insert((*name).to_owned(), if default_on { 1.0 } else { 0.0 });
605 }
606 Self {
607 enabled,
608 weights,
609 version: Some("v1".to_owned()),
610 }
611 }
612
613 #[must_use]
618 pub fn safe_canary_v1() -> Self {
619 let mut rs = Self::default_v1();
620 rs.enabled.insert("success".to_owned(), false);
621 rs.enabled.insert("failure".to_owned(), false);
622 rs.weights.insert("success".to_owned(), 0.0);
623 rs.weights.insert("failure".to_owned(), 0.0);
624 rs.version = Some("safe-canary-v1-no-success-failure".to_owned());
625 rs
626 }
627
628 #[must_use]
633 pub fn agentic_canary_v2() -> Self {
634 let mut rs = Self::safe_canary_v1();
635 for name in [
636 "json_records_table",
637 "numeric_range_lines",
638 "repeated_chunk_dict",
639 "tool_schema_semantic_table",
640 ] {
641 rs.enabled.insert(name.to_owned(), true);
642 rs.weights.insert(name.to_owned(), 1.0);
643 }
644 rs.version = Some("agentic-canary-v2-quality-ready-codecs".to_owned());
645 rs
646 }
647
648 #[must_use]
653 pub fn is_enabled(&self, rule: &str) -> bool {
654 if let Some(flag) = self.enabled.get(rule) {
655 if !*flag {
656 return false;
657 }
658 }
659 !matches!(self.weights.get(rule), Some(w) if *w < ENABLE_WEIGHT_THRESHOLD)
660 }
661
662 #[must_use]
664 pub fn weight(&self, rule: &str) -> f32 {
665 self.weights.get(rule).copied().unwrap_or(1.0)
666 }
667
668 pub fn from_toml_str(s: &str) -> Result<Self, toml::de::Error> {
676 let mut rs: Self = toml::from_str(s)?;
677 rs.enabled.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
678 rs.weights.retain(|k, _| RULE_NAMES.contains(&k.as_str()));
679 Ok(rs)
680 }
681
682 pub fn from_toml_file(path: &Path) -> anyhow::Result<Self> {
687 let s = std::fs::read_to_string(path)?;
688 Self::from_toml_str(&s).map_err(|e| anyhow::anyhow!("ruleset parse: {e}"))
689 }
690
691 pub fn to_toml_string(&self) -> Result<String, toml::ser::Error> {
697 toml::to_string_pretty(self)
698 }
699}
700
701#[must_use]
703pub fn encode_symbolic(text: &str) -> String {
704 encode_symbolic_traced(text).0
705}
706
707#[must_use]
714pub fn encode_symbolic_traced(text: &str) -> (String, EncoderTrace) {
715 encode_symbolic_traced_with(text, &RuleSet::default_v1())
716}
717
718#[must_use]
729pub fn encode_symbolic_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
730 let mut trace = EncoderTrace::default();
731 let mut t: String = text.to_owned();
732 if rs.is_enabled("term_substitutions") {
738 let before = t.len() as u64;
739 let mut fired = false;
740 for (re, short) in SUB_RULES.iter() {
741 let n = u32::try_from(re.find_iter(&t).count()).unwrap_or(u32::MAX);
742 if n > 0 {
743 trace.term_substitutions = trace.term_substitutions.saturating_add(n);
744 t = re.replace_all(&t, *short).into_owned();
745 fired = true;
746 }
747 }
748 if fired {
749 trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] =
750 before.saturating_sub(t.len() as u64);
751 }
752 }
753 if rs.is_enabled("if_prefix") {
754 let n = u32::try_from(IF_PREFIX.find_iter(&t).count()).unwrap_or(u32::MAX);
755 trace.if_prefix = n;
756 if n > 0 {
757 let before = t.len() as u64;
758 t = IF_PREFIX.replace_all(&t, "").into_owned();
759 trace.bytes_saved[EncoderTrace::IDX_IF_PREFIX] = before.saturating_sub(t.len() as u64);
760 }
761 }
762 if rs.is_enabled("success") {
763 let n = u32::try_from(SUCCESS.find_iter(&t).count()).unwrap_or(u32::MAX);
764 trace.success = n;
765 if n > 0 {
766 let before = t.len() as u64;
767 t = SUCCESS.replace_all(&t, "\u{2713}").into_owned();
768 trace.bytes_saved[EncoderTrace::IDX_SUCCESS] = before.saturating_sub(t.len() as u64);
769 }
770 }
771 if rs.is_enabled("failure") {
772 let n = u32::try_from(FAILURE.find_iter(&t).count()).unwrap_or(u32::MAX);
773 trace.failure = n;
774 if n > 0 {
775 let before = t.len() as u64;
776 t = FAILURE.replace_all(&t, "\u{2717}").into_owned();
777 trace.bytes_saved[EncoderTrace::IDX_FAILURE] = before.saturating_sub(t.len() as u64);
778 }
779 }
780 if rs.is_enabled("arrow") {
781 let n = u32::try_from(ARROW.find_iter(&t).count()).unwrap_or(u32::MAX);
782 trace.arrow = n;
783 if n > 0 {
784 let before = t.len() as u64;
785 t = ARROW.replace_all(&t, "\u{2192}").into_owned();
786 trace.bytes_saved[EncoderTrace::IDX_ARROW] = before.saturating_sub(t.len() as u64);
787 }
788 }
789 if rs.is_enabled("vs") {
790 let n = u32::try_from(VS.find_iter(&t).count()).unwrap_or(u32::MAX);
791 trace.vs = n;
792 if n > 0 {
793 let before = t.len() as u64;
794 t = VS.replace_all(&t, "vs").into_owned();
795 trace.bytes_saved[EncoderTrace::IDX_VS] = before.saturating_sub(t.len() as u64);
796 }
797 }
798 if rs.is_enabled("and") {
799 let n = u32::try_from(AND.find_iter(&t).count()).unwrap_or(u32::MAX);
800 trace.and = n;
801 if n > 0 {
802 let before = t.len() as u64;
803 t = AND.replace_all(&t, "+").into_owned();
804 trace.bytes_saved[EncoderTrace::IDX_AND] = before.saturating_sub(t.len() as u64);
805 }
806 }
807 if rs.is_enabled("filler_removed") {
808 let before = t.len() as u64;
809 let words_before = t.split_whitespace().count();
810 let kept: Vec<&str> = t.split_whitespace().filter(|w| !is_filler(w)).collect();
811 let removed = u32::try_from(words_before.saturating_sub(kept.len())).unwrap_or(u32::MAX);
812 trace.filler_removed = removed;
813 t = kept.join(" ");
814 if removed > 0 {
815 trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] =
816 before.saturating_sub(t.len() as u64);
817 }
818 }
819 t = PUNCT_GAP.replace_all(&t, "$1 ").into_owned();
820 t = MULTI_WS.replace_all(&t, " ").into_owned();
821 (t.trim().to_owned(), trace)
822}
823
824fn compress_inline(body: &str, rs: &RuleSet, trace_accum: &mut EncoderTrace) -> String {
832 let (out, trace) = encode_symbolic_traced_with(body, rs);
833 trace_accum.merge(trace);
834 out
835}
836
837enum LineKind {
839 Blank,
840 Fence,
841 Heading { prefix: String, body: String },
842 ListItem { prefix: String, body: String },
843 Blockquote { body: String },
844 Table,
845 Prose,
846}
847
848fn classify_line(line: &str) -> LineKind {
849 if line.trim().is_empty() {
850 return LineKind::Blank;
851 }
852 let trimmed = line.trim_start();
853 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
854 return LineKind::Fence;
855 }
856 if trimmed.starts_with('#') {
857 let rest = trimmed.trim_start_matches('#');
858 let hashes = trimmed.len() - rest.len();
859 if (1..=6).contains(&hashes) && rest.starts_with(' ') {
860 let indent = &line[..line.len() - trimmed.len()];
861 let prefix = format!("{}{} ", indent, "#".repeat(hashes));
862 let body = rest.trim_start().to_owned();
863 return LineKind::Heading { prefix, body };
864 }
865 }
866 for marker in ["- ", "* ", "+ "] {
867 if let Some(body) = trimmed.strip_prefix(marker) {
868 if !body.is_empty() {
869 let indent = &line[..line.len() - trimmed.len()];
870 let prefix = format!("{}{}", indent, marker);
871 return LineKind::ListItem {
872 prefix,
873 body: body.to_owned(),
874 };
875 }
876 }
877 }
878 let digits: String = trimmed.chars().take_while(char::is_ascii_digit).collect();
880 if !digits.is_empty() && digits.len() <= 3 {
881 let after_digits = &trimmed[digits.len()..];
882 for sep in [". ", ") "] {
883 if let Some(body) = after_digits.strip_prefix(sep) {
884 if !body.is_empty() {
885 let indent = &line[..line.len() - trimmed.len()];
886 let prefix = format!("{}{}{}", indent, digits, sep);
887 return LineKind::ListItem {
888 prefix,
889 body: body.to_owned(),
890 };
891 }
892 }
893 }
894 }
895 if let Some(body) = trimmed.strip_prefix("> ") {
896 return LineKind::Blockquote {
897 body: body.to_owned(),
898 };
899 }
900 if trimmed.starts_with('|') && trimmed.matches('|').count() >= 2 {
901 return LineKind::Table;
902 }
903 LineKind::Prose
904}
905
906pub fn encode_symbolic_structural_traced_with(text: &str, rs: &RuleSet) -> (String, EncoderTrace) {
928 let mut out = String::with_capacity(text.len());
929 let mut trace = EncoderTrace::default();
930 let mut prose_buf = String::new();
931 let mut in_fence = false;
932 let mut in_indented_code = false;
940 let mut prev_line_blank = true; let flush = |prose_buf: &mut String, out: &mut String, trace: &mut EncoderTrace| {
943 if prose_buf.is_empty() {
944 return;
945 }
946 let compressed = compress_inline(prose_buf, rs, trace);
947 out.push_str(&compressed);
948 out.push('\n');
949 prose_buf.clear();
950 };
951
952 for line in text.split('\n') {
953 if in_fence {
954 out.push_str(line);
956 out.push('\n');
957 let trimmed = line.trim_start();
958 if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
959 in_fence = false;
960 }
961 prev_line_blank = false;
962 continue;
963 }
964
965 let is_indented = line.starts_with(" ") || line.starts_with('\t');
972 let is_blank = line.trim().is_empty();
973 if in_indented_code {
974 if is_indented || is_blank {
975 out.push_str(line);
977 out.push('\n');
978 prev_line_blank = is_blank;
979 continue;
980 }
981 in_indented_code = false;
983 } else if prev_line_blank && is_indented && !is_blank {
984 flush(&mut prose_buf, &mut out, &mut trace);
986 in_indented_code = true;
987 out.push_str(line);
988 out.push('\n');
989 prev_line_blank = false;
990 continue;
991 }
992
993 match classify_line(line) {
997 LineKind::Fence => {
998 flush(&mut prose_buf, &mut out, &mut trace);
999 out.push_str(line);
1000 out.push('\n');
1001 in_fence = true;
1002 }
1003 LineKind::Blank => {
1004 flush(&mut prose_buf, &mut out, &mut trace);
1005 out.push('\n');
1006 }
1007 LineKind::Heading { prefix, body } => {
1008 flush(&mut prose_buf, &mut out, &mut trace);
1009 let body_c = compress_inline(&body, rs, &mut trace);
1010 out.push_str(&prefix);
1011 out.push_str(&body_c);
1012 out.push('\n');
1013 }
1014 LineKind::ListItem { prefix, body } => {
1015 flush(&mut prose_buf, &mut out, &mut trace);
1016 let body_c = compress_inline(&body, rs, &mut trace);
1017 out.push_str(&prefix);
1018 out.push_str(&body_c);
1019 out.push('\n');
1020 }
1021 LineKind::Blockquote { body } => {
1022 flush(&mut prose_buf, &mut out, &mut trace);
1023 let body_c = compress_inline(&body, rs, &mut trace);
1024 out.push_str("> ");
1025 out.push_str(&body_c);
1026 out.push('\n');
1027 }
1028 LineKind::Table => {
1029 flush(&mut prose_buf, &mut out, &mut trace);
1030 out.push_str(line);
1031 out.push('\n');
1032 }
1033 LineKind::Prose => {
1034 if !prose_buf.is_empty() {
1035 prose_buf.push(' ');
1036 }
1037 prose_buf.push_str(line);
1038 }
1039 }
1040 prev_line_blank = is_blank;
1045 }
1046 flush(&mut prose_buf, &mut out, &mut trace);
1047
1048 if !text.ends_with('\n') && out.ends_with('\n') {
1051 out.pop();
1052 }
1053 (out, trace)
1054}
1055
1056pub struct SymbolicEncoder {
1058 measurer: Arc<dyn Measurer>,
1059}
1060
1061impl SymbolicEncoder {
1062 #[must_use]
1064 pub fn new(measurer: Arc<dyn Measurer>) -> Self {
1065 Self { measurer }
1066 }
1067
1068 fn hash(text: &str) -> String {
1069 let mut h = Hasher::new();
1070 h.update(text.as_bytes());
1071 h.finalize().to_hex().to_string()
1072 }
1073
1074 fn count_or_zero(&self, text: &str, model: &Model) -> u32 {
1075 self.measurer.tokenize(text, model).unwrap_or(0)
1076 }
1077
1078 fn build(
1079 &self,
1080 original: &str,
1081 compressed: &str,
1082 format: Format,
1083 model: Model,
1084 fallback: Option<FallbackReason>,
1085 ) -> Compressed {
1086 let baseline = self.count_or_zero(original, &model);
1087 let encoded = self.count_or_zero(compressed, &model);
1088 Compressed {
1089 content: compressed.to_owned(),
1090 format,
1091 baseline_tokens: baseline,
1092 compressed_tokens: encoded,
1093 model,
1094 content_hash: Self::hash(original),
1095 fallback,
1096 }
1097 }
1098}
1099
1100impl SymbolicEncoder {
1101 #[must_use]
1110 pub fn compress_traced(&self, input: &str, model: Model) -> (Compressed, EncoderTrace) {
1111 self.compress_traced_with(input, model, &RuleSet::default_v1())
1112 }
1113
1114 #[must_use]
1119 pub fn compress_traced_with(
1120 &self,
1121 input: &str,
1122 model: Model,
1123 rs: &RuleSet,
1124 ) -> (Compressed, EncoderTrace) {
1125 if !self.measurer.supported(&model) {
1126 return (
1127 self.build(
1128 input,
1129 input,
1130 Format::Prose,
1131 model,
1132 Some(FallbackReason::TokenizerMissing),
1133 ),
1134 EncoderTrace::default(),
1135 );
1136 }
1137 let chars = input.chars().count();
1138 if chars < MIN_INPUT_CHARS {
1139 return (
1140 self.build(
1141 input,
1142 input,
1143 Format::Prose,
1144 model,
1145 Some(FallbackReason::Uncompressible),
1146 ),
1147 EncoderTrace::default(),
1148 );
1149 }
1150 if chars > MAX_INPUT_CHARS {
1151 return (
1152 self.build(
1153 input,
1154 input,
1155 Format::Prose,
1156 model,
1157 Some(FallbackReason::OversizedInput),
1158 ),
1159 EncoderTrace::default(),
1160 );
1161 }
1162 let (encoded, trace) = if has_structural_markers(input) {
1169 encode_symbolic_structural_traced_with(input, rs)
1170 } else {
1171 encode_symbolic_traced_with(input, rs)
1172 };
1173 let baseline = self.count_or_zero(input, &model);
1174 let candidate = self.count_or_zero(&encoded, &model);
1175 if candidate >= baseline {
1176 return (
1177 self.build(
1178 input,
1179 input,
1180 Format::Prose,
1181 model,
1182 Some(FallbackReason::Uncompressible),
1183 ),
1184 EncoderTrace::default(),
1185 );
1186 }
1187 (
1188 self.build(input, &encoded, Format::Symbolic, model, None),
1189 trace,
1190 )
1191 }
1192}
1193
1194impl Encoder for SymbolicEncoder {
1195 fn compress(&self, input: &str, model: Model) -> Compressed {
1196 self.compress_traced(input, model).0
1197 }
1198
1199 fn select_format(&self, input: &str, model: Model) -> Format {
1200 if !self.measurer.supported(&model) || input.chars().count() < MIN_INPUT_CHARS {
1201 return Format::Prose;
1202 }
1203 let encoded = encode_symbolic(input);
1204 if self.count_or_zero(&encoded, &model) >= self.count_or_zero(input, &model) {
1205 Format::Prose
1206 } else {
1207 Format::Symbolic
1208 }
1209 }
1210
1211 fn fallback(&self, input: &str, model: Model, reason: FallbackReason) -> Compressed {
1212 self.build(input, input, Format::Prose, model, Some(reason))
1213 }
1214}
1215
1216#[cfg(test)]
1217mod tests {
1218 use super::*;
1219 use crate::tokenizers::LocalMeasurer;
1220
1221 fn enc() -> SymbolicEncoder {
1222 let m = LocalMeasurer::with_defaults().expect("measurer");
1223 SymbolicEncoder::new(Arc::new(m))
1224 }
1225
1226 #[test]
1234 fn structural_gate_detects_paragraph_breaks() {
1235 assert!(has_structural_markers("foo\n\nbar"));
1236 }
1237
1238 #[test]
1239 fn structural_gate_detects_headings() {
1240 assert!(has_structural_markers("# Title\ncontent follows"));
1241 assert!(has_structural_markers("content\n## Subheading\nmore"));
1242 }
1243
1244 #[test]
1245 fn structural_gate_detects_lists() {
1246 assert!(has_structural_markers("intro\n- item one\n- item two"));
1247 assert!(has_structural_markers("intro\n1. first\n2. second"));
1248 }
1249
1250 #[test]
1251 fn structural_gate_detects_fenced_code() {
1252 assert!(has_structural_markers("prose\n```\ncode\n```"));
1253 assert!(has_structural_markers("```rust\nfn main() {}\n```"));
1254 }
1255
1256 #[test]
1257 fn structural_gate_detects_tables_and_blockquotes() {
1258 assert!(has_structural_markers(
1259 "col\n| a | b |\n|---|---|\n| 1 | 2 |"
1260 ));
1261 assert!(has_structural_markers("context\n> quoted line\nafter"));
1262 }
1263
1264 #[test]
1265 fn structural_gate_lets_flat_prose_through() {
1266 assert!(!has_structural_markers(
1267 "one sentence. another sentence. a third. no line breaks here."
1268 ));
1269 }
1270
1271 #[test]
1272 fn markdown_input_preserves_structure_through_compression() {
1273 let md = "# Heading\n\nFirst paragraph with enough body to clear the thirty-two-char floor.\n\n- list item one\n- list item two\n\nSecond paragraph follows here.";
1279 let (out, _trace) = enc().compress_traced(md, Model::ClaudeSonnet47);
1280 let content = &out.content;
1284 assert!(
1286 content.contains("\n\n"),
1287 "expected paragraph break preserved, got: {content:?}"
1288 );
1289 assert!(
1291 content.starts_with("# "),
1292 "expected heading prefix preserved, got: {content:?}"
1293 );
1294 assert!(
1296 content.contains("\n- "),
1297 "expected list-item marker preserved, got: {content:?}"
1298 );
1299 let newlines = content.matches('\n').count();
1301 assert!(
1302 newlines >= 4,
1303 "expected >=4 newlines (paragraph + 2 list + blank), got {newlines} in {content:?}"
1304 );
1305 }
1306
1307 #[test]
1308 fn structural_encoder_preserves_fenced_code_verbatim() {
1309 use crate::RuleSet;
1310 let md =
1311 "Intro paragraph.\n\n```rust\nfn main() {\n println!(\"x\");\n}\n```\n\nEpilogue.";
1312 let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1313 assert!(
1314 out.contains("```rust\nfn main() {\n println!(\"x\");\n}\n```"),
1315 "fenced code must be preserved byte-for-byte, got: {out:?}"
1316 );
1317 }
1318
1319 #[test]
1323 fn structural_encoder_preserves_four_space_indented_code() {
1324 use crate::RuleSet;
1325 let md = "intro paragraph.\n\n fn check(token: &Token) -> bool {\n token.expires_at <= Utc::now()\n }\n\nepilogue paragraph.";
1326 let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1327 assert!(
1328 out.contains(" fn check(token: &Token) -> bool {"),
1329 "four-space indented code must be verbatim, got: {out:?}"
1330 );
1331 assert!(
1332 out.contains(" token.expires_at <= Utc::now()"),
1333 "indented-code continuation (8 spaces) must be verbatim, got: {out:?}"
1334 );
1335 assert!(
1336 out.contains(" }"),
1337 "closing brace line must be verbatim, got: {out:?}"
1338 );
1339 }
1340
1341 #[test]
1342 fn structural_encoder_preserves_tab_indented_code() {
1343 use crate::RuleSet;
1344 let md = "intro.\n\n\tlet x = 1;\n\tlet y = 2;\n\nafter.";
1345 let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1346 assert!(
1347 out.contains("\tlet x = 1;"),
1348 "tab-indented code must be verbatim, got: {out:?}"
1349 );
1350 }
1351
1352 #[test]
1353 fn structural_encoder_compresses_paragraph_body() {
1354 use crate::RuleSet;
1355 let md = "Title line no header.\n\nThe authentication module sends a request to the policy engine and it returns a result.";
1357 let (out, _trace) = encode_symbolic_structural_traced_with(md, &RuleSet::default_v1());
1358 assert!(out.contains("\n\n"), "paragraph break preserved");
1359 assert!(
1361 !out.contains("authentication module"),
1362 "expected term_substitutions to rewrite 'authentication module', got: {out:?}"
1363 );
1364 }
1365
1366 #[test]
1367 fn substitutes_authorization_term() {
1368 let out = encode_symbolic(
1369 "The user authentication module sends the request to the policy engine.",
1370 );
1371 assert!(out.contains("A.mod"), "expected A.mod in {out}");
1372 assert!(out.contains("PE"), "expected PE in {out}");
1373 }
1374
1375 #[test]
1376 fn drops_filler_words() {
1377 let out = encode_symbolic("The user is in the system.");
1378 let lc = out.to_lowercase();
1380 assert!(!lc.split_whitespace().any(|w| w == "the"));
1381 assert!(!lc.split_whitespace().any(|w| w == "is"));
1382 }
1383
1384 #[test]
1385 fn drops_expanded_prepositions_and_intensifiers() {
1386 let out = encode_symbolic(
1392 "The request is just really very important for the handler to actually log during the call.",
1393 );
1394 let lc = out.to_lowercase();
1395 let words: std::collections::HashSet<_> = lc.split_whitespace().collect();
1396 for stripped in ["for", "during", "just", "really", "very", "actually"] {
1397 assert!(
1398 !words.contains(stripped),
1399 "filler `{stripped}` must be stripped from: {out}",
1400 );
1401 }
1402 for kept in ["important", "log", "call"] {
1406 assert!(
1407 words.iter().any(|w| w.contains(kept)),
1408 "content word `{kept}` must survive: {out}",
1409 );
1410 }
1411 }
1412
1413 #[test]
1414 fn polarity_bearing_words_are_not_filler() {
1415 for forbidden in [
1422 "not", "never", "no", "nothing", "maybe", "perhaps", "likely", "possibly",
1423 ] {
1424 assert!(
1425 !crate::encoder::FILLER_WORDS.contains(&forbidden),
1426 "polarity-bearing word `{forbidden}` must NOT be in FILLER_WORDS",
1427 );
1428 }
1429 }
1430
1431 #[test]
1432 fn arrow_replacement() {
1433 let out = encode_symbolic("The handler invokes the policy engine.");
1437 assert!(out.contains('\u{2192}'), "missing arrow in {out}");
1438 }
1439
1440 #[test]
1441 fn success_glyph_replacement_when_rule_is_explicitly_enabled() {
1442 let mut rs = RuleSet::default_v1();
1448 rs.enabled.insert("success".to_owned(), true);
1449 rs.weights.insert("success".to_owned(), 1.0);
1450 let (out, _) =
1451 encode_symbolic_traced_with("If validation succeeds the request continues.", &rs);
1452 assert!(out.contains('\u{2713}'), "missing check in {out}");
1453 }
1454
1455 #[test]
1456 fn failure_glyph_replacement_when_rule_is_explicitly_enabled() {
1457 let mut rs = RuleSet::default_v1();
1459 rs.enabled.insert("failure".to_owned(), true);
1460 rs.weights.insert("failure".to_owned(), 1.0);
1461 let (out, _) =
1462 encode_symbolic_traced_with("If validation fails the request is rejected.", &rs);
1463 assert!(out.contains('\u{2717}'), "missing cross in {out}");
1464 }
1465
1466 #[test]
1467 fn default_v1_disables_success_and_failure_glyphs() {
1468 let rs = RuleSet::default_v1();
1474 assert_eq!(
1475 rs.enabled.get("success").copied(),
1476 Some(false),
1477 "success must be OFF by default",
1478 );
1479 assert_eq!(
1480 rs.enabled.get("failure").copied(),
1481 Some(false),
1482 "failure must be OFF by default",
1483 );
1484 let (out, trace) = encode_symbolic_traced_with(
1485 "If validation succeeds the call fails and the handler logs it.",
1486 &rs,
1487 );
1488 assert!(!out.contains('\u{2713}'));
1489 assert!(!out.contains('\u{2717}'));
1490 assert_eq!(trace.success, 0);
1491 assert_eq!(trace.failure, 0);
1492 }
1493
1494 #[test]
1495 fn longer_term_wins_over_shorter() {
1496 let out = encode_symbolic("The authentication module handles login.");
1497 assert!(out.contains("A.mod"));
1499 assert!(!out.contains("A module"));
1500 }
1501
1502 #[test]
1503 fn idempotent_on_minimal_input() {
1504 let out = encode_symbolic("hi");
1505 assert_eq!(out, "hi");
1506 }
1507
1508 #[test]
1509 fn compress_returns_symbolic_when_net_positive() {
1510 let inp = "The authentication module forwards the request to the policy engine \
1511 for validation against the session store.";
1512 let out = enc().compress(inp, Model::Gpt4);
1513 assert_eq!(out.format, Format::Symbolic);
1514 assert!(out.compressed_tokens < out.baseline_tokens, "{out:?}");
1515 assert!(out.fallback.is_none());
1516 }
1517
1518 #[test]
1519 fn compress_falls_back_when_too_short() {
1520 let out = enc().compress("hello world", Model::Gpt4);
1521 assert_eq!(out.format, Format::Prose);
1522 assert!(matches!(out.fallback, Some(FallbackReason::Uncompressible)));
1523 }
1524
1525 #[test]
1526 fn compress_falls_back_for_unregistered_model() {
1527 let out = enc().compress(
1528 "The authentication module forwards the request to the policy engine.",
1529 Model::Gemini25Pro,
1530 );
1531 assert_eq!(out.format, Format::Prose);
1532 assert!(matches!(
1533 out.fallback,
1534 Some(FallbackReason::TokenizerMissing)
1535 ));
1536 }
1537
1538 #[test]
1539 fn select_format_matches_compress_choice() {
1540 let inp = "The authentication module forwards the request to the policy engine \
1541 for validation against the session store.";
1542 let f = enc().select_format(inp, Model::Gpt4);
1543 let c = enc().compress(inp, Model::Gpt4);
1544 assert_eq!(f, c.format);
1545 }
1546
1547 #[test]
1548 fn explicit_fallback_returns_prose() {
1549 let out = enc().fallback(
1550 "The authentication module forwards the request.",
1551 Model::Gpt4,
1552 FallbackReason::QualityDegraded,
1553 );
1554 assert_eq!(out.format, Format::Prose);
1555 assert!(matches!(
1556 out.fallback,
1557 Some(FallbackReason::QualityDegraded)
1558 ));
1559 }
1560
1561 #[test]
1562 fn content_hash_is_blake3_of_original_not_compressed() {
1563 let inp = "The authentication module forwards the request to the policy engine.";
1564 let out = enc().compress(inp, Model::Gpt4);
1565 let mut h = Hasher::new();
1566 h.update(inp.as_bytes());
1567 assert_eq!(out.content_hash, h.finalize().to_hex().to_string());
1568 }
1569
1570 #[test]
1571 fn trace_records_term_substitution_count() {
1572 let (_, t) = encode_symbolic_traced(
1573 "The authentication module forwards a request to the policy engine \
1574 for validation against the session store.",
1575 );
1576 assert!(t.term_substitutions >= 3, "{t:?}");
1580 }
1581
1582 #[test]
1583 fn trace_records_filler_removal_count() {
1584 let (_, t) = encode_symbolic_traced("The user is in the system and is using the database.");
1585 assert!(t.filler_removed >= 4, "{t:?}");
1587 }
1588
1589 #[test]
1590 fn trace_no_fire_for_neutral_text() {
1591 let (_, t) = encode_symbolic_traced("Lorem ipsum dolor sit amet consectetur");
1592 assert_eq!(t.term_substitutions, 0);
1593 assert_eq!(t.if_prefix, 0);
1594 assert!(!t.any_fired() || t.filler_removed > 0);
1595 }
1596
1597 #[test]
1598 fn step9_bytes_saved_populated_when_multiple_rules_fire() {
1599 let input = "The authentication module forwards a request to the policy \
1607 engine for validation against the session store.";
1608 let (out, trace) = encode_symbolic_traced(input);
1609
1610 assert!(
1612 trace.bytes_saved[EncoderTrace::IDX_TERM_SUBSTITUTIONS] > 0,
1613 "term_substitutions should have saved bytes; trace={trace:?}"
1614 );
1615 assert!(
1616 trace.bytes_saved[EncoderTrace::IDX_FILLER_REMOVED] > 0,
1617 "filler_removed should have saved bytes; trace={trace:?}"
1618 );
1619
1620 assert_eq!(
1623 trace.bytes_saved[EncoderTrace::IDX_ANSI_STRIPPED],
1624 0,
1625 "ansi_stripped cannot fire on plain prose"
1626 );
1627
1628 let sum_deltas: u64 = trace.bytes_saved.iter().sum();
1634 let total_delta = (input.len() as u64).saturating_sub(out.len() as u64);
1635 assert!(sum_deltas > 0, "at least one rule contributed");
1636 assert!(
1637 sum_deltas <= total_delta,
1638 "sum of per-rule deltas ({sum_deltas}) must not exceed \
1639 total shrink ({total_delta}); input={} output={}",
1640 input.len(),
1641 out.len()
1642 );
1643 }
1644
1645 #[test]
1646 fn compress_traced_returns_empty_trace_on_short_input() {
1647 let (_, trace) = enc().compress_traced("hi", Model::Gpt4);
1648 assert_eq!(trace, EncoderTrace::default());
1649 }
1650
1651 #[test]
1652 fn compress_traced_returns_empty_trace_on_unsupported_model() {
1653 let (_, trace) = enc().compress_traced(
1654 "The authentication module forwards the request.",
1655 Model::Gemini25Pro,
1656 );
1657 assert_eq!(trace, EncoderTrace::default());
1658 }
1659
1660 #[test]
1661 fn compress_traced_returns_empty_trace_on_oversized_input() {
1662 let big = "abc ".repeat(MAX_INPUT_CHARS);
1663 let (out, trace) = enc().compress_traced(&big, Model::Gpt4);
1664 assert_eq!(out.format, Format::Prose);
1665 assert!(matches!(out.fallback, Some(FallbackReason::OversizedInput)));
1666 assert_eq!(trace, EncoderTrace::default());
1667 }
1668
1669 #[test]
1670 fn trace_pairs_are_alphabetical() {
1671 let t = EncoderTrace::default();
1672 let pairs = t.as_pairs();
1673 let mut sorted = pairs;
1674 sorted.sort_by_key(|(name, _)| *name);
1675 assert_eq!(pairs, sorted);
1676 }
1677
1678 #[test]
1679 fn ruleset_default_v1_matches_constants_only_encoder() {
1680 let inputs = [
1683 "The authentication module forwards the request to the policy engine.",
1684 "If validation succeeds the request continues. The handler invokes the rate limiter.",
1685 "User is authorized via the bearer token; resource handler validates the operation.",
1686 ];
1687 let rs = RuleSet::default_v1();
1688 for inp in inputs {
1689 let (a, ta) = encode_symbolic_traced(inp);
1690 let (b, tb) = encode_symbolic_traced_with(inp, &rs);
1691 assert_eq!(a, b, "default_v1 must match legacy on `{inp}`");
1692 assert_eq!(ta, tb, "trace must match on `{inp}`");
1693 }
1694 }
1695
1696 #[test]
1697 fn ruleset_disabled_rule_does_not_fire() {
1698 let mut rs = RuleSet::default_v1();
1699 rs.enabled.insert("success".to_owned(), false);
1700 let (out, trace) = encode_symbolic_traced_with(
1701 "If validation succeeds the request continues to the handler.",
1702 &rs,
1703 );
1704 assert!(
1705 !out.contains('\u{2713}'),
1706 "success glyph must not appear: {out}"
1707 );
1708 assert_eq!(trace.success, 0, "success rule trace must be zero");
1709 }
1710
1711 #[test]
1712 fn ruleset_weight_below_threshold_is_treated_as_off() {
1713 let mut rs = RuleSet::default_v1();
1719 rs.weights.insert("arrow".to_owned(), 0.02);
1720 let (out, trace) = encode_symbolic_traced_with(
1721 "The handler invokes the policy engine to validate the request.",
1722 &rs,
1723 );
1724 assert!(
1725 !out.contains('\u{2192}'),
1726 "arrow glyph must not appear: {out}"
1727 );
1728 assert_eq!(trace.arrow, 0);
1729 }
1730
1731 #[test]
1732 fn ruleset_weight_above_threshold_but_below_legacy_half_is_on() {
1733 let mut rs = RuleSet::default_v1();
1739 rs.weights.insert("arrow".to_owned(), 0.2);
1740 let (out, trace) = encode_symbolic_traced_with(
1741 "The handler invokes the policy engine to validate the request.",
1742 &rs,
1743 );
1744 assert!(
1745 out.contains('\u{2192}'),
1746 "arrow glyph must be applied at weight 0.2 under revised threshold: {out}"
1747 );
1748 assert!(trace.arrow > 0);
1749 }
1750
1751 #[test]
1752 fn ruleset_unrecognised_keys_are_dropped_on_load() {
1753 let toml = r"
1754[enabled]
1755success = false
1756made_up_rule = true
1757[weights]
1758arrow = 0.3
1759another_made_up = 0.7
1760";
1761 let rs = RuleSet::from_toml_str(toml).expect("parse");
1762 assert_eq!(rs.enabled.get("success").copied(), Some(false));
1763 assert!(!rs.enabled.contains_key("made_up_rule"));
1764 assert!(!rs.weights.contains_key("another_made_up"));
1765 }
1766
1767 #[test]
1768 fn ruleset_round_trip_through_toml() {
1769 let mut rs = RuleSet::default_v1();
1770 rs.enabled.insert("success".to_owned(), false);
1771 rs.weights.insert("arrow".to_owned(), 0.42);
1772 rs.version = Some("test-r1".to_owned());
1773 let s = rs.to_toml_string().expect("serialize");
1774 let rs2 = RuleSet::from_toml_str(&s).expect("parse");
1775 assert_eq!(rs2.enabled.get("success").copied(), Some(false));
1776 assert!((rs2.weight("arrow") - 0.42).abs() < 1e-6);
1777 assert_eq!(rs2.version.as_deref(), Some("test-r1"));
1778 }
1779
1780 #[test]
1781 fn safe_canary_preserves_success_failure_words() {
1782 let rs = RuleSet::safe_canary_v1();
1783 let (out, trace) = encode_symbolic_traced_with(
1784 "If validation succeeds the request continues. If validation fails the request is rejected.",
1785 &rs,
1786 );
1787 assert!(
1788 out.to_lowercase().contains("succeeds"),
1789 "success word should remain: {out}"
1790 );
1791 assert!(
1792 out.to_lowercase().contains("fails"),
1793 "failure word should remain: {out}"
1794 );
1795 assert_eq!(trace.success, 0);
1796 assert_eq!(trace.failure, 0);
1797 assert_eq!(
1798 rs.version.as_deref(),
1799 Some("safe-canary-v1-no-success-failure")
1800 );
1801 }
1802
1803 #[test]
1804 fn agentic_canary_v2_enables_quality_ready_tool_codecs() {
1805 let rs = RuleSet::agentic_canary_v2();
1806 for name in [
1807 "json_records_table",
1808 "numeric_range_lines",
1809 "repeated_chunk_dict",
1810 "tool_schema_semantic_table",
1811 ] {
1812 assert!(rs.is_enabled(name), "{name} should be enabled");
1813 assert!((rs.weight(name) - 1.0).abs() < f32::EPSILON);
1814 }
1815 assert!(!rs.is_enabled("success"), "success glyphs stay disabled");
1816 assert!(!rs.is_enabled("failure"), "failure glyphs stay disabled");
1817 assert_eq!(
1818 rs.version.as_deref(),
1819 Some("agentic-canary-v2-quality-ready-codecs")
1820 );
1821 }
1822
1823 #[test]
1824 fn compress_traced_with_respects_ruleset_toggle() {
1825 let mut rs = RuleSet::default_v1();
1826 rs.enabled.insert("term_substitutions".to_owned(), false);
1827 let inp = "The authentication module forwards the request to the policy engine \
1828 for validation against the session store.";
1829 let (out, _) = enc().compress_traced_with(inp, Model::Gpt4, &rs);
1830 assert!(!out.content.contains("A.mod"), "{:?}", out.content);
1833 assert!(!out.content.contains("PE"), "{:?}", out.content);
1834 }
1835
1836 #[test]
1838 fn compress_meets_section_10() {
1839 use std::time::Instant;
1840 let e = enc();
1841 let inp = "The authentication module forwards the request to the policy engine \
1842 for validation against the session store and then the response \
1843 pipeline returns the result. "
1844 .repeat(20);
1845 let mut samples = Vec::with_capacity(100);
1846 for _ in 0..100 {
1847 let t = Instant::now();
1848 let _ = e.compress(&inp, Model::Gpt4);
1849 samples.push(t.elapsed().as_micros());
1850 }
1851 samples.sort_unstable();
1852 let p50 = samples[50];
1853 let p95 = samples[94];
1854 let p99 = samples[98];
1855 eprintln!(
1856 "compress {} bytes -> p50={p50}us p95={p95}us p99={p99}us",
1857 inp.len()
1858 );
1859 assert!(p95 < 50_000, "p95 {p95}us breaches debug ceiling");
1861 }
1862
1863 #[test]
1864 fn idx_constants_match_as_pairs_order() {
1865 let names = EncoderTrace::default().as_pairs().map(|(n, _)| n);
1870 assert_eq!(names[EncoderTrace::IDX_AND], "and");
1871 assert_eq!(names[EncoderTrace::IDX_ANSI_STRIPPED], "ansi_stripped");
1872 assert_eq!(names[EncoderTrace::IDX_ARROW], "arrow");
1873 assert_eq!(names[EncoderTrace::IDX_BLANK_LINES], "blank_lines");
1874 assert_eq!(names[EncoderTrace::IDX_FAILURE], "failure");
1875 assert_eq!(names[EncoderTrace::IDX_FILLER_REMOVED], "filler_removed");
1876 assert_eq!(names[EncoderTrace::IDX_IF_PREFIX], "if_prefix");
1877 assert_eq!(names[EncoderTrace::IDX_JSON_MINIFIED], "json_minified");
1878 assert_eq!(
1879 names[EncoderTrace::IDX_JSON_RECORDS_TABLE],
1880 "json_records_table"
1881 );
1882 assert_eq!(
1883 names[EncoderTrace::IDX_NUMERIC_RANGE_LINES],
1884 "numeric_range_lines"
1885 );
1886 assert_eq!(
1887 names[EncoderTrace::IDX_REPEATED_CHUNK_DICT],
1888 "repeated_chunk_dict"
1889 );
1890 assert_eq!(names[EncoderTrace::IDX_REPEATED_LINES], "repeated_lines");
1891 assert_eq!(names[EncoderTrace::IDX_SUCCESS], "success");
1892 assert_eq!(
1893 names[EncoderTrace::IDX_TERM_SUBSTITUTIONS],
1894 "term_substitutions"
1895 );
1896 assert_eq!(
1897 names[EncoderTrace::IDX_TOOL_SCHEMA_SEMANTIC_TABLE],
1898 "tool_schema_semantic_table"
1899 );
1900 assert_eq!(names[EncoderTrace::IDX_TRAILING_WS], "trailing_ws");
1901 assert_eq!(names[EncoderTrace::IDX_VS], "vs");
1902 }
1903
1904 #[test]
1905 fn bytes_saved_pairs_parallel_to_as_pairs() {
1906 let mut t = EncoderTrace::default();
1910 t.bytes_saved[0] = 7; t.bytes_saved[13] = 42; let counts = t.as_pairs();
1914 let bytes = t.bytes_saved_pairs();
1915 assert_eq!(counts.len(), bytes.len());
1916 for i in 0..counts.len() {
1917 assert_eq!(counts[i].0, bytes[i].0, "name at index {i} diverges");
1918 }
1919 assert_eq!(bytes[0], ("and", 7));
1920 assert_eq!(bytes[13], ("term_substitutions", 42));
1921
1922 let d = EncoderTrace::default();
1926 assert!(d.bytes_saved_pairs().iter().all(|(_, b)| *b == 0));
1927 }
1928
1929 #[test]
1930 fn bytes_saved_merge_is_saturating_sum() {
1931 let mut a = EncoderTrace::default();
1932 a.bytes_saved[5] = 100;
1933 let mut b = EncoderTrace::default();
1934 b.bytes_saved[5] = 50;
1935 b.bytes_saved[9] = u64::MAX; a.merge(b);
1937 assert_eq!(a.bytes_saved[5], 150);
1938 assert_eq!(a.bytes_saved[9], u64::MAX);
1939
1940 let mut c = EncoderTrace::default();
1941 c.bytes_saved[9] = 1;
1942 a.merge(c);
1943 assert_eq!(a.bytes_saved[9], u64::MAX);
1945 }
1946}