1use crate::ansi_strip::AnsiStripper;
2use crate::dict_compressor::DictCompressor;
3use crate::entropy_truncator::EntropyTruncator;
4use crate::error::{Result, SqzError};
5use crate::preset::Preset;
6use crate::prompt_cache::PromptCacheDetector;
7use crate::stages::{
8 CollapseArraysStage, CondenseStage, CustomTransformsStage, FlattenStage, GitDiffFoldStage,
9 KeepFieldsStage, StripFieldsStage, StripNullsStage, TruncateStringsStage,
10};
11use crate::token_counter::TokenCounter;
12use crate::token_pruner::TokenPruner;
13use crate::toon::ToonEncoder;
14use crate::types::{CompressedContent, Content, ContentType, ModelFamily, StageConfig};
15
16pub struct SessionContext {
18 pub session_id: String,
19}
20
21pub struct CompressionPipeline {
24 stages: Vec<Box<dyn crate::stages::CompressionStage>>,
25 toon_encoder: ToonEncoder,
26 token_counter: TokenCounter,
27 token_pruner: TokenPruner,
28 dict_compressor: std::cell::RefCell<DictCompressor>,
29 entropy_truncator: EntropyTruncator,
30 #[allow(dead_code)]
31 prompt_cache_detector: PromptCacheDetector,
32}
33
34impl CompressionPipeline {
35 pub fn new(_preset: &Preset) -> Self {
39 let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
40 Box::new(AnsiStripper),
41 Box::new(KeepFieldsStage),
42 Box::new(StripFieldsStage),
43 Box::new(CondenseStage),
44 Box::new(GitDiffFoldStage),
45 Box::new(StripNullsStage),
46 Box::new(FlattenStage),
47 Box::new(TruncateStringsStage),
48 Box::new(CollapseArraysStage),
49 Box::new(CustomTransformsStage),
50 ];
51 stages.sort_by_key(|s| s.priority());
52
53 Self {
54 stages,
55 toon_encoder: ToonEncoder,
56 token_counter: TokenCounter::new(),
57 token_pruner: TokenPruner::new(),
58 dict_compressor: std::cell::RefCell::new(DictCompressor::new()),
59 entropy_truncator: EntropyTruncator::new(),
60 prompt_cache_detector: PromptCacheDetector,
61 }
62 }
63
64 pub fn compress(
69 &self,
70 input: &str,
71 _ctx: &SessionContext,
72 preset: &Preset,
73 ) -> Result<CompressedContent> {
74 let model_family = model_family_from_preset(preset);
75 let tokens_original = self.token_counter.count(input, &model_family);
76
77 let mut content = Content {
78 raw: input.to_owned(),
79 content_type: ContentType::PlainText,
80 metadata: crate::types::ContentMetadata {
81 source: None,
82 path: None,
83 language: None,
84 },
85 tokens_original,
86 };
87
88 let mut stages_applied: Vec<String> = Vec::new();
89
90 for stage in &self.stages {
91 let config = stage_config_from_preset(stage.name(), preset);
92 if config.enabled {
93 stage.process(&mut content, &config)?;
94 stages_applied.push(stage.name().to_owned());
95 }
96 }
97
98 let is_json = ToonEncoder::is_json(&content.raw);
101
102 if !is_json && content.raw.len() > 200 {
105 if let Ok(rle_result) = crate::rle_compressor::rle_compress(&content.raw, 3) {
106 if rle_result.runs_collapsed > 0 {
107 let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
109 let rle_has_error = rle_result.text.contains("ERROR") || rle_result.text.contains("error:");
110 if !has_error || rle_has_error {
111 content.raw = rle_result.text;
112 stages_applied.push("rle".to_owned());
113 }
114 }
115 }
116 }
117
118 if !is_json && content.raw.len() > 300 {
120 if let Ok(sw_result) = crate::rle_compressor::sliding_window_dedup(&content.raw, 4) {
121 if sw_result.dedup_count > 0 {
122 let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
124 let sw_has_error = sw_result.text.contains("ERROR") || sw_result.text.contains("error:");
125 if !has_error || sw_has_error {
126 content.raw = sw_result.text;
127 stages_applied.push("sliding_window_dedup".to_owned());
128 }
129 }
130 }
131 }
132
133 if !is_json && content.raw.len() > 500 {
135 if let Ok(trunc_result) = self.entropy_truncator.truncate_string(&content.raw) {
136 if trunc_result.segments_dropped > 0 {
137 content.raw = trunc_result.text;
138 stages_applied.push("entropy_truncate".to_owned());
139 }
140 }
141 }
142
143 if !is_json && content.raw.len() > 100 && looks_like_prose(&content.raw) {
145 if let Ok(prune_result) = self.token_pruner.prune(&content.raw) {
146 if prune_result.tokens_removed > 0 {
147 content.raw = prune_result.text;
148 stages_applied.push("token_prune".to_owned());
149 }
150 }
151 }
152
153 let data = if ToonEncoder::is_json(&content.raw) {
155 self.dict_compressor.borrow_mut().observe(&content.raw);
159 let dict_result = self.dict_compressor.borrow().compress(&content.raw)?;
160
161 if dict_result.substitutions > 0 && dict_result.bytes_saved > 50 {
162 stages_applied.push("dict_compress".to_owned());
164 if let Some(newline_pos) = dict_result.data.find("\n{") {
165 let header = &dict_result.data[..newline_pos + 1];
166 let json_body = &dict_result.data[newline_pos + 1..];
167 if let Ok(json) = serde_json::from_str::<serde_json::Value>(json_body) {
168 let encoded = self.toon_encoder.encode(&json)?;
169 stages_applied.push("toon_encode".to_owned());
170 format!("{header}{encoded}")
171 } else {
172 dict_result.data
173 }
174 } else {
175 dict_result.data
176 }
177 } else {
178 let json: serde_json::Value = serde_json::from_str(&content.raw)
180 .map_err(|e| SqzError::Other(format!("pipeline: JSON parse error: {e}")))?;
181 let encoded = self.toon_encoder.encode(&json)?;
182 stages_applied.push("toon_encode".to_owned());
183 encoded
184 }
185 } else {
186 content.raw
187 };
188
189 let tokens_compressed = self.token_counter.count(&data, &model_family);
190 let compression_ratio = if tokens_original == 0 {
191 1.0
192 } else {
193 tokens_compressed as f64 / tokens_original as f64
194 };
195
196 Ok(CompressedContent {
197 data,
198 tokens_compressed,
199 tokens_original,
200 stages_applied,
201 compression_ratio,
202 provenance: crate::types::Provenance::default(),
203 verify: None,
204 })
205 }
206
207 pub fn insert_stage(&mut self, stage: Box<dyn crate::stages::CompressionStage>) {
209 self.stages.push(stage);
210 self.stages.sort_by_key(|s| s.priority());
211 }
212
213 pub fn reload_preset(&mut self, _preset: &Preset) -> Result<()> {
217 let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
218 Box::new(AnsiStripper),
219 Box::new(KeepFieldsStage),
220 Box::new(StripFieldsStage),
221 Box::new(CondenseStage),
222 Box::new(GitDiffFoldStage),
223 Box::new(StripNullsStage),
224 Box::new(FlattenStage),
225 Box::new(TruncateStringsStage),
226 Box::new(CollapseArraysStage),
227 Box::new(CustomTransformsStage),
228 ];
229 stages.sort_by_key(|s| s.priority());
230 self.stages = stages;
231 self.dict_compressor.borrow_mut().reset();
233 Ok(())
234 }
235}
236
237fn model_family_from_preset(preset: &Preset) -> ModelFamily {
239 match preset.model.family.to_lowercase().as_str() {
240 "anthropic" | "claude" => ModelFamily::AnthropicClaude,
241 "openai" | "gpt" => ModelFamily::OpenAiGpt,
242 "google" | "gemini" => ModelFamily::GoogleGemini,
243 other => ModelFamily::Local(other.to_string()),
244 }
245}
246
247fn stage_config_from_preset(name: &str, preset: &Preset) -> StageConfig {
249 let c = &preset.compression;
250 match name {
251 "ansi_strip" => StageConfig {
252 enabled: true,
253 options: serde_json::Value::Object(Default::default()),
254 },
255 "keep_fields" => {
256 if let Some(cfg) = &c.keep_fields {
257 StageConfig {
258 enabled: cfg.enabled,
259 options: serde_json::json!({ "fields": cfg.fields }),
260 }
261 } else {
262 StageConfig::default()
263 }
264 }
265 "strip_fields" => {
266 if let Some(cfg) = &c.strip_fields {
267 StageConfig {
268 enabled: cfg.enabled,
269 options: serde_json::json!({ "fields": cfg.fields }),
270 }
271 } else {
272 StageConfig::default()
273 }
274 }
275 "condense" => {
276 if let Some(cfg) = &c.condense {
277 StageConfig {
278 enabled: cfg.enabled,
279 options: serde_json::json!({
280 "max_repeated_lines": cfg.max_repeated_lines
281 }),
282 }
283 } else {
284 StageConfig::default()
285 }
286 }
287 "git_diff_fold" => {
288 if let Some(cfg) = &c.git_diff_fold {
289 StageConfig {
290 enabled: cfg.enabled,
291 options: serde_json::json!({
292 "max_context_lines": cfg.max_context_lines
293 }),
294 }
295 } else {
296 StageConfig {
298 enabled: true,
299 options: serde_json::json!({ "max_context_lines": 2 }),
300 }
301 }
302 }
303 "strip_nulls" => {
304 if let Some(cfg) = &c.strip_nulls {
305 StageConfig {
306 enabled: cfg.enabled,
307 options: serde_json::Value::Object(Default::default()),
308 }
309 } else {
310 StageConfig::default()
311 }
312 }
313 "flatten" => {
314 if let Some(cfg) = &c.flatten {
315 StageConfig {
316 enabled: cfg.enabled,
317 options: serde_json::json!({ "max_depth": cfg.max_depth }),
318 }
319 } else {
320 StageConfig::default()
321 }
322 }
323 "truncate_strings" => {
324 if let Some(cfg) = &c.truncate_strings {
325 StageConfig {
326 enabled: cfg.enabled,
327 options: serde_json::json!({ "max_length": cfg.max_length }),
328 }
329 } else {
330 StageConfig::default()
331 }
332 }
333 "collapse_arrays" => {
334 if let Some(cfg) = &c.collapse_arrays {
335 StageConfig {
336 enabled: cfg.enabled,
337 options: serde_json::json!({
338 "max_items": cfg.max_items,
339 "summary_template": cfg.summary_template
340 }),
341 }
342 } else {
343 StageConfig::default()
344 }
345 }
346 "custom_transforms" => {
347 if let Some(cfg) = &c.custom_transforms {
348 StageConfig {
349 enabled: cfg.enabled,
350 options: serde_json::Value::Object(Default::default()),
351 }
352 } else {
353 StageConfig::default()
354 }
355 }
356 _ => StageConfig::default(),
357 }
358}
359
360fn looks_like_prose(text: &str) -> bool {
363 let lines: Vec<&str> = text.lines().take(20).collect();
364 if lines.is_empty() {
365 return false;
366 }
367
368 let mut prose_lines = 0;
369 let mut code_lines = 0;
370
371 for line in &lines {
372 let trimmed = line.trim();
373 if trimmed.is_empty() {
374 continue;
375 }
376 if trimmed.ends_with('{')
378 || trimmed.ends_with('}')
379 || trimmed.ends_with(';')
380 || trimmed.starts_with("fn ")
381 || trimmed.starts_with("pub ")
382 || trimmed.starts_with("let ")
383 || trimmed.starts_with("const ")
384 || trimmed.starts_with("import ")
385 || trimmed.starts_with("def ")
386 || trimmed.starts_with("class ")
387 || trimmed.contains("::")
388 || trimmed.contains("->")
389 || trimmed.contains("=>")
390 {
391 code_lines += 1;
392 } else {
393 prose_lines += 1;
394 }
395 }
396
397 prose_lines > code_lines
399}
400
401#[cfg(test)]
406mod tests {
407 use super::*;
408 use crate::preset::{
409 BudgetConfig, CollapseArraysConfig, CompressionConfig, CondenseConfig,
410 CustomTransformsConfig, ModelConfig, PresetMeta,
411 StripNullsConfig, ToolSelectionConfig, TruncateStringsConfig,
412 TerseModeConfig,
413 };
414
415 fn default_preset() -> Preset {
416 Preset {
417 preset: PresetMeta {
418 name: "test".into(),
419 version: "1.0".into(),
420 description: String::new(),
421 },
422 compression: CompressionConfig {
423 stages: vec![],
424 keep_fields: None,
425 strip_fields: None,
426 condense: Some(CondenseConfig {
427 enabled: true,
428 max_repeated_lines: 3,
429 }),
430 git_diff_fold: None,
431 strip_nulls: Some(StripNullsConfig { enabled: true }),
432 flatten: None,
433 truncate_strings: Some(TruncateStringsConfig {
434 enabled: true,
435 max_length: 500,
436 }),
437 collapse_arrays: Some(CollapseArraysConfig {
438 enabled: true,
439 max_items: 5,
440 summary_template: "... and {remaining} more items".into(),
441 }),
442 custom_transforms: Some(CustomTransformsConfig { enabled: true }),
443 },
444 tool_selection: ToolSelectionConfig {
445 max_tools: 5,
446 similarity_threshold: 0.7,
447 default_tools: vec![],
448 },
449 budget: BudgetConfig {
450 warning_threshold: 0.70,
451 ceiling_threshold: 0.85,
452 default_window_size: 200_000,
453 agents: Default::default(),
454 },
455 terse_mode: TerseModeConfig {
456 enabled: false,
457 level: crate::preset::TerseLevel::Moderate,
458 },
459 model: ModelConfig {
460 family: "anthropic".into(),
461 primary: "claude-sonnet-4-20250514".into(),
462 local: String::new(),
463 complexity_threshold: 0.4,
464 pricing: None,
465 },
466 }
467 }
468
469 fn ctx() -> SessionContext {
470 SessionContext {
471 session_id: "test-session".into(),
472 }
473 }
474
475 #[test]
476 fn new_creates_pipeline_with_sorted_stages() {
477 let preset = default_preset();
478 let pipeline = CompressionPipeline::new(&preset);
479 let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
481 let mut sorted = priorities.clone();
482 sorted.sort();
483 assert_eq!(priorities, sorted);
484 }
485
486 #[test]
487 fn compress_plain_text_passthrough() {
488 let preset = default_preset();
489 let pipeline = CompressionPipeline::new(&preset);
490 let result = pipeline.compress("hello world", &ctx(), &preset).unwrap();
491 assert_eq!(result.data, "hello world");
492 assert!(!result.stages_applied.contains(&"toon_encode".to_owned()));
493 }
494
495 #[test]
496 fn compress_json_applies_toon() {
497 let preset = default_preset();
498 let pipeline = CompressionPipeline::new(&preset);
499 let json = r#"{"name":"Alice","age":30}"#;
500 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
501 assert!(result.data.starts_with("TOON:"), "data: {}", result.data);
502 assert!(result.stages_applied.contains(&"toon_encode".to_owned()));
503 }
504
505 #[test]
506 fn compress_strips_nulls_from_json() {
507 let preset = default_preset();
508 let pipeline = CompressionPipeline::new(&preset);
509 let json = r#"{"a":1,"b":null}"#;
510 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
511 assert!(result.data.starts_with("TOON:"));
513 let decoded = ToonEncoder.decode(&result.data).unwrap();
515 assert!(decoded.get("b").is_none());
516 assert_eq!(decoded["a"], serde_json::json!(1));
517 }
518
519 #[test]
520 fn compress_returns_token_counts() {
521 let preset = default_preset();
522 let pipeline = CompressionPipeline::new(&preset);
523 let input = "a".repeat(100);
524 let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
525 assert!(result.tokens_original > 0);
526 assert!(result.tokens_compressed > 0);
527 }
528
529 #[test]
530 fn compress_ratio_is_reasonable() {
531 let preset = default_preset();
532 let pipeline = CompressionPipeline::new(&preset);
533 let result = pipeline.compress("hello", &ctx(), &preset).unwrap();
534 assert!(result.compression_ratio > 0.0);
535 }
536
537 #[test]
538 fn insert_stage_re_sorts_by_priority() {
539 use crate::stages::CompressionStage;
540 use crate::types::StageConfig;
541
542 struct LowPriorityStage;
543 impl CompressionStage for LowPriorityStage {
544 fn name(&self) -> &str {
545 "low_priority"
546 }
547 fn priority(&self) -> u32 {
548 5 }
550 fn process(
551 &self,
552 _content: &mut Content,
553 _config: &StageConfig,
554 ) -> crate::error::Result<()> {
555 Ok(())
556 }
557 }
558
559 let preset = default_preset();
560 let mut pipeline = CompressionPipeline::new(&preset);
561 pipeline.insert_stage(Box::new(LowPriorityStage));
562
563 let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
564 let mut sorted = priorities.clone();
565 sorted.sort();
566 assert_eq!(priorities, sorted);
567 assert_eq!(pipeline.stages[0].name(), "ansi_strip");
568 assert_eq!(pipeline.stages[1].name(), "low_priority");
569 }
570
571 #[test]
572 fn reload_preset_rebuilds_stages() {
573 let preset = default_preset();
574 let mut pipeline = CompressionPipeline::new(&preset);
575 let original_count = pipeline.stages.len();
576 pipeline.reload_preset(&preset).unwrap();
577 assert_eq!(pipeline.stages.len(), original_count);
578 }
579
580 #[test]
581 fn compress_keep_fields_filters_json() {
582 use crate::preset::KeepFieldsConfig;
583 let mut preset = default_preset();
584 preset.compression.keep_fields = Some(KeepFieldsConfig {
585 enabled: true,
586 fields: vec!["id".into(), "name".into()],
587 });
588 let pipeline = CompressionPipeline::new(&preset);
589 let json = r#"{"id":1,"name":"Bob","debug":"x"}"#;
590 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
591 let decoded = ToonEncoder.decode(&result.data).unwrap();
592 assert!(decoded.get("debug").is_none());
593 assert_eq!(decoded["id"], serde_json::json!(1));
594 }
595
596 #[test]
597 fn compress_empty_string() {
598 let preset = default_preset();
599 let pipeline = CompressionPipeline::new(&preset);
600 let result = pipeline.compress("", &ctx(), &preset).unwrap();
601 assert_eq!(result.data, "");
602 assert_eq!(result.tokens_original, 0);
603 }
604
605 #[test]
606 fn stage_config_from_preset_unknown_stage() {
607 let preset = default_preset();
608 let config = stage_config_from_preset("nonexistent", &preset);
609 assert!(!config.enabled);
610 }
611
612 use proptest::prelude::*;
617
618 fn significant_line_strategy() -> impl Strategy<Value = String> {
620 prop_oneof![
621 Just("error: connection refused".to_owned()),
622 Just("warning: deprecated API usage".to_owned()),
623 Just("failed: build step exited with code 1".to_owned()),
624 Just("success: deployment complete".to_owned()),
625 Just("status: all checks passed".to_owned()),
626 Just("error: file not found".to_owned()),
627 Just("warning: unused variable detected".to_owned()),
628 ]
629 }
630
631 fn noise_line_strategy() -> impl Strategy<Value = String> {
633 prop_oneof![
634 Just("---".to_owned()),
635 Just("Loading...".to_owned()),
636 Just("================".to_owned()),
637 Just("...".to_owned()),
638 ]
639 }
640
641 fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
644 let leaf = prop_oneof![
645 Just(serde_json::Value::Null),
646 any::<bool>().prop_map(serde_json::Value::Bool),
647 any::<i64>().prop_map(|n| serde_json::json!(n)),
648 any::<f64>()
649 .prop_filter("must be finite", |f| f.is_finite())
650 .prop_map(|f| serde_json::json!(f)),
651 ".*".prop_map(serde_json::Value::String),
652 ];
653
654 leaf.prop_recursive(4, 64, 8, |inner| {
655 prop_oneof![
656 prop::collection::vec(inner.clone(), 0..8)
657 .prop_map(serde_json::Value::Array),
658 prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
659 serde_json::Value::Object(m.into_iter().collect())
660 }),
661 ]
662 })
663 }
664
665 proptest! {
666 #[test]
674 fn prop_pipeline_ascii_safe_output(v in arb_json_value()) {
675 let preset = default_preset();
676 let pipeline = CompressionPipeline::new(&preset);
677
678 let json_input = serde_json::to_string(&v).expect("serialize should not fail");
679 let result = pipeline.compress(&json_input, &ctx(), &preset)
680 .expect("compress should not fail");
681
682 for ch in result.data.chars() {
683 let cp = ch as u32;
684 let is_printable_ascii = cp >= 0x20 && cp <= 0x7E;
685 let is_standard_whitespace = cp == 0x09 || cp == 0x0A || cp == 0x0D;
686 prop_assert!(
687 is_printable_ascii || is_standard_whitespace,
688 "non-ASCII-safe character in output: U+{:04X} ({:?})\noutput: {:?}",
689 cp, ch, result.data
690 );
691 }
692 }
693 }
694
695 proptest! {
696 #[test]
706 fn prop_compression_preserves_significant_content(
707 significant_lines in prop::collection::vec(significant_line_strategy(), 1..=5),
708 noise_line in noise_line_strategy(),
709 noise_repeat in 5u32..=10u32,
710 ) {
711 let preset = default_preset(); let pipeline = CompressionPipeline::new(&preset);
713
714 let mut lines: Vec<String> = Vec::new();
716 for sig in &significant_lines {
717 for _ in 0..noise_repeat {
718 lines.push(noise_line.clone());
719 }
720 lines.push(sig.clone());
721 }
722 for _ in 0..noise_repeat {
724 lines.push(noise_line.clone());
725 }
726
727 let input = lines.join("\n");
728 let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
729 let output = &result.data;
730
731 for sig in &significant_lines {
733 prop_assert!(
734 output.contains(sig.as_str()),
735 "significant line missing from output: {:?}\noutput: {:?}",
736 sig,
737 output
738 );
739 }
740
741 let mut max_run = 0usize;
743 let mut current_run = 0usize;
744 for line in output.lines() {
745 if line == noise_line.as_str() {
746 current_run += 1;
747 max_run = max_run.max(current_run);
748 } else {
749 current_run = 0;
750 }
751 }
752 prop_assert!(
753 max_run <= 3,
754 "noise line {:?} has a consecutive run of {} (max 3)\noutput: {:?}",
755 noise_line,
756 max_run,
757 output
758 );
759 }
760 }
761}