1use crate::ansi_strip::AnsiStripper;
2use crate::dict_compressor::DictCompressor;
3use crate::entropy_truncator::EntropyTruncator;
4use crate::error::{Result, SqzError};
5use crate::preset::Preset;
6use crate::prompt_cache::PromptCacheDetector;
7use crate::stages::{
8 CollapseArraysStage, CondenseStage, CustomTransformsStage, FlattenStage, GitDiffFoldStage,
9 KeepFieldsStage, StripFieldsStage, StripNullsStage, TruncateStringsStage,
10};
11use crate::token_counter::TokenCounter;
12use crate::token_pruner::TokenPruner;
13use crate::toon::ToonEncoder;
14use crate::types::{CompressedContent, Content, ContentType, ModelFamily, StageConfig};
15
16pub struct SessionContext {
19 pub session_id: String,
20}
21
22pub struct CompressionPipeline {
32 stages: Vec<Box<dyn crate::stages::CompressionStage>>,
33 toon_encoder: ToonEncoder,
34 token_counter: TokenCounter,
35 token_pruner: TokenPruner,
36 dict_compressor: std::cell::RefCell<DictCompressor>,
37 entropy_truncator: EntropyTruncator,
38 #[allow(dead_code)]
39 prompt_cache_detector: PromptCacheDetector,
40}
41
42impl CompressionPipeline {
43 pub fn new(_preset: &Preset) -> Self {
47 let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
48 Box::new(AnsiStripper),
49 Box::new(KeepFieldsStage),
50 Box::new(StripFieldsStage),
51 Box::new(CondenseStage),
52 Box::new(GitDiffFoldStage),
53 Box::new(StripNullsStage),
54 Box::new(FlattenStage),
55 Box::new(TruncateStringsStage),
56 Box::new(CollapseArraysStage),
57 Box::new(CustomTransformsStage),
58 ];
59 stages.sort_by_key(|s| s.priority());
60
61 Self {
62 stages,
63 toon_encoder: ToonEncoder,
64 token_counter: TokenCounter::new(),
65 token_pruner: TokenPruner::new(),
66 dict_compressor: std::cell::RefCell::new(DictCompressor::new()),
67 entropy_truncator: EntropyTruncator::new(),
68 prompt_cache_detector: PromptCacheDetector,
69 }
70 }
71
72 pub fn compress(
77 &self,
78 input: &str,
79 _ctx: &SessionContext,
80 preset: &Preset,
81 ) -> Result<CompressedContent> {
82 let model_family = model_family_from_preset(preset);
83 let tokens_original = self.token_counter.count(input, &model_family);
84
85 let mut content = Content {
86 raw: input.to_owned(),
87 content_type: ContentType::PlainText,
88 metadata: crate::types::ContentMetadata {
89 source: None,
90 path: None,
91 language: None,
92 },
93 tokens_original,
94 };
95
96 let mut stages_applied: Vec<String> = Vec::new();
97
98 for stage in &self.stages {
99 let config = stage_config_from_preset(stage.name(), preset);
100 if config.enabled {
101 stage.process(&mut content, &config)?;
102 stages_applied.push(stage.name().to_owned());
103 }
104 }
105
106 let is_json = ToonEncoder::is_json(&content.raw);
109
110 if is_json && content.raw.len() > 100 {
113 let proj_config = crate::json_projection::ProjectionConfig::default();
114 if let Ok(proj_result) = crate::json_projection::project_json(&content.raw, &proj_config) {
115 if proj_result.fields_removed > 0 {
116 content.raw = proj_result.data;
117 stages_applied.push("json_projection".to_owned());
118 }
119 }
120 }
121
122 if !is_json && content.raw.len() > 200 {
125 if let Ok(rle_result) = crate::rle_compressor::rle_compress(&content.raw, 3) {
126 if rle_result.runs_collapsed > 0 {
127 let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
129 let rle_has_error = rle_result.text.contains("ERROR") || rle_result.text.contains("error:");
130 if !has_error || rle_has_error {
131 content.raw = rle_result.text;
132 stages_applied.push("rle".to_owned());
133 }
134 }
135 }
136 }
137
138 if !is_json && content.raw.len() > 300 {
140 if let Ok(sw_result) = crate::rle_compressor::sliding_window_dedup(&content.raw, 4) {
141 if sw_result.dedup_count > 0 {
142 let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
144 let sw_has_error = sw_result.text.contains("ERROR") || sw_result.text.contains("error:");
145 if !has_error || sw_has_error {
146 content.raw = sw_result.text;
147 stages_applied.push("sliding_window_dedup".to_owned());
148 }
149 }
150 }
151 }
152
153 if !is_json && content.raw.len() > 500 {
155 if let Ok(trunc_result) = self.entropy_truncator.truncate_string(&content.raw) {
156 if trunc_result.segments_dropped > 0 {
157 content.raw = trunc_result.text;
158 stages_applied.push("entropy_truncate".to_owned());
159 }
160 }
161 }
162
163 if !is_json && content.raw.len() > 100 && looks_like_prose(&content.raw) {
165 if let Ok(prune_result) = self.token_pruner.prune(&content.raw) {
166 if prune_result.tokens_removed > 0 {
167 content.raw = prune_result.text;
168 stages_applied.push("token_prune".to_owned());
169 }
170 }
171 }
172
173 let data = if ToonEncoder::is_json(&content.raw) {
175 self.dict_compressor.borrow_mut().observe(&content.raw);
179 let dict_result = self.dict_compressor.borrow().compress(&content.raw)?;
180
181 if dict_result.substitutions > 0 && dict_result.bytes_saved > 50 {
182 stages_applied.push("dict_compress".to_owned());
184 if let Some(newline_pos) = dict_result.data.find("\n{") {
185 let header = &dict_result.data[..newline_pos + 1];
186 let json_body = &dict_result.data[newline_pos + 1..];
187 if let Ok(json) = serde_json::from_str::<serde_json::Value>(json_body) {
188 let encoded = self.toon_encoder.encode(&json)?;
189 stages_applied.push("toon_encode".to_owned());
190 format!("{header}{encoded}")
191 } else {
192 dict_result.data
193 }
194 } else {
195 dict_result.data
196 }
197 } else {
198 let json: serde_json::Value = serde_json::from_str(&content.raw)
200 .map_err(|e| SqzError::Other(format!("pipeline: JSON parse error: {e}")))?;
201 let encoded = self.toon_encoder.encode(&json)?;
202 stages_applied.push("toon_encode".to_owned());
203 encoded
204 }
205 } else {
206 content.raw
207 };
208
209 let tokens_compressed = self.token_counter.count(&data, &model_family);
210 let compression_ratio = if tokens_original == 0 {
211 1.0
212 } else {
213 tokens_compressed as f64 / tokens_original as f64
214 };
215
216 Ok(CompressedContent {
217 data,
218 tokens_compressed,
219 tokens_original,
220 stages_applied,
221 compression_ratio,
222 provenance: crate::types::Provenance::default(),
223 verify: None,
224 })
225 }
226
227 pub fn insert_stage(&mut self, stage: Box<dyn crate::stages::CompressionStage>) {
229 self.stages.push(stage);
230 self.stages.sort_by_key(|s| s.priority());
231 }
232
233 pub fn reload_preset(&mut self, _preset: &Preset) -> Result<()> {
237 let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
238 Box::new(AnsiStripper),
239 Box::new(KeepFieldsStage),
240 Box::new(StripFieldsStage),
241 Box::new(CondenseStage),
242 Box::new(GitDiffFoldStage),
243 Box::new(StripNullsStage),
244 Box::new(FlattenStage),
245 Box::new(TruncateStringsStage),
246 Box::new(CollapseArraysStage),
247 Box::new(CustomTransformsStage),
248 ];
249 stages.sort_by_key(|s| s.priority());
250 self.stages = stages;
251 self.dict_compressor.borrow_mut().reset();
253 Ok(())
254 }
255}
256
257fn model_family_from_preset(preset: &Preset) -> ModelFamily {
259 match preset.model.family.to_lowercase().as_str() {
260 "anthropic" | "claude" => ModelFamily::AnthropicClaude,
261 "openai" | "gpt" => ModelFamily::OpenAiGpt,
262 "google" | "gemini" => ModelFamily::GoogleGemini,
263 other => ModelFamily::Local(other.to_string()),
264 }
265}
266
267fn stage_config_from_preset(name: &str, preset: &Preset) -> StageConfig {
269 let c = &preset.compression;
270 match name {
271 "ansi_strip" => StageConfig {
272 enabled: true,
273 options: serde_json::Value::Object(Default::default()),
274 },
275 "keep_fields" => {
276 if let Some(cfg) = &c.keep_fields {
277 StageConfig {
278 enabled: cfg.enabled,
279 options: serde_json::json!({ "fields": cfg.fields }),
280 }
281 } else {
282 StageConfig::default()
283 }
284 }
285 "strip_fields" => {
286 if let Some(cfg) = &c.strip_fields {
287 StageConfig {
288 enabled: cfg.enabled,
289 options: serde_json::json!({ "fields": cfg.fields }),
290 }
291 } else {
292 StageConfig::default()
293 }
294 }
295 "condense" => {
296 if let Some(cfg) = &c.condense {
297 StageConfig {
298 enabled: cfg.enabled,
299 options: serde_json::json!({
300 "max_repeated_lines": cfg.max_repeated_lines
301 }),
302 }
303 } else {
304 StageConfig::default()
305 }
306 }
307 "git_diff_fold" => {
308 if let Some(cfg) = &c.git_diff_fold {
309 StageConfig {
310 enabled: cfg.enabled,
311 options: serde_json::json!({
312 "max_context_lines": cfg.max_context_lines
313 }),
314 }
315 } else {
316 StageConfig {
318 enabled: true,
319 options: serde_json::json!({ "max_context_lines": 2 }),
320 }
321 }
322 }
323 "strip_nulls" => {
324 if let Some(cfg) = &c.strip_nulls {
325 StageConfig {
326 enabled: cfg.enabled,
327 options: serde_json::Value::Object(Default::default()),
328 }
329 } else {
330 StageConfig::default()
331 }
332 }
333 "flatten" => {
334 if let Some(cfg) = &c.flatten {
335 StageConfig {
336 enabled: cfg.enabled,
337 options: serde_json::json!({ "max_depth": cfg.max_depth }),
338 }
339 } else {
340 StageConfig::default()
341 }
342 }
343 "truncate_strings" => {
344 if let Some(cfg) = &c.truncate_strings {
345 StageConfig {
346 enabled: cfg.enabled,
347 options: serde_json::json!({ "max_length": cfg.max_length }),
348 }
349 } else {
350 StageConfig::default()
351 }
352 }
353 "collapse_arrays" => {
354 if let Some(cfg) = &c.collapse_arrays {
355 StageConfig {
356 enabled: cfg.enabled,
357 options: serde_json::json!({
358 "max_items": cfg.max_items,
359 "summary_template": cfg.summary_template
360 }),
361 }
362 } else {
363 StageConfig::default()
364 }
365 }
366 "custom_transforms" => {
367 if let Some(cfg) = &c.custom_transforms {
368 StageConfig {
369 enabled: cfg.enabled,
370 options: serde_json::Value::Object(Default::default()),
371 }
372 } else {
373 StageConfig::default()
374 }
375 }
376 _ => StageConfig::default(),
377 }
378}
379
380fn looks_like_prose(text: &str) -> bool {
383 let lines: Vec<&str> = text.lines().take(20).collect();
384 if lines.is_empty() {
385 return false;
386 }
387
388 let mut prose_lines = 0;
389 let mut code_lines = 0;
390
391 for line in &lines {
392 let trimmed = line.trim();
393 if trimmed.is_empty() {
394 continue;
395 }
396 if trimmed.ends_with('{')
398 || trimmed.ends_with('}')
399 || trimmed.ends_with(';')
400 || trimmed.starts_with("fn ")
401 || trimmed.starts_with("pub ")
402 || trimmed.starts_with("let ")
403 || trimmed.starts_with("const ")
404 || trimmed.starts_with("import ")
405 || trimmed.starts_with("def ")
406 || trimmed.starts_with("class ")
407 || trimmed.contains("::")
408 || trimmed.contains("->")
409 || trimmed.contains("=>")
410 {
411 code_lines += 1;
412 } else {
413 prose_lines += 1;
414 }
415 }
416
417 prose_lines > code_lines
419}
420
421#[cfg(test)]
426mod tests {
427 use super::*;
428 use crate::preset::{
429 BudgetConfig, CollapseArraysConfig, CompressionConfig, CondenseConfig,
430 CustomTransformsConfig, ModelConfig, PresetMeta,
431 StripNullsConfig, ToolSelectionConfig, TruncateStringsConfig,
432 TerseModeConfig,
433 };
434
435 fn default_preset() -> Preset {
436 Preset {
437 preset: PresetMeta {
438 name: "test".into(),
439 version: "1.0".into(),
440 description: String::new(),
441 },
442 compression: CompressionConfig {
443 stages: vec![],
444 keep_fields: None,
445 strip_fields: None,
446 condense: Some(CondenseConfig {
447 enabled: true,
448 max_repeated_lines: 3,
449 }),
450 git_diff_fold: None,
451 strip_nulls: Some(StripNullsConfig { enabled: true }),
452 flatten: None,
453 truncate_strings: Some(TruncateStringsConfig {
454 enabled: true,
455 max_length: 500,
456 }),
457 collapse_arrays: Some(CollapseArraysConfig {
458 enabled: true,
459 max_items: 5,
460 summary_template: "... and {remaining} more items".into(),
461 }),
462 custom_transforms: Some(CustomTransformsConfig { enabled: true }),
463 },
464 tool_selection: ToolSelectionConfig {
465 max_tools: 5,
466 similarity_threshold: 0.7,
467 default_tools: vec![],
468 },
469 budget: BudgetConfig {
470 warning_threshold: 0.70,
471 ceiling_threshold: 0.85,
472 default_window_size: 200_000,
473 agents: Default::default(),
474 },
475 terse_mode: TerseModeConfig {
476 enabled: false,
477 level: crate::preset::TerseLevel::Moderate,
478 },
479 model: ModelConfig {
480 family: "anthropic".into(),
481 primary: "claude-sonnet-4-20250514".into(),
482 local: String::new(),
483 complexity_threshold: 0.4,
484 pricing: None,
485 },
486 }
487 }
488
489 fn ctx() -> SessionContext {
490 SessionContext {
491 session_id: "test-session".into(),
492 }
493 }
494
495 #[test]
496 fn new_creates_pipeline_with_sorted_stages() {
497 let preset = default_preset();
498 let pipeline = CompressionPipeline::new(&preset);
499 let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
501 let mut sorted = priorities.clone();
502 sorted.sort();
503 assert_eq!(priorities, sorted);
504 }
505
506 #[test]
507 fn compress_plain_text_passthrough() {
508 let preset = default_preset();
509 let pipeline = CompressionPipeline::new(&preset);
510 let result = pipeline.compress("hello world", &ctx(), &preset).unwrap();
511 assert_eq!(result.data, "hello world");
512 assert!(!result.stages_applied.contains(&"toon_encode".to_owned()));
513 }
514
515 #[test]
516 fn compress_json_applies_toon() {
517 let preset = default_preset();
518 let pipeline = CompressionPipeline::new(&preset);
519 let json = r#"{"name":"Alice","age":30}"#;
520 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
521 assert!(result.data.starts_with("TOON:"), "data: {}", result.data);
522 assert!(result.stages_applied.contains(&"toon_encode".to_owned()));
523 }
524
525 #[test]
526 fn compress_strips_nulls_from_json() {
527 let preset = default_preset();
528 let pipeline = CompressionPipeline::new(&preset);
529 let json = r#"{"a":1,"b":null}"#;
530 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
531 assert!(result.data.starts_with("TOON:"));
533 let decoded = ToonEncoder.decode(&result.data).unwrap();
535 assert!(decoded.get("b").is_none());
536 assert_eq!(decoded["a"], serde_json::json!(1));
537 }
538
539 #[test]
540 fn compress_returns_token_counts() {
541 let preset = default_preset();
542 let pipeline = CompressionPipeline::new(&preset);
543 let input = "a".repeat(100);
544 let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
545 assert!(result.tokens_original > 0);
546 assert!(result.tokens_compressed > 0);
547 }
548
549 #[test]
550 fn compress_ratio_is_reasonable() {
551 let preset = default_preset();
552 let pipeline = CompressionPipeline::new(&preset);
553 let result = pipeline.compress("hello", &ctx(), &preset).unwrap();
554 assert!(result.compression_ratio > 0.0);
555 }
556
557 #[test]
558 fn insert_stage_re_sorts_by_priority() {
559 use crate::stages::CompressionStage;
560 use crate::types::StageConfig;
561
562 struct LowPriorityStage;
563 impl CompressionStage for LowPriorityStage {
564 fn name(&self) -> &str {
565 "low_priority"
566 }
567 fn priority(&self) -> u32 {
568 5 }
570 fn process(
571 &self,
572 _content: &mut Content,
573 _config: &StageConfig,
574 ) -> crate::error::Result<()> {
575 Ok(())
576 }
577 }
578
579 let preset = default_preset();
580 let mut pipeline = CompressionPipeline::new(&preset);
581 pipeline.insert_stage(Box::new(LowPriorityStage));
582
583 let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
584 let mut sorted = priorities.clone();
585 sorted.sort();
586 assert_eq!(priorities, sorted);
587 assert_eq!(pipeline.stages[0].name(), "ansi_strip");
588 assert_eq!(pipeline.stages[1].name(), "low_priority");
589 }
590
591 #[test]
592 fn reload_preset_rebuilds_stages() {
593 let preset = default_preset();
594 let mut pipeline = CompressionPipeline::new(&preset);
595 let original_count = pipeline.stages.len();
596 pipeline.reload_preset(&preset).unwrap();
597 assert_eq!(pipeline.stages.len(), original_count);
598 }
599
600 #[test]
601 fn compress_keep_fields_filters_json() {
602 use crate::preset::KeepFieldsConfig;
603 let mut preset = default_preset();
604 preset.compression.keep_fields = Some(KeepFieldsConfig {
605 enabled: true,
606 fields: vec!["id".into(), "name".into()],
607 });
608 let pipeline = CompressionPipeline::new(&preset);
609 let json = r#"{"id":1,"name":"Bob","debug":"x"}"#;
610 let result = pipeline.compress(json, &ctx(), &preset).unwrap();
611 let decoded = ToonEncoder.decode(&result.data).unwrap();
612 assert!(decoded.get("debug").is_none());
613 assert_eq!(decoded["id"], serde_json::json!(1));
614 }
615
616 #[test]
617 fn compress_empty_string() {
618 let preset = default_preset();
619 let pipeline = CompressionPipeline::new(&preset);
620 let result = pipeline.compress("", &ctx(), &preset).unwrap();
621 assert_eq!(result.data, "");
622 assert_eq!(result.tokens_original, 0);
623 }
624
625 #[test]
626 fn stage_config_from_preset_unknown_stage() {
627 let preset = default_preset();
628 let config = stage_config_from_preset("nonexistent", &preset);
629 assert!(!config.enabled);
630 }
631
632 use proptest::prelude::*;
637
638 fn significant_line_strategy() -> impl Strategy<Value = String> {
640 prop_oneof![
641 Just("error: connection refused".to_owned()),
642 Just("warning: deprecated API usage".to_owned()),
643 Just("failed: build step exited with code 1".to_owned()),
644 Just("success: deployment complete".to_owned()),
645 Just("status: all checks passed".to_owned()),
646 Just("error: file not found".to_owned()),
647 Just("warning: unused variable detected".to_owned()),
648 ]
649 }
650
651 fn noise_line_strategy() -> impl Strategy<Value = String> {
653 prop_oneof![
654 Just("---".to_owned()),
655 Just("Loading...".to_owned()),
656 Just("================".to_owned()),
657 Just("...".to_owned()),
658 ]
659 }
660
661 fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
664 let leaf = prop_oneof![
665 Just(serde_json::Value::Null),
666 any::<bool>().prop_map(serde_json::Value::Bool),
667 any::<i64>().prop_map(|n| serde_json::json!(n)),
668 any::<f64>()
669 .prop_filter("must be finite", |f| f.is_finite())
670 .prop_map(|f| serde_json::json!(f)),
671 ".*".prop_map(serde_json::Value::String),
672 ];
673
674 leaf.prop_recursive(4, 64, 8, |inner| {
675 prop_oneof![
676 prop::collection::vec(inner.clone(), 0..8)
677 .prop_map(serde_json::Value::Array),
678 prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
679 serde_json::Value::Object(m.into_iter().collect())
680 }),
681 ]
682 })
683 }
684
685 proptest! {
686 #[test]
694 fn prop_pipeline_ascii_safe_output(v in arb_json_value()) {
695 let preset = default_preset();
696 let pipeline = CompressionPipeline::new(&preset);
697
698 let json_input = serde_json::to_string(&v).expect("serialize should not fail");
699 let result = pipeline.compress(&json_input, &ctx(), &preset)
700 .expect("compress should not fail");
701
702 for ch in result.data.chars() {
703 let cp = ch as u32;
704 let is_printable_ascii = cp >= 0x20 && cp <= 0x7E;
705 let is_standard_whitespace = cp == 0x09 || cp == 0x0A || cp == 0x0D;
706 prop_assert!(
707 is_printable_ascii || is_standard_whitespace,
708 "non-ASCII-safe character in output: U+{:04X} ({:?})\noutput: {:?}",
709 cp, ch, result.data
710 );
711 }
712 }
713 }
714
715 proptest! {
716 #[test]
726 fn prop_compression_preserves_significant_content(
727 significant_lines in prop::collection::vec(significant_line_strategy(), 1..=5),
728 noise_line in noise_line_strategy(),
729 noise_repeat in 5u32..=10u32,
730 ) {
731 let preset = default_preset(); let pipeline = CompressionPipeline::new(&preset);
733
734 let mut lines: Vec<String> = Vec::new();
736 for sig in &significant_lines {
737 for _ in 0..noise_repeat {
738 lines.push(noise_line.clone());
739 }
740 lines.push(sig.clone());
741 }
742 for _ in 0..noise_repeat {
744 lines.push(noise_line.clone());
745 }
746
747 let input = lines.join("\n");
748 let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
749 let output = &result.data;
750
751 for sig in &significant_lines {
753 prop_assert!(
754 output.contains(sig.as_str()),
755 "significant line missing from output: {:?}\noutput: {:?}",
756 sig,
757 output
758 );
759 }
760
761 let mut max_run = 0usize;
763 let mut current_run = 0usize;
764 for line in output.lines() {
765 if line == noise_line.as_str() {
766 current_run += 1;
767 max_run = max_run.max(current_run);
768 } else {
769 current_run = 0;
770 }
771 }
772 prop_assert!(
773 max_run <= 3,
774 "noise line {:?} has a consecutive run of {} (max 3)\noutput: {:?}",
775 noise_line,
776 max_run,
777 output
778 );
779 }
780 }
781}