Skip to main content

sqz_engine/
pipeline.rs

1use crate::ansi_strip::AnsiStripper;
2use crate::dict_compressor::DictCompressor;
3use crate::entropy_truncator::EntropyTruncator;
4use crate::error::{Result, SqzError};
5use crate::preset::Preset;
6use crate::prompt_cache::PromptCacheDetector;
7use crate::stages::{
8    CollapseArraysStage, CondenseStage, CustomTransformsStage, FlattenStage, GitDiffFoldStage,
9    KeepFieldsStage, StripFieldsStage, StripNullsStage, TruncateStringsStage,
10};
11use crate::token_counter::TokenCounter;
12use crate::token_pruner::TokenPruner;
13use crate::toon::ToonEncoder;
14use crate::types::{CompressedContent, Content, ContentType, ModelFamily, StageConfig};
15
16/// Minimal session context passed to the pipeline.
17pub struct SessionContext {
18    pub session_id: String,
19}
20
21/// The compression pipeline orchestrator with integrated token pruning,
22/// dictionary compression, and entropy-weighted truncation.
23pub struct CompressionPipeline {
24    stages: Vec<Box<dyn crate::stages::CompressionStage>>,
25    toon_encoder: ToonEncoder,
26    token_counter: TokenCounter,
27    token_pruner: TokenPruner,
28    dict_compressor: std::cell::RefCell<DictCompressor>,
29    entropy_truncator: EntropyTruncator,
30    #[allow(dead_code)]
31    prompt_cache_detector: PromptCacheDetector,
32}
33
34impl CompressionPipeline {
35    /// Construct the pipeline from a preset, creating all built-in stages
36    /// sorted by priority, plus the token pruner, dictionary compressor,
37    /// and entropy truncator.
38    pub fn new(_preset: &Preset) -> Self {
39        let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
40            Box::new(AnsiStripper),
41            Box::new(KeepFieldsStage),
42            Box::new(StripFieldsStage),
43            Box::new(CondenseStage),
44            Box::new(GitDiffFoldStage),
45            Box::new(StripNullsStage),
46            Box::new(FlattenStage),
47            Box::new(TruncateStringsStage),
48            Box::new(CollapseArraysStage),
49            Box::new(CustomTransformsStage),
50        ];
51        stages.sort_by_key(|s| s.priority());
52
53        Self {
54            stages,
55            toon_encoder: ToonEncoder,
56            token_counter: TokenCounter::new(),
57            token_pruner: TokenPruner::new(),
58            dict_compressor: std::cell::RefCell::new(DictCompressor::new()),
59            entropy_truncator: EntropyTruncator::new(),
60            prompt_cache_detector: PromptCacheDetector,
61        }
62    }
63
64    /// Run content through all enabled stages, then apply:
65    /// 1. Dictionary compression + TOON encoding for JSON
66    /// 2. Token pruning for prose/plain text
67    /// 3. Entropy-weighted truncation for long content
68    pub fn compress(
69        &self,
70        input: &str,
71        _ctx: &SessionContext,
72        preset: &Preset,
73    ) -> Result<CompressedContent> {
74        let model_family = model_family_from_preset(preset);
75        let tokens_original = self.token_counter.count(input, &model_family);
76
77        let mut content = Content {
78            raw: input.to_owned(),
79            content_type: ContentType::PlainText,
80            metadata: crate::types::ContentMetadata {
81                source: None,
82                path: None,
83                language: None,
84            },
85            tokens_original,
86        };
87
88        let mut stages_applied: Vec<String> = Vec::new();
89
90        for stage in &self.stages {
91            let config = stage_config_from_preset(stage.name(), preset);
92            if config.enabled {
93                stage.process(&mut content, &config)?;
94                stages_applied.push(stage.name().to_owned());
95            }
96        }
97
98        // Post-stage processing: apply advanced compression techniques
99
100        let is_json = ToonEncoder::is_json(&content.raw);
101
102        // RLE: collapse repeated patterns in non-JSON content (generalizes condense)
103        // Only apply when content is long enough to benefit
104        if !is_json && content.raw.len() > 200 {
105            if let Ok(rle_result) = crate::rle_compressor::rle_compress(&content.raw, 3) {
106                if rle_result.runs_collapsed > 0 {
107                    // Safety check: verify critical markers are preserved
108                    let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
109                    let rle_has_error = rle_result.text.contains("ERROR") || rle_result.text.contains("error:");
110                    if !has_error || rle_has_error {
111                        content.raw = rle_result.text;
112                        stages_applied.push("rle".to_owned());
113                    }
114                }
115            }
116        }
117
118        // Sliding window dedup: catch repeated substrings across non-adjacent lines
119        if !is_json && content.raw.len() > 300 {
120            if let Ok(sw_result) = crate::rle_compressor::sliding_window_dedup(&content.raw, 4) {
121                if sw_result.dedup_count > 0 {
122                    // Safety check: verify critical markers are preserved
123                    let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
124                    let sw_has_error = sw_result.text.contains("ERROR") || sw_result.text.contains("error:");
125                    if !has_error || sw_has_error {
126                        content.raw = sw_result.text;
127                        stages_applied.push("sliding_window_dedup".to_owned());
128                    }
129                }
130            }
131        }
132
133        // Entropy-weighted truncation for long non-JSON content
134        if !is_json && content.raw.len() > 500 {
135            if let Ok(trunc_result) = self.entropy_truncator.truncate_string(&content.raw) {
136                if trunc_result.segments_dropped > 0 {
137                    content.raw = trunc_result.text;
138                    stages_applied.push("entropy_truncate".to_owned());
139                }
140            }
141        }
142
143        // Token pruning for prose content (non-JSON, non-code)
144        if !is_json && content.raw.len() > 100 && looks_like_prose(&content.raw) {
145            if let Ok(prune_result) = self.token_pruner.prune(&content.raw) {
146                if prune_result.tokens_removed > 0 {
147                    content.raw = prune_result.text;
148                    stages_applied.push("token_prune".to_owned());
149                }
150            }
151        }
152
153        // Apply dictionary compression + TOON encoding for JSON
154        let data = if ToonEncoder::is_json(&content.raw) {
155            // Dictionary compression — observe and try to compress.
156            // Only use dict compression if it actually saves bytes (header overhead
157            // can make small payloads larger).
158            self.dict_compressor.borrow_mut().observe(&content.raw);
159            let dict_result = self.dict_compressor.borrow().compress(&content.raw)?;
160
161            if dict_result.substitutions > 0 && dict_result.bytes_saved > 50 {
162                // Dict compression was beneficial — encode the JSON body with TOON
163                stages_applied.push("dict_compress".to_owned());
164                if let Some(newline_pos) = dict_result.data.find("\n{") {
165                    let header = &dict_result.data[..newline_pos + 1];
166                    let json_body = &dict_result.data[newline_pos + 1..];
167                    if let Ok(json) = serde_json::from_str::<serde_json::Value>(json_body) {
168                        let encoded = self.toon_encoder.encode(&json)?;
169                        stages_applied.push("toon_encode".to_owned());
170                        format!("{header}{encoded}")
171                    } else {
172                        dict_result.data
173                    }
174                } else {
175                    dict_result.data
176                }
177            } else {
178                // No beneficial dict substitutions — standard TOON encoding
179                let json: serde_json::Value = serde_json::from_str(&content.raw)
180                    .map_err(|e| SqzError::Other(format!("pipeline: JSON parse error: {e}")))?;
181                let encoded = self.toon_encoder.encode(&json)?;
182                stages_applied.push("toon_encode".to_owned());
183                encoded
184            }
185        } else {
186            content.raw
187        };
188
189        let tokens_compressed = self.token_counter.count(&data, &model_family);
190        let compression_ratio = if tokens_original == 0 {
191            1.0
192        } else {
193            tokens_compressed as f64 / tokens_original as f64
194        };
195
196        Ok(CompressedContent {
197            data,
198            tokens_compressed,
199            tokens_original,
200            stages_applied,
201            compression_ratio,
202            provenance: crate::types::Provenance::default(),
203            verify: None,
204        })
205    }
206
207    /// Insert a plugin stage and re-sort by priority.
208    pub fn insert_stage(&mut self, stage: Box<dyn crate::stages::CompressionStage>) {
209        self.stages.push(stage);
210        self.stages.sort_by_key(|s| s.priority());
211    }
212
213    /// Rebuild stage list from a new preset (hot-reload support).
214    /// Built-in stages are recreated; plugin stages are dropped and must be
215    /// re-inserted by the caller.
216    pub fn reload_preset(&mut self, _preset: &Preset) -> Result<()> {
217        let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
218            Box::new(AnsiStripper),
219            Box::new(KeepFieldsStage),
220            Box::new(StripFieldsStage),
221            Box::new(CondenseStage),
222            Box::new(GitDiffFoldStage),
223            Box::new(StripNullsStage),
224            Box::new(FlattenStage),
225            Box::new(TruncateStringsStage),
226            Box::new(CollapseArraysStage),
227            Box::new(CustomTransformsStage),
228        ];
229        stages.sort_by_key(|s| s.priority());
230        self.stages = stages;
231        // Reset session-level state in dict compressor
232        self.dict_compressor.borrow_mut().reset();
233        Ok(())
234    }
235}
236
237/// Derive the `ModelFamily` from the preset's model configuration.
238fn model_family_from_preset(preset: &Preset) -> ModelFamily {
239    match preset.model.family.to_lowercase().as_str() {
240        "anthropic" | "claude" => ModelFamily::AnthropicClaude,
241        "openai" | "gpt" => ModelFamily::OpenAiGpt,
242        "google" | "gemini" => ModelFamily::GoogleGemini,
243        other => ModelFamily::Local(other.to_string()),
244    }
245}
246
247/// Build a `StageConfig` for a named stage from the preset's compression config.
248fn stage_config_from_preset(name: &str, preset: &Preset) -> StageConfig {
249    let c = &preset.compression;
250    match name {
251        "ansi_strip" => StageConfig {
252            enabled: true,
253            options: serde_json::Value::Object(Default::default()),
254        },
255        "keep_fields" => {
256            if let Some(cfg) = &c.keep_fields {
257                StageConfig {
258                    enabled: cfg.enabled,
259                    options: serde_json::json!({ "fields": cfg.fields }),
260                }
261            } else {
262                StageConfig::default()
263            }
264        }
265        "strip_fields" => {
266            if let Some(cfg) = &c.strip_fields {
267                StageConfig {
268                    enabled: cfg.enabled,
269                    options: serde_json::json!({ "fields": cfg.fields }),
270                }
271            } else {
272                StageConfig::default()
273            }
274        }
275        "condense" => {
276            if let Some(cfg) = &c.condense {
277                StageConfig {
278                    enabled: cfg.enabled,
279                    options: serde_json::json!({
280                        "max_repeated_lines": cfg.max_repeated_lines
281                    }),
282                }
283            } else {
284                StageConfig::default()
285            }
286        }
287        "git_diff_fold" => {
288            if let Some(cfg) = &c.git_diff_fold {
289                StageConfig {
290                    enabled: cfg.enabled,
291                    options: serde_json::json!({
292                        "max_context_lines": cfg.max_context_lines
293                    }),
294                }
295            } else {
296                // Default: enabled with 2 context lines
297                StageConfig {
298                    enabled: true,
299                    options: serde_json::json!({ "max_context_lines": 2 }),
300                }
301            }
302        }
303        "strip_nulls" => {
304            if let Some(cfg) = &c.strip_nulls {
305                StageConfig {
306                    enabled: cfg.enabled,
307                    options: serde_json::Value::Object(Default::default()),
308                }
309            } else {
310                StageConfig::default()
311            }
312        }
313        "flatten" => {
314            if let Some(cfg) = &c.flatten {
315                StageConfig {
316                    enabled: cfg.enabled,
317                    options: serde_json::json!({ "max_depth": cfg.max_depth }),
318                }
319            } else {
320                StageConfig::default()
321            }
322        }
323        "truncate_strings" => {
324            if let Some(cfg) = &c.truncate_strings {
325                StageConfig {
326                    enabled: cfg.enabled,
327                    options: serde_json::json!({ "max_length": cfg.max_length }),
328                }
329            } else {
330                StageConfig::default()
331            }
332        }
333        "collapse_arrays" => {
334            if let Some(cfg) = &c.collapse_arrays {
335                StageConfig {
336                    enabled: cfg.enabled,
337                    options: serde_json::json!({
338                        "max_items": cfg.max_items,
339                        "summary_template": cfg.summary_template
340                    }),
341                }
342            } else {
343                StageConfig::default()
344            }
345        }
346        "custom_transforms" => {
347            if let Some(cfg) = &c.custom_transforms {
348                StageConfig {
349                    enabled: cfg.enabled,
350                    options: serde_json::Value::Object(Default::default()),
351                }
352            } else {
353                StageConfig::default()
354            }
355        }
356        _ => StageConfig::default(),
357    }
358}
359
360/// Heuristic: does this text look like prose (documentation, error messages,
361/// README content) rather than code or structured data?
362fn looks_like_prose(text: &str) -> bool {
363    let lines: Vec<&str> = text.lines().take(20).collect();
364    if lines.is_empty() {
365        return false;
366    }
367
368    let mut prose_lines = 0;
369    let mut code_lines = 0;
370
371    for line in &lines {
372        let trimmed = line.trim();
373        if trimmed.is_empty() {
374            continue;
375        }
376        // Code indicators
377        if trimmed.ends_with('{')
378            || trimmed.ends_with('}')
379            || trimmed.ends_with(';')
380            || trimmed.starts_with("fn ")
381            || trimmed.starts_with("pub ")
382            || trimmed.starts_with("let ")
383            || trimmed.starts_with("const ")
384            || trimmed.starts_with("import ")
385            || trimmed.starts_with("def ")
386            || trimmed.starts_with("class ")
387            || trimmed.contains("::")
388            || trimmed.contains("->")
389            || trimmed.contains("=>")
390        {
391            code_lines += 1;
392        } else {
393            prose_lines += 1;
394        }
395    }
396
397    // Consider it prose if more than half the lines look like prose
398    prose_lines > code_lines
399}
400
401// ---------------------------------------------------------------------------
402// Tests
403// ---------------------------------------------------------------------------
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408    use crate::preset::{
409        BudgetConfig, CollapseArraysConfig, CompressionConfig, CondenseConfig,
410        CustomTransformsConfig, ModelConfig, PresetMeta,
411        StripNullsConfig, ToolSelectionConfig, TruncateStringsConfig,
412        TerseModeConfig,
413    };
414
415    fn default_preset() -> Preset {
416        Preset {
417            preset: PresetMeta {
418                name: "test".into(),
419                version: "1.0".into(),
420                description: String::new(),
421            },
422            compression: CompressionConfig {
423                stages: vec![],
424                keep_fields: None,
425                strip_fields: None,
426                condense: Some(CondenseConfig {
427                    enabled: true,
428                    max_repeated_lines: 3,
429                }),
430                git_diff_fold: None,
431                strip_nulls: Some(StripNullsConfig { enabled: true }),
432                flatten: None,
433                truncate_strings: Some(TruncateStringsConfig {
434                    enabled: true,
435                    max_length: 500,
436                }),
437                collapse_arrays: Some(CollapseArraysConfig {
438                    enabled: true,
439                    max_items: 5,
440                    summary_template: "... and {remaining} more items".into(),
441                }),
442                custom_transforms: Some(CustomTransformsConfig { enabled: true }),
443            },
444            tool_selection: ToolSelectionConfig {
445                max_tools: 5,
446                similarity_threshold: 0.7,
447                default_tools: vec![],
448            },
449            budget: BudgetConfig {
450                warning_threshold: 0.70,
451                ceiling_threshold: 0.85,
452                default_window_size: 200_000,
453                agents: Default::default(),
454            },
455            terse_mode: TerseModeConfig {
456                enabled: false,
457                level: crate::preset::TerseLevel::Moderate,
458            },
459            model: ModelConfig {
460                family: "anthropic".into(),
461                primary: "claude-sonnet-4-20250514".into(),
462                local: String::new(),
463                complexity_threshold: 0.4,
464                pricing: None,
465            },
466        }
467    }
468
469    fn ctx() -> SessionContext {
470        SessionContext {
471            session_id: "test-session".into(),
472        }
473    }
474
475    #[test]
476    fn new_creates_pipeline_with_sorted_stages() {
477        let preset = default_preset();
478        let pipeline = CompressionPipeline::new(&preset);
479        // Verify stages are sorted by priority
480        let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
481        let mut sorted = priorities.clone();
482        sorted.sort();
483        assert_eq!(priorities, sorted);
484    }
485
486    #[test]
487    fn compress_plain_text_passthrough() {
488        let preset = default_preset();
489        let pipeline = CompressionPipeline::new(&preset);
490        let result = pipeline.compress("hello world", &ctx(), &preset).unwrap();
491        assert_eq!(result.data, "hello world");
492        assert!(!result.stages_applied.contains(&"toon_encode".to_owned()));
493    }
494
495    #[test]
496    fn compress_json_applies_toon() {
497        let preset = default_preset();
498        let pipeline = CompressionPipeline::new(&preset);
499        let json = r#"{"name":"Alice","age":30}"#;
500        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
501        assert!(result.data.starts_with("TOON:"), "data: {}", result.data);
502        assert!(result.stages_applied.contains(&"toon_encode".to_owned()));
503    }
504
505    #[test]
506    fn compress_strips_nulls_from_json() {
507        let preset = default_preset();
508        let pipeline = CompressionPipeline::new(&preset);
509        let json = r#"{"a":1,"b":null}"#;
510        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
511        // After strip_nulls, "b" is gone; TOON encodes the result
512        assert!(result.data.starts_with("TOON:"));
513        // Decode and verify null is gone
514        let decoded = ToonEncoder.decode(&result.data).unwrap();
515        assert!(decoded.get("b").is_none());
516        assert_eq!(decoded["a"], serde_json::json!(1));
517    }
518
519    #[test]
520    fn compress_returns_token_counts() {
521        let preset = default_preset();
522        let pipeline = CompressionPipeline::new(&preset);
523        let input = "a".repeat(100);
524        let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
525        assert!(result.tokens_original > 0);
526        assert!(result.tokens_compressed > 0);
527    }
528
529    #[test]
530    fn compress_ratio_is_reasonable() {
531        let preset = default_preset();
532        let pipeline = CompressionPipeline::new(&preset);
533        let result = pipeline.compress("hello", &ctx(), &preset).unwrap();
534        assert!(result.compression_ratio > 0.0);
535    }
536
537    #[test]
538    fn insert_stage_re_sorts_by_priority() {
539        use crate::stages::CompressionStage;
540        use crate::types::StageConfig;
541
542        struct LowPriorityStage;
543        impl CompressionStage for LowPriorityStage {
544            fn name(&self) -> &str {
545                "low_priority"
546            }
547            fn priority(&self) -> u32 {
548                5 // lower than all built-in stages
549            }
550            fn process(
551                &self,
552                _content: &mut Content,
553                _config: &StageConfig,
554            ) -> crate::error::Result<()> {
555                Ok(())
556            }
557        }
558
559        let preset = default_preset();
560        let mut pipeline = CompressionPipeline::new(&preset);
561        pipeline.insert_stage(Box::new(LowPriorityStage));
562
563        let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
564        let mut sorted = priorities.clone();
565        sorted.sort();
566        assert_eq!(priorities, sorted);
567        assert_eq!(pipeline.stages[0].name(), "ansi_strip");
568        assert_eq!(pipeline.stages[1].name(), "low_priority");
569    }
570
571    #[test]
572    fn reload_preset_rebuilds_stages() {
573        let preset = default_preset();
574        let mut pipeline = CompressionPipeline::new(&preset);
575        let original_count = pipeline.stages.len();
576        pipeline.reload_preset(&preset).unwrap();
577        assert_eq!(pipeline.stages.len(), original_count);
578    }
579
580    #[test]
581    fn compress_keep_fields_filters_json() {
582        use crate::preset::KeepFieldsConfig;
583        let mut preset = default_preset();
584        preset.compression.keep_fields = Some(KeepFieldsConfig {
585            enabled: true,
586            fields: vec!["id".into(), "name".into()],
587        });
588        let pipeline = CompressionPipeline::new(&preset);
589        let json = r#"{"id":1,"name":"Bob","debug":"x"}"#;
590        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
591        let decoded = ToonEncoder.decode(&result.data).unwrap();
592        assert!(decoded.get("debug").is_none());
593        assert_eq!(decoded["id"], serde_json::json!(1));
594    }
595
596    #[test]
597    fn compress_empty_string() {
598        let preset = default_preset();
599        let pipeline = CompressionPipeline::new(&preset);
600        let result = pipeline.compress("", &ctx(), &preset).unwrap();
601        assert_eq!(result.data, "");
602        assert_eq!(result.tokens_original, 0);
603    }
604
605    #[test]
606    fn stage_config_from_preset_unknown_stage() {
607        let preset = default_preset();
608        let config = stage_config_from_preset("nonexistent", &preset);
609        assert!(!config.enabled);
610    }
611
612    // ---------------------------------------------------------------------------
613    // Property tests
614    // ---------------------------------------------------------------------------
615
616    use proptest::prelude::*;
617
618    /// Generate a significant line from a fixed set of meaningful tokens.
619    fn significant_line_strategy() -> impl Strategy<Value = String> {
620        prop_oneof![
621            Just("error: connection refused".to_owned()),
622            Just("warning: deprecated API usage".to_owned()),
623            Just("failed: build step exited with code 1".to_owned()),
624            Just("success: deployment complete".to_owned()),
625            Just("status: all checks passed".to_owned()),
626            Just("error: file not found".to_owned()),
627            Just("warning: unused variable detected".to_owned()),
628        ]
629    }
630
631    /// Generate a noise line (repeated decorative content).
632    fn noise_line_strategy() -> impl Strategy<Value = String> {
633        prop_oneof![
634            Just("---".to_owned()),
635            Just("Loading...".to_owned()),
636            Just("================".to_owned()),
637            Just("...".to_owned()),
638        ]
639    }
640
641    /// Recursive strategy that generates arbitrary serde_json::Value instances.
642    /// Mirrors the strategy in toon.rs tests.
643    fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
644        let leaf = prop_oneof![
645            Just(serde_json::Value::Null),
646            any::<bool>().prop_map(serde_json::Value::Bool),
647            any::<i64>().prop_map(|n| serde_json::json!(n)),
648            any::<f64>()
649                .prop_filter("must be finite", |f| f.is_finite())
650                .prop_map(|f| serde_json::json!(f)),
651            ".*".prop_map(serde_json::Value::String),
652        ];
653
654        leaf.prop_recursive(4, 64, 8, |inner| {
655            prop_oneof![
656                prop::collection::vec(inner.clone(), 0..8)
657                    .prop_map(serde_json::Value::Array),
658                prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
659                    serde_json::Value::Object(m.into_iter().collect())
660                }),
661            ]
662        })
663    }
664
665    proptest! {
666        /// **Validates: Requirements 17.1, 17.2, 13.2**
667        ///
668        /// Property 22: ASCII-safe output.
669        ///
670        /// For any JSON input, the Compression_Pipeline SHALL produce output
671        /// using only ASCII-safe characters: printable ASCII (0x20–0x7E) plus
672        /// standard whitespace (\t = 0x09, \n = 0x0A, \r = 0x0D).
673        #[test]
674        fn prop_pipeline_ascii_safe_output(v in arb_json_value()) {
675            let preset = default_preset();
676            let pipeline = CompressionPipeline::new(&preset);
677
678            let json_input = serde_json::to_string(&v).expect("serialize should not fail");
679            let result = pipeline.compress(&json_input, &ctx(), &preset)
680                .expect("compress should not fail");
681
682            for ch in result.data.chars() {
683                let cp = ch as u32;
684                let is_printable_ascii = cp >= 0x20 && cp <= 0x7E;
685                let is_standard_whitespace = cp == 0x09 || cp == 0x0A || cp == 0x0D;
686                prop_assert!(
687                    is_printable_ascii || is_standard_whitespace,
688                    "non-ASCII-safe character in output: U+{:04X} ({:?})\noutput: {:?}",
689                    cp, ch, result.data
690                );
691            }
692        }
693    }
694
695    proptest! {
696        /// **Validates: Requirements 1.3**
697        ///
698        /// Property 1: Compression preserves semantically significant content.
699        ///
700        /// For any CLI output containing significant tokens (errors, warnings,
701        /// status messages) mixed with noise (repeated identical lines), the
702        /// Compression_Pipeline SHALL produce output that:
703        ///   1. Contains all significant lines.
704        ///   2. Contains each noise line at most `max_repeated_lines` times.
705        #[test]
706        fn prop_compression_preserves_significant_content(
707            significant_lines in prop::collection::vec(significant_line_strategy(), 1..=5),
708            noise_line in noise_line_strategy(),
709            noise_repeat in 5u32..=10u32,
710        ) {
711            let preset = default_preset(); // condense enabled, max_repeated_lines=3
712            let pipeline = CompressionPipeline::new(&preset);
713
714            // Build interleaved input: noise, significant, noise, significant, ...
715            let mut lines: Vec<String> = Vec::new();
716            for sig in &significant_lines {
717                for _ in 0..noise_repeat {
718                    lines.push(noise_line.clone());
719                }
720                lines.push(sig.clone());
721            }
722            // Trailing noise block
723            for _ in 0..noise_repeat {
724                lines.push(noise_line.clone());
725            }
726
727            let input = lines.join("\n");
728            let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
729            let output = &result.data;
730
731            // 1. All significant lines must appear in the output.
732            for sig in &significant_lines {
733                prop_assert!(
734                    output.contains(sig.as_str()),
735                    "significant line missing from output: {:?}\noutput: {:?}",
736                    sig,
737                    output
738                );
739            }
740
741            // 2. No consecutive run of the noise line exceeds max_repeated_lines (3).
742            let mut max_run = 0usize;
743            let mut current_run = 0usize;
744            for line in output.lines() {
745                if line == noise_line.as_str() {
746                    current_run += 1;
747                    max_run = max_run.max(current_run);
748                } else {
749                    current_run = 0;
750                }
751            }
752            prop_assert!(
753                max_run <= 3,
754                "noise line {:?} has a consecutive run of {} (max 3)\noutput: {:?}",
755                noise_line,
756                max_run,
757                output
758            );
759        }
760    }
761}