Skip to main content

sqz_engine/
pipeline.rs

1use crate::ansi_strip::AnsiStripper;
2use crate::dict_compressor::DictCompressor;
3use crate::entropy_truncator::EntropyTruncator;
4use crate::error::{Result, SqzError};
5use crate::preset::Preset;
6use crate::prompt_cache::PromptCacheDetector;
7use crate::stages::{
8    CollapseArraysStage, CondenseStage, CustomTransformsStage, FlattenStage, GitDiffFoldStage,
9    KeepFieldsStage, StripFieldsStage, StripNullsStage, TruncateStringsStage,
10};
11use crate::token_counter::TokenCounter;
12use crate::token_pruner::TokenPruner;
13use crate::toon::ToonEncoder;
14use crate::types::{CompressedContent, Content, ContentType, ModelFamily, StageConfig};
15
16/// Minimal session context passed to the pipeline. Just carries the session
17/// ID so stages can log or cache per-session.
18pub struct SessionContext {
19    pub session_id: String,
20}
21
22/// The compression pipeline orchestrator.
23///
24/// Runs content through a sequence of configurable stages (condense, strip
25/// nulls, collapse arrays, etc.), then applies post-processing: RLE
26/// compression, sliding-window dedup, entropy truncation, token pruning,
27/// dictionary compression, and TOON encoding for JSON.
28///
29/// Stages are sorted by priority — lower numbers run first. You can insert
30/// custom stages with [`insert_stage`](CompressionPipeline::insert_stage).
31pub struct CompressionPipeline {
32    stages: Vec<Box<dyn crate::stages::CompressionStage>>,
33    toon_encoder: ToonEncoder,
34    token_counter: TokenCounter,
35    token_pruner: TokenPruner,
36    dict_compressor: std::cell::RefCell<DictCompressor>,
37    entropy_truncator: EntropyTruncator,
38    #[allow(dead_code)]
39    prompt_cache_detector: PromptCacheDetector,
40}
41
42impl CompressionPipeline {
43    /// Construct the pipeline from a preset, creating all built-in stages
44    /// sorted by priority, plus the token pruner, dictionary compressor,
45    /// and entropy truncator.
46    pub fn new(_preset: &Preset) -> Self {
47        let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
48            Box::new(AnsiStripper),
49            Box::new(KeepFieldsStage),
50            Box::new(StripFieldsStage),
51            Box::new(CondenseStage),
52            Box::new(GitDiffFoldStage),
53            Box::new(StripNullsStage),
54            Box::new(FlattenStage),
55            Box::new(TruncateStringsStage),
56            Box::new(CollapseArraysStage),
57            Box::new(CustomTransformsStage),
58        ];
59        stages.sort_by_key(|s| s.priority());
60
61        Self {
62            stages,
63            toon_encoder: ToonEncoder,
64            token_counter: TokenCounter::new(),
65            token_pruner: TokenPruner::new(),
66            dict_compressor: std::cell::RefCell::new(DictCompressor::new()),
67            entropy_truncator: EntropyTruncator::new(),
68            prompt_cache_detector: PromptCacheDetector,
69        }
70    }
71
72    /// Run content through all enabled stages, then apply:
73    /// 1. Dictionary compression + TOON encoding for JSON
74    /// 2. Token pruning for prose/plain text
75    /// 3. Entropy-weighted truncation for long content
76    pub fn compress(
77        &self,
78        input: &str,
79        _ctx: &SessionContext,
80        preset: &Preset,
81    ) -> Result<CompressedContent> {
82        let model_family = model_family_from_preset(preset);
83        let tokens_original = self.token_counter.count(input, &model_family);
84
85        let mut content = Content {
86            raw: input.to_owned(),
87            content_type: ContentType::PlainText,
88            metadata: crate::types::ContentMetadata {
89                source: None,
90                path: None,
91                language: None,
92            },
93            tokens_original,
94        };
95
96        let mut stages_applied: Vec<String> = Vec::new();
97
98        for stage in &self.stages {
99            let config = stage_config_from_preset(stage.name(), preset);
100            if config.enabled {
101                stage.process(&mut content, &config)?;
102                stages_applied.push(stage.name().to_owned());
103            }
104        }
105
106        // Post-stage processing: apply advanced compression techniques
107
108        let is_json = ToonEncoder::is_json(&content.raw);
109
110        // JSON projection: strip internal/debug fields, empty collections,
111        // deep nesting, and redundant timestamps before other JSON processing
112        if is_json && content.raw.len() > 100 {
113            let proj_config = crate::json_projection::ProjectionConfig::default();
114            if let Ok(proj_result) = crate::json_projection::project_json(&content.raw, &proj_config) {
115                if proj_result.fields_removed > 0 {
116                    content.raw = proj_result.data;
117                    stages_applied.push("json_projection".to_owned());
118                }
119            }
120        }
121
122        // RLE: collapse repeated patterns in non-JSON content (generalizes condense)
123        // Only apply when content is long enough to benefit
124        if !is_json && content.raw.len() > 200 {
125            if let Ok(rle_result) = crate::rle_compressor::rle_compress(&content.raw, 3) {
126                if rle_result.runs_collapsed > 0 {
127                    // Safety check: verify critical markers are preserved
128                    let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
129                    let rle_has_error = rle_result.text.contains("ERROR") || rle_result.text.contains("error:");
130                    if !has_error || rle_has_error {
131                        content.raw = rle_result.text;
132                        stages_applied.push("rle".to_owned());
133                    }
134                }
135            }
136        }
137
138        // Sliding window dedup: catch repeated substrings across non-adjacent lines
139        if !is_json && content.raw.len() > 300 {
140            if let Ok(sw_result) = crate::rle_compressor::sliding_window_dedup(&content.raw, 4) {
141                if sw_result.dedup_count > 0 {
142                    // Safety check: verify critical markers are preserved
143                    let has_error = content.raw.contains("ERROR") || content.raw.contains("error:");
144                    let sw_has_error = sw_result.text.contains("ERROR") || sw_result.text.contains("error:");
145                    if !has_error || sw_has_error {
146                        content.raw = sw_result.text;
147                        stages_applied.push("sliding_window_dedup".to_owned());
148                    }
149                }
150            }
151        }
152
153        // Entropy-weighted truncation for long non-JSON content
154        if !is_json && content.raw.len() > 500 {
155            if let Ok(trunc_result) = self.entropy_truncator.truncate_string(&content.raw) {
156                if trunc_result.segments_dropped > 0 {
157                    content.raw = trunc_result.text;
158                    stages_applied.push("entropy_truncate".to_owned());
159                }
160            }
161        }
162
163        // Token pruning for prose content (non-JSON, non-code)
164        if !is_json && content.raw.len() > 100 && looks_like_prose(&content.raw) {
165            if let Ok(prune_result) = self.token_pruner.prune(&content.raw) {
166                if prune_result.tokens_removed > 0 {
167                    content.raw = prune_result.text;
168                    stages_applied.push("token_prune".to_owned());
169                }
170            }
171        }
172
173        // Apply dictionary compression + TOON encoding for JSON
174        let data = if ToonEncoder::is_json(&content.raw) {
175            // Dictionary compression — observe and try to compress.
176            // Only use dict compression if it actually saves bytes (header overhead
177            // can make small payloads larger).
178            self.dict_compressor.borrow_mut().observe(&content.raw);
179            let dict_result = self.dict_compressor.borrow().compress(&content.raw)?;
180
181            if dict_result.substitutions > 0 && dict_result.bytes_saved > 50 {
182                // Dict compression was beneficial — encode the JSON body with TOON
183                stages_applied.push("dict_compress".to_owned());
184                if let Some(newline_pos) = dict_result.data.find("\n{") {
185                    let header = &dict_result.data[..newline_pos + 1];
186                    let json_body = &dict_result.data[newline_pos + 1..];
187                    if let Ok(json) = serde_json::from_str::<serde_json::Value>(json_body) {
188                        let encoded = self.toon_encoder.encode(&json)?;
189                        stages_applied.push("toon_encode".to_owned());
190                        format!("{header}{encoded}")
191                    } else {
192                        dict_result.data
193                    }
194                } else {
195                    dict_result.data
196                }
197            } else {
198                // No beneficial dict substitutions — standard TOON encoding
199                let json: serde_json::Value = serde_json::from_str(&content.raw)
200                    .map_err(|e| SqzError::Other(format!("pipeline: JSON parse error: {e}")))?;
201                let encoded = self.toon_encoder.encode(&json)?;
202                stages_applied.push("toon_encode".to_owned());
203                encoded
204            }
205        } else {
206            content.raw
207        };
208
209        let tokens_compressed = self.token_counter.count(&data, &model_family);
210        let compression_ratio = if tokens_original == 0 {
211            1.0
212        } else {
213            tokens_compressed as f64 / tokens_original as f64
214        };
215
216        Ok(CompressedContent {
217            data,
218            tokens_compressed,
219            tokens_original,
220            stages_applied,
221            compression_ratio,
222            provenance: crate::types::Provenance::default(),
223            verify: None,
224        })
225    }
226
227    /// Insert a plugin stage and re-sort by priority.
228    pub fn insert_stage(&mut self, stage: Box<dyn crate::stages::CompressionStage>) {
229        self.stages.push(stage);
230        self.stages.sort_by_key(|s| s.priority());
231    }
232
233    /// Rebuild stage list from a new preset (hot-reload support).
234    /// Built-in stages are recreated; plugin stages are dropped and must be
235    /// re-inserted by the caller.
236    pub fn reload_preset(&mut self, _preset: &Preset) -> Result<()> {
237        let mut stages: Vec<Box<dyn crate::stages::CompressionStage>> = vec![
238            Box::new(AnsiStripper),
239            Box::new(KeepFieldsStage),
240            Box::new(StripFieldsStage),
241            Box::new(CondenseStage),
242            Box::new(GitDiffFoldStage),
243            Box::new(StripNullsStage),
244            Box::new(FlattenStage),
245            Box::new(TruncateStringsStage),
246            Box::new(CollapseArraysStage),
247            Box::new(CustomTransformsStage),
248        ];
249        stages.sort_by_key(|s| s.priority());
250        self.stages = stages;
251        // Reset session-level state in dict compressor
252        self.dict_compressor.borrow_mut().reset();
253        Ok(())
254    }
255}
256
257/// Derive the `ModelFamily` from the preset's model configuration.
258fn model_family_from_preset(preset: &Preset) -> ModelFamily {
259    match preset.model.family.to_lowercase().as_str() {
260        "anthropic" | "claude" => ModelFamily::AnthropicClaude,
261        "openai" | "gpt" => ModelFamily::OpenAiGpt,
262        "google" | "gemini" => ModelFamily::GoogleGemini,
263        other => ModelFamily::Local(other.to_string()),
264    }
265}
266
267/// Build a `StageConfig` for a named stage from the preset's compression config.
268fn stage_config_from_preset(name: &str, preset: &Preset) -> StageConfig {
269    let c = &preset.compression;
270    match name {
271        "ansi_strip" => StageConfig {
272            enabled: true,
273            options: serde_json::Value::Object(Default::default()),
274        },
275        "keep_fields" => {
276            if let Some(cfg) = &c.keep_fields {
277                StageConfig {
278                    enabled: cfg.enabled,
279                    options: serde_json::json!({ "fields": cfg.fields }),
280                }
281            } else {
282                StageConfig::default()
283            }
284        }
285        "strip_fields" => {
286            if let Some(cfg) = &c.strip_fields {
287                StageConfig {
288                    enabled: cfg.enabled,
289                    options: serde_json::json!({ "fields": cfg.fields }),
290                }
291            } else {
292                StageConfig::default()
293            }
294        }
295        "condense" => {
296            if let Some(cfg) = &c.condense {
297                StageConfig {
298                    enabled: cfg.enabled,
299                    options: serde_json::json!({
300                        "max_repeated_lines": cfg.max_repeated_lines
301                    }),
302                }
303            } else {
304                StageConfig::default()
305            }
306        }
307        "git_diff_fold" => {
308            if let Some(cfg) = &c.git_diff_fold {
309                StageConfig {
310                    enabled: cfg.enabled,
311                    options: serde_json::json!({
312                        "max_context_lines": cfg.max_context_lines
313                    }),
314                }
315            } else {
316                // Default: enabled with 2 context lines
317                StageConfig {
318                    enabled: true,
319                    options: serde_json::json!({ "max_context_lines": 2 }),
320                }
321            }
322        }
323        "strip_nulls" => {
324            if let Some(cfg) = &c.strip_nulls {
325                StageConfig {
326                    enabled: cfg.enabled,
327                    options: serde_json::Value::Object(Default::default()),
328                }
329            } else {
330                StageConfig::default()
331            }
332        }
333        "flatten" => {
334            if let Some(cfg) = &c.flatten {
335                StageConfig {
336                    enabled: cfg.enabled,
337                    options: serde_json::json!({ "max_depth": cfg.max_depth }),
338                }
339            } else {
340                StageConfig::default()
341            }
342        }
343        "truncate_strings" => {
344            if let Some(cfg) = &c.truncate_strings {
345                StageConfig {
346                    enabled: cfg.enabled,
347                    options: serde_json::json!({ "max_length": cfg.max_length }),
348                }
349            } else {
350                StageConfig::default()
351            }
352        }
353        "collapse_arrays" => {
354            if let Some(cfg) = &c.collapse_arrays {
355                StageConfig {
356                    enabled: cfg.enabled,
357                    options: serde_json::json!({
358                        "max_items": cfg.max_items,
359                        "summary_template": cfg.summary_template
360                    }),
361                }
362            } else {
363                StageConfig::default()
364            }
365        }
366        "custom_transforms" => {
367            if let Some(cfg) = &c.custom_transforms {
368                StageConfig {
369                    enabled: cfg.enabled,
370                    options: serde_json::Value::Object(Default::default()),
371                }
372            } else {
373                StageConfig::default()
374            }
375        }
376        _ => StageConfig::default(),
377    }
378}
379
380/// Heuristic: does this text look like prose (documentation, error messages,
381/// README content) rather than code or structured data?
382fn looks_like_prose(text: &str) -> bool {
383    let lines: Vec<&str> = text.lines().take(20).collect();
384    if lines.is_empty() {
385        return false;
386    }
387
388    let mut prose_lines = 0;
389    let mut code_lines = 0;
390
391    for line in &lines {
392        let trimmed = line.trim();
393        if trimmed.is_empty() {
394            continue;
395        }
396        // Code indicators
397        if trimmed.ends_with('{')
398            || trimmed.ends_with('}')
399            || trimmed.ends_with(';')
400            || trimmed.starts_with("fn ")
401            || trimmed.starts_with("pub ")
402            || trimmed.starts_with("let ")
403            || trimmed.starts_with("const ")
404            || trimmed.starts_with("import ")
405            || trimmed.starts_with("def ")
406            || trimmed.starts_with("class ")
407            || trimmed.contains("::")
408            || trimmed.contains("->")
409            || trimmed.contains("=>")
410        {
411            code_lines += 1;
412        } else {
413            prose_lines += 1;
414        }
415    }
416
417    // Consider it prose if more than half the lines look like prose
418    prose_lines > code_lines
419}
420
421// ---------------------------------------------------------------------------
422// Tests
423// ---------------------------------------------------------------------------
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428    use crate::preset::{
429        BudgetConfig, CollapseArraysConfig, CompressionConfig, CondenseConfig,
430        CustomTransformsConfig, ModelConfig, PresetMeta,
431        StripNullsConfig, ToolSelectionConfig, TruncateStringsConfig,
432        TerseModeConfig,
433    };
434
435    fn default_preset() -> Preset {
436        Preset {
437            preset: PresetMeta {
438                name: "test".into(),
439                version: "1.0".into(),
440                description: String::new(),
441            },
442            compression: CompressionConfig {
443                stages: vec![],
444                keep_fields: None,
445                strip_fields: None,
446                condense: Some(CondenseConfig {
447                    enabled: true,
448                    max_repeated_lines: 3,
449                }),
450                git_diff_fold: None,
451                strip_nulls: Some(StripNullsConfig { enabled: true }),
452                flatten: None,
453                truncate_strings: Some(TruncateStringsConfig {
454                    enabled: true,
455                    max_length: 500,
456                }),
457                collapse_arrays: Some(CollapseArraysConfig {
458                    enabled: true,
459                    max_items: 5,
460                    summary_template: "... and {remaining} more items".into(),
461                }),
462                custom_transforms: Some(CustomTransformsConfig { enabled: true }),
463            },
464            tool_selection: ToolSelectionConfig {
465                max_tools: 5,
466                similarity_threshold: 0.7,
467                default_tools: vec![],
468            },
469            budget: BudgetConfig {
470                warning_threshold: 0.70,
471                ceiling_threshold: 0.85,
472                default_window_size: 200_000,
473                agents: Default::default(),
474            },
475            terse_mode: TerseModeConfig {
476                enabled: false,
477                level: crate::preset::TerseLevel::Moderate,
478            },
479            model: ModelConfig {
480                family: "anthropic".into(),
481                primary: "claude-sonnet-4-20250514".into(),
482                local: String::new(),
483                complexity_threshold: 0.4,
484                pricing: None,
485            },
486        }
487    }
488
489    fn ctx() -> SessionContext {
490        SessionContext {
491            session_id: "test-session".into(),
492        }
493    }
494
495    #[test]
496    fn new_creates_pipeline_with_sorted_stages() {
497        let preset = default_preset();
498        let pipeline = CompressionPipeline::new(&preset);
499        // Verify stages are sorted by priority
500        let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
501        let mut sorted = priorities.clone();
502        sorted.sort();
503        assert_eq!(priorities, sorted);
504    }
505
506    #[test]
507    fn compress_plain_text_passthrough() {
508        let preset = default_preset();
509        let pipeline = CompressionPipeline::new(&preset);
510        let result = pipeline.compress("hello world", &ctx(), &preset).unwrap();
511        assert_eq!(result.data, "hello world");
512        assert!(!result.stages_applied.contains(&"toon_encode".to_owned()));
513    }
514
515    #[test]
516    fn compress_json_applies_toon() {
517        let preset = default_preset();
518        let pipeline = CompressionPipeline::new(&preset);
519        let json = r#"{"name":"Alice","age":30}"#;
520        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
521        assert!(result.data.starts_with("TOON:"), "data: {}", result.data);
522        assert!(result.stages_applied.contains(&"toon_encode".to_owned()));
523    }
524
525    #[test]
526    fn compress_strips_nulls_from_json() {
527        let preset = default_preset();
528        let pipeline = CompressionPipeline::new(&preset);
529        let json = r#"{"a":1,"b":null}"#;
530        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
531        // After strip_nulls, "b" is gone; TOON encodes the result
532        assert!(result.data.starts_with("TOON:"));
533        // Decode and verify null is gone
534        let decoded = ToonEncoder.decode(&result.data).unwrap();
535        assert!(decoded.get("b").is_none());
536        assert_eq!(decoded["a"], serde_json::json!(1));
537    }
538
539    #[test]
540    fn compress_returns_token_counts() {
541        let preset = default_preset();
542        let pipeline = CompressionPipeline::new(&preset);
543        let input = "a".repeat(100);
544        let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
545        assert!(result.tokens_original > 0);
546        assert!(result.tokens_compressed > 0);
547    }
548
549    #[test]
550    fn compress_ratio_is_reasonable() {
551        let preset = default_preset();
552        let pipeline = CompressionPipeline::new(&preset);
553        let result = pipeline.compress("hello", &ctx(), &preset).unwrap();
554        assert!(result.compression_ratio > 0.0);
555    }
556
557    #[test]
558    fn insert_stage_re_sorts_by_priority() {
559        use crate::stages::CompressionStage;
560        use crate::types::StageConfig;
561
562        struct LowPriorityStage;
563        impl CompressionStage for LowPriorityStage {
564            fn name(&self) -> &str {
565                "low_priority"
566            }
567            fn priority(&self) -> u32 {
568                5 // lower than all built-in stages
569            }
570            fn process(
571                &self,
572                _content: &mut Content,
573                _config: &StageConfig,
574            ) -> crate::error::Result<()> {
575                Ok(())
576            }
577        }
578
579        let preset = default_preset();
580        let mut pipeline = CompressionPipeline::new(&preset);
581        pipeline.insert_stage(Box::new(LowPriorityStage));
582
583        let priorities: Vec<u32> = pipeline.stages.iter().map(|s| s.priority()).collect();
584        let mut sorted = priorities.clone();
585        sorted.sort();
586        assert_eq!(priorities, sorted);
587        assert_eq!(pipeline.stages[0].name(), "ansi_strip");
588        assert_eq!(pipeline.stages[1].name(), "low_priority");
589    }
590
591    #[test]
592    fn reload_preset_rebuilds_stages() {
593        let preset = default_preset();
594        let mut pipeline = CompressionPipeline::new(&preset);
595        let original_count = pipeline.stages.len();
596        pipeline.reload_preset(&preset).unwrap();
597        assert_eq!(pipeline.stages.len(), original_count);
598    }
599
600    #[test]
601    fn compress_keep_fields_filters_json() {
602        use crate::preset::KeepFieldsConfig;
603        let mut preset = default_preset();
604        preset.compression.keep_fields = Some(KeepFieldsConfig {
605            enabled: true,
606            fields: vec!["id".into(), "name".into()],
607        });
608        let pipeline = CompressionPipeline::new(&preset);
609        let json = r#"{"id":1,"name":"Bob","debug":"x"}"#;
610        let result = pipeline.compress(json, &ctx(), &preset).unwrap();
611        let decoded = ToonEncoder.decode(&result.data).unwrap();
612        assert!(decoded.get("debug").is_none());
613        assert_eq!(decoded["id"], serde_json::json!(1));
614    }
615
616    #[test]
617    fn compress_empty_string() {
618        let preset = default_preset();
619        let pipeline = CompressionPipeline::new(&preset);
620        let result = pipeline.compress("", &ctx(), &preset).unwrap();
621        assert_eq!(result.data, "");
622        assert_eq!(result.tokens_original, 0);
623    }
624
625    #[test]
626    fn stage_config_from_preset_unknown_stage() {
627        let preset = default_preset();
628        let config = stage_config_from_preset("nonexistent", &preset);
629        assert!(!config.enabled);
630    }
631
632    // ---------------------------------------------------------------------------
633    // Property tests
634    // ---------------------------------------------------------------------------
635
636    use proptest::prelude::*;
637
638    /// Generate a significant line from a fixed set of meaningful tokens.
639    fn significant_line_strategy() -> impl Strategy<Value = String> {
640        prop_oneof![
641            Just("error: connection refused".to_owned()),
642            Just("warning: deprecated API usage".to_owned()),
643            Just("failed: build step exited with code 1".to_owned()),
644            Just("success: deployment complete".to_owned()),
645            Just("status: all checks passed".to_owned()),
646            Just("error: file not found".to_owned()),
647            Just("warning: unused variable detected".to_owned()),
648        ]
649    }
650
651    /// Generate a noise line (repeated decorative content).
652    fn noise_line_strategy() -> impl Strategy<Value = String> {
653        prop_oneof![
654            Just("---".to_owned()),
655            Just("Loading...".to_owned()),
656            Just("================".to_owned()),
657            Just("...".to_owned()),
658        ]
659    }
660
661    /// Recursive strategy that generates arbitrary serde_json::Value instances.
662    /// Mirrors the strategy in toon.rs tests.
663    fn arb_json_value() -> impl Strategy<Value = serde_json::Value> {
664        let leaf = prop_oneof![
665            Just(serde_json::Value::Null),
666            any::<bool>().prop_map(serde_json::Value::Bool),
667            any::<i64>().prop_map(|n| serde_json::json!(n)),
668            any::<f64>()
669                .prop_filter("must be finite", |f| f.is_finite())
670                .prop_map(|f| serde_json::json!(f)),
671            ".*".prop_map(serde_json::Value::String),
672        ];
673
674        leaf.prop_recursive(4, 64, 8, |inner| {
675            prop_oneof![
676                prop::collection::vec(inner.clone(), 0..8)
677                    .prop_map(serde_json::Value::Array),
678                prop::collection::hash_map(".*", inner, 0..8).prop_map(|m| {
679                    serde_json::Value::Object(m.into_iter().collect())
680                }),
681            ]
682        })
683    }
684
685    proptest! {
686        /// **Validates: Requirements 17.1, 17.2, 13.2**
687        ///
688        /// Property 22: ASCII-safe output.
689        ///
690        /// For any JSON input, the Compression_Pipeline SHALL produce output
691        /// using only ASCII-safe characters: printable ASCII (0x20–0x7E) plus
692        /// standard whitespace (\t = 0x09, \n = 0x0A, \r = 0x0D).
693        #[test]
694        fn prop_pipeline_ascii_safe_output(v in arb_json_value()) {
695            let preset = default_preset();
696            let pipeline = CompressionPipeline::new(&preset);
697
698            let json_input = serde_json::to_string(&v).expect("serialize should not fail");
699            let result = pipeline.compress(&json_input, &ctx(), &preset)
700                .expect("compress should not fail");
701
702            for ch in result.data.chars() {
703                let cp = ch as u32;
704                let is_printable_ascii = cp >= 0x20 && cp <= 0x7E;
705                let is_standard_whitespace = cp == 0x09 || cp == 0x0A || cp == 0x0D;
706                prop_assert!(
707                    is_printable_ascii || is_standard_whitespace,
708                    "non-ASCII-safe character in output: U+{:04X} ({:?})\noutput: {:?}",
709                    cp, ch, result.data
710                );
711            }
712        }
713    }
714
715    proptest! {
716        /// **Validates: Requirements 1.3**
717        ///
718        /// Property 1: Compression preserves semantically significant content.
719        ///
720        /// For any CLI output containing significant tokens (errors, warnings,
721        /// status messages) mixed with noise (repeated identical lines), the
722        /// Compression_Pipeline SHALL produce output that:
723        ///   1. Contains all significant lines.
724        ///   2. Contains each noise line at most `max_repeated_lines` times.
725        #[test]
726        fn prop_compression_preserves_significant_content(
727            significant_lines in prop::collection::vec(significant_line_strategy(), 1..=5),
728            noise_line in noise_line_strategy(),
729            noise_repeat in 5u32..=10u32,
730        ) {
731            let preset = default_preset(); // condense enabled, max_repeated_lines=3
732            let pipeline = CompressionPipeline::new(&preset);
733
734            // Build interleaved input: noise, significant, noise, significant, ...
735            let mut lines: Vec<String> = Vec::new();
736            for sig in &significant_lines {
737                for _ in 0..noise_repeat {
738                    lines.push(noise_line.clone());
739                }
740                lines.push(sig.clone());
741            }
742            // Trailing noise block
743            for _ in 0..noise_repeat {
744                lines.push(noise_line.clone());
745            }
746
747            let input = lines.join("\n");
748            let result = pipeline.compress(&input, &ctx(), &preset).unwrap();
749            let output = &result.data;
750
751            // 1. All significant lines must appear in the output.
752            for sig in &significant_lines {
753                prop_assert!(
754                    output.contains(sig.as_str()),
755                    "significant line missing from output: {:?}\noutput: {:?}",
756                    sig,
757                    output
758                );
759            }
760
761            // 2. No consecutive run of the noise line exceeds max_repeated_lines (3).
762            let mut max_run = 0usize;
763            let mut current_run = 0usize;
764            for line in output.lines() {
765                if line == noise_line.as_str() {
766                    current_run += 1;
767                    max_run = max_run.max(current_run);
768                } else {
769                    current_run = 0;
770                }
771            }
772            prop_assert!(
773                max_run <= 3,
774                "noise line {:?} has a consecutive run of {} (max 3)\noutput: {:?}",
775                noise_line,
776                max_run,
777                output
778            );
779        }
780    }
781}