Skip to main content

sqz_engine/
stages.rs

1use crate::error::{Result, SqzError};
2use crate::toon::ToonEncoder;
3use crate::types::{Content, ContentType, StageConfig};
4
5/// A single compression stage in the pipeline.
6///
7/// Each stage transforms `Content` in place according to its `StageConfig`.
8/// Stages must check `config.enabled` and return early (no-op) when disabled.
9pub trait CompressionStage: Send + Sync {
10    fn name(&self) -> &str;
11    fn priority(&self) -> u32;
12    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()>;
13}
14
15// ---------------------------------------------------------------------------
16// Helper: parse raw as JSON, apply a transform, serialize back
17// ---------------------------------------------------------------------------
18
19fn with_json<F>(content: &mut Content, f: F) -> Result<()>
20where
21    F: FnOnce(&mut serde_json::Value) -> Result<()>,
22{
23    if !ToonEncoder::is_json(&content.raw) {
24        return Ok(());
25    }
26    let mut value: serde_json::Value = serde_json::from_str(&content.raw)?;
27    f(&mut value)?;
28    content.raw = serde_json::to_string(&value)?;
29    Ok(())
30}
31
32// ---------------------------------------------------------------------------
33// Stage 1: keep_fields
34// ---------------------------------------------------------------------------
35
36/// For JSON content, keep only the specified top-level fields; drop all others.
37/// Config options: `fields` — array of field name strings.
38/// Non-JSON content passes through unchanged.
39pub struct KeepFieldsStage;
40
41impl CompressionStage for KeepFieldsStage {
42    fn name(&self) -> &str {
43        "keep_fields"
44    }
45
46    fn priority(&self) -> u32 {
47        10
48    }
49
50    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
51        if !config.enabled {
52            return Ok(());
53        }
54        let fields: Vec<String> = match config.options.get("fields") {
55            Some(v) => serde_json::from_value(v.clone())
56                .map_err(|e| SqzError::Other(format!("keep_fields: invalid fields option: {e}")))?,
57            None => return Ok(()),
58        };
59        if fields.is_empty() {
60            return Ok(());
61        }
62        with_json(content, |value| {
63            if let serde_json::Value::Object(map) = value {
64                map.retain(|k, _| fields.contains(k));
65            }
66            Ok(())
67        })
68    }
69}
70
71// ---------------------------------------------------------------------------
72// Stage 2: strip_fields
73// ---------------------------------------------------------------------------
74
75/// For JSON content, remove specified fields by key name.
76/// Supports dot-notation for nested fields (e.g. "metadata.internal_id").
77/// Config options: `fields` — array of field path strings.
78/// Non-JSON content passes through unchanged.
79pub struct StripFieldsStage;
80
81fn strip_field_path(value: &mut serde_json::Value, path: &[&str]) {
82    if path.is_empty() {
83        return;
84    }
85    if let serde_json::Value::Object(map) = value {
86        if path.len() == 1 {
87            map.remove(path[0]);
88        } else {
89            if let Some(child) = map.get_mut(path[0]) {
90                strip_field_path(child, &path[1..]);
91            }
92        }
93    }
94}
95
96impl CompressionStage for StripFieldsStage {
97    fn name(&self) -> &str {
98        "strip_fields"
99    }
100
101    fn priority(&self) -> u32 {
102        20
103    }
104
105    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
106        if !config.enabled {
107            return Ok(());
108        }
109        let fields: Vec<String> = match config.options.get("fields") {
110            Some(v) => serde_json::from_value(v.clone())
111                .map_err(|e| SqzError::Other(format!("strip_fields: invalid fields option: {e}")))?,
112            None => return Ok(()),
113        };
114        if fields.is_empty() {
115            return Ok(());
116        }
117        with_json(content, |value| {
118            for field in &fields {
119                let parts: Vec<&str> = field.split('.').collect();
120                strip_field_path(value, &parts);
121            }
122            Ok(())
123        })
124    }
125}
126
127// ---------------------------------------------------------------------------
128// Stage 3: condense
129// ---------------------------------------------------------------------------
130
131/// For plain text / CLI output, collapse runs of repeated identical lines
132/// down to at most `max_repeated_lines`.
133/// Config options: `max_repeated_lines` (u32, default 3).
134/// Non-plain-text content passes through unchanged.
135pub struct CondenseStage;
136
137impl CompressionStage for CondenseStage {
138    fn name(&self) -> &str {
139        "condense"
140    }
141
142    fn priority(&self) -> u32 {
143        30
144    }
145
146    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
147        if !config.enabled {
148            return Ok(());
149        }
150        // Only apply to plain text and CLI output
151        match &content.content_type {
152            ContentType::PlainText | ContentType::CliOutput { .. } => {}
153            _ => return Ok(()),
154        }
155
156        let max_repeated: u32 = config
157            .options
158            .get("max_repeated_lines")
159            .and_then(|v| v.as_u64())
160            .map(|v| v as u32)
161            .unwrap_or(3);
162
163        let mut result = Vec::new();
164        let mut current_line: Option<&str> = None;
165        let mut run_count: u32 = 0;
166
167        for line in content.raw.lines() {
168            match current_line {
169                Some(prev) if prev == line => {
170                    run_count += 1;
171                    if run_count <= max_repeated {
172                        result.push(line);
173                    }
174                }
175                _ => {
176                    current_line = Some(line);
177                    run_count = 1;
178                    result.push(line);
179                }
180            }
181        }
182
183        // Preserve trailing newline if original had one
184        let trailing_newline = content.raw.ends_with('\n');
185        content.raw = result.join("\n");
186        if trailing_newline {
187            content.raw.push('\n');
188        }
189        Ok(())
190    }
191}
192
193// ---------------------------------------------------------------------------
194// Stage 4: strip_nulls
195// ---------------------------------------------------------------------------
196
197/// For JSON content, recursively remove all null-valued fields from objects.
198/// Arrays keep their null elements.
199/// Config options: `enabled` (bool).
200pub struct StripNullsStage;
201
202fn strip_nulls_recursive(value: &mut serde_json::Value) {
203    match value {
204        serde_json::Value::Object(map) => {
205            map.retain(|_, v| !v.is_null());
206            for v in map.values_mut() {
207                strip_nulls_recursive(v);
208            }
209        }
210        serde_json::Value::Array(arr) => {
211            for item in arr.iter_mut() {
212                strip_nulls_recursive(item);
213            }
214        }
215        _ => {}
216    }
217}
218
219impl CompressionStage for StripNullsStage {
220    fn name(&self) -> &str {
221        "strip_nulls"
222    }
223
224    fn priority(&self) -> u32 {
225        40
226    }
227
228    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
229        if !config.enabled {
230            return Ok(());
231        }
232        with_json(content, |value| {
233            strip_nulls_recursive(value);
234            Ok(())
235        })
236    }
237}
238
239// ---------------------------------------------------------------------------
240// Stage 5: flatten
241// ---------------------------------------------------------------------------
242
243/// For JSON content, flatten nested objects up to `max_depth` levels using
244/// dot-notation for flattened keys (e.g. `{"a":{"b":1}}` → `{"a.b":1}`).
245/// Config options: `max_depth` (u32, default 3).
246/// Non-JSON content passes through unchanged.
247pub struct FlattenStage;
248
249fn flatten_value(
250    value: &serde_json::Value,
251    prefix: &str,
252    depth: u32,
253    max_depth: u32,
254    out: &mut serde_json::Map<String, serde_json::Value>,
255) {
256    if let serde_json::Value::Object(map) = value {
257        if depth < max_depth {
258            for (k, v) in map {
259                let new_key = if prefix.is_empty() {
260                    k.clone()
261                } else {
262                    format!("{prefix}.{k}")
263                };
264                flatten_value(v, &new_key, depth + 1, max_depth, out);
265            }
266            return;
267        }
268    }
269    out.insert(prefix.to_owned(), value.clone());
270}
271
272impl CompressionStage for FlattenStage {
273    fn name(&self) -> &str {
274        "flatten"
275    }
276
277    fn priority(&self) -> u32 {
278        50
279    }
280
281    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
282        if !config.enabled {
283            return Ok(());
284        }
285        let max_depth: u32 = config
286            .options
287            .get("max_depth")
288            .and_then(|v| v.as_u64())
289            .map(|v| v as u32)
290            .unwrap_or(3);
291
292        with_json(content, |value| {
293            if let serde_json::Value::Object(map) = value {
294                let mut out = serde_json::Map::new();
295                for (k, v) in map.iter() {
296                    flatten_value(v, k, 1, max_depth, &mut out);
297                }
298                *map = out;
299            }
300            Ok(())
301        })
302    }
303}
304
305// ---------------------------------------------------------------------------
306// Stage 6: truncate_strings
307// ---------------------------------------------------------------------------
308
309/// For JSON content, truncate string values longer than `max_length` chars,
310/// appending "..." to indicate truncation.
311/// Config options: `max_length` (u32, default 500).
312/// Non-JSON content passes through unchanged.
313pub struct TruncateStringsStage;
314
315fn truncate_strings_recursive(value: &mut serde_json::Value, max_length: usize) {
316    match value {
317        serde_json::Value::String(s) => {
318            if s.chars().count() > max_length {
319                let truncated: String = s.chars().take(max_length).collect();
320                *s = format!("{truncated}...");
321            }
322        }
323        serde_json::Value::Object(map) => {
324            for v in map.values_mut() {
325                truncate_strings_recursive(v, max_length);
326            }
327        }
328        serde_json::Value::Array(arr) => {
329            for item in arr.iter_mut() {
330                truncate_strings_recursive(item, max_length);
331            }
332        }
333        _ => {}
334    }
335}
336
337impl CompressionStage for TruncateStringsStage {
338    fn name(&self) -> &str {
339        "truncate_strings"
340    }
341
342    fn priority(&self) -> u32 {
343        60
344    }
345
346    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
347        if !config.enabled {
348            return Ok(());
349        }
350        let max_length: usize = config
351            .options
352            .get("max_length")
353            .and_then(|v| v.as_u64())
354            .map(|v| v as usize)
355            .unwrap_or(500);
356
357        with_json(content, |value| {
358            truncate_strings_recursive(value, max_length);
359            Ok(())
360        })
361    }
362}
363
364// ---------------------------------------------------------------------------
365// Stage 7: collapse_arrays
366// ---------------------------------------------------------------------------
367
368/// For JSON content, if an array has more than `max_items` elements:
369/// 1. First, try tabular encoding: if all elements are objects with the same
370///    keys, encode as a header row + data rows for maximum compression.
371/// 2. Otherwise, keep the first `max_items` and replace the rest with a
372///    summary string element.
373///
374/// Config options:
375///   - `max_items` (u32, default 5)
376///   - `summary_template` (string, default "... and {remaining} more items")
377/// Non-JSON content passes through unchanged.
378pub struct CollapseArraysStage;
379
380/// Check if all elements in an array are objects with the same set of keys.
381/// Returns the shared keys in a stable order if uniform, None otherwise.
382fn detect_uniform_array(arr: &[serde_json::Value]) -> Option<Vec<String>> {
383    if arr.len() < 2 {
384        return None;
385    }
386
387    let first_keys: Vec<String> = match &arr[0] {
388        serde_json::Value::Object(map) => {
389            if map.is_empty() {
390                return None;
391            }
392            map.keys().cloned().collect()
393        }
394        _ => return None,
395    };
396
397    // Check that every element is an object with exactly the same keys
398    for item in &arr[1..] {
399        match item {
400            serde_json::Value::Object(map) => {
401                if map.len() != first_keys.len() {
402                    return None;
403                }
404                for key in &first_keys {
405                    if !map.contains_key(key) {
406                        return None;
407                    }
408                }
409            }
410            _ => return None,
411        }
412    }
413
414    Some(first_keys)
415}
416
417/// Encode a uniform array of objects as a compact tabular string:
418/// `[headers] | col1 | col2 | ... \n val1 | val2 | ...`
419fn encode_tabular(arr: &[serde_json::Value], keys: &[String]) -> String {
420    let mut lines = Vec::with_capacity(arr.len() + 1);
421
422    // Header row
423    lines.push(keys.join(" | "));
424
425    // Data rows
426    for item in arr {
427        if let serde_json::Value::Object(map) = item {
428            let row: Vec<String> = keys
429                .iter()
430                .map(|k| value_to_compact_string(map.get(k).unwrap_or(&serde_json::Value::Null)))
431                .collect();
432            lines.push(row.join(" | "));
433        }
434    }
435
436    lines.join("\n")
437}
438
439/// Convert a JSON value to a compact single-line string for tabular display.
440fn value_to_compact_string(v: &serde_json::Value) -> String {
441    match v {
442        serde_json::Value::Null => "null".to_string(),
443        serde_json::Value::Bool(b) => b.to_string(),
444        serde_json::Value::Number(n) => n.to_string(),
445        serde_json::Value::String(s) => {
446            if s.len() > 50 {
447                format!("{}...", &s[..47])
448            } else {
449                s.clone()
450            }
451        }
452        serde_json::Value::Array(a) => format!("[{} items]", a.len()),
453        serde_json::Value::Object(m) => format!("{{{} keys}}", m.len()),
454    }
455}
456
457fn collapse_arrays_recursive(
458    value: &mut serde_json::Value,
459    max_items: usize,
460    summary_template: &str,
461) {
462    match value {
463        serde_json::Value::Array(arr) => {
464            // First recurse into existing items
465            for item in arr.iter_mut() {
466                collapse_arrays_recursive(item, max_items, summary_template);
467            }
468
469            // Try tabular encoding for uniform arrays
470            if arr.len() > max_items {
471                if let Some(keys) = detect_uniform_array(arr) {
472                    let table = encode_tabular(arr, &keys);
473                    let count = arr.len();
474                    arr.clear();
475                    arr.push(serde_json::Value::String(
476                        format!("[table: {count} rows]\n{table}"),
477                    ));
478                    return;
479                }
480
481                // Fallback: simple truncation with summary
482                let remaining = arr.len() - max_items;
483                arr.truncate(max_items);
484                let summary = summary_template.replace("{remaining}", &remaining.to_string());
485                arr.push(serde_json::Value::String(summary));
486            }
487        }
488        serde_json::Value::Object(map) => {
489            for v in map.values_mut() {
490                collapse_arrays_recursive(v, max_items, summary_template);
491            }
492        }
493        _ => {}
494    }
495}
496
497impl CompressionStage for CollapseArraysStage {
498    fn name(&self) -> &str {
499        "collapse_arrays"
500    }
501
502    fn priority(&self) -> u32 {
503        70
504    }
505
506    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
507        if !config.enabled {
508            return Ok(());
509        }
510        let max_items: usize = config
511            .options
512            .get("max_items")
513            .and_then(|v| v.as_u64())
514            .map(|v| v as usize)
515            .unwrap_or(5);
516        let summary_template = config
517            .options
518            .get("summary_template")
519            .and_then(|v| v.as_str())
520            .unwrap_or("... and {remaining} more items")
521            .to_owned();
522
523        with_json(content, |value| {
524            collapse_arrays_recursive(value, max_items, &summary_template);
525            Ok(())
526        })
527    }
528}
529
530// ---------------------------------------------------------------------------
531// Stage 7a: word_abbreviate
532// ---------------------------------------------------------------------------
533
534/// For plain text / CLI output, replace common long words with standard
535/// abbreviations (e.g. "implementation" → "impl", "configuration" → "config").
536///
537/// Only replaces whole words (not substrings) and only in prose-like content.
538/// Config options: `enabled` (bool).
539pub struct WordAbbreviateStage;
540
541/// Built-in abbreviation table: (long_form, short_form).
542/// Only includes unambiguous, widely-understood abbreviations.
543const WORD_ABBREVIATIONS: &[(&str, &str)] = &[
544    ("implementation", "impl"),
545    ("implementations", "impls"),
546    ("configuration", "config"),
547    ("configurations", "configs"),
548    ("authentication", "auth"),
549    ("authorization", "authz"),
550    ("application", "app"),
551    ("applications", "apps"),
552    ("environment", "env"),
553    ("environments", "envs"),
554    ("development", "dev"),
555    ("production", "prod"),
556    ("repository", "repo"),
557    ("repositories", "repos"),
558    ("dependency", "dep"),
559    ("dependencies", "deps"),
560    ("documentation", "docs"),
561    ("information", "info"),
562    ("directory", "dir"),
563    ("directories", "dirs"),
564    ("parameter", "param"),
565    ("parameters", "params"),
566    ("argument", "arg"),
567    ("arguments", "args"),
568    ("function", "fn"),
569    ("functions", "fns"),
570    ("reference", "ref"),
571    ("references", "refs"),
572    ("specification", "spec"),
573    ("specifications", "specs"),
574    ("temporary", "tmp"),
575    ("administrator", "admin"),
576    ("administrators", "admins"),
577    ("database", "db"),
578    ("databases", "dbs"),
579    ("message", "msg"),
580    ("messages", "msgs"),
581    ("response", "resp"),
582    ("request", "req"),
583    ("requests", "reqs"),
584    ("attribute", "attr"),
585    ("attributes", "attrs"),
586    ("expression", "expr"),
587    ("expressions", "exprs"),
588    ("operation", "op"),
589    ("operations", "ops"),
590    ("maximum", "max"),
591    ("minimum", "min"),
592    ("number", "num"),
593    ("string", "str"),
594    ("boolean", "bool"),
595    ("integer", "int"),
596    ("previous", "prev"),
597    ("current", "curr"),
598    ("original", "orig"),
599    ("source", "src"),
600    ("destination", "dest"),
601    ("package", "pkg"),
602    ("packages", "pkgs"),
603    ("library", "lib"),
604    ("libraries", "libs"),
605    ("executable", "exec"),
606    ("executables", "execs"),
607    ("command", "cmd"),
608    ("commands", "cmds"),
609    ("variable", "var"),
610    ("variables", "vars"),
611    ("certificate", "cert"),
612    ("certificates", "certs"),
613    ("synchronize", "sync"),
614    ("asynchronous", "async"),
615    ("initialize", "init"),
616    ("allocation", "alloc"),
617    ("allocations", "allocs"),
618    ("generation", "gen"),
619    ("miscellaneous", "misc"),
620    ("utility", "util"),
621    ("utilities", "utils"),
622    ("statistics", "stats"),
623    ("connection", "conn"),
624    ("connections", "conns"),
625    ("transaction", "txn"),
626    ("transactions", "txns"),
627    ("management", "mgmt"),
628    ("notification", "notif"),
629    ("notifications", "notifs"),
630    ("permission", "perm"),
631    ("permissions", "perms"),
632    ("distribution", "distro"),
633    ("distributions", "distros"),
634    ("architecture", "arch"),
635    ("infrastructure", "infra"),
636    ("kubernetes", "k8s"),
637    ("namespace", "ns"),
638    ("namespaces", "nses"),
639    ("container", "ctr"),
640    ("containers", "ctrs"),
641    ("microservice", "svc"),
642    ("microservices", "svcs"),
643];
644
645impl CompressionStage for WordAbbreviateStage {
646    fn name(&self) -> &str {
647        "word_abbreviate"
648    }
649
650    fn priority(&self) -> u32 {
651        25 // After strip_fields (20), before condense (30)
652    }
653
654    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
655        if !config.enabled {
656            return Ok(());
657        }
658        // Only apply to plain text and CLI output
659        match &content.content_type {
660            ContentType::PlainText | ContentType::CliOutput { .. } => {}
661            _ => return Ok(()),
662        }
663
664        let mut result = content.raw.clone();
665        for &(long, short) in WORD_ABBREVIATIONS {
666            // Replace whole words only (case-insensitive for the check,
667            // but preserve surrounding context)
668            result = replace_whole_word(&result, long, short);
669        }
670
671        content.raw = result;
672        Ok(())
673    }
674}
675
676/// Apply word abbreviation to a plain text string.
677///
678/// This is a convenience function for callers that want to abbreviate
679/// outside the pipeline stage system (e.g. CLI proxy post-processing).
680pub fn abbreviate_words(text: &str) -> String {
681    let mut result = text.to_string();
682    for &(long, short) in WORD_ABBREVIATIONS {
683        result = replace_whole_word(&result, long, short);
684    }
685    result
686}
687
688/// Replace whole-word occurrences of `word` with `replacement`.
689/// A "whole word" is bounded by non-alphanumeric characters or string edges.
690fn replace_whole_word(text: &str, word: &str, replacement: &str) -> String {
691    if text.is_empty() || word.is_empty() {
692        return text.to_string();
693    }
694
695    let lower = text.to_lowercase();
696    let word_lower = word.to_lowercase();
697    let word_len = word.len();
698    let mut result = String::with_capacity(text.len());
699    let mut last_end = 0;
700
701    let text_bytes = text.as_bytes();
702
703    for (start, _) in lower.match_indices(&word_lower) {
704        let end = start + word_len;
705
706        // Check word boundary before
707        let before_ok = start == 0
708            || !text_bytes[start - 1].is_ascii_alphanumeric();
709        // Check word boundary after
710        let after_ok = end >= text.len()
711            || !text_bytes[end].is_ascii_alphanumeric();
712
713        if before_ok && after_ok {
714            result.push_str(&text[last_end..start]);
715            result.push_str(replacement);
716            last_end = end;
717        }
718    }
719
720    result.push_str(&text[last_end..]);
721    result
722}
723
724// ---------------------------------------------------------------------------
725// Stage 7b: git_diff_fold
726// ---------------------------------------------------------------------------
727
728/// For git diff output, fold consecutive unchanged context lines (lines
729/// starting with a space) into a compact `[N unchanged lines]` marker.
730/// This preserves all changed lines (+/-) and hunk headers (@@) while
731/// dramatically reducing noise from context lines.
732///
733/// Config options:
734///   - `max_context_lines` (u32, default 2) — keep this many context lines
735///     before/after each changed block before folding the rest.
736pub struct GitDiffFoldStage;
737
738impl CompressionStage for GitDiffFoldStage {
739    fn name(&self) -> &str {
740        "git_diff_fold"
741    }
742
743    fn priority(&self) -> u32 {
744        35
745    }
746
747    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
748        if !config.enabled {
749            return Ok(());
750        }
751        // Only apply to plain text / CLI output that looks like a diff
752        match &content.content_type {
753            ContentType::PlainText | ContentType::CliOutput { .. } => {}
754            _ => return Ok(()),
755        }
756        // Quick check: must contain diff markers
757        if !content.raw.contains("\n+") && !content.raw.contains("\n-") {
758            return Ok(());
759        }
760
761        let max_ctx: usize = config
762            .options
763            .get("max_context_lines")
764            .and_then(|v| v.as_u64())
765            .map(|v| v as usize)
766            .unwrap_or(2);
767
768        let lines: Vec<&str> = content.raw.lines().collect();
769        let n = lines.len();
770
771        // Mark which lines are "changed" (added, removed, or hunk headers)
772        let is_changed: Vec<bool> = lines
773            .iter()
774            .map(|l| {
775                l.starts_with('+')
776                    || l.starts_with('-')
777                    || l.starts_with("@@")
778                    || l.starts_with("diff ")
779                    || l.starts_with("index ")
780                    || l.starts_with("--- ")
781                    || l.starts_with("+++ ")
782            })
783            .collect();
784
785        // For each context line, determine if it's within max_ctx of a changed line
786        let mut keep = vec![false; n];
787        for i in 0..n {
788            if is_changed[i] {
789                keep[i] = true;
790                // Keep max_ctx lines before
791                for j in i.saturating_sub(max_ctx)..i {
792                    keep[j] = true;
793                }
794                // Keep max_ctx lines after
795                for j in (i + 1)..n.min(i + 1 + max_ctx) {
796                    keep[j] = true;
797                }
798            }
799        }
800
801        // Build output, folding consecutive non-kept lines
802        let mut result = Vec::new();
803        let mut fold_count = 0usize;
804
805        for i in 0..n {
806            if keep[i] {
807                if fold_count > 0 {
808                    result.push(format!("[{fold_count} unchanged lines]"));
809                    fold_count = 0;
810                }
811                result.push(lines[i].to_owned());
812            } else {
813                fold_count += 1;
814            }
815        }
816        if fold_count > 0 {
817            result.push(format!("[{fold_count} unchanged lines]"));
818        }
819
820        let trailing_newline = content.raw.ends_with('\n');
821        content.raw = result.join("\n");
822        if trailing_newline {
823            content.raw.push('\n');
824        }
825        Ok(())
826    }
827}
828
829// ---------------------------------------------------------------------------
830// Stage 8: custom_transforms
831// ---------------------------------------------------------------------------
832
833/// No-op stage that serves as the insertion point for plugin stages.
834/// Passes content through unchanged.
835pub struct CustomTransformsStage;
836
837impl CompressionStage for CustomTransformsStage {
838    fn name(&self) -> &str {
839        "custom_transforms"
840    }
841
842    fn priority(&self) -> u32 {
843        80
844    }
845
846    fn process(&self, _content: &mut Content, config: &StageConfig) -> Result<()> {
847        if !config.enabled {
848            return Ok(());
849        }
850        // No-op: plugin stages are inserted here by the pipeline
851        Ok(())
852    }
853}
854
855// ---------------------------------------------------------------------------
856// Tests
857// ---------------------------------------------------------------------------
858
859#[cfg(test)]
860mod tests {
861    use super::*;
862    use crate::types::{ContentMetadata, ContentType};
863    use serde_json::json;
864
865    fn json_content(raw: &str) -> Content {
866        Content {
867            raw: raw.to_owned(),
868            content_type: ContentType::Json,
869            metadata: ContentMetadata {
870                source: None,
871                path: None,
872                language: None,
873            },
874            tokens_original: 0,
875        }
876    }
877
878    fn text_content(raw: &str) -> Content {
879        Content {
880            raw: raw.to_owned(),
881            content_type: ContentType::PlainText,
882            metadata: ContentMetadata {
883                source: None,
884                path: None,
885                language: None,
886            },
887            tokens_original: 0,
888        }
889    }
890
891    fn enabled_config(options: serde_json::Value) -> StageConfig {
892        StageConfig {
893            enabled: true,
894            options,
895        }
896    }
897
898    fn disabled_config() -> StageConfig {
899        StageConfig {
900            enabled: false,
901            options: json!({}),
902        }
903    }
904
905    // --- keep_fields ---
906
907    #[test]
908    fn keep_fields_retains_specified() {
909        let mut c = json_content(r#"{"id":1,"name":"Alice","debug":"x"}"#);
910        let cfg = enabled_config(json!({"fields": ["id", "name"]}));
911        KeepFieldsStage.process(&mut c, &cfg).unwrap();
912        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
913        assert_eq!(v, json!({"id":1,"name":"Alice"}));
914    }
915
916    #[test]
917    fn keep_fields_disabled_passthrough() {
918        let raw = r#"{"id":1,"name":"Alice"}"#;
919        let mut c = json_content(raw);
920        KeepFieldsStage.process(&mut c, &disabled_config()).unwrap();
921        assert_eq!(c.raw, raw);
922    }
923
924    #[test]
925    fn keep_fields_non_json_passthrough() {
926        let raw = "not json at all";
927        let mut c = text_content(raw);
928        let cfg = enabled_config(json!({"fields": ["id"]}));
929        KeepFieldsStage.process(&mut c, &cfg).unwrap();
930        assert_eq!(c.raw, raw);
931    }
932
933    // --- strip_fields ---
934
935    #[test]
936    fn strip_fields_removes_top_level() {
937        let mut c = json_content(r#"{"id":1,"debug":"x","name":"Bob"}"#);
938        let cfg = enabled_config(json!({"fields": ["debug"]}));
939        StripFieldsStage.process(&mut c, &cfg).unwrap();
940        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
941        assert_eq!(v, json!({"id":1,"name":"Bob"}));
942    }
943
944    #[test]
945    fn strip_fields_dot_notation() {
946        let mut c = json_content(r#"{"metadata":{"internal_id":"x","public":"y"},"name":"Bob"}"#);
947        let cfg = enabled_config(json!({"fields": ["metadata.internal_id"]}));
948        StripFieldsStage.process(&mut c, &cfg).unwrap();
949        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
950        assert_eq!(v, json!({"metadata":{"public":"y"},"name":"Bob"}));
951    }
952
953    #[test]
954    fn strip_fields_disabled_passthrough() {
955        let raw = r#"{"id":1}"#;
956        let mut c = json_content(raw);
957        StripFieldsStage.process(&mut c, &disabled_config()).unwrap();
958        assert_eq!(c.raw, raw);
959    }
960
961    // --- condense ---
962
963    #[test]
964    fn condense_collapses_repeated_lines() {
965        let raw = "a\na\na\na\na\nb\n";
966        let mut c = text_content(raw);
967        let cfg = enabled_config(json!({"max_repeated_lines": 3}));
968        CondenseStage.process(&mut c, &cfg).unwrap();
969        assert_eq!(c.raw, "a\na\na\nb\n");
970    }
971
972    #[test]
973    fn condense_keeps_up_to_max() {
974        let raw = "x\nx\nx\n";
975        let mut c = text_content(raw);
976        let cfg = enabled_config(json!({"max_repeated_lines": 3}));
977        CondenseStage.process(&mut c, &cfg).unwrap();
978        assert_eq!(c.raw, "x\nx\nx\n");
979    }
980
981    #[test]
982    fn condense_disabled_passthrough() {
983        let raw = "a\na\na\na\n";
984        let mut c = text_content(raw);
985        CondenseStage.process(&mut c, &disabled_config()).unwrap();
986        assert_eq!(c.raw, raw);
987    }
988
989    #[test]
990    fn condense_skips_json() {
991        let raw = r#"{"a":1}"#;
992        let mut c = json_content(raw);
993        let cfg = enabled_config(json!({"max_repeated_lines": 1}));
994        CondenseStage.process(&mut c, &cfg).unwrap();
995        assert_eq!(c.raw, raw);
996    }
997
998    // --- strip_nulls ---
999
1000    #[test]
1001    fn strip_nulls_removes_null_fields() {
1002        let mut c = json_content(r#"{"a":1,"b":null,"c":"x"}"#);
1003        let cfg = enabled_config(json!({}));
1004        StripNullsStage.process(&mut c, &cfg).unwrap();
1005        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1006        assert_eq!(v, json!({"a":1,"c":"x"}));
1007    }
1008
1009    #[test]
1010    fn strip_nulls_recursive() {
1011        let mut c = json_content(r#"{"a":{"b":null,"c":1}}"#);
1012        let cfg = enabled_config(json!({}));
1013        StripNullsStage.process(&mut c, &cfg).unwrap();
1014        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1015        assert_eq!(v, json!({"a":{"c":1}}));
1016    }
1017
1018    #[test]
1019    fn strip_nulls_keeps_null_in_arrays() {
1020        let mut c = json_content(r#"{"arr":[1,null,2]}"#);
1021        let cfg = enabled_config(json!({}));
1022        StripNullsStage.process(&mut c, &cfg).unwrap();
1023        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1024        assert_eq!(v, json!({"arr":[1,null,2]}));
1025    }
1026
1027    #[test]
1028    fn strip_nulls_disabled_passthrough() {
1029        let raw = r#"{"a":null}"#;
1030        let mut c = json_content(raw);
1031        StripNullsStage.process(&mut c, &disabled_config()).unwrap();
1032        assert_eq!(c.raw, raw);
1033    }
1034
1035    // --- flatten ---
1036
1037    #[test]
1038    fn flatten_nested_object() {
1039        let mut c = json_content(r#"{"a":{"b":{"c":1}}}"#);
1040        let cfg = enabled_config(json!({"max_depth": 3}));
1041        FlattenStage.process(&mut c, &cfg).unwrap();
1042        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1043        assert_eq!(v, json!({"a.b.c":1}));
1044    }
1045
1046    #[test]
1047    fn flatten_respects_max_depth() {
1048        let mut c = json_content(r#"{"a":{"b":{"c":1}}}"#);
1049        let cfg = enabled_config(json!({"max_depth": 1}));
1050        FlattenStage.process(&mut c, &cfg).unwrap();
1051        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1052        // At max_depth=1, top-level values are not descended into
1053        assert_eq!(v, json!({"a":{"b":{"c":1}}}));
1054    }
1055
1056    #[test]
1057    fn flatten_disabled_passthrough() {
1058        let raw = r#"{"a":{"b":1}}"#;
1059        let mut c = json_content(raw);
1060        FlattenStage.process(&mut c, &disabled_config()).unwrap();
1061        assert_eq!(c.raw, raw);
1062    }
1063
1064    // --- truncate_strings ---
1065
1066    #[test]
1067    fn truncate_strings_long_value() {
1068        let long = "a".repeat(600);
1069        let raw = format!(r#"{{"key":"{}"}}"#, long);
1070        let mut c = json_content(&raw);
1071        let cfg = enabled_config(json!({"max_length": 500}));
1072        TruncateStringsStage.process(&mut c, &cfg).unwrap();
1073        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1074        let s = v["key"].as_str().unwrap();
1075        assert!(s.ends_with("..."));
1076        assert_eq!(s.chars().count(), 503); // 500 + "..."
1077    }
1078
1079    #[test]
1080    fn truncate_strings_short_value_unchanged() {
1081        let raw = r#"{"key":"hello"}"#;
1082        let mut c = json_content(raw);
1083        let cfg = enabled_config(json!({"max_length": 500}));
1084        TruncateStringsStage.process(&mut c, &cfg).unwrap();
1085        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1086        assert_eq!(v["key"].as_str().unwrap(), "hello");
1087    }
1088
1089    #[test]
1090    fn truncate_strings_disabled_passthrough() {
1091        let long = "a".repeat(600);
1092        let raw = format!(r#"{{"key":"{}"}}"#, long);
1093        let mut c = json_content(&raw);
1094        TruncateStringsStage.process(&mut c, &disabled_config()).unwrap();
1095        assert_eq!(c.raw, raw);
1096    }
1097
1098    // --- collapse_arrays ---
1099
1100    #[test]
1101    fn collapse_arrays_truncates_long_array() {
1102        let mut c = json_content(r#"{"items":[1,2,3,4,5,6,7]}"#);
1103        let cfg = enabled_config(json!({
1104            "max_items": 5,
1105            "summary_template": "... and {remaining} more items"
1106        }));
1107        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1108        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1109        let arr = v["items"].as_array().unwrap();
1110        assert_eq!(arr.len(), 6); // 5 kept + 1 summary
1111        assert_eq!(arr[5].as_str().unwrap(), "... and 2 more items");
1112    }
1113
1114    #[test]
1115    fn collapse_arrays_short_array_unchanged() {
1116        let raw = r#"{"items":[1,2,3]}"#;
1117        let mut c = json_content(raw);
1118        let cfg = enabled_config(json!({"max_items": 5}));
1119        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1120        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1121        assert_eq!(v["items"].as_array().unwrap().len(), 3);
1122    }
1123
1124    #[test]
1125    fn collapse_arrays_disabled_passthrough() {
1126        let raw = r#"{"items":[1,2,3,4,5,6,7]}"#;
1127        let mut c = json_content(raw);
1128        CollapseArraysStage.process(&mut c, &disabled_config()).unwrap();
1129        assert_eq!(c.raw, raw);
1130    }
1131
1132    // --- git_diff_fold ---
1133
1134    #[test]
1135    fn git_diff_fold_folds_unchanged_lines() {
1136        // Use a realistic diff with many unchanged context lines
1137        let diff = concat!(
1138            "diff --git a/src/main.rs b/src/main.rs\n",
1139            "--- a/src/main.rs\n",
1140            "+++ b/src/main.rs\n",
1141            "@@ -1,12 +1,12 @@\n",
1142            " line1\n",
1143            " line2\n",
1144            " line3\n",
1145            " line4\n",
1146            " line5\n",
1147            " line6\n",
1148            "-old line\n",
1149            "+new line\n",
1150            " line7\n",
1151            " line8\n",
1152            " line9\n",
1153            " line10\n",
1154            " line11\n",
1155            " line12\n",
1156        );
1157        let mut c = text_content(diff);
1158        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1159        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1160        // Changed lines must be preserved
1161        assert!(c.raw.contains("-old line"), "output: {}", c.raw);
1162        assert!(c.raw.contains("+new line"), "output: {}", c.raw);
1163        // Hunk header must be preserved
1164        assert!(c.raw.contains("@@ -1,12"), "output: {}", c.raw);
1165        // Output should be shorter (folded lines 1-4 and 9-12)
1166        assert!(c.raw.len() < diff.len(), "output should be shorter, got:\n{}", c.raw);
1167        // Should contain fold markers
1168        assert!(c.raw.contains("unchanged lines"), "expected fold markers in:\n{}", c.raw);
1169    }
1170
1171    #[test]
1172    fn git_diff_fold_preserves_hunk_headers() {
1173        let diff = "@@ -1,5 +1,5 @@\n unchanged\n-old\n+new\n unchanged\n";
1174        let mut c = text_content(diff);
1175        let cfg = enabled_config(serde_json::json!({"max_context_lines": 1}));
1176        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1177        assert!(c.raw.contains("@@ -1,5 +1,5 @@"), "output: {}", c.raw);
1178    }
1179
1180    #[test]
1181    fn git_diff_fold_skips_non_diff_text() {
1182        let raw = "just some plain text\nno diff markers here\n";
1183        let mut c = text_content(raw);
1184        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1185        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1186        assert_eq!(c.raw, raw);
1187    }
1188
1189    #[test]
1190    fn git_diff_fold_disabled_passthrough() {
1191        let diff = "diff --git a/f b/f\n-old\n+new\n unchanged\n unchanged\n unchanged\n";
1192        let mut c = text_content(diff);
1193        GitDiffFoldStage.process(&mut c, &disabled_config()).unwrap();
1194        assert_eq!(c.raw, diff);
1195    }
1196
1197    // --- custom_transforms ---
1198
1199    #[test]
1200    fn custom_transforms_is_noop() {
1201        let raw = r#"{"a":1}"#;
1202        let mut c = json_content(raw);
1203        let cfg = enabled_config(json!({}));
1204        CustomTransformsStage.process(&mut c, &cfg).unwrap();
1205        assert_eq!(c.raw, raw);
1206    }
1207
1208    #[test]
1209    fn custom_transforms_disabled_passthrough() {
1210        let raw = "some text";
1211        let mut c = text_content(raw);
1212        CustomTransformsStage.process(&mut c, &disabled_config()).unwrap();
1213        assert_eq!(c.raw, raw);
1214    }
1215
1216    // --- tabular encoding (in collapse_arrays) ---
1217
1218    #[test]
1219    fn collapse_arrays_tabular_encoding_uniform_objects() {
1220        // Array of objects with identical keys → should produce tabular output
1221        let raw = r#"{"users":[
1222            {"id":1,"name":"Alice","role":"admin"},
1223            {"id":2,"name":"Bob","role":"user"},
1224            {"id":3,"name":"Carol","role":"user"},
1225            {"id":4,"name":"Dave","role":"admin"},
1226            {"id":5,"name":"Eve","role":"user"},
1227            {"id":6,"name":"Frank","role":"user"}
1228        ]}"#;
1229        let mut c = json_content(raw);
1230        let cfg = enabled_config(json!({
1231            "max_items": 3,
1232            "summary_template": "... and {remaining} more items"
1233        }));
1234        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1235        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1236        let arr = v["users"].as_array().unwrap();
1237        // Should be collapsed to a single tabular string element
1238        assert_eq!(arr.len(), 1, "uniform array should be encoded as single table element");
1239        let table_str = arr[0].as_str().unwrap();
1240        assert!(table_str.contains("[table: 6 rows]"), "should contain row count: {}", table_str);
1241        assert!(table_str.contains("Alice"), "should contain data: {}", table_str);
1242        assert!(table_str.contains("Frank"), "should contain all rows: {}", table_str);
1243    }
1244
1245    #[test]
1246    fn collapse_arrays_mixed_objects_falls_back_to_truncation() {
1247        // Array of objects with DIFFERENT keys → should fall back to truncation
1248        let raw = r#"{"items":[
1249            {"id":1,"name":"Alice"},
1250            {"x":2,"y":3},
1251            {"id":3,"name":"Carol"},
1252            {"x":4,"y":5},
1253            {"id":5,"name":"Eve"},
1254            {"x":6,"y":7}
1255        ]}"#;
1256        let mut c = json_content(raw);
1257        let cfg = enabled_config(json!({
1258            "max_items": 3,
1259            "summary_template": "... and {remaining} more items"
1260        }));
1261        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1262        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1263        let arr = v["items"].as_array().unwrap();
1264        // Should fall back to truncation: 3 kept + 1 summary
1265        assert_eq!(arr.len(), 4);
1266        assert!(arr[3].as_str().unwrap().contains("3 more items"));
1267    }
1268
1269    #[test]
1270    fn collapse_arrays_small_uniform_array_unchanged() {
1271        // Uniform array but under max_items → no collapse
1272        let raw = r#"{"users":[{"id":1,"name":"Alice"},{"id":2,"name":"Bob"}]}"#;
1273        let mut c = json_content(raw);
1274        let cfg = enabled_config(json!({"max_items": 5}));
1275        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1276        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1277        assert_eq!(v["users"].as_array().unwrap().len(), 2);
1278    }
1279
1280    #[test]
1281    fn detect_uniform_array_returns_keys_for_uniform() {
1282        let arr = vec![
1283            json!({"a": 1, "b": 2}),
1284            json!({"a": 3, "b": 4}),
1285        ];
1286        let keys = detect_uniform_array(&arr);
1287        assert!(keys.is_some());
1288        let keys = keys.unwrap();
1289        assert!(keys.contains(&"a".to_string()));
1290        assert!(keys.contains(&"b".to_string()));
1291    }
1292
1293    #[test]
1294    fn detect_uniform_array_returns_none_for_mixed() {
1295        let arr = vec![
1296            json!({"a": 1, "b": 2}),
1297            json!({"x": 3, "y": 4}),
1298        ];
1299        assert!(detect_uniform_array(&arr).is_none());
1300    }
1301
1302    #[test]
1303    fn detect_uniform_array_returns_none_for_non_objects() {
1304        let arr = vec![json!(1), json!(2), json!(3)];
1305        assert!(detect_uniform_array(&arr).is_none());
1306    }
1307
1308    #[test]
1309    fn detect_uniform_array_returns_none_for_single_element() {
1310        let arr = vec![json!({"a": 1})];
1311        assert!(detect_uniform_array(&arr).is_none());
1312    }
1313
1314    #[test]
1315    fn value_to_compact_string_truncates_long_strings() {
1316        let long = "a".repeat(100);
1317        let v = serde_json::Value::String(long);
1318        let s = value_to_compact_string(&v);
1319        assert!(s.len() <= 53); // 47 chars + "..."
1320        assert!(s.ends_with("..."));
1321    }
1322
1323    #[test]
1324    fn value_to_compact_string_short_string_unchanged() {
1325        let v = serde_json::Value::String("hello".to_string());
1326        assert_eq!(value_to_compact_string(&v), "hello");
1327    }
1328
1329    #[test]
1330    fn value_to_compact_string_nested_types() {
1331        assert_eq!(value_to_compact_string(&json!(null)), "null");
1332        assert_eq!(value_to_compact_string(&json!(true)), "true");
1333        assert_eq!(value_to_compact_string(&json!(42)), "42");
1334        assert_eq!(value_to_compact_string(&json!([1, 2, 3])), "[3 items]");
1335        assert_eq!(value_to_compact_string(&json!({"a": 1})), "{1 keys}");
1336    }
1337
1338    // --- word_abbreviate ---
1339
1340    #[test]
1341    fn word_abbreviate_replaces_known_words() {
1342        let raw = "The implementation of the configuration is complete.";
1343        let mut c = text_content(raw);
1344        let cfg = enabled_config(json!({}));
1345        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1346        assert!(c.raw.contains("impl"), "should abbreviate 'implementation': {}", c.raw);
1347        assert!(c.raw.contains("config"), "should abbreviate 'configuration': {}", c.raw);
1348        assert!(!c.raw.contains("implementation"), "original word should be gone: {}", c.raw);
1349    }
1350
1351    #[test]
1352    fn word_abbreviate_preserves_partial_matches() {
1353        // "implement" should NOT be abbreviated — only "implementation" is in the table
1354        let raw = "We need to implement this feature.";
1355        let mut c = text_content(raw);
1356        let cfg = enabled_config(json!({}));
1357        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1358        assert!(c.raw.contains("implement"), "partial match should be preserved: {}", c.raw);
1359    }
1360
1361    #[test]
1362    fn word_abbreviate_disabled_passthrough() {
1363        let raw = "The implementation is complete.";
1364        let mut c = text_content(raw);
1365        WordAbbreviateStage.process(&mut c, &disabled_config()).unwrap();
1366        assert_eq!(c.raw, raw);
1367    }
1368
1369    #[test]
1370    fn word_abbreviate_skips_json() {
1371        let raw = r#"{"implementation":"value"}"#;
1372        let mut c = json_content(raw);
1373        let cfg = enabled_config(json!({}));
1374        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1375        assert_eq!(c.raw, raw, "JSON content should pass through unchanged");
1376    }
1377
1378    #[test]
1379    fn word_abbreviate_case_insensitive() {
1380        let raw = "The Implementation and CONFIGURATION are ready.";
1381        let mut c = text_content(raw);
1382        let cfg = enabled_config(json!({}));
1383        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1384        assert!(c.raw.contains("impl"), "should handle mixed case: {}", c.raw);
1385        assert!(c.raw.contains("config"), "should handle uppercase: {}", c.raw);
1386    }
1387
1388    #[test]
1389    fn replace_whole_word_basic() {
1390        assert_eq!(
1391            replace_whole_word("the implementation is done", "implementation", "impl"),
1392            "the impl is done"
1393        );
1394    }
1395
1396    #[test]
1397    fn replace_whole_word_no_partial() {
1398        // "implementations" contains "implementation" but shouldn't match
1399        // because the 's' after makes it not a word boundary
1400        let result = replace_whole_word("multiple implementations exist", "implementation", "impl");
1401        // The word "implementations" has "implementation" followed by 's' which is alphanumeric,
1402        // so it should NOT be replaced
1403        assert_eq!(result, "multiple implementations exist");
1404    }
1405
1406    #[test]
1407    fn replace_whole_word_at_boundaries() {
1408        assert_eq!(
1409            replace_whole_word("implementation", "implementation", "impl"),
1410            "impl"
1411        );
1412        assert_eq!(
1413            replace_whole_word("(implementation)", "implementation", "impl"),
1414            "(impl)"
1415        );
1416    }
1417
1418    #[test]
1419    fn replace_whole_word_empty_inputs() {
1420        assert_eq!(replace_whole_word("", "word", "w"), "");
1421        assert_eq!(replace_whole_word("text", "", "w"), "text");
1422    }
1423}