Skip to main content

sqz_engine/
stages.rs

1use crate::error::{Result, SqzError};
2use crate::toon::ToonEncoder;
3use crate::types::{Content, ContentType, StageConfig};
4
5/// A single compression stage in the pipeline.
6///
7/// Each stage transforms `Content` in place according to its `StageConfig`.
8/// Stages must check `config.enabled` and return early (no-op) when disabled.
9pub trait CompressionStage: Send + Sync {
10    fn name(&self) -> &str;
11    fn priority(&self) -> u32;
12    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()>;
13}
14
15// ---------------------------------------------------------------------------
16// Helper: parse raw as JSON, apply a transform, serialize back
17// ---------------------------------------------------------------------------
18
19fn with_json<F>(content: &mut Content, f: F) -> Result<()>
20where
21    F: FnOnce(&mut serde_json::Value) -> Result<()>,
22{
23    if !ToonEncoder::is_json(&content.raw) {
24        return Ok(());
25    }
26    let mut value: serde_json::Value = serde_json::from_str(&content.raw)?;
27    f(&mut value)?;
28    content.raw = serde_json::to_string(&value)?;
29    Ok(())
30}
31
32// ---------------------------------------------------------------------------
33// Stage 1: keep_fields
34// ---------------------------------------------------------------------------
35
36/// For JSON content, keep only the specified top-level fields; drop all others.
37/// Config options: `fields` — array of field name strings.
38/// Non-JSON content passes through unchanged.
39pub struct KeepFieldsStage;
40
41impl CompressionStage for KeepFieldsStage {
42    fn name(&self) -> &str {
43        "keep_fields"
44    }
45
46    fn priority(&self) -> u32 {
47        10
48    }
49
50    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
51        if !config.enabled {
52            return Ok(());
53        }
54        let fields: Vec<String> = match config.options.get("fields") {
55            Some(v) => serde_json::from_value(v.clone())
56                .map_err(|e| SqzError::Other(format!("keep_fields: invalid fields option: {e}")))?,
57            None => return Ok(()),
58        };
59        if fields.is_empty() {
60            return Ok(());
61        }
62        with_json(content, |value| {
63            if let serde_json::Value::Object(map) = value {
64                map.retain(|k, _| fields.contains(k));
65            }
66            Ok(())
67        })
68    }
69}
70
71// ---------------------------------------------------------------------------
72// Stage 2: strip_fields
73// ---------------------------------------------------------------------------
74
75/// For JSON content, remove specified fields by key name.
76/// Supports dot-notation for nested fields (e.g. "metadata.internal_id").
77/// Config options: `fields` — array of field path strings.
78/// Non-JSON content passes through unchanged.
79pub struct StripFieldsStage;
80
81fn strip_field_path(value: &mut serde_json::Value, path: &[&str]) {
82    if path.is_empty() {
83        return;
84    }
85    if let serde_json::Value::Object(map) = value {
86        if path.len() == 1 {
87            map.remove(path[0]);
88        } else {
89            if let Some(child) = map.get_mut(path[0]) {
90                strip_field_path(child, &path[1..]);
91            }
92        }
93    }
94}
95
96impl CompressionStage for StripFieldsStage {
97    fn name(&self) -> &str {
98        "strip_fields"
99    }
100
101    fn priority(&self) -> u32 {
102        20
103    }
104
105    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
106        if !config.enabled {
107            return Ok(());
108        }
109        let fields: Vec<String> = match config.options.get("fields") {
110            Some(v) => serde_json::from_value(v.clone())
111                .map_err(|e| SqzError::Other(format!("strip_fields: invalid fields option: {e}")))?,
112            None => return Ok(()),
113        };
114        if fields.is_empty() {
115            return Ok(());
116        }
117        with_json(content, |value| {
118            for field in &fields {
119                let parts: Vec<&str> = field.split('.').collect();
120                strip_field_path(value, &parts);
121            }
122            Ok(())
123        })
124    }
125}
126
127// ---------------------------------------------------------------------------
128// Stage 3: condense
129// ---------------------------------------------------------------------------
130
131/// For plain text / CLI output, collapse runs of repeated identical lines
132/// down to at most `max_repeated_lines`.
133/// Config options: `max_repeated_lines` (u32, default 3).
134/// Non-plain-text content passes through unchanged.
135pub struct CondenseStage;
136
137impl CompressionStage for CondenseStage {
138    fn name(&self) -> &str {
139        "condense"
140    }
141
142    fn priority(&self) -> u32 {
143        30
144    }
145
146    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
147        if !config.enabled {
148            return Ok(());
149        }
150        // Only apply to plain text and CLI output
151        match &content.content_type {
152            ContentType::PlainText | ContentType::CliOutput { .. } => {}
153            _ => return Ok(()),
154        }
155
156        let max_repeated: u32 = config
157            .options
158            .get("max_repeated_lines")
159            .and_then(|v| v.as_u64())
160            .map(|v| v as u32)
161            .unwrap_or(3);
162
163        let mut result = Vec::new();
164        let mut current_line: Option<&str> = None;
165        let mut run_count: u32 = 0;
166
167        for line in content.raw.lines() {
168            match current_line {
169                Some(prev) if prev == line => {
170                    run_count += 1;
171                    if run_count <= max_repeated {
172                        result.push(line);
173                    }
174                }
175                _ => {
176                    current_line = Some(line);
177                    run_count = 1;
178                    result.push(line);
179                }
180            }
181        }
182
183        // Preserve trailing newline if original had one
184        let trailing_newline = content.raw.ends_with('\n');
185        content.raw = result.join("\n");
186        if trailing_newline {
187            content.raw.push('\n');
188        }
189        Ok(())
190    }
191}
192
193// ---------------------------------------------------------------------------
194// Stage 4: strip_nulls
195// ---------------------------------------------------------------------------
196
197/// For JSON content, recursively remove all null-valued fields from objects.
198/// Arrays keep their null elements.
199/// Config options: `enabled` (bool).
200pub struct StripNullsStage;
201
202fn strip_nulls_recursive(value: &mut serde_json::Value) {
203    match value {
204        serde_json::Value::Object(map) => {
205            map.retain(|_, v| !v.is_null());
206            for v in map.values_mut() {
207                strip_nulls_recursive(v);
208            }
209        }
210        serde_json::Value::Array(arr) => {
211            for item in arr.iter_mut() {
212                strip_nulls_recursive(item);
213            }
214        }
215        _ => {}
216    }
217}
218
219impl CompressionStage for StripNullsStage {
220    fn name(&self) -> &str {
221        "strip_nulls"
222    }
223
224    fn priority(&self) -> u32 {
225        40
226    }
227
228    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
229        if !config.enabled {
230            return Ok(());
231        }
232        with_json(content, |value| {
233            strip_nulls_recursive(value);
234            Ok(())
235        })
236    }
237}
238
239// ---------------------------------------------------------------------------
240// Stage 5: flatten
241// ---------------------------------------------------------------------------
242
243/// For JSON content, flatten nested objects up to `max_depth` levels using
244/// dot-notation for flattened keys (e.g. `{"a":{"b":1}}` → `{"a.b":1}`).
245/// Config options: `max_depth` (u32, default 3).
246/// Non-JSON content passes through unchanged.
247pub struct FlattenStage;
248
249fn flatten_value(
250    value: &serde_json::Value,
251    prefix: &str,
252    depth: u32,
253    max_depth: u32,
254    out: &mut serde_json::Map<String, serde_json::Value>,
255) {
256    if let serde_json::Value::Object(map) = value {
257        if depth < max_depth {
258            for (k, v) in map {
259                let new_key = if prefix.is_empty() {
260                    k.clone()
261                } else {
262                    format!("{prefix}.{k}")
263                };
264                flatten_value(v, &new_key, depth + 1, max_depth, out);
265            }
266            return;
267        }
268    }
269    out.insert(prefix.to_owned(), value.clone());
270}
271
272impl CompressionStage for FlattenStage {
273    fn name(&self) -> &str {
274        "flatten"
275    }
276
277    fn priority(&self) -> u32 {
278        50
279    }
280
281    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
282        if !config.enabled {
283            return Ok(());
284        }
285        let max_depth: u32 = config
286            .options
287            .get("max_depth")
288            .and_then(|v| v.as_u64())
289            .map(|v| v as u32)
290            .unwrap_or(3);
291
292        with_json(content, |value| {
293            if let serde_json::Value::Object(map) = value {
294                let mut out = serde_json::Map::new();
295                for (k, v) in map.iter() {
296                    flatten_value(v, k, 1, max_depth, &mut out);
297                }
298                *map = out;
299            }
300            Ok(())
301        })
302    }
303}
304
305// ---------------------------------------------------------------------------
306// Stage 6: truncate_strings
307// ---------------------------------------------------------------------------
308
309/// For JSON content, truncate string values longer than `max_length` chars,
310/// appending "..." to indicate truncation.
311/// Config options: `max_length` (u32, default 500).
312/// Non-JSON content passes through unchanged.
313pub struct TruncateStringsStage;
314
315fn truncate_strings_recursive(value: &mut serde_json::Value, max_length: usize) {
316    match value {
317        serde_json::Value::String(s) => {
318            if s.chars().count() > max_length {
319                let truncated: String = s.chars().take(max_length).collect();
320                *s = format!("{truncated}...");
321            }
322        }
323        serde_json::Value::Object(map) => {
324            for v in map.values_mut() {
325                truncate_strings_recursive(v, max_length);
326            }
327        }
328        serde_json::Value::Array(arr) => {
329            for item in arr.iter_mut() {
330                truncate_strings_recursive(item, max_length);
331            }
332        }
333        _ => {}
334    }
335}
336
337impl CompressionStage for TruncateStringsStage {
338    fn name(&self) -> &str {
339        "truncate_strings"
340    }
341
342    fn priority(&self) -> u32 {
343        60
344    }
345
346    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
347        if !config.enabled {
348            return Ok(());
349        }
350        let max_length: usize = config
351            .options
352            .get("max_length")
353            .and_then(|v| v.as_u64())
354            .map(|v| v as usize)
355            .unwrap_or(500);
356
357        with_json(content, |value| {
358            truncate_strings_recursive(value, max_length);
359            Ok(())
360        })
361    }
362}
363
364// ---------------------------------------------------------------------------
365// Stage 7: collapse_arrays
366// ---------------------------------------------------------------------------
367
368/// For JSON content, if an array has more than `max_items` elements:
369/// 1. First, try tabular encoding: if all elements are objects with the same
370///    keys, encode as a header row + data rows for maximum compression.
371/// 2. Otherwise, keep the first `max_items` and replace the rest with a
372///    summary string element.
373///
374/// Config options:
375///   - `max_items` (u32, default 5)
376///   - `summary_template` (string, default "... and {remaining} more items")
377/// Non-JSON content passes through unchanged.
378pub struct CollapseArraysStage;
379
380/// Check if all elements in an array are objects with the same set of keys.
381/// Returns the shared keys in a stable order if uniform, None otherwise.
382fn detect_uniform_array(arr: &[serde_json::Value]) -> Option<Vec<String>> {
383    if arr.len() < 2 {
384        return None;
385    }
386
387    let first_keys: Vec<String> = match &arr[0] {
388        serde_json::Value::Object(map) => {
389            if map.is_empty() {
390                return None;
391            }
392            map.keys().cloned().collect()
393        }
394        _ => return None,
395    };
396
397    // Check that every element is an object with exactly the same keys
398    for item in &arr[1..] {
399        match item {
400            serde_json::Value::Object(map) => {
401                if map.len() != first_keys.len() {
402                    return None;
403                }
404                for key in &first_keys {
405                    if !map.contains_key(key) {
406                        return None;
407                    }
408                }
409            }
410            _ => return None,
411        }
412    }
413
414    Some(first_keys)
415}
416
417/// Encode a uniform array of objects as a compact tabular string:
418/// `[headers] | col1 | col2 | ... \n val1 | val2 | ...`
419fn encode_tabular(arr: &[serde_json::Value], keys: &[String]) -> String {
420    let mut lines = Vec::with_capacity(arr.len() + 1);
421
422    // Header row
423    lines.push(keys.join(" | "));
424
425    // Data rows
426    for item in arr {
427        if let serde_json::Value::Object(map) = item {
428            let row: Vec<String> = keys
429                .iter()
430                .map(|k| value_to_compact_string(map.get(k).unwrap_or(&serde_json::Value::Null)))
431                .collect();
432            lines.push(row.join(" | "));
433        }
434    }
435
436    lines.join("\n")
437}
438
439/// Convert a JSON value to a compact single-line string for tabular display.
440fn value_to_compact_string(v: &serde_json::Value) -> String {
441    match v {
442        serde_json::Value::Null => "null".to_string(),
443        serde_json::Value::Bool(b) => b.to_string(),
444        serde_json::Value::Number(n) => n.to_string(),
445        serde_json::Value::String(s) => {
446            if s.len() > 50 {
447                format!("{}...", &s[..47])
448            } else {
449                s.clone()
450            }
451        }
452        serde_json::Value::Array(a) => format!("[{} items]", a.len()),
453        serde_json::Value::Object(m) => format!("{{{} keys}}", m.len()),
454    }
455}
456
457fn collapse_arrays_recursive(
458    value: &mut serde_json::Value,
459    max_items: usize,
460    summary_template: &str,
461) {
462    match value {
463        serde_json::Value::Array(arr) => {
464            // First recurse into existing items
465            for item in arr.iter_mut() {
466                collapse_arrays_recursive(item, max_items, summary_template);
467            }
468
469            // Try tabular encoding for uniform arrays
470            if arr.len() > max_items {
471                if let Some(keys) = detect_uniform_array(arr) {
472                    let table = encode_tabular(arr, &keys);
473                    let count = arr.len();
474                    arr.clear();
475                    arr.push(serde_json::Value::String(
476                        format!("[table: {count} rows]\n{table}"),
477                    ));
478                    return;
479                }
480
481                // Fallback: simple truncation with summary
482                let remaining = arr.len() - max_items;
483                arr.truncate(max_items);
484                let summary = summary_template.replace("{remaining}", &remaining.to_string());
485                arr.push(serde_json::Value::String(summary));
486            }
487        }
488        serde_json::Value::Object(map) => {
489            for v in map.values_mut() {
490                collapse_arrays_recursive(v, max_items, summary_template);
491            }
492        }
493        _ => {}
494    }
495}
496
497impl CompressionStage for CollapseArraysStage {
498    fn name(&self) -> &str {
499        "collapse_arrays"
500    }
501
502    fn priority(&self) -> u32 {
503        70
504    }
505
506    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
507        if !config.enabled {
508            return Ok(());
509        }
510        let max_items: usize = config
511            .options
512            .get("max_items")
513            .and_then(|v| v.as_u64())
514            .map(|v| v as usize)
515            .unwrap_or(5);
516        let summary_template = config
517            .options
518            .get("summary_template")
519            .and_then(|v| v.as_str())
520            .unwrap_or("... and {remaining} more items")
521            .to_owned();
522
523        with_json(content, |value| {
524            collapse_arrays_recursive(value, max_items, &summary_template);
525            Ok(())
526        })
527    }
528}
529
530// ---------------------------------------------------------------------------
531// Stage 7a: word_abbreviate
532// ---------------------------------------------------------------------------
533
534/// For plain text / CLI output, replace common long words with standard
535/// abbreviations (e.g. "implementation" → "impl", "configuration" → "config").
536///
537/// Only replaces whole words (not substrings) and only in prose-like content.
538/// Config options: `enabled` (bool).
539pub struct WordAbbreviateStage;
540
541/// Built-in abbreviation table: (long_form, short_form).
542/// Only includes unambiguous, widely-understood abbreviations.
543const WORD_ABBREVIATIONS: &[(&str, &str)] = &[
544    ("implementation", "impl"),
545    ("implementations", "impls"),
546    ("configuration", "config"),
547    ("configurations", "configs"),
548    ("authentication", "auth"),
549    ("authorization", "authz"),
550    ("application", "app"),
551    ("applications", "apps"),
552    ("environment", "env"),
553    ("environments", "envs"),
554    ("development", "dev"),
555    ("production", "prod"),
556    ("repository", "repo"),
557    ("repositories", "repos"),
558    ("dependency", "dep"),
559    ("dependencies", "deps"),
560    ("documentation", "docs"),
561    ("information", "info"),
562    ("directory", "dir"),
563    ("directories", "dirs"),
564    ("parameter", "param"),
565    ("parameters", "params"),
566    ("argument", "arg"),
567    ("arguments", "args"),
568    ("function", "fn"),
569    ("functions", "fns"),
570    ("reference", "ref"),
571    ("references", "refs"),
572    ("specification", "spec"),
573    ("specifications", "specs"),
574    ("temporary", "tmp"),
575    ("administrator", "admin"),
576    ("administrators", "admins"),
577    ("database", "db"),
578    ("databases", "dbs"),
579    ("message", "msg"),
580    ("messages", "msgs"),
581    ("response", "resp"),
582    ("request", "req"),
583    ("requests", "reqs"),
584    ("attribute", "attr"),
585    ("attributes", "attrs"),
586    ("expression", "expr"),
587    ("expressions", "exprs"),
588    ("operation", "op"),
589    ("operations", "ops"),
590    ("maximum", "max"),
591    ("minimum", "min"),
592    ("boolean", "bool"),
593    ("integer", "int"),
594    ("previous", "prev"),
595    ("current", "curr"),
596    ("original", "orig"),
597    ("synchronize", "sync"),
598    ("asynchronous", "async"),
599    ("initialize", "init"),
600    ("allocation", "alloc"),
601    ("allocations", "allocs"),
602    ("generation", "gen"),
603    ("miscellaneous", "misc"),
604    ("statistics", "stats"),
605    ("connection", "conn"),
606    ("connections", "conns"),
607    ("transaction", "txn"),
608    ("transactions", "txns"),
609    ("management", "mgmt"),
610    ("notification", "notif"),
611    ("notifications", "notifs"),
612    ("permission", "perm"),
613    ("permissions", "perms"),
614    ("distribution", "distro"),
615    ("distributions", "distros"),
616    ("architecture", "arch"),
617    ("infrastructure", "infra"),
618    ("kubernetes", "k8s"),
619    ("namespace", "ns"),
620    ("namespaces", "nses"),
621    ("container", "ctr"),
622    ("containers", "ctrs"),
623    ("microservice", "svc"),
624    ("microservices", "svcs"),
625];
626
627impl CompressionStage for WordAbbreviateStage {
628    fn name(&self) -> &str {
629        "word_abbreviate"
630    }
631
632    fn priority(&self) -> u32 {
633        25 // After strip_fields (20), before condense (30)
634    }
635
636    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
637        if !config.enabled {
638            return Ok(());
639        }
640        // Only apply to plain text and CLI output
641        match &content.content_type {
642            ContentType::PlainText | ContentType::CliOutput { .. } => {}
643            _ => return Ok(()),
644        }
645
646        let mut result = content.raw.clone();
647        for &(long, short) in WORD_ABBREVIATIONS {
648            // Replace whole words only (case-insensitive for the check,
649            // but preserve surrounding context)
650            result = replace_whole_word(&result, long, short);
651        }
652
653        content.raw = result;
654        Ok(())
655    }
656}
657
658/// Apply word abbreviation to a plain text string.
659///
660/// This is a convenience function for callers that want to abbreviate
661/// outside the pipeline stage system (e.g. CLI proxy post-processing).
662pub fn abbreviate_words(text: &str) -> String {
663    let mut result = text.to_string();
664    for &(long, short) in WORD_ABBREVIATIONS {
665        result = replace_whole_word(&result, long, short);
666    }
667    result
668}
669
670/// Replace whole-word occurrences of `word` with `replacement`.
671/// A "whole word" is bounded by non-alphanumeric characters or string edges.
672fn replace_whole_word(text: &str, word: &str, replacement: &str) -> String {
673    if text.is_empty() || word.is_empty() {
674        return text.to_string();
675    }
676
677    let lower = text.to_lowercase();
678    let word_lower = word.to_lowercase();
679    let word_len = word.len();
680    let mut result = String::with_capacity(text.len());
681    let mut last_end = 0;
682
683    let text_bytes = text.as_bytes();
684
685    for (start, _) in lower.match_indices(&word_lower) {
686        let end = start + word_len;
687
688        // Check word boundary before
689        let before_ok = start == 0
690            || !text_bytes[start - 1].is_ascii_alphanumeric();
691        // Check word boundary after
692        let after_ok = end >= text.len()
693            || !text_bytes[end].is_ascii_alphanumeric();
694
695        if before_ok && after_ok {
696            result.push_str(&text[last_end..start]);
697            result.push_str(replacement);
698            last_end = end;
699        }
700    }
701
702    result.push_str(&text[last_end..]);
703    result
704}
705
706// ---------------------------------------------------------------------------
707// Stage 7b: git_diff_fold
708// ---------------------------------------------------------------------------
709
710/// For git diff output, fold consecutive unchanged context lines (lines
711/// starting with a space) into a compact `[N unchanged lines]` marker.
712/// This preserves all changed lines (+/-) and hunk headers (@@) while
713/// dramatically reducing noise from context lines.
714///
715/// Config options:
716///   - `max_context_lines` (u32, default 2) — keep this many context lines
717///     before/after each changed block before folding the rest.
718pub struct GitDiffFoldStage;
719
720impl CompressionStage for GitDiffFoldStage {
721    fn name(&self) -> &str {
722        "git_diff_fold"
723    }
724
725    fn priority(&self) -> u32 {
726        35
727    }
728
729    fn process(&self, content: &mut Content, config: &StageConfig) -> Result<()> {
730        if !config.enabled {
731            return Ok(());
732        }
733        // Only apply to plain text / CLI output that looks like a diff
734        match &content.content_type {
735            ContentType::PlainText | ContentType::CliOutput { .. } => {}
736            _ => return Ok(()),
737        }
738
739        // Real diff detection: require strong structural signals, not just
740        // lines starting with +/-. The old check (`contains("\n+") || contains("\n-")`)
741        // false-positived on ls -l output (regular files start with -rw-),
742        // Markdown bullet lists, CSV with negative numbers, etc.
743        let looks_like_diff = content.raw.starts_with("diff --git ")
744            || content.raw.starts_with("diff -")
745            || content.raw.contains("\n@@ ")       // hunk header
746            || content.raw.contains("\n--- a/")    // unified diff file header
747            || content.raw.contains("\n+++ b/");   // unified diff file header
748
749        if !looks_like_diff {
750            return Ok(());
751        }
752
753        let max_ctx: usize = config
754            .options
755            .get("max_context_lines")
756            .and_then(|v| v.as_u64())
757            .map(|v| v as usize)
758            .unwrap_or(2);
759
760        let lines: Vec<&str> = content.raw.lines().collect();
761        let n = lines.len();
762
763        // Mark which lines are "changed" (added, removed, or hunk headers)
764        let is_changed: Vec<bool> = lines
765            .iter()
766            .map(|l| {
767                l.starts_with('+')
768                    || l.starts_with('-')
769                    || l.starts_with("@@")
770                    || l.starts_with("diff ")
771                    || l.starts_with("index ")
772                    || l.starts_with("--- ")
773                    || l.starts_with("+++ ")
774            })
775            .collect();
776
777        // For each context line, determine if it's within max_ctx of a changed line
778        let mut keep = vec![false; n];
779        for i in 0..n {
780            if is_changed[i] {
781                keep[i] = true;
782                // Keep max_ctx lines before
783                for j in i.saturating_sub(max_ctx)..i {
784                    keep[j] = true;
785                }
786                // Keep max_ctx lines after
787                for j in (i + 1)..n.min(i + 1 + max_ctx) {
788                    keep[j] = true;
789                }
790            }
791        }
792
793        // Build output, folding consecutive non-kept lines
794        let mut result = Vec::new();
795        let mut fold_count = 0usize;
796
797        for i in 0..n {
798            if keep[i] {
799                if fold_count > 0 {
800                    result.push(format!("[{fold_count} unchanged lines]"));
801                    fold_count = 0;
802                }
803                result.push(lines[i].to_owned());
804            } else {
805                fold_count += 1;
806            }
807        }
808        if fold_count > 0 {
809            result.push(format!("[{fold_count} unchanged lines]"));
810        }
811
812        let trailing_newline = content.raw.ends_with('\n');
813        content.raw = result.join("\n");
814        if trailing_newline {
815            content.raw.push('\n');
816        }
817        Ok(())
818    }
819}
820
821// ---------------------------------------------------------------------------
822// Stage 8: custom_transforms
823// ---------------------------------------------------------------------------
824
825/// No-op stage that serves as the insertion point for plugin stages.
826/// Passes content through unchanged.
827pub struct CustomTransformsStage;
828
829impl CompressionStage for CustomTransformsStage {
830    fn name(&self) -> &str {
831        "custom_transforms"
832    }
833
834    fn priority(&self) -> u32 {
835        80
836    }
837
838    fn process(&self, _content: &mut Content, config: &StageConfig) -> Result<()> {
839        if !config.enabled {
840            return Ok(());
841        }
842        // No-op: plugin stages are inserted here by the pipeline
843        Ok(())
844    }
845}
846
847// ---------------------------------------------------------------------------
848// Tests
849// ---------------------------------------------------------------------------
850
851#[cfg(test)]
852mod tests {
853    use super::*;
854    use crate::types::{ContentMetadata, ContentType};
855    use serde_json::json;
856
857    fn json_content(raw: &str) -> Content {
858        Content {
859            raw: raw.to_owned(),
860            content_type: ContentType::Json,
861            metadata: ContentMetadata {
862                source: None,
863                path: None,
864                language: None,
865            },
866            tokens_original: 0,
867        }
868    }
869
870    fn text_content(raw: &str) -> Content {
871        Content {
872            raw: raw.to_owned(),
873            content_type: ContentType::PlainText,
874            metadata: ContentMetadata {
875                source: None,
876                path: None,
877                language: None,
878            },
879            tokens_original: 0,
880        }
881    }
882
883    fn enabled_config(options: serde_json::Value) -> StageConfig {
884        StageConfig {
885            enabled: true,
886            options,
887        }
888    }
889
890    fn disabled_config() -> StageConfig {
891        StageConfig {
892            enabled: false,
893            options: json!({}),
894        }
895    }
896
897    // --- keep_fields ---
898
899    #[test]
900    fn keep_fields_retains_specified() {
901        let mut c = json_content(r#"{"id":1,"name":"Alice","debug":"x"}"#);
902        let cfg = enabled_config(json!({"fields": ["id", "name"]}));
903        KeepFieldsStage.process(&mut c, &cfg).unwrap();
904        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
905        assert_eq!(v, json!({"id":1,"name":"Alice"}));
906    }
907
908    #[test]
909    fn keep_fields_disabled_passthrough() {
910        let raw = r#"{"id":1,"name":"Alice"}"#;
911        let mut c = json_content(raw);
912        KeepFieldsStage.process(&mut c, &disabled_config()).unwrap();
913        assert_eq!(c.raw, raw);
914    }
915
916    #[test]
917    fn keep_fields_non_json_passthrough() {
918        let raw = "not json at all";
919        let mut c = text_content(raw);
920        let cfg = enabled_config(json!({"fields": ["id"]}));
921        KeepFieldsStage.process(&mut c, &cfg).unwrap();
922        assert_eq!(c.raw, raw);
923    }
924
925    // --- strip_fields ---
926
927    #[test]
928    fn strip_fields_removes_top_level() {
929        let mut c = json_content(r#"{"id":1,"debug":"x","name":"Bob"}"#);
930        let cfg = enabled_config(json!({"fields": ["debug"]}));
931        StripFieldsStage.process(&mut c, &cfg).unwrap();
932        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
933        assert_eq!(v, json!({"id":1,"name":"Bob"}));
934    }
935
936    #[test]
937    fn strip_fields_dot_notation() {
938        let mut c = json_content(r#"{"metadata":{"internal_id":"x","public":"y"},"name":"Bob"}"#);
939        let cfg = enabled_config(json!({"fields": ["metadata.internal_id"]}));
940        StripFieldsStage.process(&mut c, &cfg).unwrap();
941        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
942        assert_eq!(v, json!({"metadata":{"public":"y"},"name":"Bob"}));
943    }
944
945    #[test]
946    fn strip_fields_disabled_passthrough() {
947        let raw = r#"{"id":1}"#;
948        let mut c = json_content(raw);
949        StripFieldsStage.process(&mut c, &disabled_config()).unwrap();
950        assert_eq!(c.raw, raw);
951    }
952
953    // --- condense ---
954
955    #[test]
956    fn condense_collapses_repeated_lines() {
957        let raw = "a\na\na\na\na\nb\n";
958        let mut c = text_content(raw);
959        let cfg = enabled_config(json!({"max_repeated_lines": 3}));
960        CondenseStage.process(&mut c, &cfg).unwrap();
961        assert_eq!(c.raw, "a\na\na\nb\n");
962    }
963
964    #[test]
965    fn condense_keeps_up_to_max() {
966        let raw = "x\nx\nx\n";
967        let mut c = text_content(raw);
968        let cfg = enabled_config(json!({"max_repeated_lines": 3}));
969        CondenseStage.process(&mut c, &cfg).unwrap();
970        assert_eq!(c.raw, "x\nx\nx\n");
971    }
972
973    #[test]
974    fn condense_disabled_passthrough() {
975        let raw = "a\na\na\na\n";
976        let mut c = text_content(raw);
977        CondenseStage.process(&mut c, &disabled_config()).unwrap();
978        assert_eq!(c.raw, raw);
979    }
980
981    #[test]
982    fn condense_skips_json() {
983        let raw = r#"{"a":1}"#;
984        let mut c = json_content(raw);
985        let cfg = enabled_config(json!({"max_repeated_lines": 1}));
986        CondenseStage.process(&mut c, &cfg).unwrap();
987        assert_eq!(c.raw, raw);
988    }
989
990    // --- strip_nulls ---
991
992    #[test]
993    fn strip_nulls_removes_null_fields() {
994        let mut c = json_content(r#"{"a":1,"b":null,"c":"x"}"#);
995        let cfg = enabled_config(json!({}));
996        StripNullsStage.process(&mut c, &cfg).unwrap();
997        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
998        assert_eq!(v, json!({"a":1,"c":"x"}));
999    }
1000
1001    #[test]
1002    fn strip_nulls_recursive() {
1003        let mut c = json_content(r#"{"a":{"b":null,"c":1}}"#);
1004        let cfg = enabled_config(json!({}));
1005        StripNullsStage.process(&mut c, &cfg).unwrap();
1006        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1007        assert_eq!(v, json!({"a":{"c":1}}));
1008    }
1009
1010    #[test]
1011    fn strip_nulls_keeps_null_in_arrays() {
1012        let mut c = json_content(r#"{"arr":[1,null,2]}"#);
1013        let cfg = enabled_config(json!({}));
1014        StripNullsStage.process(&mut c, &cfg).unwrap();
1015        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1016        assert_eq!(v, json!({"arr":[1,null,2]}));
1017    }
1018
1019    #[test]
1020    fn strip_nulls_disabled_passthrough() {
1021        let raw = r#"{"a":null}"#;
1022        let mut c = json_content(raw);
1023        StripNullsStage.process(&mut c, &disabled_config()).unwrap();
1024        assert_eq!(c.raw, raw);
1025    }
1026
1027    // --- flatten ---
1028
1029    #[test]
1030    fn flatten_nested_object() {
1031        let mut c = json_content(r#"{"a":{"b":{"c":1}}}"#);
1032        let cfg = enabled_config(json!({"max_depth": 3}));
1033        FlattenStage.process(&mut c, &cfg).unwrap();
1034        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1035        assert_eq!(v, json!({"a.b.c":1}));
1036    }
1037
1038    #[test]
1039    fn flatten_respects_max_depth() {
1040        let mut c = json_content(r#"{"a":{"b":{"c":1}}}"#);
1041        let cfg = enabled_config(json!({"max_depth": 1}));
1042        FlattenStage.process(&mut c, &cfg).unwrap();
1043        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1044        // At max_depth=1, top-level values are not descended into
1045        assert_eq!(v, json!({"a":{"b":{"c":1}}}));
1046    }
1047
1048    #[test]
1049    fn flatten_disabled_passthrough() {
1050        let raw = r#"{"a":{"b":1}}"#;
1051        let mut c = json_content(raw);
1052        FlattenStage.process(&mut c, &disabled_config()).unwrap();
1053        assert_eq!(c.raw, raw);
1054    }
1055
1056    // --- truncate_strings ---
1057
1058    #[test]
1059    fn truncate_strings_long_value() {
1060        let long = "a".repeat(600);
1061        let raw = format!(r#"{{"key":"{}"}}"#, long);
1062        let mut c = json_content(&raw);
1063        let cfg = enabled_config(json!({"max_length": 500}));
1064        TruncateStringsStage.process(&mut c, &cfg).unwrap();
1065        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1066        let s = v["key"].as_str().unwrap();
1067        assert!(s.ends_with("..."));
1068        assert_eq!(s.chars().count(), 503); // 500 + "..."
1069    }
1070
1071    #[test]
1072    fn truncate_strings_short_value_unchanged() {
1073        let raw = r#"{"key":"hello"}"#;
1074        let mut c = json_content(raw);
1075        let cfg = enabled_config(json!({"max_length": 500}));
1076        TruncateStringsStage.process(&mut c, &cfg).unwrap();
1077        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1078        assert_eq!(v["key"].as_str().unwrap(), "hello");
1079    }
1080
1081    #[test]
1082    fn truncate_strings_disabled_passthrough() {
1083        let long = "a".repeat(600);
1084        let raw = format!(r#"{{"key":"{}"}}"#, long);
1085        let mut c = json_content(&raw);
1086        TruncateStringsStage.process(&mut c, &disabled_config()).unwrap();
1087        assert_eq!(c.raw, raw);
1088    }
1089
1090    // --- collapse_arrays ---
1091
1092    #[test]
1093    fn collapse_arrays_truncates_long_array() {
1094        let mut c = json_content(r#"{"items":[1,2,3,4,5,6,7]}"#);
1095        let cfg = enabled_config(json!({
1096            "max_items": 5,
1097            "summary_template": "... and {remaining} more items"
1098        }));
1099        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1100        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1101        let arr = v["items"].as_array().unwrap();
1102        assert_eq!(arr.len(), 6); // 5 kept + 1 summary
1103        assert_eq!(arr[5].as_str().unwrap(), "... and 2 more items");
1104    }
1105
1106    #[test]
1107    fn collapse_arrays_short_array_unchanged() {
1108        let raw = r#"{"items":[1,2,3]}"#;
1109        let mut c = json_content(raw);
1110        let cfg = enabled_config(json!({"max_items": 5}));
1111        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1112        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1113        assert_eq!(v["items"].as_array().unwrap().len(), 3);
1114    }
1115
1116    #[test]
1117    fn collapse_arrays_disabled_passthrough() {
1118        let raw = r#"{"items":[1,2,3,4,5,6,7]}"#;
1119        let mut c = json_content(raw);
1120        CollapseArraysStage.process(&mut c, &disabled_config()).unwrap();
1121        assert_eq!(c.raw, raw);
1122    }
1123
1124    // --- git_diff_fold ---
1125
1126    #[test]
1127    fn git_diff_fold_folds_unchanged_lines() {
1128        // Use a realistic diff with many unchanged context lines
1129        let diff = concat!(
1130            "diff --git a/src/main.rs b/src/main.rs\n",
1131            "--- a/src/main.rs\n",
1132            "+++ b/src/main.rs\n",
1133            "@@ -1,12 +1,12 @@\n",
1134            " line1\n",
1135            " line2\n",
1136            " line3\n",
1137            " line4\n",
1138            " line5\n",
1139            " line6\n",
1140            "-old line\n",
1141            "+new line\n",
1142            " line7\n",
1143            " line8\n",
1144            " line9\n",
1145            " line10\n",
1146            " line11\n",
1147            " line12\n",
1148        );
1149        let mut c = text_content(diff);
1150        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1151        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1152        // Changed lines must be preserved
1153        assert!(c.raw.contains("-old line"), "output: {}", c.raw);
1154        assert!(c.raw.contains("+new line"), "output: {}", c.raw);
1155        // Hunk header must be preserved
1156        assert!(c.raw.contains("@@ -1,12"), "output: {}", c.raw);
1157        // Output should be shorter (folded lines 1-4 and 9-12)
1158        assert!(c.raw.len() < diff.len(), "output should be shorter, got:\n{}", c.raw);
1159        // Should contain fold markers
1160        assert!(c.raw.contains("unchanged lines"), "expected fold markers in:\n{}", c.raw);
1161    }
1162
1163    #[test]
1164    fn git_diff_fold_preserves_hunk_headers() {
1165        let diff = "@@ -1,5 +1,5 @@\n unchanged\n-old\n+new\n unchanged\n";
1166        let mut c = text_content(diff);
1167        let cfg = enabled_config(serde_json::json!({"max_context_lines": 1}));
1168        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1169        assert!(c.raw.contains("@@ -1,5 +1,5 @@"), "output: {}", c.raw);
1170    }
1171
1172    #[test]
1173    fn git_diff_fold_skips_non_diff_text() {
1174        let raw = "just some plain text\nno diff markers here\n";
1175        let mut c = text_content(raw);
1176        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1177        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1178        assert_eq!(c.raw, raw);
1179    }
1180
1181    #[test]
1182    fn git_diff_fold_disabled_passthrough() {
1183        let diff = "diff --git a/f b/f\n-old\n+new\n unchanged\n unchanged\n unchanged\n";
1184        let mut c = text_content(diff);
1185        GitDiffFoldStage.process(&mut c, &disabled_config()).unwrap();
1186        assert_eq!(c.raw, diff);
1187    }
1188
1189    // --- git_diff_fold false-positive regression tests ---
1190    // https://github.com/ojuschugh1/sqz/issues/1 (Reddit report)
1191    //
1192    // ls -l output contains lines starting with - (regular file permissions:
1193    // -rw-r--r--) which the old diff detector treated as diff deletions.
1194    // Directory entries were silently dropped from the model's view.
1195
1196    #[test]
1197    fn git_diff_fold_does_not_fold_ls_output() {
1198        let ls_output = concat!(
1199            "total 24\n",
1200            "drwxr-xr-x  6 user user  192 Apr 18 10:00 packages\n",
1201            "drwxr-xr-x  3 user user   96 Apr 18 10:00 configuration\n",
1202            "drwxr-xr-x  4 user user  128 Apr 18 10:00 documentation\n",
1203            "drwxr-xr-x  2 user user   64 Apr 18 10:00 environment\n",
1204            "-rw-r--r--  1 user user 1024 Apr 18 10:00 README.md\n",
1205        );
1206        let mut c = text_content(ls_output);
1207        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1208        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1209        // ALL directory entries must be preserved — none should be folded
1210        assert!(c.raw.contains("packages"), "packages must survive: {}", c.raw);
1211        assert!(c.raw.contains("configuration"), "configuration must survive: {}", c.raw);
1212        assert!(c.raw.contains("documentation"), "documentation must survive: {}", c.raw);
1213        assert!(c.raw.contains("environment"), "environment must survive: {}", c.raw);
1214        assert!(c.raw.contains("README.md"), "README.md must survive: {}", c.raw);
1215        assert!(!c.raw.contains("unchanged lines"), "no folding should occur: {}", c.raw);
1216    }
1217
1218    #[test]
1219    fn git_diff_fold_does_not_fold_markdown_bullets() {
1220        let markdown = concat!(
1221            "# Features\n",
1222            "\n",
1223            "- First feature\n",
1224            "- Second feature\n",
1225            "- Third feature\n",
1226            "+ Added bonus\n",
1227            "\n",
1228            "## Details\n",
1229        );
1230        let mut c = text_content(markdown);
1231        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1232        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1233        assert_eq!(c.raw, markdown, "markdown should pass through unchanged");
1234    }
1235
1236    #[test]
1237    fn git_diff_fold_still_works_on_real_diffs() {
1238        // Verify the fix didn't break actual diff folding
1239        let diff = concat!(
1240            "diff --git a/src/main.rs b/src/main.rs\n",
1241            "--- a/src/main.rs\n",
1242            "+++ b/src/main.rs\n",
1243            "@@ -1,10 +1,10 @@\n",
1244            " line1\n",
1245            " line2\n",
1246            " line3\n",
1247            " line4\n",
1248            " line5\n",
1249            "-old line\n",
1250            "+new line\n",
1251            " line6\n",
1252            " line7\n",
1253            " line8\n",
1254            " line9\n",
1255            " line10\n",
1256        );
1257        let mut c = text_content(diff);
1258        let cfg = enabled_config(serde_json::json!({"max_context_lines": 2}));
1259        GitDiffFoldStage.process(&mut c, &cfg).unwrap();
1260        // Changed lines must be preserved
1261        assert!(c.raw.contains("-old line"), "removed line preserved: {}", c.raw);
1262        assert!(c.raw.contains("+new line"), "added line preserved: {}", c.raw);
1263        // Should fold some context lines
1264        assert!(c.raw.contains("unchanged lines"), "should fold context: {}", c.raw);
1265        // Output should have fewer lines (fold markers replace multiple lines)
1266        assert!(
1267            c.raw.lines().count() < diff.lines().count(),
1268            "output should have fewer lines: {} vs {}",
1269            c.raw.lines().count(), diff.lines().count()
1270        );
1271    }
1272
1273    // --- custom_transforms ---
1274
1275    #[test]
1276    fn custom_transforms_is_noop() {
1277        let raw = r#"{"a":1}"#;
1278        let mut c = json_content(raw);
1279        let cfg = enabled_config(json!({}));
1280        CustomTransformsStage.process(&mut c, &cfg).unwrap();
1281        assert_eq!(c.raw, raw);
1282    }
1283
1284    #[test]
1285    fn custom_transforms_disabled_passthrough() {
1286        let raw = "some text";
1287        let mut c = text_content(raw);
1288        CustomTransformsStage.process(&mut c, &disabled_config()).unwrap();
1289        assert_eq!(c.raw, raw);
1290    }
1291
1292    // --- tabular encoding (in collapse_arrays) ---
1293
1294    #[test]
1295    fn collapse_arrays_tabular_encoding_uniform_objects() {
1296        // Array of objects with identical keys → should produce tabular output
1297        let raw = r#"{"users":[
1298            {"id":1,"name":"Alice","role":"admin"},
1299            {"id":2,"name":"Bob","role":"user"},
1300            {"id":3,"name":"Carol","role":"user"},
1301            {"id":4,"name":"Dave","role":"admin"},
1302            {"id":5,"name":"Eve","role":"user"},
1303            {"id":6,"name":"Frank","role":"user"}
1304        ]}"#;
1305        let mut c = json_content(raw);
1306        let cfg = enabled_config(json!({
1307            "max_items": 3,
1308            "summary_template": "... and {remaining} more items"
1309        }));
1310        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1311        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1312        let arr = v["users"].as_array().unwrap();
1313        // Should be collapsed to a single tabular string element
1314        assert_eq!(arr.len(), 1, "uniform array should be encoded as single table element");
1315        let table_str = arr[0].as_str().unwrap();
1316        assert!(table_str.contains("[table: 6 rows]"), "should contain row count: {}", table_str);
1317        assert!(table_str.contains("Alice"), "should contain data: {}", table_str);
1318        assert!(table_str.contains("Frank"), "should contain all rows: {}", table_str);
1319    }
1320
1321    #[test]
1322    fn collapse_arrays_mixed_objects_falls_back_to_truncation() {
1323        // Array of objects with DIFFERENT keys → should fall back to truncation
1324        let raw = r#"{"items":[
1325            {"id":1,"name":"Alice"},
1326            {"x":2,"y":3},
1327            {"id":3,"name":"Carol"},
1328            {"x":4,"y":5},
1329            {"id":5,"name":"Eve"},
1330            {"x":6,"y":7}
1331        ]}"#;
1332        let mut c = json_content(raw);
1333        let cfg = enabled_config(json!({
1334            "max_items": 3,
1335            "summary_template": "... and {remaining} more items"
1336        }));
1337        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1338        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1339        let arr = v["items"].as_array().unwrap();
1340        // Should fall back to truncation: 3 kept + 1 summary
1341        assert_eq!(arr.len(), 4);
1342        assert!(arr[3].as_str().unwrap().contains("3 more items"));
1343    }
1344
1345    #[test]
1346    fn collapse_arrays_small_uniform_array_unchanged() {
1347        // Uniform array but under max_items → no collapse
1348        let raw = r#"{"users":[{"id":1,"name":"Alice"},{"id":2,"name":"Bob"}]}"#;
1349        let mut c = json_content(raw);
1350        let cfg = enabled_config(json!({"max_items": 5}));
1351        CollapseArraysStage.process(&mut c, &cfg).unwrap();
1352        let v: serde_json::Value = serde_json::from_str(&c.raw).unwrap();
1353        assert_eq!(v["users"].as_array().unwrap().len(), 2);
1354    }
1355
1356    #[test]
1357    fn detect_uniform_array_returns_keys_for_uniform() {
1358        let arr = vec![
1359            json!({"a": 1, "b": 2}),
1360            json!({"a": 3, "b": 4}),
1361        ];
1362        let keys = detect_uniform_array(&arr);
1363        assert!(keys.is_some());
1364        let keys = keys.unwrap();
1365        assert!(keys.contains(&"a".to_string()));
1366        assert!(keys.contains(&"b".to_string()));
1367    }
1368
1369    #[test]
1370    fn detect_uniform_array_returns_none_for_mixed() {
1371        let arr = vec![
1372            json!({"a": 1, "b": 2}),
1373            json!({"x": 3, "y": 4}),
1374        ];
1375        assert!(detect_uniform_array(&arr).is_none());
1376    }
1377
1378    #[test]
1379    fn detect_uniform_array_returns_none_for_non_objects() {
1380        let arr = vec![json!(1), json!(2), json!(3)];
1381        assert!(detect_uniform_array(&arr).is_none());
1382    }
1383
1384    #[test]
1385    fn detect_uniform_array_returns_none_for_single_element() {
1386        let arr = vec![json!({"a": 1})];
1387        assert!(detect_uniform_array(&arr).is_none());
1388    }
1389
1390    #[test]
1391    fn value_to_compact_string_truncates_long_strings() {
1392        let long = "a".repeat(100);
1393        let v = serde_json::Value::String(long);
1394        let s = value_to_compact_string(&v);
1395        assert!(s.len() <= 53); // 47 chars + "..."
1396        assert!(s.ends_with("..."));
1397    }
1398
1399    #[test]
1400    fn value_to_compact_string_short_string_unchanged() {
1401        let v = serde_json::Value::String("hello".to_string());
1402        assert_eq!(value_to_compact_string(&v), "hello");
1403    }
1404
1405    #[test]
1406    fn value_to_compact_string_nested_types() {
1407        assert_eq!(value_to_compact_string(&json!(null)), "null");
1408        assert_eq!(value_to_compact_string(&json!(true)), "true");
1409        assert_eq!(value_to_compact_string(&json!(42)), "42");
1410        assert_eq!(value_to_compact_string(&json!([1, 2, 3])), "[3 items]");
1411        assert_eq!(value_to_compact_string(&json!({"a": 1})), "{1 keys}");
1412    }
1413
1414    // --- word_abbreviate ---
1415
1416    #[test]
1417    fn word_abbreviate_replaces_known_words() {
1418        let raw = "The implementation of the configuration is complete.";
1419        let mut c = text_content(raw);
1420        let cfg = enabled_config(json!({}));
1421        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1422        assert!(c.raw.contains("impl"), "should abbreviate 'implementation': {}", c.raw);
1423        assert!(c.raw.contains("config"), "should abbreviate 'configuration': {}", c.raw);
1424        assert!(!c.raw.contains("implementation"), "original word should be gone: {}", c.raw);
1425    }
1426
1427    #[test]
1428    fn word_abbreviate_preserves_partial_matches() {
1429        // "implement" should NOT be abbreviated — only "implementation" is in the table
1430        let raw = "We need to implement this feature.";
1431        let mut c = text_content(raw);
1432        let cfg = enabled_config(json!({}));
1433        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1434        assert!(c.raw.contains("implement"), "partial match should be preserved: {}", c.raw);
1435    }
1436
1437    #[test]
1438    fn word_abbreviate_disabled_passthrough() {
1439        let raw = "The implementation is complete.";
1440        let mut c = text_content(raw);
1441        WordAbbreviateStage.process(&mut c, &disabled_config()).unwrap();
1442        assert_eq!(c.raw, raw);
1443    }
1444
1445    #[test]
1446    fn word_abbreviate_skips_json() {
1447        let raw = r#"{"implementation":"value"}"#;
1448        let mut c = json_content(raw);
1449        let cfg = enabled_config(json!({}));
1450        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1451        assert_eq!(c.raw, raw, "JSON content should pass through unchanged");
1452    }
1453
1454    #[test]
1455    fn word_abbreviate_case_insensitive() {
1456        let raw = "The Implementation and CONFIGURATION are ready.";
1457        let mut c = text_content(raw);
1458        let cfg = enabled_config(json!({}));
1459        WordAbbreviateStage.process(&mut c, &cfg).unwrap();
1460        assert!(c.raw.contains("impl"), "should handle mixed case: {}", c.raw);
1461        assert!(c.raw.contains("config"), "should handle uppercase: {}", c.raw);
1462    }
1463
1464    #[test]
1465    fn replace_whole_word_basic() {
1466        assert_eq!(
1467            replace_whole_word("the implementation is done", "implementation", "impl"),
1468            "the impl is done"
1469        );
1470    }
1471
1472    #[test]
1473    fn replace_whole_word_no_partial() {
1474        // "implementations" contains "implementation" but shouldn't match
1475        // because the 's' after makes it not a word boundary
1476        let result = replace_whole_word("multiple implementations exist", "implementation", "impl");
1477        // The word "implementations" has "implementation" followed by 's' which is alphanumeric,
1478        // so it should NOT be replaced
1479        assert_eq!(result, "multiple implementations exist");
1480    }
1481
1482    #[test]
1483    fn replace_whole_word_at_boundaries() {
1484        assert_eq!(
1485            replace_whole_word("implementation", "implementation", "impl"),
1486            "impl"
1487        );
1488        assert_eq!(
1489            replace_whole_word("(implementation)", "implementation", "impl"),
1490            "(impl)"
1491        );
1492    }
1493
1494    #[test]
1495    fn replace_whole_word_empty_inputs() {
1496        assert_eq!(replace_whole_word("", "word", "w"), "");
1497        assert_eq!(replace_whole_word("text", "", "w"), "text");
1498    }
1499}