Skip to main content

lean_ctx/core/
structured_compact.rs

1//! Lossless compaction of structured data (JSON / JSON Lines).
2//!
3//! Pretty-printed JSON is whitespace-heavy: indentation, spaces after `:` and
4//! `,`, and newlines can be 20-50% of the bytes. Code read modes (`map`,
5//! `signatures`) don't apply to data files, so JSON historically fell through to
6//! the line-based `aggressive` path and saved ~0% (measured).
7//!
8//! This module strips only *insignificant* whitespace — the bytes that sit
9//! **outside** string literals. It is genuinely lossless:
10//!   * key order is preserved (we operate on the original text, not a parsed
11//!     `serde_json::Value`, which would re-sort keys);
12//!   * number formatting is preserved (e.g. `1.0`, `1e3`, trailing zeros);
13//!   * string contents (including any whitespace inside them) are untouched.
14//!
15//! We validate that the input parses as JSON before touching it, so malformed
16//! data is never altered, and we only return output that is strictly smaller.
17
18/// Largest input we attempt to compact. JSON above this is rare in reads and the
19/// validation parse would dominate; bail to keep the hot path bounded.
20const MAX_INPUT_BYTES: usize = 4 * 1024 * 1024;
21
22/// Strips whitespace that lies outside JSON string literals.
23///
24/// Assumes `input` is syntactically valid JSON; callers validate first. Escapes
25/// (`\"`, `\\`) inside strings are handled so an escaped quote does not end the
26/// string early.
27fn strip_insignificant_ws(input: &str) -> String {
28    let mut out = String::with_capacity(input.len());
29    let mut in_string = false;
30    let mut escaped = false;
31
32    for c in input.chars() {
33        if in_string {
34            out.push(c);
35            if escaped {
36                escaped = false;
37            } else if c == '\\' {
38                escaped = true;
39            } else if c == '"' {
40                in_string = false;
41            }
42            continue;
43        }
44
45        match c {
46            '"' => {
47                in_string = true;
48                out.push(c);
49            }
50            ' ' | '\t' | '\n' | '\r' => {} // insignificant outside strings
51            _ => out.push(c),
52        }
53    }
54
55    out
56}
57
58/// Compacts a single JSON document by removing insignificant whitespace.
59///
60/// Returns `Some(compacted)` only when `input` is valid JSON **and** the
61/// compacted form is strictly smaller. The result is value-identical to the
62/// input (only formatting bytes are removed).
63#[must_use]
64pub fn compact_json(input: &str) -> Option<String> {
65    if input.len() > MAX_INPUT_BYTES {
66        return None;
67    }
68    let trimmed = input.trim_start();
69    // Cheap pre-filter: only objects/arrays carry enough whitespace to be worth
70    // compacting (scalars have none to strip).
71    if !trimmed.starts_with('{') && !trimmed.starts_with('[') {
72        return None;
73    }
74    // Validate before mutating: never reshape malformed JSON.
75    serde_json::from_str::<serde_json::Value>(input).ok()?;
76
77    let compact = strip_insignificant_ws(input);
78    (compact.len() < input.len()).then_some(compact)
79}
80
81/// Compacts JSON Lines (one JSON value per line). Returns `Some` only when every
82/// non-empty line is valid JSON and the joined result is strictly smaller.
83#[must_use]
84pub fn compact_jsonl(input: &str) -> Option<String> {
85    if input.len() > MAX_INPUT_BYTES {
86        return None;
87    }
88    let mut out = String::with_capacity(input.len());
89    let mut any = false;
90
91    for line in input.lines() {
92        let t = line.trim();
93        if t.is_empty() {
94            continue;
95        }
96        serde_json::from_str::<serde_json::Value>(t).ok()?;
97        if any {
98            out.push('\n');
99        }
100        out.push_str(&strip_insignificant_ws(t));
101        any = true;
102    }
103
104    if !any {
105        return None;
106    }
107    (out.len() < input.len()).then_some(out)
108}
109
110/// Best-effort lossless compaction selected by file extension.
111///
112/// `.json`/`.geojson` → single document; `.jsonl`/`.ndjson` → line-delimited.
113/// With no (or an unknown) extension, attempts single-document JSON when the
114/// content looks like JSON. Returns `None` when nothing smaller applies.
115#[must_use]
116pub fn compact_structured(content: &str, ext: Option<&str>) -> Option<String> {
117    if matches!(ext, Some("jsonl" | "ndjson")) {
118        return compact_jsonl(content);
119    }
120    // `.json`/`.geojson`/`.webmanifest` and unknown extensions fall back to
121    // single-document JSON compaction, which no-ops when the content isn't JSON.
122    compact_json(content)
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    fn parse(s: &str) -> serde_json::Value {
130        serde_json::from_str(s).expect("valid json")
131    }
132
133    #[test]
134    fn compacts_pretty_object_losslessly() {
135        let pretty = "{\n  \"name\": \"lean-ctx\",\n  \"version\": 3,\n  \"tags\": [\n    \"a\",\n    \"b\"\n  ]\n}";
136        let out = compact_json(pretty).expect("should compact");
137        assert!(out.len() < pretty.len());
138        assert_eq!(parse(&out), parse(pretty), "value must be identical");
139        assert!(!out.contains('\n'));
140    }
141
142    #[test]
143    fn preserves_key_order() {
144        // serde_json's default Map sorts keys; our text-based strip must NOT.
145        let pretty = "{\n  \"zebra\": 1,\n  \"alpha\": 2,\n  \"mike\": 3\n}";
146        let out = compact_json(pretty).expect("should compact");
147        assert_eq!(out, r#"{"zebra":1,"alpha":2,"mike":3}"#);
148    }
149
150    #[test]
151    fn preserves_number_formatting() {
152        let pretty = "{\n  \"a\": 1.0,\n  \"b\": 1e3,\n  \"c\": 0.50\n}";
153        let out = compact_json(pretty).expect("should compact");
154        assert_eq!(out, r#"{"a":1.0,"b":1e3,"c":0.50}"#);
155    }
156
157    #[test]
158    fn whitespace_inside_strings_is_kept() {
159        let input = "{\n  \"msg\": \"hello   world\\n\\ttab\"\n}";
160        let out = compact_json(input).expect("should compact");
161        assert_eq!(parse(&out), parse(input));
162        assert!(out.contains("hello   world"), "inner spaces preserved");
163        assert!(out.contains("\\n\\ttab"), "escapes preserved");
164    }
165
166    #[test]
167    fn escaped_quote_does_not_end_string() {
168        let input = "{\n  \"q\": \"a \\\" b : c\"\n}";
169        let out = compact_json(input).expect("should compact");
170        assert_eq!(parse(&out), parse(input));
171        assert_eq!(out, r#"{"q":"a \" b : c"}"#);
172    }
173
174    #[test]
175    fn already_minified_returns_none() {
176        let min = r#"{"a":1,"b":[2,3]}"#;
177        assert!(compact_json(min).is_none(), "no smaller form available");
178    }
179
180    #[test]
181    fn invalid_json_is_never_touched() {
182        assert!(compact_json("{not valid json").is_none());
183        assert!(compact_json("{\"a\": }").is_none());
184        assert!(compact_json("just text  with spaces").is_none());
185    }
186
187    #[test]
188    fn scalars_and_non_json_skipped() {
189        assert!(compact_json("42").is_none());
190        assert!(compact_json("\"a string\"").is_none());
191        assert!(compact_json("   ").is_none());
192    }
193
194    #[test]
195    fn jsonl_compacts_each_line() {
196        let input = "{ \"a\": 1 }\n{ \"b\": 2 }\n\n{ \"c\": 3 }";
197        let out = compact_jsonl(input).expect("should compact");
198        assert_eq!(out, "{\"a\":1}\n{\"b\":2}\n{\"c\":3}");
199    }
200
201    #[test]
202    fn jsonl_with_invalid_line_returns_none() {
203        let input = "{\"a\":1}\nnot json\n{\"b\":2}";
204        assert!(compact_jsonl(input).is_none());
205    }
206
207    #[test]
208    fn compact_structured_dispatches_by_ext() {
209        let pretty = "{\n  \"x\": 1\n}";
210        assert!(compact_structured(pretty, Some("json")).is_some());
211        assert!(compact_structured("{ \"x\": 1 }\n{ \"y\": 2 }", Some("jsonl")).is_some());
212        assert!(compact_structured(pretty, None).is_some());
213        assert!(compact_structured("def f(): pass", Some("py")).is_none());
214    }
215
216    #[test]
217    fn idempotent_on_compacted_output() {
218        let pretty = "{\n  \"a\": [1, 2, 3],\n  \"b\": { \"c\": 4 }\n}";
219        let once = compact_json(pretty).expect("compact once");
220        assert!(compact_json(&once).is_none(), "second pass finds nothing");
221    }
222
223    #[test]
224    fn oversized_input_bails() {
225        let big = format!("{{\"a\":\"{}\"}}", " ".repeat(MAX_INPUT_BYTES));
226        assert!(compact_json(&big).is_none());
227    }
228}