Skip to main content

sqz_engine/
json_projection.rs

1/// Schema-Aware JSON Projection — strips JSON to only the fields
2/// relevant to the current context.
3///
4/// Unlike `strip_nulls` (removes null values) or `keep_fields` (requires
5/// an explicit field list), projection automatically identifies and removes
6/// low-value fields based on content patterns:
7///
8/// - Internal/debug fields: `_id`, `__v`, `debug_*`, `internal_*`, `trace_*`
9/// - Metadata bloat: `created_by`, `updated_by`, `etag`, `_links`, `_embedded`
10/// - Redundant timestamps: keeps `created_at`, drops `modified_at` if same day
11/// - Empty collections: `[]`, `{}`
12/// - Verbose nested objects below a depth threshold
13///
14/// The projection is conservative — it only removes fields that are
15/// demonstrably low-value for LLM comprehension.
16
17use crate::error::Result;
18
19/// Configuration for JSON projection.
20#[derive(Debug, Clone)]
21pub struct ProjectionConfig {
22    /// Remove fields matching these prefixes (case-insensitive).
23    pub strip_prefixes: Vec<String>,
24    /// Remove fields matching these exact names (case-insensitive).
25    pub strip_names: Vec<String>,
26    /// Maximum nesting depth to preserve. Objects deeper than this
27    /// are replaced with `{...N keys}`. Default: 5.
28    pub max_depth: usize,
29    /// Remove empty arrays and objects. Default: true.
30    pub strip_empty: bool,
31    /// Remove redundant timestamps (keep only the most recent). Default: true.
32    pub dedup_timestamps: bool,
33}
34
35impl Default for ProjectionConfig {
36    fn default() -> Self {
37        Self {
38            strip_prefixes: vec![
39                "_".to_string(),
40                "debug".to_string(),
41                "internal".to_string(),
42                "trace".to_string(),
43                "x_".to_string(),
44            ],
45            strip_names: vec![
46                "__v".to_string(),
47                "__typename".to_string(),
48                "etag".to_string(),
49                "_links".to_string(),
50                "_embedded".to_string(),
51                "cursor".to_string(),
52                "request_id".to_string(),
53                "x_request_id".to_string(),
54                "correlation_id".to_string(),
55            ],
56            max_depth: 5,
57            strip_empty: true,
58            dedup_timestamps: true,
59        }
60    }
61}
62
63/// Result of JSON projection.
64#[derive(Debug, Clone)]
65pub struct ProjectionResult {
66    /// The projected JSON string.
67    pub data: String,
68    /// Number of fields removed.
69    pub fields_removed: usize,
70    /// Estimated tokens saved.
71    pub tokens_saved: u32,
72}
73
74/// Apply schema-aware projection to a JSON string.
75///
76/// Returns the projected JSON and stats, or the original string unchanged
77/// if it's not valid JSON or projection doesn't help.
78pub fn project_json(input: &str, config: &ProjectionConfig) -> Result<ProjectionResult> {
79    let trimmed = input.trim();
80    let mut value: serde_json::Value = match serde_json::from_str(trimmed) {
81        Ok(v) => v,
82        Err(_) => {
83            return Ok(ProjectionResult {
84                data: input.to_string(),
85                fields_removed: 0,
86                tokens_saved: 0,
87            });
88        }
89    };
90
91    let original_tokens = estimate_tokens(input);
92    let mut fields_removed = 0;
93
94    project_value(&mut value, config, 0, &mut fields_removed);
95
96    let projected = serde_json::to_string(&value)
97        .unwrap_or_else(|_| input.to_string());
98
99    let projected_tokens = estimate_tokens(&projected);
100    let tokens_saved = original_tokens.saturating_sub(projected_tokens);
101
102    // Only return projected version if it's actually smaller or fields were removed
103    if projected.len() < input.len() || fields_removed > 0 {
104        Ok(ProjectionResult {
105            data: projected,
106            fields_removed,
107            tokens_saved,
108        })
109    } else {
110        Ok(ProjectionResult {
111            data: input.to_string(),
112            fields_removed: 0,
113            tokens_saved: 0,
114        })
115    }
116}
117
118/// Recursively project a JSON value, removing low-value fields.
119fn project_value(
120    value: &mut serde_json::Value,
121    config: &ProjectionConfig,
122    depth: usize,
123    removed: &mut usize,
124) {
125    match value {
126        serde_json::Value::Object(map) => {
127            // At max depth, replace deep objects with a summary
128            if depth >= config.max_depth {
129                let key_count = map.len();
130                if key_count > 0 {
131                    map.clear();
132                    map.insert(
133                        "_sqz_summary".to_string(),
134                        serde_json::Value::String(format!("{{...{key_count} keys}}")),
135                    );
136                    *removed += key_count;
137                }
138                return;
139            }
140
141            let keys_to_remove: Vec<String> = map
142                .keys()
143                .filter(|k| should_strip_field(k, config))
144                .cloned()
145                .collect();
146
147            for key in &keys_to_remove {
148                map.remove(key);
149                *removed += 1;
150            }
151
152            // Strip empty collections
153            if config.strip_empty {
154                let empty_keys: Vec<String> = map
155                    .iter()
156                    .filter(|(_, v)| is_empty_collection(v))
157                    .map(|(k, _)| k.clone())
158                    .collect();
159                for key in &empty_keys {
160                    map.remove(key);
161                    *removed += 1;
162                }
163            }
164
165            // Dedup timestamps: if multiple timestamp fields exist on the same
166            // day, keep only the most descriptive one
167            if config.dedup_timestamps {
168                dedup_timestamps(map, removed);
169            }
170
171            // Recurse into remaining values
172            for v in map.values_mut() {
173                project_value(v, config, depth + 1, removed);
174            }
175        }
176        serde_json::Value::Array(arr) => {
177            for item in arr.iter_mut() {
178                project_value(item, config, depth + 1, removed);
179            }
180        }
181        _ => {}
182    }
183}
184
185/// Check if a field name should be stripped based on the config.
186fn should_strip_field(name: &str, config: &ProjectionConfig) -> bool {
187    let lower = name.to_lowercase();
188
189    // Check exact name matches
190    for strip_name in &config.strip_names {
191        if lower == strip_name.to_lowercase() {
192            return true;
193        }
194    }
195
196    // Check prefix matches
197    for prefix in &config.strip_prefixes {
198        let prefix_lower = prefix.to_lowercase();
199        if lower.starts_with(&prefix_lower) && lower != prefix_lower {
200            // Don't strip if the field IS the prefix (e.g., don't strip "id" for prefix "_")
201            // But DO strip "_id", "debug_info", etc.
202            return true;
203        }
204    }
205
206    false
207}
208
209/// Check if a value is an empty collection.
210fn is_empty_collection(value: &serde_json::Value) -> bool {
211    match value {
212        serde_json::Value::Array(arr) => arr.is_empty(),
213        serde_json::Value::Object(map) => map.is_empty(),
214        serde_json::Value::String(s) => s.is_empty(),
215        _ => false,
216    }
217}
218
219/// Remove redundant timestamp fields. If multiple fields end in `_at` or `_date`
220/// and have the same date prefix (YYYY-MM-DD), keep only the first one.
221fn dedup_timestamps(
222    map: &mut serde_json::Map<String, serde_json::Value>,
223    removed: &mut usize,
224) {
225    let timestamp_fields: Vec<(String, String)> = map
226        .iter()
227        .filter_map(|(k, v)| {
228            if (k.ends_with("_at") || k.ends_with("_date") || k.ends_with("_time"))
229                && v.is_string()
230            {
231                let date_prefix = v.as_str().unwrap_or("").chars().take(10).collect::<String>();
232                if date_prefix.len() == 10 && date_prefix.contains('-') {
233                    return Some((k.clone(), date_prefix));
234                }
235            }
236            None
237        })
238        .collect();
239
240    if timestamp_fields.len() <= 1 {
241        return;
242    }
243
244    // Group by date prefix
245    let mut seen_dates: std::collections::HashSet<String> = std::collections::HashSet::new();
246    let mut first_field_per_date: std::collections::HashMap<String, String> = std::collections::HashMap::new();
247    let mut to_remove = Vec::new();
248
249    for (field, date) in &timestamp_fields {
250        if seen_dates.contains(date) {
251            // This date was already seen — remove this field unless it's the "primary" one
252            let primary = first_field_per_date.get(date).unwrap();
253            // Keep the more descriptive field (created_at > updated_at > modified_at)
254            let dominated = if field.contains("created") {
255                // created_at is more important — remove the previous one
256                to_remove.push(primary.clone());
257                false
258            } else {
259                true
260            };
261            if dominated {
262                to_remove.push(field.clone());
263            }
264        } else {
265            seen_dates.insert(date.clone());
266            first_field_per_date.insert(date.clone(), field.clone());
267        }
268    }
269
270    for field in &to_remove {
271        map.remove(field);
272        *removed += 1;
273    }
274}
275
276fn estimate_tokens(text: &str) -> u32 {
277    ((text.len() as f64) / 4.0).ceil() as u32
278}
279
280// ── Tests ─────────────────────────────────────────────────────────────────
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285    use serde_json::json;
286
287    #[test]
288    fn test_strips_internal_fields() {
289        let input = json!({
290            "id": 1,
291            "name": "Alice",
292            "_id": "abc123",
293            "__v": 3,
294            "debug_info": "verbose stuff",
295            "internal_state": "hidden"
296        });
297        let config = ProjectionConfig::default();
298        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
299        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
300        assert!(parsed.get("id").is_some(), "id should be kept");
301        assert!(parsed.get("name").is_some(), "name should be kept");
302        assert!(parsed.get("_id").is_none(), "_id should be stripped");
303        assert!(parsed.get("__v").is_none(), "__v should be stripped");
304        assert!(parsed.get("debug_info").is_none(), "debug_info should be stripped");
305        assert!(result.fields_removed > 0);
306    }
307
308    #[test]
309    fn test_strips_empty_collections() {
310        let input = json!({
311            "name": "Bob",
312            "tags": [],
313            "metadata": {},
314            "bio": ""
315        });
316        let config = ProjectionConfig::default();
317        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
318        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
319        assert!(parsed.get("name").is_some());
320        assert!(parsed.get("tags").is_none(), "empty array should be stripped");
321        assert!(parsed.get("metadata").is_none(), "empty object should be stripped");
322        assert!(parsed.get("bio").is_none(), "empty string should be stripped");
323    }
324
325    #[test]
326    fn test_max_depth_truncation() {
327        let input = json!({
328            "a": {"b": {"c": {"d": {"e": {"f": "deep"}}}}}
329        });
330        let config = ProjectionConfig {
331            max_depth: 3,
332            ..Default::default()
333        };
334        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
335        // At depth 3, the nested object should be replaced with a summary
336        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
337        // Navigate to depth 3: a -> b -> c (this is where truncation happens)
338        let at_depth = &parsed["a"]["b"]["c"];
339        assert!(
340            at_depth.get("_sqz_summary").is_some() || result.fields_removed > 0,
341            "deep nesting should be truncated at max_depth: {:?}", parsed
342        );
343    }
344
345    #[test]
346    fn test_dedup_timestamps() {
347        let input = json!({
348            "name": "Alice",
349            "created_at": "2024-01-15T10:00:00Z",
350            "updated_at": "2024-01-15T14:30:00Z",
351            "modified_at": "2024-01-15T14:30:00Z"
352        });
353        let config = ProjectionConfig::default();
354        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
355        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
356        // Should keep created_at, drop one of the redundant same-day timestamps
357        assert!(parsed.get("created_at").is_some(), "created_at should be kept");
358        assert!(result.fields_removed > 0, "redundant timestamps should be removed");
359    }
360
361    #[test]
362    fn test_non_json_passthrough() {
363        let input = "not json at all";
364        let config = ProjectionConfig::default();
365        let result = project_json(input, &config).unwrap();
366        assert_eq!(result.data, input);
367        assert_eq!(result.fields_removed, 0);
368    }
369
370    #[test]
371    fn test_preserves_important_fields() {
372        let input = json!({
373            "id": 42,
374            "name": "Alice",
375            "email": "alice@example.com",
376            "role": "admin",
377            "status": "active"
378        });
379        let config = ProjectionConfig::default();
380        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
381        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
382        assert_eq!(parsed["id"], 42);
383        assert_eq!(parsed["name"], "Alice");
384        assert_eq!(parsed["role"], "admin");
385    }
386
387    #[test]
388    fn test_custom_strip_prefixes() {
389        let input = json!({
390            "name": "Alice",
391            "tmp_cache": "data",
392            "tmp_buffer": "more data"
393        });
394        let config = ProjectionConfig {
395            strip_prefixes: vec!["tmp_".to_string()],
396            ..Default::default()
397        };
398        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
399        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
400        assert!(parsed.get("name").is_some());
401        assert!(parsed.get("tmp_cache").is_none());
402        assert!(parsed.get("tmp_buffer").is_none());
403    }
404
405    #[test]
406    fn test_nested_projection() {
407        let input = json!({
408            "user": {
409                "id": 1,
410                "name": "Alice",
411                "_internal_id": "xyz",
412                "debug_flags": [1, 2, 3]
413            }
414        });
415        let config = ProjectionConfig::default();
416        let result = project_json(&serde_json::to_string(&input).unwrap(), &config).unwrap();
417        let parsed: serde_json::Value = serde_json::from_str(&result.data).unwrap();
418        assert!(parsed["user"].get("id").is_some());
419        assert!(parsed["user"].get("_internal_id").is_none());
420        assert!(parsed["user"].get("debug_flags").is_none());
421    }
422
423    use proptest::prelude::*;
424
425    proptest! {
426        /// Projection never produces invalid JSON from valid JSON input.
427        #[test]
428        fn prop_projection_produces_valid_json(
429            key1 in "[a-z]{3,10}",
430            key2 in "[a-z]{3,10}",
431            val in "[a-z0-9 ]{1,50}",
432        ) {
433            let input = format!(r#"{{"{key1}":"{val}","{key2}":42}}"#);
434            let config = ProjectionConfig::default();
435            let result = project_json(&input, &config).unwrap();
436            let parsed: std::result::Result<serde_json::Value, _> = serde_json::from_str(&result.data);
437            prop_assert!(parsed.is_ok(), "projection output must be valid JSON");
438        }
439
440        /// Fields removed count is non-negative.
441        #[test]
442        fn prop_fields_removed_non_negative(
443            val in "[a-z]{1,20}",
444        ) {
445            let input = format!(r#"{{"name":"{val}","_debug":"x","__v":1}}"#);
446            let config = ProjectionConfig::default();
447            let result = project_json(&input, &config).unwrap();
448            // fields_removed is usize, always >= 0
449            let _ = result.fields_removed;
450        }
451    }
452}