Skip to main content

cognis_rag/splitters/
json_splitter.rs

1//! JSON-aware splitter — splits on top-level array elements / object pairs.
2//!
3//! Operates on the *string form* of the JSON. Parses, then walks the
4//! structure top-level-first and emits chunks that fit `chunk_size`.
5
6use crate::document::Document;
7
8use super::{child_doc, TextSplitter};
9
10/// JSON splitter. Behaviour:
11/// - Top-level array → one chunk per element (or grouped if small).
12/// - Top-level object → one chunk per key/value pair (or grouped if small).
13/// - Anything else → one chunk for the whole value.
14pub struct JsonSplitter {
15    chunk_size: usize,
16}
17
18impl Default for JsonSplitter {
19    fn default() -> Self {
20        Self { chunk_size: 1000 }
21    }
22}
23
24impl JsonSplitter {
25    /// Construct.
26    pub fn new() -> Self {
27        Self::default()
28    }
29    /// Cap chunk size.
30    pub fn with_chunk_size(mut self, n: usize) -> Self {
31        self.chunk_size = n;
32        self
33    }
34}
35
36impl TextSplitter for JsonSplitter {
37    fn split(&self, doc: &Document) -> Vec<Document> {
38        let value: serde_json::Value = match serde_json::from_str(&doc.content) {
39            Ok(v) => v,
40            Err(_) => {
41                // Not JSON — emit one chunk and let downstream cope.
42                return vec![child_doc(doc, doc.content.clone(), 0)];
43            }
44        };
45
46        let pieces = match value {
47            serde_json::Value::Array(items) => items
48                .into_iter()
49                .map(|v| serde_json::to_string(&v).unwrap_or_default())
50                .collect::<Vec<_>>(),
51            serde_json::Value::Object(map) => map
52                .into_iter()
53                .map(|(k, v)| {
54                    serde_json::to_string(&serde_json::json!({ k: v })).unwrap_or_default()
55                })
56                .collect::<Vec<_>>(),
57            other => vec![serde_json::to_string(&other).unwrap_or_default()],
58        };
59
60        // Pack pieces into chunks up to chunk_size. A single element
61        // larger than chunk_size still gets its own chunk (we'd rather
62        // emit one oversized chunk than mangle the JSON to hit the cap).
63        let mut chunks: Vec<String> = Vec::new();
64        let mut buf = String::new();
65        for p in pieces {
66            let plen = p.chars().count();
67            // Oversized element on its own: flush buf, emit p alone.
68            if plen > self.chunk_size {
69                if !buf.is_empty() {
70                    chunks.push(std::mem::take(&mut buf));
71                }
72                chunks.push(p);
73                continue;
74            }
75            if !buf.is_empty() && buf.chars().count() + plen + 1 > self.chunk_size {
76                chunks.push(std::mem::take(&mut buf));
77            }
78            if !buf.is_empty() {
79                buf.push('\n');
80            }
81            buf.push_str(&p);
82        }
83        if !buf.is_empty() {
84            chunks.push(buf);
85        }
86
87        chunks
88            .into_iter()
89            .enumerate()
90            .map(|(i, c)| child_doc(doc, c, i))
91            .collect()
92    }
93}
94
95#[cfg(test)]
96mod tests {
97    use super::*;
98
99    #[test]
100    fn array_root_one_chunk_per_element() {
101        let doc = Document::new(r#"[{"a":1}, {"b":2}, {"c":3}]"#);
102        let chunks = JsonSplitter::new().with_chunk_size(15).split(&doc);
103        assert!(chunks.len() >= 2);
104    }
105
106    #[test]
107    fn object_root_one_chunk_per_pair() {
108        let doc = Document::new(r#"{"a":1, "b":2, "c":3}"#);
109        let chunks = JsonSplitter::new().with_chunk_size(10).split(&doc);
110        assert!(chunks.len() >= 2);
111    }
112
113    #[test]
114    fn invalid_json_falls_through() {
115        let doc = Document::new("not json");
116        let chunks = JsonSplitter::new().split(&doc);
117        assert_eq!(chunks.len(), 1);
118        assert_eq!(chunks[0].content, "not json");
119    }
120}