cognis_rag/splitters/
json_splitter.rs1use crate::document::Document;
7
8use super::{child_doc, TextSplitter};
9
10pub struct JsonSplitter {
15 chunk_size: usize,
16}
17
18impl Default for JsonSplitter {
19 fn default() -> Self {
20 Self { chunk_size: 1000 }
21 }
22}
23
24impl JsonSplitter {
25 pub fn new() -> Self {
27 Self::default()
28 }
29 pub fn with_chunk_size(mut self, n: usize) -> Self {
31 self.chunk_size = n;
32 self
33 }
34}
35
36impl TextSplitter for JsonSplitter {
37 fn split(&self, doc: &Document) -> Vec<Document> {
38 let value: serde_json::Value = match serde_json::from_str(&doc.content) {
39 Ok(v) => v,
40 Err(_) => {
41 return vec![child_doc(doc, doc.content.clone(), 0)];
43 }
44 };
45
46 let pieces = match value {
47 serde_json::Value::Array(items) => items
48 .into_iter()
49 .map(|v| serde_json::to_string(&v).unwrap_or_default())
50 .collect::<Vec<_>>(),
51 serde_json::Value::Object(map) => map
52 .into_iter()
53 .map(|(k, v)| {
54 serde_json::to_string(&serde_json::json!({ k: v })).unwrap_or_default()
55 })
56 .collect::<Vec<_>>(),
57 other => vec![serde_json::to_string(&other).unwrap_or_default()],
58 };
59
60 let mut chunks: Vec<String> = Vec::new();
64 let mut buf = String::new();
65 for p in pieces {
66 let plen = p.chars().count();
67 if plen > self.chunk_size {
69 if !buf.is_empty() {
70 chunks.push(std::mem::take(&mut buf));
71 }
72 chunks.push(p);
73 continue;
74 }
75 if !buf.is_empty() && buf.chars().count() + plen + 1 > self.chunk_size {
76 chunks.push(std::mem::take(&mut buf));
77 }
78 if !buf.is_empty() {
79 buf.push('\n');
80 }
81 buf.push_str(&p);
82 }
83 if !buf.is_empty() {
84 chunks.push(buf);
85 }
86
87 chunks
88 .into_iter()
89 .enumerate()
90 .map(|(i, c)| child_doc(doc, c, i))
91 .collect()
92 }
93}
94
95#[cfg(test)]
96mod tests {
97 use super::*;
98
99 #[test]
100 fn array_root_one_chunk_per_element() {
101 let doc = Document::new(r#"[{"a":1}, {"b":2}, {"c":3}]"#);
102 let chunks = JsonSplitter::new().with_chunk_size(15).split(&doc);
103 assert!(chunks.len() >= 2);
104 }
105
106 #[test]
107 fn object_root_one_chunk_per_pair() {
108 let doc = Document::new(r#"{"a":1, "b":2, "c":3}"#);
109 let chunks = JsonSplitter::new().with_chunk_size(10).split(&doc);
110 assert!(chunks.len() >= 2);
111 }
112
113 #[test]
114 fn invalid_json_falls_through() {
115 let doc = Document::new("not json");
116 let chunks = JsonSplitter::new().split(&doc);
117 assert_eq!(chunks.len(), 1);
118 assert_eq!(chunks[0].content, "not json");
119 }
120}