Skip to main content

zeph_memory/document/
splitter.rs

1use super::types::{Chunk, Document};
2
3#[derive(Debug, Clone)]
4pub struct SplitterConfig {
5    pub chunk_size: usize,
6    pub chunk_overlap: usize,
7    pub sentence_aware: bool,
8}
9
10impl Default for SplitterConfig {
11    fn default() -> Self {
12        Self {
13            chunk_size: 1000,
14            chunk_overlap: 200,
15            sentence_aware: true,
16        }
17    }
18}
19
20pub struct TextSplitter {
21    config: SplitterConfig,
22}
23
24impl TextSplitter {
25    #[must_use]
26    pub fn new(config: SplitterConfig) -> Self {
27        Self { config }
28    }
29
30    #[must_use]
31    pub fn split(&self, document: &Document) -> Vec<Chunk> {
32        let text = &document.content;
33        if text.is_empty() {
34            return Vec::new();
35        }
36
37        let pieces = if self.config.sentence_aware {
38            split_sentences(text)
39        } else {
40            split_chars(text, self.config.chunk_size, self.config.chunk_overlap)
41        };
42
43        if self.config.sentence_aware {
44            let chunks =
45                merge_sentences(&pieces, self.config.chunk_size, self.config.chunk_overlap);
46            chunks
47                .into_iter()
48                .enumerate()
49                .map(|(i, content)| Chunk {
50                    content,
51                    metadata: document.metadata.clone(),
52                    chunk_index: i,
53                })
54                .collect()
55        } else {
56            pieces
57                .into_iter()
58                .enumerate()
59                .map(|(i, content)| Chunk {
60                    content,
61                    metadata: document.metadata.clone(),
62                    chunk_index: i,
63                })
64                .collect()
65        }
66    }
67}
68
69fn split_sentences(text: &str) -> Vec<String> {
70    let mut sentences = Vec::new();
71    let mut current = String::new();
72
73    let chars: Vec<char> = text.chars().collect();
74    let mut i = 0;
75
76    while i < chars.len() {
77        current.push(chars[i]);
78
79        // Split on paragraph breaks
80        if chars[i] == '\n' && i + 1 < chars.len() && chars[i + 1] == '\n' {
81            current.push(chars[i + 1]);
82            i += 1;
83            if !current.trim().is_empty() {
84                sentences.push(std::mem::take(&mut current));
85            }
86        }
87        // Split on sentence endings followed by space
88        else if (chars[i] == '.' || chars[i] == '?' || chars[i] == '!')
89            && i + 1 < chars.len()
90            && chars[i + 1] == ' '
91            && !current.trim().is_empty()
92        {
93            sentences.push(std::mem::take(&mut current));
94        }
95
96        i += 1;
97    }
98
99    if !current.trim().is_empty() {
100        sentences.push(current);
101    }
102
103    sentences
104}
105
106/// Merge sentences into chunks, respecting size and overlap.
107fn merge_sentences(sentences: &[String], chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
108    let mut chunks = Vec::new();
109    let mut current = String::new();
110    // Sliding window: track only the sentence indices contributing to the current chunk.
111    let mut window_start = 0;
112
113    for (idx, sentence) in sentences.iter().enumerate() {
114        if !current.is_empty() && current.len() + sentence.len() > chunk_size {
115            chunks.push(current.clone());
116
117            // Build overlap from recent sentences (walk backwards from current window)
118            current.clear();
119            let mut overlap_len = 0;
120            let mut overlap_start = idx;
121            for i in (window_start..idx).rev() {
122                if overlap_len + sentences[i].len() > chunk_overlap {
123                    break;
124                }
125                overlap_len += sentences[i].len();
126                overlap_start = i;
127            }
128            for s in &sentences[overlap_start..idx] {
129                current.push_str(s);
130            }
131            window_start = overlap_start;
132        }
133
134        current.push_str(sentence);
135    }
136
137    if !current.is_empty() {
138        chunks.push(current);
139    }
140
141    chunks
142}
143
144fn split_chars(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
145    let mut chunks = Vec::new();
146    let chars: Vec<char> = text.chars().collect();
147    let step = chunk_size.saturating_sub(overlap).max(1);
148    let mut start = 0;
149
150    while start < chars.len() {
151        let end = (start + chunk_size).min(chars.len());
152        chunks.push(chars[start..end].iter().collect());
153        start += step;
154    }
155
156    chunks
157}
158
159#[cfg(test)]
160mod tests {
161    use std::collections::HashMap;
162
163    use super::*;
164    use crate::document::types::DocumentMetadata;
165
166    fn make_doc(content: &str) -> Document {
167        Document {
168            content: content.to_owned(),
169            metadata: DocumentMetadata {
170                source: "test".to_owned(),
171                content_type: "text/plain".to_owned(),
172                extra: HashMap::new(),
173            },
174        }
175    }
176
177    #[test]
178    fn empty_document() {
179        let splitter = TextSplitter::new(SplitterConfig::default());
180        let chunks = splitter.split(&make_doc(""));
181        assert!(chunks.is_empty());
182    }
183
184    #[test]
185    fn single_small_chunk() {
186        let splitter = TextSplitter::new(SplitterConfig::default());
187        let chunks = splitter.split(&make_doc("Hello world."));
188        assert_eq!(chunks.len(), 1);
189        assert_eq!(chunks[0].chunk_index, 0);
190    }
191
192    #[test]
193    fn sentence_aware_splitting() {
194        let text = "First sentence. Second sentence. Third sentence.";
195        let splitter = TextSplitter::new(SplitterConfig {
196            chunk_size: 20,
197            chunk_overlap: 5,
198            sentence_aware: true,
199        });
200        let chunks = splitter.split(&make_doc(text));
201        assert!(chunks.len() > 1);
202        for (i, chunk) in chunks.iter().enumerate() {
203            assert_eq!(chunk.chunk_index, i);
204        }
205    }
206
207    #[test]
208    fn char_splitting_with_overlap() {
209        let text = "abcdefghijklmnopqrstuvwxyz";
210        let splitter = TextSplitter::new(SplitterConfig {
211            chunk_size: 10,
212            chunk_overlap: 3,
213            sentence_aware: false,
214        });
215        let chunks = splitter.split(&make_doc(text));
216        assert!(chunks.len() > 1);
217        // Verify overlap: end of chunk N overlaps with start of chunk N+1
218        assert_eq!(&chunks[0].content[7..10], &chunks[1].content[..3]);
219    }
220
221    #[test]
222    fn metadata_preserved() {
223        let splitter = TextSplitter::new(SplitterConfig::default());
224        let doc = make_doc("Some content.");
225        let chunks = splitter.split(&doc);
226        assert_eq!(chunks[0].metadata.source, "test");
227    }
228
229    #[test]
230    fn paragraph_break_splitting() {
231        let text = "First paragraph.\n\nSecond paragraph.";
232        let sentences = super::split_sentences(text);
233        assert_eq!(sentences.len(), 2);
234    }
235
236    #[test]
237    fn document_smaller_than_chunk_size() {
238        let splitter = TextSplitter::new(SplitterConfig {
239            chunk_size: 1000,
240            chunk_overlap: 100,
241            sentence_aware: true,
242        });
243        let chunks = splitter.split(&make_doc("Short text."));
244        assert_eq!(chunks.len(), 1);
245        assert_eq!(chunks[0].content, "Short text.");
246    }
247
248    #[test]
249    fn single_sentence_no_trailing_space() {
250        let sentences = super::split_sentences("Hello world");
251        assert_eq!(sentences.len(), 1);
252        assert_eq!(sentences[0], "Hello world");
253    }
254
255    #[test]
256    fn char_split_no_overlap() {
257        let chunks = super::split_chars("abcdefghij", 5, 0);
258        assert_eq!(chunks, vec!["abcde", "fghij"]);
259    }
260
261    #[test]
262    fn char_split_full_overlap_makes_progress() {
263        // overlap >= chunk_size should still make progress (step = max(1, 0))
264        let chunks = super::split_chars("abcde", 3, 3);
265        assert!(!chunks.is_empty());
266        assert_eq!(chunks[0], "abc");
267    }
268
269    #[test]
270    fn sentence_aware_overlap_includes_previous() {
271        let text = "A. B. C. D. E.";
272        let splitter = TextSplitter::new(SplitterConfig {
273            chunk_size: 5,
274            chunk_overlap: 3,
275            sentence_aware: true,
276        });
277        let chunks = splitter.split(&make_doc(text));
278        assert!(chunks.len() > 1);
279        // Later chunks should contain overlap from previous
280        if chunks.len() >= 2 {
281            // Second chunk should start with overlap content, not fresh
282            assert!(!chunks[1].content.is_empty());
283        }
284    }
285
286    #[test]
287    fn question_mark_splits_sentence() {
288        let sentences = super::split_sentences("Is this a question? Yes it is.");
289        assert_eq!(sentences.len(), 2);
290    }
291
292    #[test]
293    fn exclamation_splits_sentence() {
294        let sentences = super::split_sentences("Wow! Amazing.");
295        assert_eq!(sentences.len(), 2);
296    }
297
298    mod proptest_splitter {
299        use super::*;
300        use proptest::prelude::*;
301
302        proptest! {
303            #![proptest_config(ProptestConfig::with_cases(1000))]
304
305            #[test]
306            fn split_never_panics(
307                content in "\\PC{0,5000}",
308                chunk_size in 1usize..2000,
309                chunk_overlap in 0usize..500,
310                sentence_aware in proptest::bool::ANY,
311            ) {
312                let splitter = TextSplitter::new(SplitterConfig {
313                    chunk_size,
314                    chunk_overlap,
315                    sentence_aware,
316                });
317                let doc = make_doc(&content);
318                let _ = splitter.split(&doc);
319            }
320
321            #[test]
322            fn chunks_cover_all_content(
323                content in "[a-z ]{10,500}",
324                chunk_size in 10usize..200,
325            ) {
326                let splitter = TextSplitter::new(SplitterConfig {
327                    chunk_size,
328                    chunk_overlap: 0,
329                    sentence_aware: false,
330                });
331                let doc = make_doc(&content);
332                let chunks = splitter.split(&doc);
333
334                if !content.is_empty() {
335                    prop_assert!(!chunks.is_empty());
336                }
337
338                let total_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
339                prop_assert!(total_chars >= content.len());
340            }
341
342            #[test]
343            fn chunk_indices_sequential(
344                content in "[a-z. ]{10,1000}",
345                chunk_size in 5usize..100,
346                sentence_aware in proptest::bool::ANY,
347            ) {
348                let splitter = TextSplitter::new(SplitterConfig {
349                    chunk_size,
350                    chunk_overlap: 0,
351                    sentence_aware,
352                });
353                let doc = make_doc(&content);
354                let chunks = splitter.split(&doc);
355
356                for (i, chunk) in chunks.iter().enumerate() {
357                    prop_assert_eq!(chunk.chunk_index, i);
358                }
359            }
360
361            #[test]
362            fn no_empty_chunks(
363                content in "[a-z. !?]{1,500}",
364                chunk_size in 1usize..200,
365                sentence_aware in proptest::bool::ANY,
366            ) {
367                let splitter = TextSplitter::new(SplitterConfig {
368                    chunk_size,
369                    chunk_overlap: 0,
370                    sentence_aware,
371                });
372                let doc = make_doc(&content);
373                let chunks = splitter.split(&doc);
374
375                for chunk in &chunks {
376                    prop_assert!(!chunk.content.is_empty());
377                }
378            }
379        }
380    }
381}