1use super::types::{Chunk, Document};
2
3#[derive(Debug, Clone)]
4pub struct SplitterConfig {
5 pub chunk_size: usize,
6 pub chunk_overlap: usize,
7 pub sentence_aware: bool,
8}
9
10impl Default for SplitterConfig {
11 fn default() -> Self {
12 Self {
13 chunk_size: 1000,
14 chunk_overlap: 200,
15 sentence_aware: true,
16 }
17 }
18}
19
20pub struct TextSplitter {
21 config: SplitterConfig,
22}
23
24impl TextSplitter {
25 #[must_use]
26 pub fn new(config: SplitterConfig) -> Self {
27 Self { config }
28 }
29
30 #[must_use]
31 pub fn split(&self, document: &Document) -> Vec<Chunk> {
32 let text = &document.content;
33 if text.is_empty() {
34 return Vec::new();
35 }
36
37 let pieces = if self.config.sentence_aware {
38 split_sentences(text)
39 } else {
40 split_chars(text, self.config.chunk_size, self.config.chunk_overlap)
41 };
42
43 if self.config.sentence_aware {
44 let chunks =
45 merge_sentences(&pieces, self.config.chunk_size, self.config.chunk_overlap);
46 chunks
47 .into_iter()
48 .enumerate()
49 .map(|(i, content)| Chunk {
50 content,
51 metadata: document.metadata.clone(),
52 chunk_index: i,
53 })
54 .collect()
55 } else {
56 pieces
57 .into_iter()
58 .enumerate()
59 .map(|(i, content)| Chunk {
60 content,
61 metadata: document.metadata.clone(),
62 chunk_index: i,
63 })
64 .collect()
65 }
66 }
67}
68
69fn split_sentences(text: &str) -> Vec<String> {
70 let mut sentences = Vec::new();
71 let mut current = String::new();
72
73 let chars: Vec<char> = text.chars().collect();
74 let mut i = 0;
75
76 while i < chars.len() {
77 current.push(chars[i]);
78
79 if chars[i] == '\n' && i + 1 < chars.len() && chars[i + 1] == '\n' {
81 current.push(chars[i + 1]);
82 i += 1;
83 if !current.trim().is_empty() {
84 sentences.push(std::mem::take(&mut current));
85 }
86 }
87 else if (chars[i] == '.' || chars[i] == '?' || chars[i] == '!')
89 && i + 1 < chars.len()
90 && chars[i + 1] == ' '
91 && !current.trim().is_empty()
92 {
93 sentences.push(std::mem::take(&mut current));
94 }
95
96 i += 1;
97 }
98
99 if !current.trim().is_empty() {
100 sentences.push(current);
101 }
102
103 sentences
104}
105
106fn merge_sentences(sentences: &[String], chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
108 let mut chunks = Vec::new();
109 let mut current = String::new();
110 let mut window_start = 0;
112
113 for (idx, sentence) in sentences.iter().enumerate() {
114 if !current.is_empty() && current.len() + sentence.len() > chunk_size {
115 chunks.push(current.clone());
116
117 current.clear();
119 let mut overlap_len = 0;
120 let mut overlap_start = idx;
121 for i in (window_start..idx).rev() {
122 if overlap_len + sentences[i].len() > chunk_overlap {
123 break;
124 }
125 overlap_len += sentences[i].len();
126 overlap_start = i;
127 }
128 for s in &sentences[overlap_start..idx] {
129 current.push_str(s);
130 }
131 window_start = overlap_start;
132 }
133
134 current.push_str(sentence);
135 }
136
137 if !current.is_empty() {
138 chunks.push(current);
139 }
140
141 chunks
142}
143
144fn split_chars(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
145 let mut chunks = Vec::new();
146 let chars: Vec<char> = text.chars().collect();
147 let step = chunk_size.saturating_sub(overlap).max(1);
148 let mut start = 0;
149
150 while start < chars.len() {
151 let end = (start + chunk_size).min(chars.len());
152 chunks.push(chars[start..end].iter().collect());
153 start += step;
154 }
155
156 chunks
157}
158
159#[cfg(test)]
160mod tests {
161 use std::collections::HashMap;
162
163 use super::*;
164 use crate::document::types::DocumentMetadata;
165
166 fn make_doc(content: &str) -> Document {
167 Document {
168 content: content.to_owned(),
169 metadata: DocumentMetadata {
170 source: "test".to_owned(),
171 content_type: "text/plain".to_owned(),
172 extra: HashMap::new(),
173 },
174 }
175 }
176
177 #[test]
178 fn empty_document() {
179 let splitter = TextSplitter::new(SplitterConfig::default());
180 let chunks = splitter.split(&make_doc(""));
181 assert!(chunks.is_empty());
182 }
183
184 #[test]
185 fn single_small_chunk() {
186 let splitter = TextSplitter::new(SplitterConfig::default());
187 let chunks = splitter.split(&make_doc("Hello world."));
188 assert_eq!(chunks.len(), 1);
189 assert_eq!(chunks[0].chunk_index, 0);
190 }
191
192 #[test]
193 fn sentence_aware_splitting() {
194 let text = "First sentence. Second sentence. Third sentence.";
195 let splitter = TextSplitter::new(SplitterConfig {
196 chunk_size: 20,
197 chunk_overlap: 5,
198 sentence_aware: true,
199 });
200 let chunks = splitter.split(&make_doc(text));
201 assert!(chunks.len() > 1);
202 for (i, chunk) in chunks.iter().enumerate() {
203 assert_eq!(chunk.chunk_index, i);
204 }
205 }
206
207 #[test]
208 fn char_splitting_with_overlap() {
209 let text = "abcdefghijklmnopqrstuvwxyz";
210 let splitter = TextSplitter::new(SplitterConfig {
211 chunk_size: 10,
212 chunk_overlap: 3,
213 sentence_aware: false,
214 });
215 let chunks = splitter.split(&make_doc(text));
216 assert!(chunks.len() > 1);
217 assert_eq!(&chunks[0].content[7..10], &chunks[1].content[..3]);
219 }
220
221 #[test]
222 fn metadata_preserved() {
223 let splitter = TextSplitter::new(SplitterConfig::default());
224 let doc = make_doc("Some content.");
225 let chunks = splitter.split(&doc);
226 assert_eq!(chunks[0].metadata.source, "test");
227 }
228
229 #[test]
230 fn paragraph_break_splitting() {
231 let text = "First paragraph.\n\nSecond paragraph.";
232 let sentences = super::split_sentences(text);
233 assert_eq!(sentences.len(), 2);
234 }
235
236 #[test]
237 fn document_smaller_than_chunk_size() {
238 let splitter = TextSplitter::new(SplitterConfig {
239 chunk_size: 1000,
240 chunk_overlap: 100,
241 sentence_aware: true,
242 });
243 let chunks = splitter.split(&make_doc("Short text."));
244 assert_eq!(chunks.len(), 1);
245 assert_eq!(chunks[0].content, "Short text.");
246 }
247
248 #[test]
249 fn single_sentence_no_trailing_space() {
250 let sentences = super::split_sentences("Hello world");
251 assert_eq!(sentences.len(), 1);
252 assert_eq!(sentences[0], "Hello world");
253 }
254
255 #[test]
256 fn char_split_no_overlap() {
257 let chunks = super::split_chars("abcdefghij", 5, 0);
258 assert_eq!(chunks, vec!["abcde", "fghij"]);
259 }
260
261 #[test]
262 fn char_split_full_overlap_makes_progress() {
263 let chunks = super::split_chars("abcde", 3, 3);
265 assert!(!chunks.is_empty());
266 assert_eq!(chunks[0], "abc");
267 }
268
269 #[test]
270 fn sentence_aware_overlap_includes_previous() {
271 let text = "A. B. C. D. E.";
272 let splitter = TextSplitter::new(SplitterConfig {
273 chunk_size: 5,
274 chunk_overlap: 3,
275 sentence_aware: true,
276 });
277 let chunks = splitter.split(&make_doc(text));
278 assert!(chunks.len() > 1);
279 if chunks.len() >= 2 {
281 assert!(!chunks[1].content.is_empty());
283 }
284 }
285
286 #[test]
287 fn question_mark_splits_sentence() {
288 let sentences = super::split_sentences("Is this a question? Yes it is.");
289 assert_eq!(sentences.len(), 2);
290 }
291
292 #[test]
293 fn exclamation_splits_sentence() {
294 let sentences = super::split_sentences("Wow! Amazing.");
295 assert_eq!(sentences.len(), 2);
296 }
297
298 mod proptest_splitter {
299 use super::*;
300 use proptest::prelude::*;
301
302 proptest! {
303 #![proptest_config(ProptestConfig::with_cases(1000))]
304
305 #[test]
306 fn split_never_panics(
307 content in "\\PC{0,5000}",
308 chunk_size in 1usize..2000,
309 chunk_overlap in 0usize..500,
310 sentence_aware in proptest::bool::ANY,
311 ) {
312 let splitter = TextSplitter::new(SplitterConfig {
313 chunk_size,
314 chunk_overlap,
315 sentence_aware,
316 });
317 let doc = make_doc(&content);
318 let _ = splitter.split(&doc);
319 }
320
321 #[test]
322 fn chunks_cover_all_content(
323 content in "[a-z ]{10,500}",
324 chunk_size in 10usize..200,
325 ) {
326 let splitter = TextSplitter::new(SplitterConfig {
327 chunk_size,
328 chunk_overlap: 0,
329 sentence_aware: false,
330 });
331 let doc = make_doc(&content);
332 let chunks = splitter.split(&doc);
333
334 if !content.is_empty() {
335 prop_assert!(!chunks.is_empty());
336 }
337
338 let total_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
339 prop_assert!(total_chars >= content.len());
340 }
341
342 #[test]
343 fn chunk_indices_sequential(
344 content in "[a-z. ]{10,1000}",
345 chunk_size in 5usize..100,
346 sentence_aware in proptest::bool::ANY,
347 ) {
348 let splitter = TextSplitter::new(SplitterConfig {
349 chunk_size,
350 chunk_overlap: 0,
351 sentence_aware,
352 });
353 let doc = make_doc(&content);
354 let chunks = splitter.split(&doc);
355
356 for (i, chunk) in chunks.iter().enumerate() {
357 prop_assert_eq!(chunk.chunk_index, i);
358 }
359 }
360
361 #[test]
362 fn no_empty_chunks(
363 content in "[a-z. !?]{1,500}",
364 chunk_size in 1usize..200,
365 sentence_aware in proptest::bool::ANY,
366 ) {
367 let splitter = TextSplitter::new(SplitterConfig {
368 chunk_size,
369 chunk_overlap: 0,
370 sentence_aware,
371 });
372 let doc = make_doc(&content);
373 let chunks = splitter.split(&doc);
374
375 for chunk in &chunks {
376 prop_assert!(!chunk.content.is_empty());
377 }
378 }
379 }
380 }
381}