1use super::types::{Chunk, Document};
5
6#[derive(Debug, Clone)]
7pub struct SplitterConfig {
8 pub chunk_size: usize,
9 pub chunk_overlap: usize,
10 pub sentence_aware: bool,
11}
12
13impl Default for SplitterConfig {
14 fn default() -> Self {
15 Self {
16 chunk_size: 1000,
17 chunk_overlap: 200,
18 sentence_aware: true,
19 }
20 }
21}
22
23pub struct TextSplitter {
24 config: SplitterConfig,
25}
26
27impl TextSplitter {
28 #[must_use]
29 pub fn new(config: SplitterConfig) -> Self {
30 Self { config }
31 }
32
33 #[must_use]
34 pub fn split(&self, document: &Document) -> Vec<Chunk> {
35 let text = &document.content;
36 if text.is_empty() {
37 return Vec::new();
38 }
39
40 let pieces = if self.config.sentence_aware {
41 split_sentences(text)
42 } else {
43 split_chars(text, self.config.chunk_size, self.config.chunk_overlap)
44 };
45
46 if self.config.sentence_aware {
47 let chunks =
48 merge_sentences(&pieces, self.config.chunk_size, self.config.chunk_overlap);
49 chunks
50 .into_iter()
51 .enumerate()
52 .map(|(i, content)| Chunk {
53 content,
54 metadata: document.metadata.clone(),
55 chunk_index: i,
56 })
57 .collect()
58 } else {
59 pieces
60 .into_iter()
61 .enumerate()
62 .map(|(i, content)| Chunk {
63 content,
64 metadata: document.metadata.clone(),
65 chunk_index: i,
66 })
67 .collect()
68 }
69 }
70}
71
72fn split_sentences(text: &str) -> Vec<String> {
73 let mut sentences = Vec::new();
74 let mut current = String::new();
75
76 let chars: Vec<char> = text.chars().collect();
77 let mut i = 0;
78
79 while i < chars.len() {
80 current.push(chars[i]);
81
82 if chars[i] == '\n' && i + 1 < chars.len() && chars[i + 1] == '\n' {
84 current.push(chars[i + 1]);
85 i += 1;
86 if !current.trim().is_empty() {
87 sentences.push(std::mem::take(&mut current));
88 }
89 }
90 else if (chars[i] == '.' || chars[i] == '?' || chars[i] == '!')
92 && i + 1 < chars.len()
93 && chars[i + 1] == ' '
94 && !current.trim().is_empty()
95 {
96 sentences.push(std::mem::take(&mut current));
97 }
98
99 i += 1;
100 }
101
102 if !current.trim().is_empty() {
103 sentences.push(current);
104 }
105
106 sentences
107}
108
109fn merge_sentences(sentences: &[String], chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
111 let mut chunks = Vec::new();
112 let mut current = String::new();
113 let mut window_start = 0;
115
116 for (idx, sentence) in sentences.iter().enumerate() {
117 if !current.is_empty() && current.len() + sentence.len() > chunk_size {
118 chunks.push(current.clone());
119
120 current.clear();
122 let mut overlap_len = 0;
123 let mut overlap_start = idx;
124 for i in (window_start..idx).rev() {
125 if overlap_len + sentences[i].len() > chunk_overlap {
126 break;
127 }
128 overlap_len += sentences[i].len();
129 overlap_start = i;
130 }
131 for s in &sentences[overlap_start..idx] {
132 current.push_str(s);
133 }
134 window_start = overlap_start;
135 }
136
137 current.push_str(sentence);
138 }
139
140 if !current.is_empty() {
141 chunks.push(current);
142 }
143
144 chunks
145}
146
147fn split_chars(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
148 let mut chunks = Vec::new();
149 let chars: Vec<char> = text.chars().collect();
150 let step = chunk_size.saturating_sub(overlap).max(1);
151 let mut start = 0;
152
153 while start < chars.len() {
154 let end = (start + chunk_size).min(chars.len());
155 chunks.push(chars[start..end].iter().collect());
156 start += step;
157 }
158
159 chunks
160}
161
162#[cfg(test)]
163mod tests {
164 use std::collections::HashMap;
165
166 use super::*;
167 use crate::document::types::DocumentMetadata;
168
169 fn make_doc(content: &str) -> Document {
170 Document {
171 content: content.to_owned(),
172 metadata: DocumentMetadata {
173 source: "test".to_owned(),
174 content_type: "text/plain".to_owned(),
175 extra: HashMap::new(),
176 },
177 }
178 }
179
180 #[test]
181 fn empty_document() {
182 let splitter = TextSplitter::new(SplitterConfig::default());
183 let chunks = splitter.split(&make_doc(""));
184 assert!(chunks.is_empty());
185 }
186
187 #[test]
188 fn single_small_chunk() {
189 let splitter = TextSplitter::new(SplitterConfig::default());
190 let chunks = splitter.split(&make_doc("Hello world."));
191 assert_eq!(chunks.len(), 1);
192 assert_eq!(chunks[0].chunk_index, 0);
193 }
194
195 #[test]
196 fn sentence_aware_splitting() {
197 let text = "First sentence. Second sentence. Third sentence.";
198 let splitter = TextSplitter::new(SplitterConfig {
199 chunk_size: 20,
200 chunk_overlap: 5,
201 sentence_aware: true,
202 });
203 let chunks = splitter.split(&make_doc(text));
204 assert!(chunks.len() > 1);
205 for (i, chunk) in chunks.iter().enumerate() {
206 assert_eq!(chunk.chunk_index, i);
207 }
208 }
209
210 #[test]
211 fn char_splitting_with_overlap() {
212 let text = "abcdefghijklmnopqrstuvwxyz";
213 let splitter = TextSplitter::new(SplitterConfig {
214 chunk_size: 10,
215 chunk_overlap: 3,
216 sentence_aware: false,
217 });
218 let chunks = splitter.split(&make_doc(text));
219 assert!(chunks.len() > 1);
220 assert_eq!(&chunks[0].content[7..10], &chunks[1].content[..3]);
222 }
223
224 #[test]
225 fn metadata_preserved() {
226 let splitter = TextSplitter::new(SplitterConfig::default());
227 let doc = make_doc("Some content.");
228 let chunks = splitter.split(&doc);
229 assert_eq!(chunks[0].metadata.source, "test");
230 }
231
232 #[test]
233 fn paragraph_break_splitting() {
234 let text = "First paragraph.\n\nSecond paragraph.";
235 let sentences = super::split_sentences(text);
236 assert_eq!(sentences.len(), 2);
237 }
238
239 #[test]
240 fn document_smaller_than_chunk_size() {
241 let splitter = TextSplitter::new(SplitterConfig {
242 chunk_size: 1000,
243 chunk_overlap: 100,
244 sentence_aware: true,
245 });
246 let chunks = splitter.split(&make_doc("Short text."));
247 assert_eq!(chunks.len(), 1);
248 assert_eq!(chunks[0].content, "Short text.");
249 }
250
251 #[test]
252 fn single_sentence_no_trailing_space() {
253 let sentences = super::split_sentences("Hello world");
254 assert_eq!(sentences.len(), 1);
255 assert_eq!(sentences[0], "Hello world");
256 }
257
258 #[test]
259 fn char_split_no_overlap() {
260 let chunks = super::split_chars("abcdefghij", 5, 0);
261 assert_eq!(chunks, vec!["abcde", "fghij"]);
262 }
263
264 #[test]
265 fn char_split_full_overlap_makes_progress() {
266 let chunks = super::split_chars("abcde", 3, 3);
268 assert!(!chunks.is_empty());
269 assert_eq!(chunks[0], "abc");
270 }
271
272 #[test]
273 fn sentence_aware_overlap_includes_previous() {
274 let text = "A. B. C. D. E.";
275 let splitter = TextSplitter::new(SplitterConfig {
276 chunk_size: 5,
277 chunk_overlap: 3,
278 sentence_aware: true,
279 });
280 let chunks = splitter.split(&make_doc(text));
281 assert!(chunks.len() > 1);
282 if chunks.len() >= 2 {
284 assert!(!chunks[1].content.is_empty());
286 }
287 }
288
289 #[test]
290 fn question_mark_splits_sentence() {
291 let sentences = super::split_sentences("Is this a question? Yes it is.");
292 assert_eq!(sentences.len(), 2);
293 }
294
295 #[test]
296 fn exclamation_splits_sentence() {
297 let sentences = super::split_sentences("Wow! Amazing.");
298 assert_eq!(sentences.len(), 2);
299 }
300
301 mod proptest_splitter {
302 use super::*;
303 use proptest::prelude::*;
304
305 proptest! {
306 #![proptest_config(ProptestConfig::with_cases(1000))]
307
308 #[test]
309 fn split_never_panics(
310 content in "\\PC{0,5000}",
311 chunk_size in 1usize..2000,
312 chunk_overlap in 0usize..500,
313 sentence_aware in proptest::bool::ANY,
314 ) {
315 let splitter = TextSplitter::new(SplitterConfig {
316 chunk_size,
317 chunk_overlap,
318 sentence_aware,
319 });
320 let doc = make_doc(&content);
321 let _ = splitter.split(&doc);
322 }
323
324 #[test]
325 fn chunks_cover_all_content(
326 content in "[a-z ]{10,500}",
327 chunk_size in 10usize..200,
328 ) {
329 let splitter = TextSplitter::new(SplitterConfig {
330 chunk_size,
331 chunk_overlap: 0,
332 sentence_aware: false,
333 });
334 let doc = make_doc(&content);
335 let chunks = splitter.split(&doc);
336
337 if !content.is_empty() {
338 prop_assert!(!chunks.is_empty());
339 }
340
341 let total_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
342 prop_assert!(total_chars >= content.len());
343 }
344
345 #[test]
346 fn chunk_indices_sequential(
347 content in "[a-z. ]{10,1000}",
348 chunk_size in 5usize..100,
349 sentence_aware in proptest::bool::ANY,
350 ) {
351 let splitter = TextSplitter::new(SplitterConfig {
352 chunk_size,
353 chunk_overlap: 0,
354 sentence_aware,
355 });
356 let doc = make_doc(&content);
357 let chunks = splitter.split(&doc);
358
359 for (i, chunk) in chunks.iter().enumerate() {
360 prop_assert_eq!(chunk.chunk_index, i);
361 }
362 }
363
364 #[test]
365 fn no_empty_chunks(
366 content in "[a-z. !?]{1,500}",
367 chunk_size in 1usize..200,
368 sentence_aware in proptest::bool::ANY,
369 ) {
370 let splitter = TextSplitter::new(SplitterConfig {
371 chunk_size,
372 chunk_overlap: 0,
373 sentence_aware,
374 });
375 let doc = make_doc(&content);
376 let chunks = splitter.split(&doc);
377
378 for chunk in &chunks {
379 prop_assert!(!chunk.content.is_empty());
380 }
381 }
382 }
383 }
384}