1use crate::errors::Result;
2
3pub trait TextChunker: Send + Sync {
4 fn chunk(&self, text: &str) -> Result<Vec<String>>;
5}
6
7pub struct FixedSizeChunker {
8 chunk_size: usize,
9 overlap: usize,
10}
11
12impl FixedSizeChunker {
13 pub fn new(chunk_size: usize, overlap: usize) -> Self {
14 Self { chunk_size, overlap }
15 }
16}
17
18impl Default for FixedSizeChunker {
19 fn default() -> Self {
20 Self::new(500, 50)
21 }
22}
23
24impl TextChunker for FixedSizeChunker {
25 fn chunk(&self, text: &str) -> Result<Vec<String>> {
26 if self.overlap >= self.chunk_size {
27 return Err(crate::errors::RagError::InvalidConfig(
28 "Overlap must be less than chunk size".to_string(),
29 ));
30 }
31
32 let words: Vec<&str> = text.split_whitespace().collect();
33 let mut chunks = Vec::new();
34
35 if words.is_empty() {
36 return Ok(chunks);
37 }
38
39 let mut start = 0;
40 while start < words.len() {
41 let end = (start + self.chunk_size).min(words.len());
42 let chunk = words[start..end].join(" ");
43 chunks.push(chunk);
44
45 start += self.chunk_size - self.overlap;
46 if start >= words.len() {
47 break;
48 }
49 }
50
51 Ok(chunks)
52 }
53}
54
55pub struct ParagraphChunker;
56
57impl Default for ParagraphChunker {
58 fn default() -> Self {
59 Self
60 }
61}
62
63impl TextChunker for ParagraphChunker {
64 fn chunk(&self, text: &str) -> Result<Vec<String>> {
65 let chunks: Vec<String> = text
66 .split("\n\n")
67 .map(|s| s.trim().to_string())
68 .filter(|s| !s.is_empty())
69 .collect();
70
71 if chunks.is_empty() && !text.trim().is_empty() {
72 Ok(vec![text.trim().to_string()])
73 } else {
74 Ok(chunks)
75 }
76 }
77}
78
79pub struct SentenceChunker {
80 max_sentences: usize,
81}
82
83impl SentenceChunker {
84 pub fn new(max_sentences: usize) -> Self {
85 Self { max_sentences }
86 }
87}
88
89impl Default for SentenceChunker {
90 fn default() -> Self {
91 Self::new(5)
92 }
93}
94
95impl TextChunker for SentenceChunker {
96 fn chunk(&self, text: &str) -> Result<Vec<String>> {
97 let sentences: Vec<String> = text
98 .split_inclusive(&['.', '!', '?', '\n'][..])
99 .map(|s| s.trim().to_string())
100 .filter(|s| !s.is_empty())
101 .collect();
102
103 let mut chunks = Vec::new();
104 for chunk in sentences.chunks(self.max_sentences) {
105 let chunk_text = chunk.join(" ");
106 chunks.push(chunk_text);
107 }
108
109 Ok(chunks)
110 }
111}
112
113#[cfg(test)]
114mod tests {
115 use super::*;
116
117 #[test]
118 fn test_fixed_size_chunker_basic() {
119 let chunker = FixedSizeChunker::new(3, 0);
120 let text = "one two three four five six seven";
121 let chunks = chunker.chunk(text).unwrap();
122 assert_eq!(chunks.len(), 3);
123 assert_eq!(chunks[0], "one two three");
124 assert_eq!(chunks[1], "four five six");
125 assert_eq!(chunks[2], "seven");
126 }
127
128 #[test]
129 fn test_fixed_size_chunker_with_overlap() {
130 let chunker = FixedSizeChunker::new(4, 2);
131 let text = "a b c d e f g h";
132 let chunks = chunker.chunk(text).unwrap();
133 assert_eq!(chunks.len(), 4);
134 assert_eq!(chunks[0], "a b c d");
135 assert_eq!(chunks[1], "c d e f");
136 assert_eq!(chunks[2], "e f g h");
137 assert_eq!(chunks[3], "g h");
138 }
139
140 #[test]
141 fn test_fixed_size_chunker_empty() {
142 let chunker = FixedSizeChunker::new(5, 1);
143 let chunks = chunker.chunk("").unwrap();
144 assert!(chunks.is_empty());
145 }
146
147 #[test]
148 fn test_fixed_size_chunker_invalid_config() {
149 let chunker = FixedSizeChunker::new(5, 10);
150 let result = chunker.chunk("test text here");
151 assert!(result.is_err());
152 }
153
154 #[test]
155 fn test_fixed_size_chunker_single_word() {
156 let chunker = FixedSizeChunker::new(5, 0);
157 let chunks = chunker.chunk("hello").unwrap();
158 assert_eq!(chunks.len(), 1);
159 assert_eq!(chunks[0], "hello");
160 }
161
162 #[test]
163 fn test_fixed_size_chunker_default() {
164 let chunker = FixedSizeChunker::default();
165 let text: String = (0..1000).map(|i| format!("word{} ", i)).collect();
166 let chunks = chunker.chunk(&text).unwrap();
167 assert!(chunks.len() > 1);
168 }
169
170 #[test]
171 fn test_paragraph_chunker_basic() {
172 let chunker = ParagraphChunker;
173 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
174 let chunks = chunker.chunk(text).unwrap();
175 assert_eq!(chunks.len(), 3);
176 assert_eq!(chunks[0], "First paragraph.");
177 assert_eq!(chunks[1], "Second paragraph.");
178 assert_eq!(chunks[2], "Third paragraph.");
179 }
180
181 #[test]
182 fn test_paragraph_chunker_single_paragraph() {
183 let chunker = ParagraphChunker;
184 let text = "Only one paragraph.";
185 let chunks = chunker.chunk(text).unwrap();
186 assert_eq!(chunks.len(), 1);
187 assert_eq!(chunks[0], "Only one paragraph.");
188 }
189
190 #[test]
191 fn test_paragraph_chunker_empty() {
192 let chunker = ParagraphChunker;
193 let chunks = chunker.chunk("").unwrap();
194 assert!(chunks.is_empty());
195 }
196
197 #[test]
198 fn test_paragraph_chunker_whitespace_only() {
199 let chunker = ParagraphChunker;
200 let chunks = chunker.chunk(" \n\n ").unwrap();
201 assert!(chunks.is_empty());
202 }
203
204 #[test]
205 fn test_paragraph_chunker_no_double_newline() {
206 let chunker = ParagraphChunker;
207 let text = "Just a single line with no paragraph breaks";
208 let chunks = chunker.chunk(text).unwrap();
209 assert_eq!(chunks.len(), 1);
210 assert_eq!(chunks[0], "Just a single line with no paragraph breaks");
211 }
212
213 #[test]
214 fn test_sentence_chunker_basic() {
215 let chunker = SentenceChunker::new(2);
216 let text = "First sentence. Second sentence. Third sentence. Fourth.";
217 let chunks = chunker.chunk(text).unwrap();
218 assert_eq!(chunks.len(), 2);
219 assert_eq!(chunks[0], "First sentence. Second sentence.");
220 assert_eq!(chunks[1], "Third sentence. Fourth.");
221 }
222
223 #[test]
224 fn test_sentence_chunker_single_sentence() {
225 let chunker = SentenceChunker::new(3);
226 let text = "Only one sentence.";
227 let chunks = chunker.chunk(text).unwrap();
228 assert_eq!(chunks.len(), 1);
229 assert_eq!(chunks[0], "Only one sentence.");
230 }
231
232 #[test]
233 fn test_sentence_chunker_exclamation() {
234 let chunker = SentenceChunker::new(2);
235 let text = "Hello! How are you? I am fine.";
236 let chunks = chunker.chunk(text).unwrap();
237 assert_eq!(chunks.len(), 2);
238 assert_eq!(chunks[0], "Hello! How are you?");
239 assert_eq!(chunks[1], "I am fine.");
240 }
241
242 #[test]
243 fn test_sentence_chunker_empty() {
244 let chunker = SentenceChunker::new(5);
245 let chunks = chunker.chunk("").unwrap();
246 assert!(chunks.is_empty());
247 }
248
249 #[test]
250 fn test_sentence_chunker_default() {
251 let chunker = SentenceChunker::default();
252 let text = "A. B. C. D. E. F. G. H. I. J.";
253 let chunks = chunker.chunk(text).unwrap();
254 assert_eq!(chunks.len(), 2);
255 }
256
257 #[test]
258 fn test_sentence_chunker_newline_separator() {
259 let chunker = SentenceChunker::new(2);
260 let text = "Line one\nLine two\nLine three\nLine four";
261 let chunks = chunker.chunk(text).unwrap();
262 assert_eq!(chunks.len(), 2);
263 assert_eq!(chunks[0], "Line one Line two");
265 assert_eq!(chunks[1], "Line three Line four");
266 }
267}