Skip to main content

rag/
chunker.rs

1use crate::errors::Result;
2
3pub trait TextChunker: Send + Sync {
4    fn chunk(&self, text: &str) -> Result<Vec<String>>;
5}
6
7pub struct FixedSizeChunker {
8    chunk_size: usize,
9    overlap: usize,
10}
11
12impl FixedSizeChunker {
13    pub fn new(chunk_size: usize, overlap: usize) -> Self {
14        Self { chunk_size, overlap }
15    }
16}
17
18impl Default for FixedSizeChunker {
19    fn default() -> Self {
20        Self::new(500, 50)
21    }
22}
23
24impl TextChunker for FixedSizeChunker {
25    fn chunk(&self, text: &str) -> Result<Vec<String>> {
26        if self.overlap >= self.chunk_size {
27            return Err(crate::errors::RagError::InvalidConfig(
28                "Overlap must be less than chunk size".to_string(),
29            ));
30        }
31
32        let words: Vec<&str> = text.split_whitespace().collect();
33        let mut chunks = Vec::new();
34
35        if words.is_empty() {
36            return Ok(chunks);
37        }
38
39        let mut start = 0;
40        while start < words.len() {
41            let end = (start + self.chunk_size).min(words.len());
42            let chunk = words[start..end].join(" ");
43            chunks.push(chunk);
44
45            start += self.chunk_size - self.overlap;
46            if start >= words.len() {
47                break;
48            }
49        }
50
51        Ok(chunks)
52    }
53}
54
55pub struct ParagraphChunker;
56
57impl Default for ParagraphChunker {
58    fn default() -> Self {
59        Self
60    }
61}
62
63impl TextChunker for ParagraphChunker {
64    fn chunk(&self, text: &str) -> Result<Vec<String>> {
65        let chunks: Vec<String> = text
66            .split("\n\n")
67            .map(|s| s.trim().to_string())
68            .filter(|s| !s.is_empty())
69            .collect();
70
71        if chunks.is_empty() && !text.trim().is_empty() {
72            Ok(vec![text.trim().to_string()])
73        } else {
74            Ok(chunks)
75        }
76    }
77}
78
79pub struct SentenceChunker {
80    max_sentences: usize,
81}
82
83impl SentenceChunker {
84    pub fn new(max_sentences: usize) -> Self {
85        Self { max_sentences }
86    }
87}
88
89impl Default for SentenceChunker {
90    fn default() -> Self {
91        Self::new(5)
92    }
93}
94
95impl TextChunker for SentenceChunker {
96    fn chunk(&self, text: &str) -> Result<Vec<String>> {
97        let sentences: Vec<String> = text
98            .split_inclusive(&['.', '!', '?', '\n'][..])
99            .map(|s| s.trim().to_string())
100            .filter(|s| !s.is_empty())
101            .collect();
102
103        let mut chunks = Vec::new();
104        for chunk in sentences.chunks(self.max_sentences) {
105            let chunk_text = chunk.join(" ");
106            chunks.push(chunk_text);
107        }
108
109        Ok(chunks)
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn test_fixed_size_chunker_basic() {
119        let chunker = FixedSizeChunker::new(3, 0);
120        let text = "one two three four five six seven";
121        let chunks = chunker.chunk(text).unwrap();
122        assert_eq!(chunks.len(), 3);
123        assert_eq!(chunks[0], "one two three");
124        assert_eq!(chunks[1], "four five six");
125        assert_eq!(chunks[2], "seven");
126    }
127
128    #[test]
129    fn test_fixed_size_chunker_with_overlap() {
130        let chunker = FixedSizeChunker::new(4, 2);
131        let text = "a b c d e f g h";
132        let chunks = chunker.chunk(text).unwrap();
133        assert_eq!(chunks.len(), 4);
134        assert_eq!(chunks[0], "a b c d");
135        assert_eq!(chunks[1], "c d e f");
136        assert_eq!(chunks[2], "e f g h");
137        assert_eq!(chunks[3], "g h");
138    }
139
140    #[test]
141    fn test_fixed_size_chunker_empty() {
142        let chunker = FixedSizeChunker::new(5, 1);
143        let chunks = chunker.chunk("").unwrap();
144        assert!(chunks.is_empty());
145    }
146
147    #[test]
148    fn test_fixed_size_chunker_invalid_config() {
149        let chunker = FixedSizeChunker::new(5, 10);
150        let result = chunker.chunk("test text here");
151        assert!(result.is_err());
152    }
153
154    #[test]
155    fn test_fixed_size_chunker_single_word() {
156        let chunker = FixedSizeChunker::new(5, 0);
157        let chunks = chunker.chunk("hello").unwrap();
158        assert_eq!(chunks.len(), 1);
159        assert_eq!(chunks[0], "hello");
160    }
161
162    #[test]
163    fn test_fixed_size_chunker_default() {
164        let chunker = FixedSizeChunker::default();
165        let text: String = (0..1000).map(|i| format!("word{} ", i)).collect();
166        let chunks = chunker.chunk(&text).unwrap();
167        assert!(chunks.len() > 1);
168    }
169
170    #[test]
171    fn test_paragraph_chunker_basic() {
172        let chunker = ParagraphChunker;
173        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
174        let chunks = chunker.chunk(text).unwrap();
175        assert_eq!(chunks.len(), 3);
176        assert_eq!(chunks[0], "First paragraph.");
177        assert_eq!(chunks[1], "Second paragraph.");
178        assert_eq!(chunks[2], "Third paragraph.");
179    }
180
181    #[test]
182    fn test_paragraph_chunker_single_paragraph() {
183        let chunker = ParagraphChunker;
184        let text = "Only one paragraph.";
185        let chunks = chunker.chunk(text).unwrap();
186        assert_eq!(chunks.len(), 1);
187        assert_eq!(chunks[0], "Only one paragraph.");
188    }
189
190    #[test]
191    fn test_paragraph_chunker_empty() {
192        let chunker = ParagraphChunker;
193        let chunks = chunker.chunk("").unwrap();
194        assert!(chunks.is_empty());
195    }
196
197    #[test]
198    fn test_paragraph_chunker_whitespace_only() {
199        let chunker = ParagraphChunker;
200        let chunks = chunker.chunk("   \n\n   ").unwrap();
201        assert!(chunks.is_empty());
202    }
203
204    #[test]
205    fn test_paragraph_chunker_no_double_newline() {
206        let chunker = ParagraphChunker;
207        let text = "Just a single line with no paragraph breaks";
208        let chunks = chunker.chunk(text).unwrap();
209        assert_eq!(chunks.len(), 1);
210        assert_eq!(chunks[0], "Just a single line with no paragraph breaks");
211    }
212
213    #[test]
214    fn test_sentence_chunker_basic() {
215        let chunker = SentenceChunker::new(2);
216        let text = "First sentence. Second sentence. Third sentence. Fourth.";
217        let chunks = chunker.chunk(text).unwrap();
218        assert_eq!(chunks.len(), 2);
219        assert_eq!(chunks[0], "First sentence. Second sentence.");
220        assert_eq!(chunks[1], "Third sentence. Fourth.");
221    }
222
223    #[test]
224    fn test_sentence_chunker_single_sentence() {
225        let chunker = SentenceChunker::new(3);
226        let text = "Only one sentence.";
227        let chunks = chunker.chunk(text).unwrap();
228        assert_eq!(chunks.len(), 1);
229        assert_eq!(chunks[0], "Only one sentence.");
230    }
231
232    #[test]
233    fn test_sentence_chunker_exclamation() {
234        let chunker = SentenceChunker::new(2);
235        let text = "Hello! How are you? I am fine.";
236        let chunks = chunker.chunk(text).unwrap();
237        assert_eq!(chunks.len(), 2);
238        assert_eq!(chunks[0], "Hello! How are you?");
239        assert_eq!(chunks[1], "I am fine.");
240    }
241
242    #[test]
243    fn test_sentence_chunker_empty() {
244        let chunker = SentenceChunker::new(5);
245        let chunks = chunker.chunk("").unwrap();
246        assert!(chunks.is_empty());
247    }
248
249    #[test]
250    fn test_sentence_chunker_default() {
251        let chunker = SentenceChunker::default();
252        let text = "A. B. C. D. E. F. G. H. I. J.";
253        let chunks = chunker.chunk(text).unwrap();
254        assert_eq!(chunks.len(), 2);
255    }
256
257    #[test]
258    fn test_sentence_chunker_newline_separator() {
259        let chunker = SentenceChunker::new(2);
260        let text = "Line one\nLine two\nLine three\nLine four";
261        let chunks = chunker.chunk(text).unwrap();
262        assert_eq!(chunks.len(), 2);
263        // Newlines are consumed as sentence delimiters, so chunks join with space
264        assert_eq!(chunks[0], "Line one Line two");
265        assert_eq!(chunks[1], "Line three Line four");
266    }
267}