Skip to main content

traitclaw_rag/
chunker.rs

1//! Document chunking strategies for RAG pipelines.
2//!
3//! Chunkers split large documents into smaller segments suitable for embedding
4//! and retrieval. Each chunker implements the [`Chunker`] trait.
5//!
6//! # Chunkers
7//!
8//! - [`FixedSizeChunker`] — splits by character count with configurable overlap
9//! - [`SentenceChunker`] — splits on sentence boundaries (`.`, `!`, `?`)
10//! - [`RecursiveChunker`] — hierarchical: paragraph → sentence → character fallback
11
12/// Trait for all document chunking strategies.
13///
14/// # Example
15///
16/// ```rust
17/// use traitclaw_rag::chunker::{Chunker, FixedSizeChunker};
18///
19/// let chunker = FixedSizeChunker::new(100, 20);
20/// let chunks = chunker.chunk("A long piece of text...");
21/// assert!(!chunks.is_empty());
22/// ```
23pub trait Chunker: Send + Sync + 'static {
24    /// Split `text` into a list of chunks.
25    ///
26    /// Returns an empty `Vec` for empty input.
27    fn chunk(&self, text: &str) -> Vec<String>;
28}
29
30// ─────────────────────────────────────────────────────────────────────────────
31// FixedSizeChunker
32// ─────────────────────────────────────────────────────────────────────────────
33
34/// Splits text by character count with configurable overlap.
35///
36/// # Example
37///
38/// ```rust
39/// use traitclaw_rag::chunker::{Chunker, FixedSizeChunker};
40///
41/// let chunker = FixedSizeChunker::new(200, 50);
42/// let text = "a".repeat(500);
43/// let chunks = chunker.chunk(&text);
44/// assert!(chunks.len() >= 3);
45/// ```
46pub struct FixedSizeChunker {
47    chunk_size: usize,
48    overlap: usize,
49}
50
51impl FixedSizeChunker {
52    /// Create a new `FixedSizeChunker`.
53    ///
54    /// # Panics
55    ///
56    /// Panics if `overlap >= chunk_size`.
57    #[must_use]
58    pub fn new(chunk_size: usize, overlap: usize) -> Self {
59        assert!(
60            overlap < chunk_size,
61            "overlap ({overlap}) must be less than chunk_size ({chunk_size})"
62        );
63        Self {
64            chunk_size,
65            overlap,
66        }
67    }
68}
69
70impl Chunker for FixedSizeChunker {
71    fn chunk(&self, text: &str) -> Vec<String> {
72        if text.is_empty() {
73            return Vec::new();
74        }
75
76        let chars: Vec<char> = text.chars().collect();
77        let total = chars.len();
78
79        if total <= self.chunk_size {
80            return vec![text.to_string()];
81        }
82
83        let step = self.chunk_size - self.overlap;
84        let mut chunks = Vec::new();
85        let mut start = 0;
86
87        while start < total {
88            let end = (start + self.chunk_size).min(total);
89            let chunk: String = chars[start..end].iter().collect();
90            chunks.push(chunk);
91            if end == total {
92                break;
93            }
94            start += step;
95        }
96
97        chunks
98    }
99}
100
101// ─────────────────────────────────────────────────────────────────────────────
102// SentenceChunker
103// ─────────────────────────────────────────────────────────────────────────────
104
105/// Splits text on sentence boundaries into fixed-size sentence groups.
106///
107/// Sentence delimiters: `.`, `!`, `?`. Trailing whitespace is trimmed.
108///
109/// # Example
110///
111/// ```rust
112/// use traitclaw_rag::chunker::{Chunker, SentenceChunker};
113///
114/// let chunker = SentenceChunker::new(3);
115/// let text = "Sentence one. Sentence two. Sentence three. Sentence four.";
116/// let chunks = chunker.chunk(text);
117/// assert_eq!(chunks.len(), 2);
118/// ```
119pub struct SentenceChunker {
120    sentences_per_chunk: usize,
121}
122
123impl SentenceChunker {
124    /// Create a new `SentenceChunker` with `sentences_per_chunk` sentences per chunk.
125    ///
126    /// # Panics
127    ///
128    /// Panics if `sentences_per_chunk == 0`.
129    #[must_use]
130    pub fn new(sentences_per_chunk: usize) -> Self {
131        assert!(sentences_per_chunk > 0, "sentences_per_chunk must be > 0");
132        Self {
133            sentences_per_chunk,
134        }
135    }
136
137    /// Split text into individual sentences using `.`, `!`, `?` as delimiters.
138    fn split_sentences(text: &str) -> Vec<String> {
139        let mut sentences = Vec::new();
140        let mut current = String::new();
141
142        for ch in text.chars() {
143            current.push(ch);
144            if matches!(ch, '.' | '!' | '?') {
145                let s = current.trim().to_string();
146                if !s.is_empty() {
147                    sentences.push(s);
148                }
149                current.clear();
150            }
151        }
152
153        // Any remainder that doesn't end with punctuation
154        let remainder = current.trim().to_string();
155        if !remainder.is_empty() {
156            sentences.push(remainder);
157        }
158
159        sentences
160    }
161}
162
163impl Chunker for SentenceChunker {
164    fn chunk(&self, text: &str) -> Vec<String> {
165        if text.is_empty() {
166            return Vec::new();
167        }
168
169        let sentences = Self::split_sentences(text);
170        if sentences.is_empty() {
171            return Vec::new();
172        }
173
174        sentences
175            .chunks(self.sentences_per_chunk)
176            .map(|window| window.join(" "))
177            .collect()
178    }
179}
180
181// ─────────────────────────────────────────────────────────────────────────────
182// RecursiveChunker
183// ─────────────────────────────────────────────────────────────────────────────
184
185/// Hierarchical chunker: tries paragraphs → sentences → fixed-char fallback.
186///
187/// - First splits by `\n\n` (paragraphs)
188/// - If a paragraph is still larger than `max_chunk_size`, splits by sentences
189/// - If a sentence is still too large, uses fixed-size character splitting
190///
191/// # Example
192///
193/// ```rust
194/// use traitclaw_rag::chunker::{Chunker, RecursiveChunker};
195///
196/// let chunker = RecursiveChunker::new(200);
197/// let text = "Para 1.\n\nPara 2 sentence one. Para 2 sentence two.";
198/// let chunks = chunker.chunk(text);
199/// assert!(!chunks.is_empty());
200/// ```
201pub struct RecursiveChunker {
202    max_chunk_size: usize,
203}
204
205impl RecursiveChunker {
206    /// Create a new `RecursiveChunker` with the given max chunk size in characters.
207    ///
208    /// # Panics
209    ///
210    /// Panics if `max_chunk_size == 0`.
211    #[must_use]
212    pub fn new(max_chunk_size: usize) -> Self {
213        assert!(max_chunk_size > 0, "max_chunk_size must be > 0");
214        Self { max_chunk_size }
215    }
216
217    fn split_by_level(text: &str, max: usize) -> Vec<String> {
218        let mut result = Vec::new();
219
220        // Level 1: split by paragraph
221        for para in text.split("\n\n") {
222            let para = para.trim();
223            if para.is_empty() {
224                continue;
225            }
226
227            if para.chars().count() <= max {
228                result.push(para.to_string());
229            } else {
230                // Level 2: split by sentence
231                let mut sentence_buf = String::new();
232                for sentence in SentenceChunker::split_sentences(para) {
233                    if sentence_buf.chars().count() + sentence.chars().count() + 1 <= max {
234                        if !sentence_buf.is_empty() {
235                            sentence_buf.push(' ');
236                        }
237                        sentence_buf.push_str(&sentence);
238                    } else {
239                        if !sentence_buf.is_empty() {
240                            // Flush existing buffer
241                            if sentence_buf.chars().count() <= max {
242                                result.push(sentence_buf.clone());
243                            } else {
244                                // Level 3: fixed-char fallback
245                                let fixed = FixedSizeChunker::new(max, 0);
246                                result.extend(fixed.chunk(&sentence_buf));
247                            }
248                            sentence_buf.clear();
249                        }
250                        // Start new buffer with current sentence
251                        sentence_buf = sentence;
252                    }
253                }
254                if !sentence_buf.is_empty() {
255                    if sentence_buf.chars().count() <= max {
256                        result.push(sentence_buf);
257                    } else {
258                        let fixed = FixedSizeChunker::new(max, 0);
259                        result.extend(fixed.chunk(&sentence_buf));
260                    }
261                }
262            }
263        }
264
265        result
266    }
267}
268
269impl Chunker for RecursiveChunker {
270    fn chunk(&self, text: &str) -> Vec<String> {
271        if text.is_empty() {
272            return Vec::new();
273        }
274        Self::split_by_level(text, self.max_chunk_size)
275    }
276}
277
278// ─────────────────────────────────────────────────────────────────────────────
279// Tests
280// ─────────────────────────────────────────────────────────────────────────────
281
282#[cfg(test)]
283mod tests {
284    use super::*;
285
286    // ── FixedSizeChunker ──────────────────────────────────────────────────────
287
288    #[test]
289    fn test_fixed_size_empty_input() {
290        // AC #8: empty input → all chunkers return empty Vec
291        let c = FixedSizeChunker::new(200, 50);
292        assert!(c.chunk("").is_empty());
293    }
294
295    #[test]
296    fn test_fixed_size_short_text() {
297        let c = FixedSizeChunker::new(200, 50);
298        let chunks = c.chunk("short");
299        assert_eq!(chunks.len(), 1);
300        assert_eq!(chunks[0], "short");
301    }
302
303    #[test]
304    fn test_fixed_size_produces_overlap() {
305        // AC #6: 1000-char text → FixedSizeChunker(200, 50) produces ≥ 5 chunks with overlap
306        let text = "a".repeat(1000);
307        let c = FixedSizeChunker::new(200, 50);
308        let chunks = c.chunk(&text);
309        assert!(
310            chunks.len() >= 5,
311            "expected >= 5 chunks, got {}",
312            chunks.len()
313        );
314
315        // Verify overlap: end of chunk[0] == start of chunk[1] (first 50 chars)
316        let end_of_first: String = chunks[0]
317            .chars()
318            .rev()
319            .take(50)
320            .collect::<String>()
321            .chars()
322            .rev()
323            .collect();
324        let start_of_second: String = chunks[1].chars().take(50).collect();
325        assert_eq!(end_of_first, start_of_second, "overlap not maintained");
326    }
327
328    #[test]
329    fn test_fixed_size_each_chunk_not_exceeds_size() {
330        let text = "x".repeat(500);
331        let c = FixedSizeChunker::new(100, 25);
332        for chunk in c.chunk(&text) {
333            assert!(chunk.chars().count() <= 100);
334        }
335    }
336
337    // ── SentenceChunker ───────────────────────────────────────────────────────
338
339    #[test]
340    fn test_sentence_chunker_empty() {
341        // AC #8
342        let c = SentenceChunker::new(3);
343        assert!(c.chunk("").is_empty());
344    }
345
346    #[test]
347    fn test_sentence_chunker_10_sentences_gives_4_chunks() {
348        // AC #7: 10-sentence text → SentenceChunker(3) produces 4 chunks (3+3+3+1)
349        let sents: Vec<String> = (1..=10).map(|i| format!("Sentence {i}.")).collect();
350        let text = sents.join(" ");
351        let c = SentenceChunker::new(3);
352        let chunks = c.chunk(&text);
353        // 10 sentences / 3 per chunk = ceil(10/3) = 4 chunks
354        assert_eq!(
355            chunks.len(),
356            4,
357            "expected 4 chunks, got {}: {:?}",
358            chunks.len(),
359            chunks
360        );
361    }
362
363    #[test]
364    fn test_sentence_chunker_single() {
365        let c = SentenceChunker::new(3);
366        let chunks = c.chunk("One sentence.");
367        assert_eq!(chunks.len(), 1);
368    }
369
370    #[test]
371    fn test_sentence_chunker_exclamation_question() {
372        let c = SentenceChunker::new(2);
373        let chunks = c.chunk("Hello! How are you? I'm fine.");
374        assert_eq!(chunks.len(), 2);
375    }
376
377    // ── RecursiveChunker ─────────────────────────────────────────────────────
378
379    #[test]
380    fn test_recursive_chunker_empty() {
381        // AC #8
382        let c = RecursiveChunker::new(200);
383        assert!(c.chunk("").is_empty());
384    }
385
386    #[test]
387    fn test_recursive_chunker_paragraph_split() {
388        let text = "Short paragraph one.\n\nShort paragraph two.";
389        let c = RecursiveChunker::new(200);
390        let chunks = c.chunk(text);
391        assert_eq!(chunks.len(), 2);
392    }
393
394    #[test]
395    fn test_recursive_chunker_long_paragraph_splits_to_sentences() {
396        // A paragraph larger than max_chunk_size → falls back to sentence splitting
397        let long_sentence = "word ".repeat(20); // 100 chars
398        let text = format!(
399            "{}. {}. {}.",
400            long_sentence.trim(),
401            long_sentence.trim(),
402            long_sentence.trim()
403        );
404        let c = RecursiveChunker::new(110); // max 110 chars, sentences are ~100 chars
405        let chunks = c.chunk(&text);
406        assert!(
407            chunks.len() >= 2,
408            "expected multiple chunks for long paragraph"
409        );
410    }
411
412    #[test]
413    fn test_recursive_chunker_each_chunk_within_limit() {
414        let long_text = format!("word. {}", "sentence text here. ".repeat(50));
415        let c = RecursiveChunker::new(100);
416        for chunk in c.chunk(&long_text) {
417            assert!(
418                chunk.chars().count() <= 100,
419                "chunk exceeds max_chunk_size: {} chars",
420                chunk.chars().count()
421            );
422        }
423    }
424}