ares/rag/
chunker.rs

1//! Text chunking for document processing.
2//!
3//! This module provides various chunking strategies for splitting documents
4//! into manageable pieces for embedding and retrieval:
5//! - **Word-based**: Simple word count chunking with overlap
6//! - **Semantic**: Sentence/paragraph aware chunking using text-splitter
7//! - **Token-based**: Token-aware chunking for LLM context limits
8
9use std::str::FromStr;
10
11use serde::{Deserialize, Serialize};
12use text_splitter::TextSplitter;
13
14use crate::types::{AppError, Result};
15
16// ============================================================================
17// Chunking Strategy Types
18// ============================================================================
19
20/// Available chunking strategies
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
22#[serde(rename_all = "kebab-case")]
23pub enum ChunkingStrategy {
24    /// Simple word-based chunking with overlap
25    #[default]
26    Word,
27    /// Semantic chunking using sentence/paragraph boundaries
28    Semantic,
29    /// Character-based chunking
30    Character,
31}
32
33impl FromStr for ChunkingStrategy {
34    type Err = AppError;
35
36    fn from_str(s: &str) -> Result<Self> {
37        match s.to_lowercase().as_str() {
38            "word" | "words" => Ok(Self::Word),
39            "semantic" | "sentence" | "paragraph" => Ok(Self::Semantic),
40            "character" | "char" | "chars" => Ok(Self::Character),
41            _ => Err(AppError::Internal(format!(
42                "Unknown chunking strategy: {}. Use: word, semantic, character",
43                s
44            ))),
45        }
46    }
47}
48
49impl std::fmt::Display for ChunkingStrategy {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        let name = match self {
52            Self::Word => "word",
53            Self::Semantic => "semantic",
54            Self::Character => "character",
55        };
56        write!(f, "{}", name)
57    }
58}
59
60// ============================================================================
61// Chunker Configuration
62// ============================================================================
63
64/// Configuration for the text chunker
65#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct ChunkerConfig {
67    /// Chunking strategy to use
68    #[serde(default)]
69    pub strategy: ChunkingStrategy,
70    /// Target chunk size (in words for word strategy, characters for others)
71    #[serde(default = "default_chunk_size")]
72    pub chunk_size: usize,
73    /// Overlap between chunks (for word strategy)
74    #[serde(default = "default_chunk_overlap")]
75    pub chunk_overlap: usize,
76    /// Minimum chunk size to keep
77    #[serde(default = "default_min_chunk_size")]
78    pub min_chunk_size: usize,
79}
80
81fn default_chunk_size() -> usize {
82    512
83}
84
85fn default_chunk_overlap() -> usize {
86    50
87}
88
89fn default_min_chunk_size() -> usize {
90    20
91}
92
93impl Default for ChunkerConfig {
94    fn default() -> Self {
95        Self {
96            strategy: ChunkingStrategy::default(),
97            chunk_size: default_chunk_size(),
98            chunk_overlap: default_chunk_overlap(),
99            min_chunk_size: default_min_chunk_size(),
100        }
101    }
102}
103
104// ============================================================================
105// Chunk Result
106// ============================================================================
107
108/// A single chunk with metadata
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111    /// Chunk index (0-based)
112    pub index: usize,
113    /// Chunk content
114    pub content: String,
115    /// Start position in original text (character offset)
116    pub start_offset: usize,
117    /// End position in original text (character offset)
118    pub end_offset: usize,
119}
120
121// ============================================================================
122// Text Chunker
123// ============================================================================
124
125/// Text chunker for splitting documents
126#[derive(Debug, Clone)]
127pub struct TextChunker {
128    config: ChunkerConfig,
129}
130
131impl TextChunker {
132    /// Create a new text chunker with the given configuration
133    pub fn new(config: ChunkerConfig) -> Self {
134        Self { config }
135    }
136
137    /// Create with word-based chunking (backward compatible)
138    pub fn with_word_chunking(chunk_size: usize, chunk_overlap: usize) -> Self {
139        Self::new(ChunkerConfig {
140            strategy: ChunkingStrategy::Word,
141            chunk_size,
142            chunk_overlap,
143            min_chunk_size: default_min_chunk_size(),
144        })
145    }
146
147    /// Create with semantic chunking
148    pub fn with_semantic_chunking(max_chunk_size: usize) -> Self {
149        Self::new(ChunkerConfig {
150            strategy: ChunkingStrategy::Semantic,
151            chunk_size: max_chunk_size,
152            chunk_overlap: 0, // Not used for semantic
153            min_chunk_size: default_min_chunk_size(),
154        })
155    }
156
157    /// Create with character-based chunking
158    pub fn with_character_chunking(chunk_size: usize, chunk_overlap: usize) -> Self {
159        Self::new(ChunkerConfig {
160            strategy: ChunkingStrategy::Character,
161            chunk_size,
162            chunk_overlap,
163            min_chunk_size: default_min_chunk_size(),
164        })
165    }
166
167    /// Chunk text and return simple string vector (backward compatible)
168    pub fn chunk(&self, text: &str) -> Vec<String> {
169        self.chunk_with_metadata(text)
170            .into_iter()
171            .map(|c| c.content)
172            .collect()
173    }
174
175    /// Chunk text with full metadata
176    pub fn chunk_with_metadata(&self, text: &str) -> Vec<Chunk> {
177        match self.config.strategy {
178            ChunkingStrategy::Word => self.chunk_by_words(text),
179            ChunkingStrategy::Semantic => self.chunk_semantically(text),
180            ChunkingStrategy::Character => self.chunk_by_characters(text),
181        }
182    }
183
184    /// Word-based chunking with overlap
185    fn chunk_by_words(&self, text: &str) -> Vec<Chunk> {
186        let words: Vec<&str> = text.split_whitespace().collect();
187        let mut chunks = Vec::new();
188        let step = self
189            .config
190            .chunk_size
191            .saturating_sub(self.config.chunk_overlap)
192            .max(1);
193
194        let mut chunk_index = 0;
195        let mut word_index = 0;
196
197        while word_index < words.len() {
198            let end = (word_index + self.config.chunk_size).min(words.len());
199            let chunk_words = &words[word_index..end];
200            let content = chunk_words.join(" ");
201
202            if content.len() >= self.config.min_chunk_size {
203                // Calculate approximate character offsets
204                let start_offset = if word_index == 0 {
205                    0
206                } else {
207                    words[..word_index]
208                        .iter()
209                        .map(|w| w.len() + 1)
210                        .sum::<usize>()
211                };
212                let end_offset = start_offset + content.len();
213
214                chunks.push(Chunk {
215                    index: chunk_index,
216                    content,
217                    start_offset,
218                    end_offset,
219                });
220                chunk_index += 1;
221            }
222
223            word_index += step;
224        }
225
226        chunks
227    }
228
229    /// Semantic chunking using text-splitter
230    fn chunk_semantically(&self, text: &str) -> Vec<Chunk> {
231        let splitter = TextSplitter::new(self.config.chunk_size);
232
233        let mut chunks = Vec::new();
234        let mut current_offset = 0;
235
236        for (index, chunk_text) in splitter.chunks(text).enumerate() {
237            // Find the actual position in the original text
238            let start_offset = text[current_offset..]
239                .find(chunk_text)
240                .map(|pos| current_offset + pos)
241                .unwrap_or(current_offset);
242            let end_offset = start_offset + chunk_text.len();
243
244            if chunk_text.len() >= self.config.min_chunk_size {
245                chunks.push(Chunk {
246                    index,
247                    content: chunk_text.to_string(),
248                    start_offset,
249                    end_offset,
250                });
251            }
252
253            current_offset = end_offset;
254        }
255
256        chunks
257    }
258
259    /// Character-based chunking with overlap
260    fn chunk_by_characters(&self, text: &str) -> Vec<Chunk> {
261        let chars: Vec<char> = text.chars().collect();
262        let mut chunks = Vec::new();
263        let step = self
264            .config
265            .chunk_size
266            .saturating_sub(self.config.chunk_overlap)
267            .max(1);
268
269        let mut char_index = 0;
270        let mut chunk_index = 0;
271
272        while char_index < chars.len() {
273            let end = (char_index + self.config.chunk_size).min(chars.len());
274            let content: String = chars[char_index..end].iter().collect();
275
276            if content.len() >= self.config.min_chunk_size {
277                chunks.push(Chunk {
278                    index: chunk_index,
279                    content,
280                    start_offset: char_index,
281                    end_offset: end,
282                });
283                chunk_index += 1;
284            }
285
286            char_index += step;
287        }
288
289        chunks
290    }
291
292    /// Get the current configuration
293    pub fn config(&self) -> &ChunkerConfig {
294        &self.config
295    }
296}
297
298impl Default for TextChunker {
299    fn default() -> Self {
300        Self::new(ChunkerConfig::default())
301    }
302}
303
304// ============================================================================
305// Tests
306// ============================================================================
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn test_chunking_strategy_from_str() {
314        assert_eq!(
315            "word".parse::<ChunkingStrategy>().unwrap(),
316            ChunkingStrategy::Word
317        );
318        assert_eq!(
319            "semantic".parse::<ChunkingStrategy>().unwrap(),
320            ChunkingStrategy::Semantic
321        );
322        assert_eq!(
323            "character".parse::<ChunkingStrategy>().unwrap(),
324            ChunkingStrategy::Character
325        );
326    }
327
328    #[test]
329    fn test_word_chunking_basic() {
330        let chunker = TextChunker::with_word_chunking(5, 2);
331        let text = "one two three four five six seven eight nine ten";
332        let chunks = chunker.chunk(text);
333
334        assert!(!chunks.is_empty());
335        assert!(chunks[0].split_whitespace().count() <= 5);
336    }
337
338    #[test]
339    fn test_word_chunking_overlap() {
340        // Use longer words to meet min_chunk_size of 20 chars
341        let config = ChunkerConfig {
342            strategy: ChunkingStrategy::Word,
343            chunk_size: 4,
344            chunk_overlap: 2,
345            min_chunk_size: 5, // Lower threshold for test
346        };
347        let chunker = TextChunker::new(config);
348        let text = "alpha bravo charlie delta echo foxtrot golf hotel india juliet";
349        let chunks = chunker.chunk(text);
350
351        // With overlap, we should see multiple chunks
352        assert!(
353            chunks.len() > 1,
354            "Expected multiple chunks, got: {:?}",
355            chunks
356        );
357    }
358
359    #[test]
360    fn test_semantic_chunking() {
361        let chunker = TextChunker::with_semantic_chunking(100);
362        let text = "This is the first sentence. This is the second sentence. \
363                    And here is a third one that is a bit longer.";
364        let chunks = chunker.chunk(text);
365
366        // Should create chunks respecting sentence boundaries
367        assert!(!chunks.is_empty());
368    }
369
370    #[test]
371    fn test_character_chunking() {
372        let config = ChunkerConfig {
373            strategy: ChunkingStrategy::Character,
374            chunk_size: 20,
375            chunk_overlap: 5,
376            min_chunk_size: 10,
377        };
378        let chunker = TextChunker::new(config);
379        let text = "This is a test string that should be chunked by characters.";
380        let chunks = chunker.chunk_with_metadata(text);
381
382        assert!(!chunks.is_empty());
383        for chunk in &chunks {
384            assert!(chunk.content.len() <= 20);
385        }
386    }
387
388    #[test]
389    fn test_chunk_metadata() {
390        let chunker = TextChunker::with_semantic_chunking(50);
391        let text = "Hello world. This is a test.";
392        let chunks = chunker.chunk_with_metadata(text);
393
394        assert!(!chunks.is_empty());
395        assert_eq!(chunks[0].index, 0);
396        assert!(chunks[0].start_offset < chunks[0].end_offset);
397    }
398
399    #[test]
400    fn test_default_config() {
401        let config = ChunkerConfig::default();
402        assert_eq!(config.strategy, ChunkingStrategy::Word);
403        assert_eq!(config.chunk_size, 512);
404        assert_eq!(config.chunk_overlap, 50);
405    }
406
407    #[test]
408    fn test_backward_compatible_api() {
409        // Old API should still work
410        let chunker = TextChunker::with_word_chunking(100, 10);
411        let text = "Hello world. This is a test with multiple words.";
412        let chunks = chunker.chunk(text);
413        assert!(!chunks.is_empty());
414    }
415
416    #[test]
417    fn test_empty_text() {
418        let chunker = TextChunker::default();
419        let chunks = chunker.chunk("");
420        assert!(chunks.is_empty());
421    }
422
423    #[test]
424    fn test_small_text() {
425        let config = ChunkerConfig {
426            strategy: ChunkingStrategy::Word,
427            chunk_size: 100,
428            chunk_overlap: 10,
429            min_chunk_size: 5,
430        };
431        let chunker = TextChunker::new(config);
432        let text = "Short text";
433        let chunks = chunker.chunk(text);
434
435        assert_eq!(chunks.len(), 1);
436        assert_eq!(chunks[0], "Short text");
437    }
438}
ares/rag/chunker.rs

ares/rag/
chunker.rs