Skip to main content

graphrag_core/text/
chunking.rs

1// Advanced text chunking with hierarchical boundary preservation (2024 best practices)
2// Shared utility module implementing state-of-the-art chunking strategies
3
4/// Hierarchical text chunking following LangChain RecursiveCharacterTextSplitter approach
5/// with semantic boundary preservation for optimal RAG performance
6pub struct HierarchicalChunker {
7    /// Hierarchical separators in order of preference
8    separators: Vec<String>,
9    /// Minimum chunk size to maintain
10    min_chunk_size: usize,
11}
12
13impl HierarchicalChunker {
14    /// Create a new hierarchical chunker with default separators
15    pub fn new() -> Self {
16        Self {
17            // Following 2024 research best practices - hierarchical separators
18            separators: vec![
19                "\n\n".to_string(), // Paragraph breaks (highest priority)
20                "\n".to_string(),   // Line breaks
21                ". ".to_string(),   // Sentence endings with space
22                "! ".to_string(),   // Exclamation sentences
23                "? ".to_string(),   // Question sentences
24                "; ".to_string(),   // Semicolon clauses
25                ": ".to_string(),   // Colon clauses
26                " ".to_string(),    // Word boundaries
27                "".to_string(),     // Character level (fallback)
28            ],
29            min_chunk_size: 50,
30        }
31    }
32
33    /// Create chunker with custom separators
34    pub fn with_separators(separators: Vec<String>) -> Self {
35        Self {
36            separators,
37            min_chunk_size: 50,
38        }
39    }
40
41    /// Set minimum chunk size
42    pub fn with_min_size(mut self, min_size: usize) -> Self {
43        self.min_chunk_size = min_size;
44        self
45    }
46
47    /// Split text into semantically coherent chunks
48    pub fn chunk_text(&self, text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
49        let mut chunks = Vec::new();
50        let mut start = 0;
51
52        while start < text.len() {
53            let mut end = (start + chunk_size).min(text.len());
54
55            // Ensure we're on a UTF-8 character boundary first
56            while end > start && !text.is_char_boundary(end) {
57                end -= 1;
58            }
59
60            // If we're at the exact end, no need to adjust
61            if end >= text.len() {
62                let chunk = &text[start..];
63                if chunk.trim().len() >= self.min_chunk_size {
64                    chunks.push(chunk.to_string());
65                }
66                break;
67            }
68
69            // Find the best boundary to avoid semantic truncation
70            let optimal_end = self.find_optimal_boundary(text, start, end);
71
72            // If we found a good boundary, use it
73            if optimal_end > start {
74                end = optimal_end;
75            }
76
77            let chunk = &text[start..end];
78
79            if chunk.trim().len() >= self.min_chunk_size {
80                chunks.push(chunk.to_string());
81            }
82
83            if end >= text.len() {
84                break;
85            }
86
87            // Calculate next start with overlap, preserving semantic boundaries
88            let mut next_start = end.saturating_sub(overlap);
89
90            // Ensure next start is on a UTF-8 boundary
91            while next_start > 0 && !text.is_char_boundary(next_start) {
92                next_start -= 1;
93            }
94
95            // Try to align next start with word boundary
96            next_start = self.find_word_boundary_backward(text, next_start);
97
98            start = next_start;
99        }
100
101        chunks
102    }
103
104    /// Find optimal boundary using hierarchical separators
105    fn find_optimal_boundary(&self, text: &str, start: usize, max_end: usize) -> usize {
106        let search_text = &text[start..max_end];
107
108        // Try each separator in order of preference
109        for separator in &self.separators {
110            if separator.is_empty() {
111                continue;
112            }
113
114            // Find the last occurrence of this separator within our range
115            if let Some(sep_pos) = search_text.rfind(separator) {
116                let boundary = start + sep_pos + separator.len();
117
118                // Make sure we're not too close to the start (maintain minimum chunk size)
119                if boundary > start + (max_end - start) / 4 {
120                    return boundary;
121                }
122            }
123        }
124
125        // If no good separator found, try to at least end at a word boundary
126        self.find_word_boundary_backward(text, max_end)
127    }
128
129    /// Find the nearest word boundary going backward from the given position
130    fn find_word_boundary_backward(&self, text: &str, mut pos: usize) -> usize {
131        // Ensure we're on a UTF-8 boundary
132        while pos > 0 && !text.is_char_boundary(pos) {
133            pos -= 1;
134        }
135
136        // Look for whitespace (word boundary) going backward
137        while pos > 0 {
138            if let Some(ch) = text.chars().nth(pos.saturating_sub(1)) {
139                if ch.is_whitespace() {
140                    return pos;
141                }
142            }
143            pos = pos.saturating_sub(1);
144
145            // Ensure we stay on UTF-8 boundaries
146            while pos > 0 && !text.is_char_boundary(pos) {
147                pos -= 1;
148            }
149        }
150
151        pos
152    }
153
154    /// Advanced sentence boundary detection
155    pub fn find_sentence_boundary(
156        &self,
157        text: &str,
158        start: usize,
159        preferred_end: usize,
160    ) -> Option<usize> {
161        let safe_start = self.find_char_boundary(text, start);
162        let safe_end = self.find_char_boundary(text, preferred_end);
163
164        if safe_start >= safe_end {
165            return None;
166        }
167
168        let search_window = &text[safe_start..safe_end];
169
170        // Look for sentence boundaries in the last part of the chunk
171        let search_start = search_window.len().saturating_sub(300); // Larger window for better context
172        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
173        let search_text = &search_window[safe_search_start..];
174
175        // Enhanced sentence boundary detection
176        let sentence_endings = ['.', '!', '?'];
177        let mut last_boundary = None;
178
179        for (i, ch) in search_text.char_indices() {
180            if sentence_endings.contains(&ch) {
181                // Check if next character is whitespace or end of text
182                let next_pos = i + ch.len_utf8();
183                if next_pos >= search_text.len() {
184                    last_boundary = Some(safe_start + safe_search_start + next_pos);
185                } else if let Some(next_char) = search_text.chars().nth(next_pos) {
186                    // More sophisticated sentence boundary detection
187                    if next_char.is_whitespace() && (next_char == '\n' || next_char == ' ') {
188                        // Make sure this isn't an abbreviation or decimal
189                        if !self.is_likely_abbreviation(search_text, i) {
190                            last_boundary = Some(safe_start + safe_search_start + next_pos);
191                        }
192                    }
193                } else {
194                    // Character at next_pos does not exist
195                }
196            }
197        }
198
199        last_boundary
200    }
201
202    /// Check if a period is likely part of an abbreviation
203    fn is_likely_abbreviation(&self, text: &str, period_pos: usize) -> bool {
204        // Simple heuristics for common abbreviations
205        if period_pos == 0 {
206            return false;
207        }
208
209        // Check for common abbreviation patterns
210        let before_period = &text[..period_pos];
211        if let Some(word_start) = before_period.rfind(' ') {
212            let potential_abbrev = &before_period[word_start + 1..];
213
214            // Common abbreviations
215            let abbreviations = [
216                "Dr", "Mr", "Mrs", "Ms", "Prof", "Jr", "Sr", "Inc", "Corp", "Ltd", "Co", "etc",
217                "vs", "e.g", "i.e", "cf", "pp",
218            ];
219
220            return abbreviations
221                .iter()
222                .any(|&abbrev| potential_abbrev.eq_ignore_ascii_case(abbrev));
223        }
224
225        // Single letter followed by period (likely initial)
226        if period_pos == 1
227            && before_period
228                .chars()
229                .next()
230                .unwrap_or(' ')
231                .is_ascii_uppercase()
232        {
233            return true;
234        }
235
236        false
237    }
238
239    /// Find a safe character boundary at or before the given position
240    fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
241        pos = pos.min(text.len());
242        while pos > 0 && !text.is_char_boundary(pos) {
243            pos -= 1;
244        }
245        pos
246    }
247
248    /// Find a safe character boundary within a slice at or before the given position
249    fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
250        pos = pos.min(text.len());
251        while pos > 0 && !text.is_char_boundary(pos) {
252            pos -= 1;
253        }
254        pos
255    }
256}
257
258impl Default for HierarchicalChunker {
259    fn default() -> Self {
260        Self::new()
261    }
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn test_hierarchical_chunking() {
270        let chunker = HierarchicalChunker::new();
271        let text = "This is a test document.\n\nIt has multiple paragraphs. Each paragraph should be preserved as much as possible. This helps maintain semantic coherence in the chunks.";
272
273        let chunks = chunker.chunk_text(text, 100, 20);
274
275        assert!(!chunks.is_empty(), "Chunks should not be empty");
276
277        // The chunker respects \n\n as highest priority separator
278        // With min_chunk_size=50, first paragraph (26 chars: "This is a test document.")
279        // is too short and will be filtered out
280        // The second paragraph is long enough (128 chars) and will be chunked
281
282        // Verify that we got meaningful chunks from the second paragraph
283        assert!(chunks.len() >= 1, "Should have at least one chunk");
284
285        // First chunk should start from second paragraph
286        assert!(
287            chunks[0].contains("multiple paragraphs")
288                || chunks[0].contains("preserved")
289                || chunks[0].contains("coherence"),
290            "Chunks should contain content from second paragraph. Got: {:?}",
291            chunks
292        );
293
294        // Verify chunks respect semantic boundaries (don't split in middle of words)
295        for (i, chunk) in chunks.iter().enumerate() {
296            let trimmed = chunk.trim();
297            if !trimmed.is_empty() {
298                // Should have substantial content (above min_chunk_size)
299                assert!(
300                    trimmed.len() >= 50,
301                    "Chunk {} should be >= min_chunk_size (50): length={}",
302                    i,
303                    trimmed.len()
304                );
305
306                let last_char = trimmed.chars().last().unwrap();
307                assert!(
308                    last_char.is_whitespace()
309                        || last_char.is_ascii_punctuation()
310                        || trimmed == text.trim(),
311                    "Chunk {} should end at word/sentence boundary",
312                    i
313                );
314            }
315        }
316    }
317
318    #[test]
319    fn test_sentence_boundary_detection() {
320        let chunker = HierarchicalChunker::new();
321        let text = "Dr. Smith went to the store. He bought some milk. Then he went home.";
322
323        // Should not break on "Dr." abbreviation
324        if let Some(boundary) = chunker.find_sentence_boundary(text, 0, 30) {
325            let chunk = &text[0..boundary];
326            assert!(!chunk.ends_with("Dr."));
327        }
328    }
329
330    #[test]
331    fn test_word_boundary_preservation() {
332        let chunker = HierarchicalChunker::new();
333        let text = "This is a very long sentence that should be split at word boundaries rather than in the middle of words.";
334
335        let chunks = chunker.chunk_text(text, 50, 10);
336
337        // No chunk should end with a partial word
338        for chunk in &chunks {
339            let trimmed = chunk.trim();
340            if !trimmed.is_empty() {
341                let last_char = trimmed.chars().last().unwrap();
342                // Should end with whitespace, punctuation, or be the complete text
343                assert!(
344                    last_char.is_whitespace()
345                        || last_char.is_ascii_punctuation()
346                        || chunk.trim() == text.trim()
347                );
348            }
349        }
350    }
351}