graphrag_core/text/
chunking.rs

1// Advanced text chunking with hierarchical boundary preservation (2024 best practices)
2// Shared utility module implementing state-of-the-art chunking strategies
3
4/// Hierarchical text chunking following LangChain RecursiveCharacterTextSplitter approach
5/// with semantic boundary preservation for optimal RAG performance
6pub struct HierarchicalChunker {
7    /// Hierarchical separators in order of preference
8    separators: Vec<String>,
9    /// Minimum chunk size to maintain
10    min_chunk_size: usize,
11}
12
13impl HierarchicalChunker {
14    /// Create a new hierarchical chunker with default separators
15    pub fn new() -> Self {
16        Self {
17            // Following 2024 research best practices - hierarchical separators
18            separators: vec![
19                "\n\n".to_string(),     // Paragraph breaks (highest priority)
20                "\n".to_string(),       // Line breaks
21                ". ".to_string(),       // Sentence endings with space
22                "! ".to_string(),       // Exclamation sentences
23                "? ".to_string(),       // Question sentences
24                "; ".to_string(),       // Semicolon clauses
25                ": ".to_string(),       // Colon clauses
26                " ".to_string(),        // Word boundaries
27                "".to_string(),         // Character level (fallback)
28            ],
29            min_chunk_size: 50,
30        }
31    }
32
33    /// Create chunker with custom separators
34    pub fn with_separators(separators: Vec<String>) -> Self {
35        Self {
36            separators,
37            min_chunk_size: 50,
38        }
39    }
40
41    /// Set minimum chunk size
42    pub fn with_min_size(mut self, min_size: usize) -> Self {
43        self.min_chunk_size = min_size;
44        self
45    }
46
47    /// Split text into semantically coherent chunks
48    pub fn chunk_text(&self, text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
49        let mut chunks = Vec::new();
50        let mut start = 0;
51
52        while start < text.len() {
53            let mut end = (start + chunk_size).min(text.len());
54
55            // Ensure we're on a UTF-8 character boundary first
56            while end > start && !text.is_char_boundary(end) {
57                end -= 1;
58            }
59
60            // If we're at the exact end, no need to adjust
61            if end >= text.len() {
62                let chunk = &text[start..];
63                if chunk.trim().len() >= self.min_chunk_size {
64                    chunks.push(chunk.to_string());
65                }
66                break;
67            }
68
69            // Find the best boundary to avoid semantic truncation
70            let optimal_end = self.find_optimal_boundary(text, start, end);
71
72            // If we found a good boundary, use it
73            if optimal_end > start {
74                end = optimal_end;
75            }
76
77            let chunk = &text[start..end];
78
79            if chunk.trim().len() >= self.min_chunk_size {
80                chunks.push(chunk.to_string());
81            }
82
83            if end >= text.len() {
84                break;
85            }
86
87            // Calculate next start with overlap, preserving semantic boundaries
88            let mut next_start = end.saturating_sub(overlap);
89
90            // Ensure next start is on a UTF-8 boundary
91            while next_start > 0 && !text.is_char_boundary(next_start) {
92                next_start -= 1;
93            }
94
95            // Try to align next start with word boundary
96            next_start = self.find_word_boundary_backward(text, next_start);
97
98            start = next_start;
99        }
100
101        chunks
102    }
103
104    /// Find optimal boundary using hierarchical separators
105    fn find_optimal_boundary(&self, text: &str, start: usize, max_end: usize) -> usize {
106        let search_text = &text[start..max_end];
107
108        // Try each separator in order of preference
109        for separator in &self.separators {
110            if separator.is_empty() {
111                continue;
112            }
113
114            // Find the last occurrence of this separator within our range
115            if let Some(sep_pos) = search_text.rfind(separator) {
116                let boundary = start + sep_pos + separator.len();
117
118                // Make sure we're not too close to the start (maintain minimum chunk size)
119                if boundary > start + (max_end - start) / 4 {
120                    return boundary;
121                }
122            }
123        }
124
125        // If no good separator found, try to at least end at a word boundary
126        self.find_word_boundary_backward(text, max_end)
127    }
128
129    /// Find the nearest word boundary going backward from the given position
130    fn find_word_boundary_backward(&self, text: &str, mut pos: usize) -> usize {
131        // Ensure we're on a UTF-8 boundary
132        while pos > 0 && !text.is_char_boundary(pos) {
133            pos -= 1;
134        }
135
136        // Look for whitespace (word boundary) going backward
137        while pos > 0 {
138            if let Some(ch) = text.chars().nth(pos.saturating_sub(1)) {
139                if ch.is_whitespace() {
140                    return pos;
141                }
142            }
143            pos = pos.saturating_sub(1);
144
145            // Ensure we stay on UTF-8 boundaries
146            while pos > 0 && !text.is_char_boundary(pos) {
147                pos -= 1;
148            }
149        }
150
151        pos
152    }
153
154    /// Advanced sentence boundary detection
155    pub fn find_sentence_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
156        let safe_start = self.find_char_boundary(text, start);
157        let safe_end = self.find_char_boundary(text, preferred_end);
158
159        if safe_start >= safe_end {
160            return None;
161        }
162
163        let search_window = &text[safe_start..safe_end];
164
165        // Look for sentence boundaries in the last part of the chunk
166        let search_start = search_window.len().saturating_sub(300); // Larger window for better context
167        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
168        let search_text = &search_window[safe_search_start..];
169
170        // Enhanced sentence boundary detection
171        let sentence_endings = ['.', '!', '?'];
172        let mut last_boundary = None;
173
174        for (i, ch) in search_text.char_indices() {
175            if sentence_endings.contains(&ch) {
176                // Check if next character is whitespace or end of text
177                let next_pos = i + ch.len_utf8();
178                if next_pos >= search_text.len() {
179                    last_boundary = Some(safe_start + safe_search_start + next_pos);
180                } else if let Some(next_char) = search_text.chars().nth(next_pos) {
181                    // More sophisticated sentence boundary detection
182                    if next_char.is_whitespace() &&
183                       (next_char == '\n' || next_char == ' ') {
184                        // Make sure this isn't an abbreviation or decimal
185                        if !self.is_likely_abbreviation(search_text, i) {
186                            last_boundary = Some(safe_start + safe_search_start + next_pos);
187                        }
188                    }
189                } else {
190                    // Character at next_pos does not exist
191                }
192            }
193        }
194
195        last_boundary
196    }
197
198    /// Check if a period is likely part of an abbreviation
199    fn is_likely_abbreviation(&self, text: &str, period_pos: usize) -> bool {
200        // Simple heuristics for common abbreviations
201        if period_pos == 0 {
202            return false;
203        }
204
205        // Check for common abbreviation patterns
206        let before_period = &text[..period_pos];
207        if let Some(word_start) = before_period.rfind(' ') {
208            let potential_abbrev = &before_period[word_start + 1..];
209
210            // Common abbreviations
211            let abbreviations = [
212                "Dr", "Mr", "Mrs", "Ms", "Prof", "Jr", "Sr", "Inc", "Corp",
213                "Ltd", "Co", "etc", "vs", "e.g", "i.e", "cf", "pp"
214            ];
215
216            return abbreviations.iter().any(|&abbrev|
217                potential_abbrev.eq_ignore_ascii_case(abbrev)
218            );
219        }
220
221        // Single letter followed by period (likely initial)
222        if period_pos == 1 && before_period.chars().next().unwrap_or(' ').is_ascii_uppercase() {
223            return true;
224        }
225
226        false
227    }
228
229    /// Find a safe character boundary at or before the given position
230    fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
231        pos = pos.min(text.len());
232        while pos > 0 && !text.is_char_boundary(pos) {
233            pos -= 1;
234        }
235        pos
236    }
237
238    /// Find a safe character boundary within a slice at or before the given position
239    fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
240        pos = pos.min(text.len());
241        while pos > 0 && !text.is_char_boundary(pos) {
242            pos -= 1;
243        }
244        pos
245    }
246}
247
248impl Default for HierarchicalChunker {
249    fn default() -> Self {
250        Self::new()
251    }
252}
253
254#[cfg(test)]
255mod tests {
256    use super::*;
257
258    #[test]
259    fn test_hierarchical_chunking() {
260        let chunker = HierarchicalChunker::new();
261        let text = "This is a test document.\n\nIt has multiple paragraphs. Each paragraph should be preserved as much as possible. This helps maintain semantic coherence in the chunks.";
262
263        let chunks = chunker.chunk_text(text, 100, 20);
264
265        assert!(!chunks.is_empty(), "Chunks should not be empty");
266
267        // The chunker respects \n\n as highest priority separator
268        // With min_chunk_size=50, first paragraph (26 chars: "This is a test document.")
269        // is too short and will be filtered out
270        // The second paragraph is long enough (128 chars) and will be chunked
271
272        // Verify that we got meaningful chunks from the second paragraph
273        assert!(chunks.len() >= 1, "Should have at least one chunk");
274
275        // First chunk should start from second paragraph
276        assert!(chunks[0].contains("multiple paragraphs") ||
277                chunks[0].contains("preserved") ||
278                chunks[0].contains("coherence"),
279                "Chunks should contain content from second paragraph. Got: {:?}", chunks);
280
281        // Verify chunks respect semantic boundaries (don't split in middle of words)
282        for (i, chunk) in chunks.iter().enumerate() {
283            let trimmed = chunk.trim();
284            if !trimmed.is_empty() {
285                // Should have substantial content (above min_chunk_size)
286                assert!(trimmed.len() >= 50,
287                       "Chunk {} should be >= min_chunk_size (50): length={}", i, trimmed.len());
288
289                let last_char = trimmed.chars().last().unwrap();
290                assert!(last_char.is_whitespace() ||
291                       last_char.is_ascii_punctuation() ||
292                       trimmed == text.trim(),
293                       "Chunk {} should end at word/sentence boundary", i);
294            }
295        }
296    }
297
298    #[test]
299    fn test_sentence_boundary_detection() {
300        let chunker = HierarchicalChunker::new();
301        let text = "Dr. Smith went to the store. He bought some milk. Then he went home.";
302
303        // Should not break on "Dr." abbreviation
304        if let Some(boundary) = chunker.find_sentence_boundary(text, 0, 30) {
305            let chunk = &text[0..boundary];
306            assert!(!chunk.ends_with("Dr."));
307        }
308    }
309
310    #[test]
311    fn test_word_boundary_preservation() {
312        let chunker = HierarchicalChunker::new();
313        let text = "This is a very long sentence that should be split at word boundaries rather than in the middle of words.";
314
315        let chunks = chunker.chunk_text(text, 50, 10);
316
317        // No chunk should end with a partial word
318        for chunk in &chunks {
319            let trimmed = chunk.trim();
320            if !trimmed.is_empty() {
321                let last_char = trimmed.chars().last().unwrap();
322                // Should end with whitespace, punctuation, or be the complete text
323                assert!(last_char.is_whitespace() ||
324                       last_char.is_ascii_punctuation() ||
325                       chunk.trim() == text.trim());
326            }
327        }
328    }
329}
graphrag_core/text/chunking.rs

graphrag_core/text/
chunking.rs