graphrag_core/text/
chunking.rs1pub struct HierarchicalChunker {
7 separators: Vec<String>,
9 min_chunk_size: usize,
11}
12
13impl HierarchicalChunker {
14 pub fn new() -> Self {
16 Self {
17 separators: vec![
19 "\n\n".to_string(), "\n".to_string(), ". ".to_string(), "! ".to_string(), "? ".to_string(), "; ".to_string(), ": ".to_string(), " ".to_string(), "".to_string(), ],
29 min_chunk_size: 50,
30 }
31 }
32
33 pub fn with_separators(separators: Vec<String>) -> Self {
35 Self {
36 separators,
37 min_chunk_size: 50,
38 }
39 }
40
41 pub fn with_min_size(mut self, min_size: usize) -> Self {
43 self.min_chunk_size = min_size;
44 self
45 }
46
47 pub fn chunk_text(&self, text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
49 let mut chunks = Vec::new();
50 let mut start = 0;
51
52 while start < text.len() {
53 let mut end = (start + chunk_size).min(text.len());
54
55 while end > start && !text.is_char_boundary(end) {
57 end -= 1;
58 }
59
60 if end >= text.len() {
62 let chunk = &text[start..];
63 if chunk.trim().len() >= self.min_chunk_size {
64 chunks.push(chunk.to_string());
65 }
66 break;
67 }
68
69 let optimal_end = self.find_optimal_boundary(text, start, end);
71
72 if optimal_end > start {
74 end = optimal_end;
75 }
76
77 let chunk = &text[start..end];
78
79 if chunk.trim().len() >= self.min_chunk_size {
80 chunks.push(chunk.to_string());
81 }
82
83 if end >= text.len() {
84 break;
85 }
86
87 let mut next_start = end.saturating_sub(overlap);
89
90 while next_start > 0 && !text.is_char_boundary(next_start) {
92 next_start -= 1;
93 }
94
95 next_start = self.find_word_boundary_backward(text, next_start);
97
98 start = next_start;
99 }
100
101 chunks
102 }
103
104 fn find_optimal_boundary(&self, text: &str, start: usize, max_end: usize) -> usize {
106 let search_text = &text[start..max_end];
107
108 for separator in &self.separators {
110 if separator.is_empty() {
111 continue;
112 }
113
114 if let Some(sep_pos) = search_text.rfind(separator) {
116 let boundary = start + sep_pos + separator.len();
117
118 if boundary > start + (max_end - start) / 4 {
120 return boundary;
121 }
122 }
123 }
124
125 self.find_word_boundary_backward(text, max_end)
127 }
128
129 fn find_word_boundary_backward(&self, text: &str, mut pos: usize) -> usize {
131 while pos > 0 && !text.is_char_boundary(pos) {
133 pos -= 1;
134 }
135
136 while pos > 0 {
138 if let Some(ch) = text.chars().nth(pos.saturating_sub(1)) {
139 if ch.is_whitespace() {
140 return pos;
141 }
142 }
143 pos = pos.saturating_sub(1);
144
145 while pos > 0 && !text.is_char_boundary(pos) {
147 pos -= 1;
148 }
149 }
150
151 pos
152 }
153
154 pub fn find_sentence_boundary(
156 &self,
157 text: &str,
158 start: usize,
159 preferred_end: usize,
160 ) -> Option<usize> {
161 let safe_start = self.find_char_boundary(text, start);
162 let safe_end = self.find_char_boundary(text, preferred_end);
163
164 if safe_start >= safe_end {
165 return None;
166 }
167
168 let search_window = &text[safe_start..safe_end];
169
170 let search_start = search_window.len().saturating_sub(300); let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
173 let search_text = &search_window[safe_search_start..];
174
175 let sentence_endings = ['.', '!', '?'];
177 let mut last_boundary = None;
178
179 for (i, ch) in search_text.char_indices() {
180 if sentence_endings.contains(&ch) {
181 let next_pos = i + ch.len_utf8();
183 if next_pos >= search_text.len() {
184 last_boundary = Some(safe_start + safe_search_start + next_pos);
185 } else if let Some(next_char) = search_text.chars().nth(next_pos) {
186 if next_char.is_whitespace() && (next_char == '\n' || next_char == ' ') {
188 if !self.is_likely_abbreviation(search_text, i) {
190 last_boundary = Some(safe_start + safe_search_start + next_pos);
191 }
192 }
193 } else {
194 }
196 }
197 }
198
199 last_boundary
200 }
201
202 fn is_likely_abbreviation(&self, text: &str, period_pos: usize) -> bool {
204 if period_pos == 0 {
206 return false;
207 }
208
209 let before_period = &text[..period_pos];
211 if let Some(word_start) = before_period.rfind(' ') {
212 let potential_abbrev = &before_period[word_start + 1..];
213
214 let abbreviations = [
216 "Dr", "Mr", "Mrs", "Ms", "Prof", "Jr", "Sr", "Inc", "Corp", "Ltd", "Co", "etc",
217 "vs", "e.g", "i.e", "cf", "pp",
218 ];
219
220 return abbreviations
221 .iter()
222 .any(|&abbrev| potential_abbrev.eq_ignore_ascii_case(abbrev));
223 }
224
225 if period_pos == 1
227 && before_period
228 .chars()
229 .next()
230 .unwrap_or(' ')
231 .is_ascii_uppercase()
232 {
233 return true;
234 }
235
236 false
237 }
238
239 fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
241 pos = pos.min(text.len());
242 while pos > 0 && !text.is_char_boundary(pos) {
243 pos -= 1;
244 }
245 pos
246 }
247
248 fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
250 pos = pos.min(text.len());
251 while pos > 0 && !text.is_char_boundary(pos) {
252 pos -= 1;
253 }
254 pos
255 }
256}
257
258impl Default for HierarchicalChunker {
259 fn default() -> Self {
260 Self::new()
261 }
262}
263
264#[cfg(test)]
265mod tests {
266 use super::*;
267
268 #[test]
269 fn test_hierarchical_chunking() {
270 let chunker = HierarchicalChunker::new();
271 let text = "This is a test document.\n\nIt has multiple paragraphs. Each paragraph should be preserved as much as possible. This helps maintain semantic coherence in the chunks.";
272
273 let chunks = chunker.chunk_text(text, 100, 20);
274
275 assert!(!chunks.is_empty(), "Chunks should not be empty");
276
277 assert!(chunks.len() >= 1, "Should have at least one chunk");
284
285 assert!(
287 chunks[0].contains("multiple paragraphs")
288 || chunks[0].contains("preserved")
289 || chunks[0].contains("coherence"),
290 "Chunks should contain content from second paragraph. Got: {:?}",
291 chunks
292 );
293
294 for (i, chunk) in chunks.iter().enumerate() {
296 let trimmed = chunk.trim();
297 if !trimmed.is_empty() {
298 assert!(
300 trimmed.len() >= 50,
301 "Chunk {} should be >= min_chunk_size (50): length={}",
302 i,
303 trimmed.len()
304 );
305
306 let last_char = trimmed.chars().last().unwrap();
307 assert!(
308 last_char.is_whitespace()
309 || last_char.is_ascii_punctuation()
310 || trimmed == text.trim(),
311 "Chunk {} should end at word/sentence boundary",
312 i
313 );
314 }
315 }
316 }
317
318 #[test]
319 fn test_sentence_boundary_detection() {
320 let chunker = HierarchicalChunker::new();
321 let text = "Dr. Smith went to the store. He bought some milk. Then he went home.";
322
323 if let Some(boundary) = chunker.find_sentence_boundary(text, 0, 30) {
325 let chunk = &text[0..boundary];
326 assert!(!chunk.ends_with("Dr."));
327 }
328 }
329
330 #[test]
331 fn test_word_boundary_preservation() {
332 let chunker = HierarchicalChunker::new();
333 let text = "This is a very long sentence that should be split at word boundaries rather than in the middle of words.";
334
335 let chunks = chunker.chunk_text(text, 50, 10);
336
337 for chunk in &chunks {
339 let trimmed = chunk.trim();
340 if !trimmed.is_empty() {
341 let last_char = trimmed.chars().last().unwrap();
342 assert!(
344 last_char.is_whitespace()
345 || last_char.is_ascii_punctuation()
346 || chunk.trim() == text.trim()
347 );
348 }
349 }
350 }
351}