graphrag_core/text/
chunking.rs1pub struct HierarchicalChunker {
7 separators: Vec<String>,
9 min_chunk_size: usize,
11}
12
13impl HierarchicalChunker {
14 pub fn new() -> Self {
16 Self {
17 separators: vec![
19 "\n\n".to_string(), "\n".to_string(), ". ".to_string(), "! ".to_string(), "? ".to_string(), "; ".to_string(), ": ".to_string(), " ".to_string(), "".to_string(), ],
29 min_chunk_size: 50,
30 }
31 }
32
33 pub fn with_separators(separators: Vec<String>) -> Self {
35 Self {
36 separators,
37 min_chunk_size: 50,
38 }
39 }
40
41 pub fn with_min_size(mut self, min_size: usize) -> Self {
43 self.min_chunk_size = min_size;
44 self
45 }
46
47 pub fn chunk_text(&self, text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
49 let mut chunks = Vec::new();
50 let mut start = 0;
51
52 while start < text.len() {
53 let mut end = (start + chunk_size).min(text.len());
54
55 while end > start && !text.is_char_boundary(end) {
57 end -= 1;
58 }
59
60 if end >= text.len() {
62 let chunk = &text[start..];
63 if chunk.trim().len() >= self.min_chunk_size {
64 chunks.push(chunk.to_string());
65 }
66 break;
67 }
68
69 let optimal_end = self.find_optimal_boundary(text, start, end);
71
72 if optimal_end > start {
74 end = optimal_end;
75 }
76
77 let chunk = &text[start..end];
78
79 if chunk.trim().len() >= self.min_chunk_size {
80 chunks.push(chunk.to_string());
81 }
82
83 if end >= text.len() {
84 break;
85 }
86
87 let mut next_start = end.saturating_sub(overlap);
89
90 while next_start > 0 && !text.is_char_boundary(next_start) {
92 next_start -= 1;
93 }
94
95 next_start = self.find_word_boundary_backward(text, next_start);
97
98 start = next_start;
99 }
100
101 chunks
102 }
103
104 fn find_optimal_boundary(&self, text: &str, start: usize, max_end: usize) -> usize {
106 let search_text = &text[start..max_end];
107
108 for separator in &self.separators {
110 if separator.is_empty() {
111 continue;
112 }
113
114 if let Some(sep_pos) = search_text.rfind(separator) {
116 let boundary = start + sep_pos + separator.len();
117
118 if boundary > start + (max_end - start) / 4 {
120 return boundary;
121 }
122 }
123 }
124
125 self.find_word_boundary_backward(text, max_end)
127 }
128
129 fn find_word_boundary_backward(&self, text: &str, mut pos: usize) -> usize {
131 while pos > 0 && !text.is_char_boundary(pos) {
133 pos -= 1;
134 }
135
136 while pos > 0 {
138 if let Some(ch) = text.chars().nth(pos.saturating_sub(1)) {
139 if ch.is_whitespace() {
140 return pos;
141 }
142 }
143 pos = pos.saturating_sub(1);
144
145 while pos > 0 && !text.is_char_boundary(pos) {
147 pos -= 1;
148 }
149 }
150
151 pos
152 }
153
154 pub fn find_sentence_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
156 let safe_start = self.find_char_boundary(text, start);
157 let safe_end = self.find_char_boundary(text, preferred_end);
158
159 if safe_start >= safe_end {
160 return None;
161 }
162
163 let search_window = &text[safe_start..safe_end];
164
165 let search_start = search_window.len().saturating_sub(300); let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
168 let search_text = &search_window[safe_search_start..];
169
170 let sentence_endings = ['.', '!', '?'];
172 let mut last_boundary = None;
173
174 for (i, ch) in search_text.char_indices() {
175 if sentence_endings.contains(&ch) {
176 let next_pos = i + ch.len_utf8();
178 if next_pos >= search_text.len() {
179 last_boundary = Some(safe_start + safe_search_start + next_pos);
180 } else if let Some(next_char) = search_text.chars().nth(next_pos) {
181 if next_char.is_whitespace() &&
183 (next_char == '\n' || next_char == ' ') {
184 if !self.is_likely_abbreviation(search_text, i) {
186 last_boundary = Some(safe_start + safe_search_start + next_pos);
187 }
188 }
189 } else {
190 }
192 }
193 }
194
195 last_boundary
196 }
197
198 fn is_likely_abbreviation(&self, text: &str, period_pos: usize) -> bool {
200 if period_pos == 0 {
202 return false;
203 }
204
205 let before_period = &text[..period_pos];
207 if let Some(word_start) = before_period.rfind(' ') {
208 let potential_abbrev = &before_period[word_start + 1..];
209
210 let abbreviations = [
212 "Dr", "Mr", "Mrs", "Ms", "Prof", "Jr", "Sr", "Inc", "Corp",
213 "Ltd", "Co", "etc", "vs", "e.g", "i.e", "cf", "pp"
214 ];
215
216 return abbreviations.iter().any(|&abbrev|
217 potential_abbrev.eq_ignore_ascii_case(abbrev)
218 );
219 }
220
221 if period_pos == 1 && before_period.chars().next().unwrap_or(' ').is_ascii_uppercase() {
223 return true;
224 }
225
226 false
227 }
228
229 fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
231 pos = pos.min(text.len());
232 while pos > 0 && !text.is_char_boundary(pos) {
233 pos -= 1;
234 }
235 pos
236 }
237
238 fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
240 pos = pos.min(text.len());
241 while pos > 0 && !text.is_char_boundary(pos) {
242 pos -= 1;
243 }
244 pos
245 }
246}
247
248impl Default for HierarchicalChunker {
249 fn default() -> Self {
250 Self::new()
251 }
252}
253
254#[cfg(test)]
255mod tests {
256 use super::*;
257
258 #[test]
259 fn test_hierarchical_chunking() {
260 let chunker = HierarchicalChunker::new();
261 let text = "This is a test document.\n\nIt has multiple paragraphs. Each paragraph should be preserved as much as possible. This helps maintain semantic coherence in the chunks.";
262
263 let chunks = chunker.chunk_text(text, 100, 20);
264
265 assert!(!chunks.is_empty(), "Chunks should not be empty");
266
267 assert!(chunks.len() >= 1, "Should have at least one chunk");
274
275 assert!(chunks[0].contains("multiple paragraphs") ||
277 chunks[0].contains("preserved") ||
278 chunks[0].contains("coherence"),
279 "Chunks should contain content from second paragraph. Got: {:?}", chunks);
280
281 for (i, chunk) in chunks.iter().enumerate() {
283 let trimmed = chunk.trim();
284 if !trimmed.is_empty() {
285 assert!(trimmed.len() >= 50,
287 "Chunk {} should be >= min_chunk_size (50): length={}", i, trimmed.len());
288
289 let last_char = trimmed.chars().last().unwrap();
290 assert!(last_char.is_whitespace() ||
291 last_char.is_ascii_punctuation() ||
292 trimmed == text.trim(),
293 "Chunk {} should end at word/sentence boundary", i);
294 }
295 }
296 }
297
298 #[test]
299 fn test_sentence_boundary_detection() {
300 let chunker = HierarchicalChunker::new();
301 let text = "Dr. Smith went to the store. He bought some milk. Then he went home.";
302
303 if let Some(boundary) = chunker.find_sentence_boundary(text, 0, 30) {
305 let chunk = &text[0..boundary];
306 assert!(!chunk.ends_with("Dr."));
307 }
308 }
309
310 #[test]
311 fn test_word_boundary_preservation() {
312 let chunker = HierarchicalChunker::new();
313 let text = "This is a very long sentence that should be split at word boundaries rather than in the middle of words.";
314
315 let chunks = chunker.chunk_text(text, 50, 10);
316
317 for chunk in &chunks {
319 let trimmed = chunk.trim();
320 if !trimmed.is_empty() {
321 let last_char = trimmed.chars().last().unwrap();
322 assert!(last_char.is_whitespace() ||
324 last_char.is_ascii_punctuation() ||
325 chunk.trim() == text.trim());
326 }
327 }
328 }
329}