1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
// Advanced text chunking with hierarchical boundary preservation (2024 best practices)
// Shared utility module implementing state-of-the-art chunking strategies
/// Hierarchical text chunking following LangChain RecursiveCharacterTextSplitter approach
/// with semantic boundary preservation for optimal RAG performance
pub struct HierarchicalChunker {
/// Hierarchical separators in order of preference
separators: Vec<String>,
/// Minimum chunk size to maintain
min_chunk_size: usize,
}
impl HierarchicalChunker {
/// Create a new hierarchical chunker with default separators
pub fn new() -> Self {
Self {
// Following 2024 research best practices - hierarchical separators
separators: vec![
"\n\n".to_string(), // Paragraph breaks (highest priority)
"\n".to_string(), // Line breaks
". ".to_string(), // Sentence endings with space
"! ".to_string(), // Exclamation sentences
"? ".to_string(), // Question sentences
"; ".to_string(), // Semicolon clauses
": ".to_string(), // Colon clauses
" ".to_string(), // Word boundaries
"".to_string(), // Character level (fallback)
],
min_chunk_size: 50,
}
}
/// Create chunker with custom separators
pub fn with_separators(separators: Vec<String>) -> Self {
Self {
separators,
min_chunk_size: 50,
}
}
/// Set minimum chunk size
pub fn with_min_size(mut self, min_size: usize) -> Self {
self.min_chunk_size = min_size;
self
}
/// Split text into semantically coherent chunks
pub fn chunk_text(&self, text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
let mut chunks = Vec::new();
let mut start = 0;
while start < text.len() {
let mut end = (start + chunk_size).min(text.len());
// Ensure we're on a UTF-8 character boundary first
while end > start && !text.is_char_boundary(end) {
end -= 1;
}
// If we're at the exact end, no need to adjust
if end >= text.len() {
let chunk = &text[start..];
if chunk.trim().len() >= self.min_chunk_size {
chunks.push(chunk.to_string());
}
break;
}
// Find the best boundary to avoid semantic truncation
let optimal_end = self.find_optimal_boundary(text, start, end);
// If we found a good boundary, use it
if optimal_end > start {
end = optimal_end;
}
let chunk = &text[start..end];
if chunk.trim().len() >= self.min_chunk_size {
chunks.push(chunk.to_string());
}
if end >= text.len() {
break;
}
// Calculate next start with overlap, preserving semantic boundaries
let mut next_start = end.saturating_sub(overlap);
// Ensure next start is on a UTF-8 boundary
while next_start > 0 && !text.is_char_boundary(next_start) {
next_start -= 1;
}
// Try to align next start with word boundary
next_start = self.find_word_boundary_backward(text, next_start);
start = next_start;
}
chunks
}
/// Find optimal boundary using hierarchical separators
fn find_optimal_boundary(&self, text: &str, start: usize, max_end: usize) -> usize {
let search_text = &text[start..max_end];
// Try each separator in order of preference
for separator in &self.separators {
if separator.is_empty() {
continue;
}
// Find the last occurrence of this separator within our range
if let Some(sep_pos) = search_text.rfind(separator) {
let boundary = start + sep_pos + separator.len();
// Make sure we're not too close to the start (maintain minimum chunk size)
if boundary > start + (max_end - start) / 4 {
return boundary;
}
}
}
// If no good separator found, try to at least end at a word boundary
self.find_word_boundary_backward(text, max_end)
}
/// Find the nearest word boundary going backward from the given position
fn find_word_boundary_backward(&self, text: &str, mut pos: usize) -> usize {
// Ensure we're on a UTF-8 boundary
while pos > 0 && !text.is_char_boundary(pos) {
pos -= 1;
}
// Look for whitespace (word boundary) going backward
while pos > 0 {
if let Some(ch) = text.chars().nth(pos.saturating_sub(1)) {
if ch.is_whitespace() {
return pos;
}
}
pos = pos.saturating_sub(1);
// Ensure we stay on UTF-8 boundaries
while pos > 0 && !text.is_char_boundary(pos) {
pos -= 1;
}
}
pos
}
/// Advanced sentence boundary detection
pub fn find_sentence_boundary(
&self,
text: &str,
start: usize,
preferred_end: usize,
) -> Option<usize> {
let safe_start = self.find_char_boundary(text, start);
let safe_end = self.find_char_boundary(text, preferred_end);
if safe_start >= safe_end {
return None;
}
let search_window = &text[safe_start..safe_end];
// Look for sentence boundaries in the last part of the chunk
let search_start = search_window.len().saturating_sub(300); // Larger window for better context
let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
let search_text = &search_window[safe_search_start..];
// Enhanced sentence boundary detection
let sentence_endings = ['.', '!', '?'];
let mut last_boundary = None;
for (i, ch) in search_text.char_indices() {
if sentence_endings.contains(&ch) {
// Check if next character is whitespace or end of text
let next_pos = i + ch.len_utf8();
if next_pos >= search_text.len() {
last_boundary = Some(safe_start + safe_search_start + next_pos);
} else if let Some(next_char) = search_text.chars().nth(next_pos) {
// More sophisticated sentence boundary detection
if next_char.is_whitespace() && (next_char == '\n' || next_char == ' ') {
// Make sure this isn't an abbreviation or decimal
if !self.is_likely_abbreviation(search_text, i) {
last_boundary = Some(safe_start + safe_search_start + next_pos);
}
}
} else {
// Character at next_pos does not exist
}
}
}
last_boundary
}
/// Check if a period is likely part of an abbreviation
fn is_likely_abbreviation(&self, text: &str, period_pos: usize) -> bool {
// Simple heuristics for common abbreviations
if period_pos == 0 {
return false;
}
// Check for common abbreviation patterns
let before_period = &text[..period_pos];
if let Some(word_start) = before_period.rfind(' ') {
let potential_abbrev = &before_period[word_start + 1..];
// Common abbreviations
let abbreviations = [
"Dr", "Mr", "Mrs", "Ms", "Prof", "Jr", "Sr", "Inc", "Corp", "Ltd", "Co", "etc",
"vs", "e.g", "i.e", "cf", "pp",
];
return abbreviations
.iter()
.any(|&abbrev| potential_abbrev.eq_ignore_ascii_case(abbrev));
}
// Single letter followed by period (likely initial)
if period_pos == 1
&& before_period
.chars()
.next()
.unwrap_or(' ')
.is_ascii_uppercase()
{
return true;
}
false
}
/// Find a safe character boundary at or before the given position
fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
pos = pos.min(text.len());
while pos > 0 && !text.is_char_boundary(pos) {
pos -= 1;
}
pos
}
/// Find a safe character boundary within a slice at or before the given position
fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
pos = pos.min(text.len());
while pos > 0 && !text.is_char_boundary(pos) {
pos -= 1;
}
pos
}
}
impl Default for HierarchicalChunker {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hierarchical_chunking() {
let chunker = HierarchicalChunker::new();
let text = "This is a test document.\n\nIt has multiple paragraphs. Each paragraph should be preserved as much as possible. This helps maintain semantic coherence in the chunks.";
let chunks = chunker.chunk_text(text, 100, 20);
assert!(!chunks.is_empty(), "Chunks should not be empty");
// The chunker respects \n\n as highest priority separator
// With min_chunk_size=50, first paragraph (26 chars: "This is a test document.")
// is too short and will be filtered out
// The second paragraph is long enough (128 chars) and will be chunked
// Verify that we got meaningful chunks from the second paragraph
assert!(!chunks.is_empty(), "Should have at least one chunk");
// First chunk should start from second paragraph
assert!(
chunks[0].contains("multiple paragraphs")
|| chunks[0].contains("preserved")
|| chunks[0].contains("coherence"),
"Chunks should contain content from second paragraph. Got: {:?}",
chunks
);
// Verify chunks respect semantic boundaries (don't split in middle of words)
for (i, chunk) in chunks.iter().enumerate() {
let trimmed = chunk.trim();
if !trimmed.is_empty() {
// Should have substantial content (above min_chunk_size)
assert!(
trimmed.len() >= 50,
"Chunk {} should be >= min_chunk_size (50): length={}",
i,
trimmed.len()
);
let last_char = trimmed.chars().last().unwrap();
assert!(
last_char.is_whitespace()
|| last_char.is_ascii_punctuation()
|| trimmed == text.trim(),
"Chunk {} should end at word/sentence boundary",
i
);
}
}
}
#[test]
fn test_sentence_boundary_detection() {
let chunker = HierarchicalChunker::new();
let text = "Dr. Smith went to the store. He bought some milk. Then he went home.";
// Should not break on "Dr." abbreviation
if let Some(boundary) = chunker.find_sentence_boundary(text, 0, 30) {
let chunk = &text[0..boundary];
assert!(!chunk.ends_with("Dr."));
}
}
#[test]
fn test_word_boundary_preservation() {
let chunker = HierarchicalChunker::new();
let text = "This is a very long sentence that should be split at word boundaries rather than in the middle of words.";
let chunks = chunker.chunk_text(text, 50, 10);
// No chunk should end with a partial word
for chunk in &chunks {
let trimmed = chunk.trim();
if !trimmed.is_empty() {
let last_char = trimmed.chars().last().unwrap();
// Should end with whitespace, punctuation, or be the complete text
assert!(
last_char.is_whitespace()
|| last_char.is_ascii_punctuation()
|| chunk.trim() == text.trim()
);
}
}
}
}