1use crate::chunking::traits::{ChunkMetadata, Chunker};
7use crate::chunking::{DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, MAX_CHUNK_SIZE};
8use crate::core::Chunk;
9use crate::error::{ChunkingError, Result};
10use crate::io::find_char_boundary;
11use unicode_segmentation::UnicodeSegmentation;
12
13#[derive(Debug, Clone)]
29pub struct SemanticChunker {
30 chunk_size: usize,
32 overlap: usize,
34 min_chunk_size: usize,
36}
37
38impl Default for SemanticChunker {
39 fn default() -> Self {
40 Self::new()
41 }
42}
43
44impl SemanticChunker {
45 #[must_use]
47 pub const fn new() -> Self {
48 Self {
49 chunk_size: DEFAULT_CHUNK_SIZE,
50 overlap: DEFAULT_OVERLAP,
51 min_chunk_size: 100,
52 }
53 }
54
55 #[must_use]
57 pub const fn with_size(chunk_size: usize) -> Self {
58 Self {
59 chunk_size,
60 overlap: 0,
61 min_chunk_size: 100,
62 }
63 }
64
65 #[must_use]
67 pub const fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
68 Self {
69 chunk_size,
70 overlap,
71 min_chunk_size: 100,
72 }
73 }
74
75 #[must_use]
77 pub const fn min_chunk_size(mut self, size: usize) -> Self {
78 self.min_chunk_size = size;
79 self
80 }
81
82 fn find_best_boundary(&self, text: &str, target_pos: usize) -> usize {
86 if target_pos >= text.len() {
87 return text.len();
88 }
89
90 let search_start = find_char_boundary(text, target_pos.saturating_sub(self.chunk_size / 5));
93 let search_end = find_char_boundary(text, target_pos.min(text.len()));
94
95 if search_start >= search_end {
96 return find_char_boundary(text, target_pos);
97 }
98
99 let search_region = &text[search_start..search_end];
100
101 if let Some(pos) = search_region.rfind("\n\n") {
103 let boundary = search_start + pos + 2;
104 if boundary > search_start {
105 return boundary;
106 }
107 }
108
109 if let Some(pos) = search_region.rfind('\n') {
111 let boundary = search_start + pos + 1;
112 if boundary > search_start {
113 return boundary;
114 }
115 }
116
117 for (i, c) in search_region.char_indices().rev() {
119 if matches!(c, '.' | '!' | '?') {
120 let next_pos = search_start + i + c.len_utf8();
121 if next_pos >= text.len()
122 || text[next_pos..].starts_with(' ')
123 || text[next_pos..].starts_with('\n')
124 {
125 return next_pos;
126 }
127 }
128 }
129
130 if let Some(pos) = search_region.rfind(' ') {
132 let boundary = search_start + pos + 1;
133 if boundary > search_start {
134 return boundary;
135 }
136 }
137
138 find_char_boundary(text, target_pos)
140 }
141
142 #[allow(dead_code)]
144 fn sentence_boundaries(text: &str) -> Vec<usize> {
145 let mut boundaries = vec![0];
146 let mut pos = 0;
147
148 for sentence in text.split_sentence_bounds() {
149 pos += sentence.len();
150 boundaries.push(pos);
151 }
152
153 boundaries
154 }
155}
156
157impl Chunker for SemanticChunker {
158 #[allow(clippy::too_many_lines)]
159 fn chunk(
160 &self,
161 buffer_id: i64,
162 text: &str,
163 metadata: Option<&ChunkMetadata>,
164 ) -> Result<Vec<Chunk>> {
165 let (chunk_size, overlap) = metadata.map_or((self.chunk_size, self.overlap), |meta| {
167 (meta.chunk_size, meta.overlap)
168 });
169
170 if chunk_size == 0 {
172 return Err(ChunkingError::InvalidConfig {
173 reason: "chunk_size must be > 0".to_string(),
174 }
175 .into());
176 }
177 if chunk_size > MAX_CHUNK_SIZE {
178 return Err(ChunkingError::ChunkTooLarge {
179 size: chunk_size,
180 max: MAX_CHUNK_SIZE,
181 }
182 .into());
183 }
184 if overlap >= chunk_size {
185 return Err(ChunkingError::OverlapTooLarge {
186 overlap,
187 size: chunk_size,
188 }
189 .into());
190 }
191
192 if text.is_empty() {
194 return Ok(vec![]);
195 }
196
197 if text.len() <= chunk_size {
199 return Ok(vec![Chunk::with_strategy(
200 buffer_id,
201 text.to_string(),
202 0..text.len(),
203 0,
204 self.name(),
205 )]);
206 }
207
208 let mut chunks = Vec::new();
209 let mut start = 0;
210 let mut index = 0;
211
212 while start < text.len() {
213 let target_end = (start + chunk_size).min(text.len());
214 let end = if target_end >= text.len() {
215 text.len()
216 } else {
217 self.find_best_boundary(text, target_end)
218 };
219
220 let end = if end <= start {
222 find_char_boundary(text, (start + chunk_size).min(text.len()))
223 } else {
224 end
225 };
226
227 let content = text[start..end].to_string();
228 let mut chunk =
229 Chunk::with_strategy(buffer_id, content, start..end, index, self.name());
230
231 if index > 0 && overlap > 0 {
232 chunk.set_has_overlap(true);
233 }
234
235 chunk.set_token_count(chunk.estimate_tokens());
237
238 chunks.push(chunk);
239
240 if let Some(meta) = metadata
242 && meta.max_chunks > 0
243 && chunks.len() >= meta.max_chunks
244 {
245 break;
246 }
247
248 if end >= text.len() {
250 break;
251 }
252
253 let next_start = if overlap > 0 {
255 let overlap_start = end.saturating_sub(overlap);
257 self.find_best_boundary(text, overlap_start)
258 } else {
259 end
260 };
261
262 start = if next_start <= start { end } else { next_start };
264
265 index += 1;
266 }
267
268 if chunks.len() > 1
270 && let Some(last) = chunks.last()
271 && last.size() < self.min_chunk_size
272 && let Some(second_last) = chunks.get(chunks.len() - 2)
273 {
274 let merged_content = format!(
276 "{}{}",
277 second_last.content,
278 &text[second_last.byte_range.end..last.byte_range.end]
279 );
280 let merged_range = second_last.byte_range.start..last.byte_range.end;
281
282 chunks.pop(); chunks.pop(); let mut merged = Chunk::with_strategy(
286 buffer_id,
287 merged_content,
288 merged_range,
289 chunks.len(),
290 self.name(),
291 );
292 merged.set_token_count(merged.estimate_tokens());
293 chunks.push(merged);
294 }
295
296 Ok(chunks)
297 }
298
299 fn name(&self) -> &'static str {
300 "semantic"
301 }
302
303 fn supports_parallel(&self) -> bool {
304 true
305 }
306
307 fn description(&self) -> &'static str {
308 "Semantic chunking respecting sentence and paragraph boundaries"
309 }
310}
311
312#[cfg(test)]
313mod tests {
314 use super::*;
315
316 #[test]
317 fn test_semantic_chunker_default() {
318 let chunker = SemanticChunker::new();
319 assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
320 assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
321 }
322
323 #[test]
324 fn test_semantic_chunker_empty_text() {
325 let chunker = SemanticChunker::new();
326 let chunks = chunker.chunk(1, "", None).unwrap();
327 assert!(chunks.is_empty());
328 }
329
330 #[test]
331 fn test_semantic_chunker_small_text() {
332 let chunker = SemanticChunker::new();
333 let text = "Hello, world!";
334 let chunks = chunker.chunk(1, text, None).unwrap();
335 assert_eq!(chunks.len(), 1);
336 assert_eq!(chunks[0].content, text);
337 }
338
339 #[test]
340 fn test_semantic_chunker_sentence_boundary() {
341 let chunker = SemanticChunker::with_size(30);
342 let text = "First sentence. Second sentence. Third sentence.";
343 let chunks = chunker.chunk(1, text, None).unwrap();
344
345 assert!(!chunks.is_empty());
347 for chunk in &chunks {
348 let content = chunk.content.trim();
350 if !content.is_empty() && chunk.end() < text.len() {
351 assert!(
353 content.ends_with('.') || content.ends_with('!') || content.ends_with('?'),
354 "Chunk '{content}' should end at sentence boundary"
355 );
356 }
357 }
358 }
359
360 #[test]
361 fn test_semantic_chunker_paragraph_boundary() {
362 let chunker = SemanticChunker::with_size(50);
363 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
364 let chunks = chunker.chunk(1, text, None).unwrap();
365
366 assert!(!chunks.is_empty());
368 }
369
370 #[test]
371 fn test_semantic_chunker_unicode() {
372 let chunker = SemanticChunker::with_size(20);
373 let text = "Hello 世界! This is a test. Another sentence.";
374 let chunks = chunker.chunk(1, text, None).unwrap();
375
376 for chunk in &chunks {
378 assert!(chunk.content.is_char_boundary(0));
379 assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
381 }
382 }
383
384 #[test]
385 fn test_semantic_chunker_token_estimation() {
386 let chunker = SemanticChunker::with_size(50);
387 let text = "Hello, world! This is a test sentence for token estimation.";
388 let chunks = chunker.chunk(1, text, None).unwrap();
389
390 for chunk in &chunks {
391 assert!(chunk.metadata.token_count.is_some());
392 }
393 }
394
395 #[test]
396 fn test_semantic_chunker_strategy_name() {
397 let chunker = SemanticChunker::new();
398 assert_eq!(chunker.name(), "semantic");
399
400 let chunks = chunker.chunk(1, "Hello!", None).unwrap();
401 assert_eq!(chunks[0].metadata.strategy, Some("semantic".to_string()));
402 }
403
404 #[test]
405 fn test_semantic_chunker_invalid_config() {
406 let chunker = SemanticChunker::with_size(0);
407 let result = chunker.chunk(1, "test", None);
408 assert!(result.is_err());
409 }
410
411 #[test]
412 fn test_semantic_chunker_overlap_too_large() {
413 let chunker = SemanticChunker::with_size_and_overlap(10, 15);
414 let result = chunker.chunk(1, "test content here", None);
415 assert!(result.is_err());
416 }
417
418 #[test]
419 fn test_semantic_chunker_with_metadata() {
420 let chunker = SemanticChunker::new();
421 let text = "Hello, world! ".repeat(100);
422 let meta = ChunkMetadata::with_size_and_overlap(100, 10)
423 .preserve_sentences(true)
424 .max_chunks(5);
425 let chunks = chunker.chunk(1, &text, Some(&meta)).unwrap();
426
427 assert!(chunks.len() <= 5);
428 }
429
430 #[test]
431 fn test_semantic_chunker_supports_parallel() {
432 let chunker = SemanticChunker::new();
433 assert!(chunker.supports_parallel());
434 }
435
436 #[test]
437 fn test_find_char_boundary() {
438 let s = "Hello 世界!";
439 assert_eq!(find_char_boundary(s, 6), 6); assert_eq!(find_char_boundary(s, 7), 6); assert_eq!(find_char_boundary(s, 8), 6); assert_eq!(find_char_boundary(s, 9), 9); }
444
445 #[test]
446 fn test_semantic_chunker_default_impl() {
447 let chunker = SemanticChunker::default();
449 assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
450 assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
451 assert_eq!(chunker.min_chunk_size, 100);
452 }
453
454 #[test]
455 fn test_semantic_chunker_min_chunk_size() {
456 let chunker = SemanticChunker::new().min_chunk_size(200);
458 assert_eq!(chunker.min_chunk_size, 200);
459 }
460
461 #[test]
462 fn test_semantic_chunker_description() {
463 let chunker = SemanticChunker::new();
465 let desc = chunker.description();
466 assert!(desc.contains("Semantic"));
467 assert!(!desc.is_empty());
468 }
469
470 #[test]
471 fn test_find_char_boundary_at_end() {
472 let s = "hello";
474 assert_eq!(find_char_boundary(s, 10), 5);
475 assert_eq!(find_char_boundary(s, 5), 5);
476 }
477
478 #[test]
479 fn test_semantic_chunker_large_text() {
480 let chunker = SemanticChunker::with_size(100);
482 let text = "This is a sentence. ".repeat(50);
483 let chunks = chunker.chunk(1, &text, None).unwrap();
484 assert!(!chunks.is_empty());
485
486 for chunk in &chunks {
488 assert!(!chunk.content.is_empty());
489 }
490 }
491
492 #[test]
493 fn test_semantic_chunker_word_boundary() {
494 let chunker = SemanticChunker::with_size(15);
496 let text = "hello world test content here";
497 let chunks = chunker.chunk(1, text, None).unwrap();
498
499 assert!(!chunks.is_empty());
501 }
502
503 #[test]
504 fn test_semantic_chunker_with_overlap() {
505 let chunker = SemanticChunker::with_size_and_overlap(50, 10);
507 let text = "Word ".repeat(30);
508 let chunks = chunker.chunk(1, &text, None).unwrap();
509
510 assert!(chunks.len() > 1);
511 }
512
513 #[test]
514 fn test_find_best_boundary_target_beyond_text() {
515 let chunker = SemanticChunker::with_size(100);
517 let text = "Short text";
518 let chunks = chunker.chunk(1, text, None).unwrap();
520 assert_eq!(chunks.len(), 1);
521 assert_eq!(chunks[0].content, text);
522 }
523
524 #[test]
525 fn test_find_best_boundary_search_region_empty() {
526 let chunker = SemanticChunker::with_size(5).min_chunk_size(1);
529 let text = "ABCDEFGHIJKLMNOP";
530 let chunks = chunker.chunk(1, text, None).unwrap();
531 assert!(!chunks.is_empty());
532 for chunk in &chunks {
534 assert!(!chunk.content.is_empty());
535 }
536 }
537
538 #[test]
539 fn test_find_best_boundary_single_newline() {
540 let chunker = SemanticChunker::with_size(20);
542 let text = "First line here\nSecond line here\nThird line";
543 let chunks = chunker.chunk(1, text, None).unwrap();
544
545 assert!(!chunks.is_empty());
547 for chunk in &chunks {
549 assert!(!chunk.content.is_empty());
550 }
551 }
552
553 #[test]
554 fn test_semantic_chunker_chunk_too_large() {
555 let chunker = SemanticChunker::with_size(MAX_CHUNK_SIZE + 1);
557 let result = chunker.chunk(1, "test", None);
558 assert!(result.is_err());
559 }
560
561 #[test]
562 fn test_semantic_chunker_force_progress() {
563 let chunker = SemanticChunker::with_size(5).min_chunk_size(1);
566 let text = "AAAAAAAAAA"; let chunks = chunker.chunk(1, text, None).unwrap();
568
569 assert!(!chunks.is_empty());
571 let total_content: String = chunks.iter().map(|c| c.content.as_str()).collect();
573 assert_eq!(total_content.len(), text.len());
574 }
575
576 #[test]
577 fn test_semantic_chunker_merge_tiny_final_chunk() {
578 let chunker = SemanticChunker::with_size(50).min_chunk_size(20);
581 let text = "This is a longer sentence that will be chunked. X";
582 let chunks = chunker.chunk(1, text, None).unwrap();
583
584 if chunks.len() > 1 {
586 let last = chunks.last().unwrap();
587 assert!(last.size() >= 20 || chunks.len() == 1);
588 }
589 }
590
591 #[test]
592 fn test_semantic_chunker_sentence_boundary_detection() {
593 let chunker = SemanticChunker::with_size(25);
595 let text = "Question? Exclamation! Statement.";
596 let chunks = chunker.chunk(1, text, None).unwrap();
597
598 assert!(!chunks.is_empty());
600 }
601
602 #[test]
603 fn test_semantic_chunker_multibyte_utf8_boundaries() {
604 let chunker = SemanticChunker::with_size(50).min_chunk_size(10);
607
608 let text = "This is \u{201C}quoted text\u{201D} with smart quotes. \
610 And more \u{201C}content\u{201D} here. \
611 Plus some emoji \u{1F389} and Japanese \u{65E5}\u{672C}\u{8A9E} for good measure.";
612
613 let result = chunker.chunk(1, text, None);
614 assert!(result.is_ok(), "Should not panic on multi-byte UTF-8 chars");
615
616 let chunks = result.unwrap();
617 assert!(!chunks.is_empty());
618
619 for chunk in &chunks {
621 assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
622 }
623 }
624
625 #[test]
626 fn test_semantic_chunker_large_multibyte_document() {
627 use std::fmt::Write;
628
629 let chunker = SemanticChunker::with_size(100).min_chunk_size(20);
631
632 let mut text = String::new();
634 for i in 0..50 {
635 let _ = write!(
636 text,
637 "Section {i}: \u{201C}This is quoted content\u{201D} with data. "
638 );
639 }
640
641 let result = chunker.chunk(1, &text, None);
642 assert!(
643 result.is_ok(),
644 "Should handle large docs with multi-byte chars"
645 );
646
647 let chunks = result.unwrap();
648 for chunk in &chunks {
650 assert!(text.is_char_boundary(chunk.byte_range.start));
651 assert!(text.is_char_boundary(chunk.byte_range.end));
652 }
653 }
654}