rlm_rs/chunking/
semantic.rs

1//! Semantic chunking strategy.
2//!
3//! Provides Unicode-aware chunking that respects sentence and paragraph
4//! boundaries using the `unicode-segmentation` crate.
5
6use crate::chunking::traits::{ChunkMetadata, Chunker};
7use crate::chunking::{DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, MAX_CHUNK_SIZE};
8use crate::core::Chunk;
9use crate::error::{ChunkingError, Result};
10use crate::io::find_char_boundary;
11use unicode_segmentation::UnicodeSegmentation;
12
13/// Semantic chunker that respects sentence and paragraph boundaries.
14///
15/// This chunker produces more coherent chunks by avoiding splits in the
16/// middle of sentences or words. It uses Unicode segmentation rules
17/// for proper international text handling.
18///
19/// # Examples
20///
21/// ```
22/// use rlm_rs::chunking::{Chunker, SemanticChunker};
23///
24/// let chunker = SemanticChunker::new();
25/// let text = "Hello, world! This is a test. Another sentence here.";
26/// let chunks = chunker.chunk(1, text, None).unwrap();
27/// ```
28#[derive(Debug, Clone)]
29pub struct SemanticChunker {
30    /// Target chunk size in characters.
31    chunk_size: usize,
32    /// Overlap between consecutive chunks.
33    overlap: usize,
34    /// Minimum chunk size (avoid tiny final chunks).
35    min_chunk_size: usize,
36}
37
38impl Default for SemanticChunker {
39    fn default() -> Self {
40        Self::new()
41    }
42}
43
44impl SemanticChunker {
45    /// Creates a new semantic chunker with default settings.
46    #[must_use]
47    pub const fn new() -> Self {
48        Self {
49            chunk_size: DEFAULT_CHUNK_SIZE,
50            overlap: DEFAULT_OVERLAP,
51            min_chunk_size: 100,
52        }
53    }
54
55    /// Creates a semantic chunker with custom chunk size and no overlap.
56    #[must_use]
57    pub const fn with_size(chunk_size: usize) -> Self {
58        Self {
59            chunk_size,
60            overlap: 0,
61            min_chunk_size: 100,
62        }
63    }
64
65    /// Creates a semantic chunker with custom size and overlap.
66    #[must_use]
67    pub const fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
68        Self {
69            chunk_size,
70            overlap,
71            min_chunk_size: 100,
72        }
73    }
74
75    /// Sets the minimum chunk size.
76    #[must_use]
77    pub const fn min_chunk_size(mut self, size: usize) -> Self {
78        self.min_chunk_size = size;
79        self
80    }
81
82    /// Finds the best boundary near the target position.
83    ///
84    /// Prefers paragraph breaks > sentence breaks > word breaks > character breaks.
85    fn find_best_boundary(&self, text: &str, target_pos: usize) -> usize {
86        if target_pos >= text.len() {
87            return text.len();
88        }
89
90        // Search window: look back up to 20% of chunk size for a good boundary
91        // Ensure both boundaries are valid UTF-8 character boundaries
92        let search_start = find_char_boundary(text, target_pos.saturating_sub(self.chunk_size / 5));
93        let search_end = find_char_boundary(text, target_pos.min(text.len()));
94
95        if search_start >= search_end {
96            return find_char_boundary(text, target_pos);
97        }
98
99        let search_region = &text[search_start..search_end];
100
101        // Priority 1: Paragraph break (double newline)
102        if let Some(pos) = search_region.rfind("\n\n") {
103            let boundary = search_start + pos + 2;
104            if boundary > search_start {
105                return boundary;
106            }
107        }
108
109        // Priority 2: Single newline
110        if let Some(pos) = search_region.rfind('\n') {
111            let boundary = search_start + pos + 1;
112            if boundary > search_start {
113                return boundary;
114            }
115        }
116
117        // Priority 3: Sentence boundary (. ! ? followed by space or end)
118        for (i, c) in search_region.char_indices().rev() {
119            if matches!(c, '.' | '!' | '?') {
120                let next_pos = search_start + i + c.len_utf8();
121                if next_pos >= text.len()
122                    || text[next_pos..].starts_with(' ')
123                    || text[next_pos..].starts_with('\n')
124                {
125                    return next_pos;
126                }
127            }
128        }
129
130        // Priority 4: Word boundary (space)
131        if let Some(pos) = search_region.rfind(' ') {
132            let boundary = search_start + pos + 1;
133            if boundary > search_start {
134                return boundary;
135            }
136        }
137
138        // Fallback: character boundary
139        find_char_boundary(text, target_pos)
140    }
141
142    /// Finds sentence boundaries in the text.
143    #[allow(dead_code)]
144    fn sentence_boundaries(text: &str) -> Vec<usize> {
145        let mut boundaries = vec![0];
146        let mut pos = 0;
147
148        for sentence in text.split_sentence_bounds() {
149            pos += sentence.len();
150            boundaries.push(pos);
151        }
152
153        boundaries
154    }
155}
156
157impl Chunker for SemanticChunker {
158    #[allow(clippy::too_many_lines)]
159    fn chunk(
160        &self,
161        buffer_id: i64,
162        text: &str,
163        metadata: Option<&ChunkMetadata>,
164    ) -> Result<Vec<Chunk>> {
165        // Get effective chunk size and overlap
166        let (chunk_size, overlap) = metadata.map_or((self.chunk_size, self.overlap), |meta| {
167            (meta.chunk_size, meta.overlap)
168        });
169
170        // Validate configuration
171        if chunk_size == 0 {
172            return Err(ChunkingError::InvalidConfig {
173                reason: "chunk_size must be > 0".to_string(),
174            }
175            .into());
176        }
177        if chunk_size > MAX_CHUNK_SIZE {
178            return Err(ChunkingError::ChunkTooLarge {
179                size: chunk_size,
180                max: MAX_CHUNK_SIZE,
181            }
182            .into());
183        }
184        if overlap >= chunk_size {
185            return Err(ChunkingError::OverlapTooLarge {
186                overlap,
187                size: chunk_size,
188            }
189            .into());
190        }
191
192        // Handle empty text
193        if text.is_empty() {
194            return Ok(vec![]);
195        }
196
197        // Handle text smaller than chunk size
198        if text.len() <= chunk_size {
199            return Ok(vec![Chunk::with_strategy(
200                buffer_id,
201                text.to_string(),
202                0..text.len(),
203                0,
204                self.name(),
205            )]);
206        }
207
208        let mut chunks = Vec::new();
209        let mut start = 0;
210        let mut index = 0;
211
212        while start < text.len() {
213            let target_end = (start + chunk_size).min(text.len());
214            let end = if target_end >= text.len() {
215                text.len()
216            } else {
217                self.find_best_boundary(text, target_end)
218            };
219
220            // Ensure we make progress
221            let end = if end <= start {
222                find_char_boundary(text, (start + chunk_size).min(text.len()))
223            } else {
224                end
225            };
226
227            let content = text[start..end].to_string();
228            let mut chunk =
229                Chunk::with_strategy(buffer_id, content, start..end, index, self.name());
230
231            if index > 0 && overlap > 0 {
232                chunk.set_has_overlap(true);
233            }
234
235            // Estimate token count
236            chunk.set_token_count(chunk.estimate_tokens());
237
238            chunks.push(chunk);
239
240            // Check max chunks limit
241            if let Some(meta) = metadata
242                && meta.max_chunks > 0
243                && chunks.len() >= meta.max_chunks
244            {
245                break;
246            }
247
248            // Move to next chunk
249            if end >= text.len() {
250                break;
251            }
252
253            // Calculate next start position
254            let next_start = if overlap > 0 {
255                // For overlap, we need to find a good boundary before the overlap point
256                let overlap_start = end.saturating_sub(overlap);
257                self.find_best_boundary(text, overlap_start)
258            } else {
259                end
260            };
261
262            // Ensure we don't go backwards
263            start = if next_start <= start { end } else { next_start };
264
265            index += 1;
266        }
267
268        // Merge tiny final chunk if it's too small
269        if chunks.len() > 1
270            && let Some(last) = chunks.last()
271            && last.size() < self.min_chunk_size
272            && let Some(second_last) = chunks.get(chunks.len() - 2)
273        {
274            // Merge into previous chunk
275            let merged_content = format!(
276                "{}{}",
277                second_last.content,
278                &text[second_last.byte_range.end..last.byte_range.end]
279            );
280            let merged_range = second_last.byte_range.start..last.byte_range.end;
281
282            chunks.pop(); // Remove last
283            chunks.pop(); // Remove second last
284
285            let mut merged = Chunk::with_strategy(
286                buffer_id,
287                merged_content,
288                merged_range,
289                chunks.len(),
290                self.name(),
291            );
292            merged.set_token_count(merged.estimate_tokens());
293            chunks.push(merged);
294        }
295
296        Ok(chunks)
297    }
298
299    fn name(&self) -> &'static str {
300        "semantic"
301    }
302
303    fn supports_parallel(&self) -> bool {
304        true
305    }
306
307    fn description(&self) -> &'static str {
308        "Semantic chunking respecting sentence and paragraph boundaries"
309    }
310}
311
312#[cfg(test)]
313mod tests {
314    use super::*;
315
316    #[test]
317    fn test_semantic_chunker_default() {
318        let chunker = SemanticChunker::new();
319        assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
320        assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
321    }
322
323    #[test]
324    fn test_semantic_chunker_empty_text() {
325        let chunker = SemanticChunker::new();
326        let chunks = chunker.chunk(1, "", None).unwrap();
327        assert!(chunks.is_empty());
328    }
329
330    #[test]
331    fn test_semantic_chunker_small_text() {
332        let chunker = SemanticChunker::new();
333        let text = "Hello, world!";
334        let chunks = chunker.chunk(1, text, None).unwrap();
335        assert_eq!(chunks.len(), 1);
336        assert_eq!(chunks[0].content, text);
337    }
338
339    #[test]
340    fn test_semantic_chunker_sentence_boundary() {
341        let chunker = SemanticChunker::with_size(30);
342        let text = "First sentence. Second sentence. Third sentence.";
343        let chunks = chunker.chunk(1, text, None).unwrap();
344
345        // Should prefer breaking at sentence boundaries
346        assert!(!chunks.is_empty());
347        for chunk in &chunks {
348            // Chunks should generally end at sentence boundaries
349            let content = chunk.content.trim();
350            if !content.is_empty() && chunk.end() < text.len() {
351                // Non-final chunks should try to end at sentence boundary
352                assert!(
353                    content.ends_with('.') || content.ends_with('!') || content.ends_with('?'),
354                    "Chunk '{content}' should end at sentence boundary"
355                );
356            }
357        }
358    }
359
360    #[test]
361    fn test_semantic_chunker_paragraph_boundary() {
362        let chunker = SemanticChunker::with_size(50);
363        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
364        let chunks = chunker.chunk(1, text, None).unwrap();
365
366        // Should prefer breaking at paragraph boundaries
367        assert!(!chunks.is_empty());
368    }
369
370    #[test]
371    fn test_semantic_chunker_unicode() {
372        let chunker = SemanticChunker::with_size(20);
373        let text = "Hello 世界! This is a test. Another sentence.";
374        let chunks = chunker.chunk(1, text, None).unwrap();
375
376        // All chunks should be valid UTF-8
377        for chunk in &chunks {
378            assert!(chunk.content.is_char_boundary(0));
379            // Verify content matches original
380            assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
381        }
382    }
383
384    #[test]
385    fn test_semantic_chunker_token_estimation() {
386        let chunker = SemanticChunker::with_size(50);
387        let text = "Hello, world! This is a test sentence for token estimation.";
388        let chunks = chunker.chunk(1, text, None).unwrap();
389
390        for chunk in &chunks {
391            assert!(chunk.metadata.token_count.is_some());
392        }
393    }
394
395    #[test]
396    fn test_semantic_chunker_strategy_name() {
397        let chunker = SemanticChunker::new();
398        assert_eq!(chunker.name(), "semantic");
399
400        let chunks = chunker.chunk(1, "Hello!", None).unwrap();
401        assert_eq!(chunks[0].metadata.strategy, Some("semantic".to_string()));
402    }
403
404    #[test]
405    fn test_semantic_chunker_invalid_config() {
406        let chunker = SemanticChunker::with_size(0);
407        let result = chunker.chunk(1, "test", None);
408        assert!(result.is_err());
409    }
410
411    #[test]
412    fn test_semantic_chunker_overlap_too_large() {
413        let chunker = SemanticChunker::with_size_and_overlap(10, 15);
414        let result = chunker.chunk(1, "test content here", None);
415        assert!(result.is_err());
416    }
417
418    #[test]
419    fn test_semantic_chunker_with_metadata() {
420        let chunker = SemanticChunker::new();
421        let text = "Hello, world! ".repeat(100);
422        let meta = ChunkMetadata::with_size_and_overlap(100, 10)
423            .preserve_sentences(true)
424            .max_chunks(5);
425        let chunks = chunker.chunk(1, &text, Some(&meta)).unwrap();
426
427        assert!(chunks.len() <= 5);
428    }
429
430    #[test]
431    fn test_semantic_chunker_supports_parallel() {
432        let chunker = SemanticChunker::new();
433        assert!(chunker.supports_parallel());
434    }
435
436    #[test]
437    fn test_find_char_boundary() {
438        let s = "Hello 世界!";
439        assert_eq!(find_char_boundary(s, 6), 6); // Before multi-byte char
440        assert_eq!(find_char_boundary(s, 7), 6); // Middle of '世'
441        assert_eq!(find_char_boundary(s, 8), 6); // Middle of '世'
442        assert_eq!(find_char_boundary(s, 9), 9); // After '世'
443    }
444
445    #[test]
446    fn test_semantic_chunker_default_impl() {
447        // Test Default trait implementation (lines 38-39)
448        let chunker = SemanticChunker::default();
449        assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
450        assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
451        assert_eq!(chunker.min_chunk_size, 100);
452    }
453
454    #[test]
455    fn test_semantic_chunker_min_chunk_size() {
456        // Test min_chunk_size builder method (lines 76-78)
457        let chunker = SemanticChunker::new().min_chunk_size(200);
458        assert_eq!(chunker.min_chunk_size, 200);
459    }
460
461    #[test]
462    fn test_semantic_chunker_description() {
463        // Test description method (lines 306-307)
464        let chunker = SemanticChunker::new();
465        let desc = chunker.description();
466        assert!(desc.contains("Semantic"));
467        assert!(!desc.is_empty());
468    }
469
470    #[test]
471    fn test_find_char_boundary_at_end() {
472        // Test find_char_boundary when pos >= s.len() (line 314)
473        let s = "hello";
474        assert_eq!(find_char_boundary(s, 10), 5);
475        assert_eq!(find_char_boundary(s, 5), 5);
476    }
477
478    #[test]
479    fn test_semantic_chunker_large_text() {
480        // Test with larger text to trigger more boundary finding
481        let chunker = SemanticChunker::with_size(100);
482        let text = "This is a sentence. ".repeat(50);
483        let chunks = chunker.chunk(1, &text, None).unwrap();
484        assert!(!chunks.is_empty());
485
486        // Verify chunks have reasonable sizes
487        for chunk in &chunks {
488            assert!(!chunk.content.is_empty());
489        }
490    }
491
492    #[test]
493    fn test_semantic_chunker_word_boundary() {
494        // Test word boundary detection
495        let chunker = SemanticChunker::with_size(15);
496        let text = "hello world test content here";
497        let chunks = chunker.chunk(1, text, None).unwrap();
498
499        // Should break at word boundaries where possible
500        assert!(!chunks.is_empty());
501    }
502
503    #[test]
504    fn test_semantic_chunker_with_overlap() {
505        // Test chunking with overlap
506        let chunker = SemanticChunker::with_size_and_overlap(50, 10);
507        let text = "Word ".repeat(30);
508        let chunks = chunker.chunk(1, &text, None).unwrap();
509
510        assert!(chunks.len() > 1);
511    }
512
513    #[test]
514    fn test_find_best_boundary_target_beyond_text() {
515        // Test find_best_boundary when target_pos >= text.len() (line 86)
516        let chunker = SemanticChunker::with_size(100);
517        let text = "Short text";
518        // Call chunk with small text and large chunk size to exercise the boundary
519        let chunks = chunker.chunk(1, text, None).unwrap();
520        assert_eq!(chunks.len(), 1);
521        assert_eq!(chunks[0].content, text);
522    }
523
524    #[test]
525    fn test_find_best_boundary_search_region_empty() {
526        // Test when search_start >= search_end (line 94)
527        // This happens with very small chunk sizes where the search window is minimal
528        let chunker = SemanticChunker::with_size(5).min_chunk_size(1);
529        let text = "ABCDEFGHIJKLMNOP";
530        let chunks = chunker.chunk(1, text, None).unwrap();
531        assert!(!chunks.is_empty());
532        // All chunks should be valid UTF-8
533        for chunk in &chunks {
534            assert!(!chunk.content.is_empty());
535        }
536    }
537
538    #[test]
539    fn test_find_best_boundary_single_newline() {
540        // Test single newline boundary finding (lines 109-111)
541        let chunker = SemanticChunker::with_size(20);
542        let text = "First line here\nSecond line here\nThird line";
543        let chunks = chunker.chunk(1, text, None).unwrap();
544
545        // Should prefer breaking at single newlines when no paragraph breaks
546        assert!(!chunks.is_empty());
547        // Verify chunks are valid and cover the text
548        for chunk in &chunks {
549            assert!(!chunk.content.is_empty());
550        }
551    }
552
553    #[test]
554    fn test_semantic_chunker_chunk_too_large() {
555        // Test ChunkTooLarge error (lines 176-178, 180)
556        let chunker = SemanticChunker::with_size(MAX_CHUNK_SIZE + 1);
557        let result = chunker.chunk(1, "test", None);
558        assert!(result.is_err());
559    }
560
561    #[test]
562    fn test_semantic_chunker_force_progress() {
563        // Test end <= start case forcing progress (line 220)
564        // This can happen with pathological input where boundary finding fails
565        let chunker = SemanticChunker::with_size(5).min_chunk_size(1);
566        let text = "AAAAAAAAAA"; // No natural boundaries
567        let chunks = chunker.chunk(1, text, None).unwrap();
568
569        // Should still make progress and produce chunks
570        assert!(!chunks.is_empty());
571        // Verify all content is covered
572        let total_content: String = chunks.iter().map(|c| c.content.as_str()).collect();
573        assert_eq!(total_content.len(), text.len());
574    }
575
576    #[test]
577    fn test_semantic_chunker_merge_tiny_final_chunk() {
578        // Test merging tiny final chunk (lines 266-292)
579        // Create text where the final chunk would be tiny
580        let chunker = SemanticChunker::with_size(50).min_chunk_size(20);
581        let text = "This is a longer sentence that will be chunked. X";
582        let chunks = chunker.chunk(1, text, None).unwrap();
583
584        // Final chunk should be merged if it's too small
585        if chunks.len() > 1 {
586            let last = chunks.last().unwrap();
587            assert!(last.size() >= 20 || chunks.len() == 1);
588        }
589    }
590
591    #[test]
592    fn test_semantic_chunker_sentence_boundary_detection() {
593        // Test sentence boundary detection with punctuation (line 121)
594        let chunker = SemanticChunker::with_size(25);
595        let text = "Question? Exclamation! Statement.";
596        let chunks = chunker.chunk(1, text, None).unwrap();
597
598        // Should detect sentence boundaries
599        assert!(!chunks.is_empty());
600    }
601
602    #[test]
603    fn test_semantic_chunker_multibyte_utf8_boundaries() {
604        // Test that multi-byte UTF-8 characters don't cause panics
605        // Smart quotes are 3 bytes each: " (0xE2 0x80 0x9C) and " (0xE2 0x80 0x9D)
606        let chunker = SemanticChunker::with_size(50).min_chunk_size(10);
607
608        // Text with smart quotes and other multi-byte chars
609        let text = "This is \u{201C}quoted text\u{201D} with smart quotes. \
610                    And more \u{201C}content\u{201D} here. \
611                    Plus some emoji \u{1F389} and Japanese \u{65E5}\u{672C}\u{8A9E} for good measure.";
612
613        let result = chunker.chunk(1, text, None);
614        assert!(result.is_ok(), "Should not panic on multi-byte UTF-8 chars");
615
616        let chunks = result.unwrap();
617        assert!(!chunks.is_empty());
618
619        // Verify all chunks are valid UTF-8 and match the source
620        for chunk in &chunks {
621            assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
622        }
623    }
624
625    #[test]
626    fn test_semantic_chunker_large_multibyte_document() {
627        use std::fmt::Write;
628
629        // Simulate a document with many multi-byte characters throughout
630        let chunker = SemanticChunker::with_size(100).min_chunk_size(20);
631
632        // Build text with multi-byte chars at various positions
633        let mut text = String::new();
634        for i in 0..50 {
635            let _ = write!(
636                text,
637                "Section {i}: \u{201C}This is quoted content\u{201D} with data. "
638            );
639        }
640
641        let result = chunker.chunk(1, &text, None);
642        assert!(
643            result.is_ok(),
644            "Should handle large docs with multi-byte chars"
645        );
646
647        let chunks = result.unwrap();
648        // Verify chunk byte ranges are valid
649        for chunk in &chunks {
650            assert!(text.is_char_boundary(chunk.byte_range.start));
651            assert!(text.is_char_boundary(chunk.byte_range.end));
652        }
653    }
654
655    #[test]
656    fn test_semantic_chunker_pure_cjk_text() {
657        // Pure CJK text has no Latin word or sentence boundaries; the chunker
658        // must not panic and must produce valid UTF-8 chunks.
659        let chunker = SemanticChunker::with_size(30).min_chunk_size(5);
660        let text = "本日は晴天なり。東京タワーは美しい。富士山は日本で最も高い山です。春の花が咲いている。";
661
662        let result = chunker.chunk(1, text, None);
663        assert!(result.is_ok(), "Should not panic on pure CJK input");
664
665        let chunks = result.unwrap();
666        assert!(!chunks.is_empty());
667        for chunk in &chunks {
668            assert!(text.is_char_boundary(chunk.byte_range.start));
669            assert!(text.is_char_boundary(chunk.byte_range.end));
670            assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
671        }
672    }
673
674    #[test]
675    fn test_semantic_chunker_arabic_text() {
676        // Arabic (RTL) text — must not panic and byte boundaries must be valid.
677        let chunker = SemanticChunker::with_size(40).min_chunk_size(5);
678        let text = "مرحبا بالعالم. هذا نص عربي. الجملة الثالثة هنا. والرابعة في النهاية.";
679
680        let result = chunker.chunk(1, text, None);
681        assert!(result.is_ok(), "Should not panic on Arabic (RTL) input");
682
683        let chunks = result.unwrap();
684        assert!(!chunks.is_empty());
685        for chunk in &chunks {
686            assert!(text.is_char_boundary(chunk.byte_range.start));
687            assert!(text.is_char_boundary(chunk.byte_range.end));
688            assert_eq!(&text[chunk.byte_range.clone()], chunk.content);
689        }
690    }
691}
rlm_rs/chunking/semantic.rs

rlm_rs/chunking/
semantic.rs