Skip to main content

shodh_memory/embeddings/
chunking.rs

1//! Text Chunking for Long-Content Embeddings
2//!
3//! MiniLM has a 256 token limit. Content beyond this is silently dropped,
4//! making long memories unsearchable by their later content.
5//!
6//! This module implements two chunking strategies:
7//!
8//! ## 1. Fixed-Size Overlapping Chunking (`chunk_text`)
9//! - Split text into ~200 token chunks (leaving room for special tokens)
10//! - 50 token overlap between chunks for context continuity
11//! - Good for general prose and documents
12//!
13//! ## 2. Semantic Chunking (`semantic_chunk_text`)
14//! - Splits on natural boundaries (paragraphs, dialogue turns, sections)
15//! - Preserves conversational context - never splits mid-turn
16//! - Better for dialogue, structured text, logs, and multi-speaker content
17//!
18//! # Example (Fixed-Size)
19//!
20//! A 1000-token memory becomes ~6 chunks:
21//! ```text
22//! [0-200] [150-350] [300-500] [450-650] [600-800] [750-1000]
23//! ```
24//!
25//! # Example (Semantic)
26//!
27//! A dialogue becomes natural chunks preserving speaker turns:
28//! ```text
29//! [Alice: Hi / Bob: Hello] [Alice: How are you? / Bob: Great!]
30//! ```
31
32use regex::Regex;
33use std::sync::LazyLock;
34
35/// Pattern to detect dialogue turns (e.g., "Alice:", "User:", "Speaker 1:")
36static DIALOGUE_TURN_PATTERN: LazyLock<Regex> =
37    LazyLock::new(|| Regex::new(r"(?m)^([A-Z][a-zA-Z0-9_\- ]{0,30})\s*:").unwrap());
38
39/// Pattern to detect section headers or timestamps
40static SECTION_PATTERN: LazyLock<Regex> =
41    LazyLock::new(|| Regex::new(r"(?m)^(?:\[.*?\]|#{1,3}\s+\w|Session \d+|---+)").unwrap());
42
43/// Chunk configuration for fixed-size chunking
44pub struct ChunkConfig {
45    /// Target chunk size in characters (approximate tokens * 4)
46    pub chunk_size: usize,
47    /// Overlap between chunks in characters
48    pub overlap: usize,
49    /// Minimum chunk size (don't create tiny trailing chunks)
50    pub min_chunk_size: usize,
51}
52
53impl Default for ChunkConfig {
54    fn default() -> Self {
55        Self {
56            // ~200 tokens * 4 chars/token = 800 chars
57            // Leave headroom for tokenizer differences
58            chunk_size: 800,
59            // ~50 tokens overlap for context continuity
60            overlap: 200,
61            // Don't create chunks smaller than ~50 tokens
62            min_chunk_size: 200,
63        }
64    }
65}
66
67/// Result of chunking a text
68#[derive(Debug, Clone)]
69pub struct ChunkResult {
70    /// The chunked text segments
71    pub chunks: Vec<String>,
72    /// Original text length
73    pub original_length: usize,
74    /// Whether chunking was needed (content exceeded single chunk)
75    pub was_chunked: bool,
76}
77
78impl ChunkResult {
79    /// Calculate content coverage ratio (1.0 = all content in single chunk)
80    pub fn coverage_ratio(&self) -> f32 {
81        if self.chunks.is_empty() {
82            return 0.0;
83        }
84        // With chunking, we cover everything
85        // Without chunking, we might truncate
86        1.0
87    }
88}
89
90/// Find the nearest valid char boundary at or before the given byte index
91#[inline]
92fn floor_char_boundary(s: &str, index: usize) -> usize {
93    if index >= s.len() {
94        return s.len();
95    }
96    // Walk backwards to find a valid char boundary
97    let mut i = index;
98    while i > 0 && !s.is_char_boundary(i) {
99        i -= 1;
100    }
101    i
102}
103
104/// Find the nearest valid char boundary at or after the given byte index
105#[inline]
106fn ceil_char_boundary(s: &str, index: usize) -> usize {
107    if index >= s.len() {
108        return s.len();
109    }
110    // Walk forwards to find a valid char boundary
111    let mut i = index;
112    while i < s.len() && !s.is_char_boundary(i) {
113        i += 1;
114    }
115    i
116}
117
118/// Chunk text into overlapping segments for embedding
119///
120/// Uses sentence-aware splitting to avoid breaking mid-sentence when possible.
121pub fn chunk_text(text: &str, config: &ChunkConfig) -> ChunkResult {
122    let text = text.trim();
123    let original_length = text.len();
124
125    // If text fits in a single chunk, no need to split
126    if original_length <= config.chunk_size {
127        return ChunkResult {
128            chunks: vec![text.to_string()],
129            original_length,
130            was_chunked: false,
131        };
132    }
133
134    let mut chunks = Vec::new();
135    let mut start = 0;
136
137    while start < original_length {
138        // Calculate end position for this chunk, ensuring valid char boundary
139        let mut end = floor_char_boundary(text, (start + config.chunk_size).min(original_length));
140
141        // If we're not at the end, try to break at a sentence boundary
142        if end < original_length {
143            end = find_break_point(text, start, end, config.min_chunk_size);
144            // Ensure the break point is on a valid char boundary
145            end = floor_char_boundary(text, end);
146        }
147
148        // Ensure start is on a valid char boundary
149        start = ceil_char_boundary(text, start);
150
151        // Safety check: ensure we don't have start >= end
152        if start >= end {
153            break;
154        }
155
156        // Extract chunk (now safe - both start and end are valid char boundaries)
157        let chunk = text[start..end].trim();
158        if chunk.len() >= config.min_chunk_size || chunks.is_empty() {
159            chunks.push(chunk.to_string());
160        } else if let Some(last) = chunks.last_mut() {
161            // Append tiny trailing chunk to previous
162            last.push(' ');
163            last.push_str(chunk);
164        }
165
166        // Move start position, accounting for overlap
167        if end >= original_length {
168            break;
169        }
170        // Ensure new start is on a valid char boundary
171        start = ceil_char_boundary(text, end.saturating_sub(config.overlap));
172
173        // Ensure we make progress
174        if start <= chunks.len().saturating_sub(1) * (config.chunk_size - config.overlap) {
175            start = ceil_char_boundary(text, end);
176        }
177    }
178
179    ChunkResult {
180        chunks,
181        original_length,
182        was_chunked: true,
183    }
184}
185
186/// Find a good break point for chunking (sentence or word boundary)
187fn find_break_point(text: &str, start: usize, ideal_end: usize, min_size: usize) -> usize {
188    let chunk = &text[start..ideal_end];
189
190    // Try to find sentence boundary (. ! ?) followed by space or end
191    let sentence_boundaries: Vec<usize> = chunk
192        .char_indices()
193        .filter_map(|(byte_offset, c)| {
194            if (c == '.' || c == '!' || c == '?') && byte_offset >= min_size {
195                // Check if followed by space or end of chunk
196                let after = byte_offset + c.len_utf8();
197                let next_char = chunk[after..].chars().next();
198                if next_char.is_none_or(|nc| nc.is_whitespace()) {
199                    return Some(start + after);
200                }
201            }
202            None
203        })
204        .collect();
205
206    // Use the last sentence boundary if available
207    if let Some(&boundary) = sentence_boundaries.last() {
208        return boundary;
209    }
210
211    // Fall back to word boundary
212    let word_boundaries: Vec<usize> = chunk
213        .char_indices()
214        .filter_map(|(i, c)| {
215            if c.is_whitespace() && i >= min_size {
216                Some(start + i)
217            } else {
218                None
219            }
220        })
221        .collect();
222
223    if let Some(&boundary) = word_boundaries.last() {
224        return boundary;
225    }
226
227    // No good boundary found, use ideal_end
228    ideal_end
229}
230
231/// Estimate token count for text (improved approximation)
232///
233/// Uses word-based estimation with adjustment for BPE subword tokenization.
234/// More accurate than simple character division for mixed content (prose, code, numbers).
235///
236/// Accuracy: ~85-90% for English prose, ~75-85% for code/mixed content.
237/// For exact counts, use a proper tokenizer like tiktoken.
238pub fn estimate_tokens(text: &str) -> usize {
239    if text.is_empty() {
240        return 0;
241    }
242
243    let words = text.split_whitespace().count();
244    if words == 0 {
245        // Text with no whitespace (e.g., single long token or CJK)
246        // Fall back to character-based estimate
247        return text.chars().count().div_ceil(4);
248    }
249
250    // BPE tokenization typically splits words into ~1.3 subword tokens on average
251    // Code and technical content have more splits (camelCase, snake_case, etc.)
252    let base_tokens = (words as f64 * 1.3).ceil() as usize;
253
254    // Add tokens for punctuation and special characters not attached to words
255    // These often become separate tokens
256    let special_chars = text
257        .chars()
258        .filter(|c| c.is_ascii_punctuation() || *c == '\n')
259        .count();
260    let punct_tokens = special_chars / 3; // ~3 punct chars per token on average
261
262    base_tokens + punct_tokens
263}
264
265/// Configuration for semantic chunking
266pub struct SemanticChunkConfig {
267    /// Target chunk size in characters
268    pub target_size: usize,
269    /// Maximum chunk size (hard limit)
270    pub max_size: usize,
271    /// Minimum chunk size (merge smaller segments)
272    pub min_size: usize,
273    /// Whether to preserve dialogue turns intact
274    pub preserve_dialogue_turns: bool,
275    /// Whether to split on paragraph boundaries
276    pub split_on_paragraphs: bool,
277}
278
279impl Default for SemanticChunkConfig {
280    fn default() -> Self {
281        Self {
282            target_size: 800,
283            max_size: 1200,
284            min_size: 100,
285            preserve_dialogue_turns: true,
286            split_on_paragraphs: true,
287        }
288    }
289}
290
291/// A semantic segment with metadata
292#[derive(Debug, Clone)]
293struct SemanticSegment {
294    text: String,
295    #[allow(dead_code)]
296    segment_type: SegmentType,
297}
298
299#[derive(Debug, Clone, PartialEq)]
300enum SegmentType {
301    DialogueTurn,
302    Paragraph,
303    Section,
304    Text,
305}
306
307/// Semantic chunking: splits text on natural boundaries (dialogue turns, paragraphs, sections)
308/// and groups related content together.
309///
310/// This is better for conversational content, logs, and structured text than fixed-size chunking.
311pub fn semantic_chunk_text(text: &str, config: &SemanticChunkConfig) -> ChunkResult {
312    let text = text.trim();
313    let original_length = text.len();
314
315    // If text fits in a single chunk, no need to split
316    if original_length <= config.target_size {
317        return ChunkResult {
318            chunks: vec![text.to_string()],
319            original_length,
320            was_chunked: false,
321        };
322    }
323
324    // Step 1: Split into semantic segments
325    let segments = split_into_segments(text, config);
326
327    // Step 2: Group segments into chunks respecting size constraints
328    let chunks = group_segments_into_chunks(segments, config);
329
330    ChunkResult {
331        chunks,
332        original_length,
333        was_chunked: true,
334    }
335}
336
337/// Split text into semantic segments based on structure
338fn split_into_segments(text: &str, config: &SemanticChunkConfig) -> Vec<SemanticSegment> {
339    let mut segments = Vec::new();
340
341    // Check if this looks like dialogue (has speaker patterns)
342    let is_dialogue = config.preserve_dialogue_turns && DIALOGUE_TURN_PATTERN.is_match(text);
343
344    if is_dialogue {
345        // Split by dialogue turns
346        let turn_starts: Vec<usize> = DIALOGUE_TURN_PATTERN
347            .find_iter(text)
348            .map(|m| m.start())
349            .collect();
350
351        // Add any text before the first turn
352        if !turn_starts.is_empty() && turn_starts[0] > 0 {
353            let pre_text = text[..turn_starts[0]].trim();
354            if !pre_text.is_empty() {
355                segments.push(SemanticSegment {
356                    text: pre_text.to_string(),
357                    segment_type: SegmentType::Text,
358                });
359            }
360        }
361
362        for (i, &start) in turn_starts.iter().enumerate() {
363            let end = if i + 1 < turn_starts.len() {
364                turn_starts[i + 1]
365            } else {
366                text.len()
367            };
368
369            let turn_text = text[start..end].trim();
370            if !turn_text.is_empty() {
371                segments.push(SemanticSegment {
372                    text: turn_text.to_string(),
373                    segment_type: SegmentType::DialogueTurn,
374                });
375            }
376        }
377    } else if config.split_on_paragraphs {
378        // Split by paragraphs (double newlines) or section markers
379        let paragraph_pattern = Regex::new(r"\n\s*\n").unwrap();
380        let mut last_end = 0;
381
382        for mat in paragraph_pattern.find_iter(text) {
383            if mat.start() > last_end {
384                let para_text = text[last_end..mat.start()].trim();
385                if !para_text.is_empty() {
386                    let seg_type = if SECTION_PATTERN.is_match(para_text) {
387                        SegmentType::Section
388                    } else {
389                        SegmentType::Paragraph
390                    };
391                    segments.push(SemanticSegment {
392                        text: para_text.to_string(),
393                        segment_type: seg_type,
394                    });
395                }
396            }
397            last_end = mat.end();
398        }
399
400        // Add remaining text
401        if last_end < text.len() {
402            let remaining = text[last_end..].trim();
403            if !remaining.is_empty() {
404                segments.push(SemanticSegment {
405                    text: remaining.to_string(),
406                    segment_type: SegmentType::Paragraph,
407                });
408            }
409        }
410    } else {
411        // Fall back to sentence-based splitting
412        segments = split_by_sentences(text);
413    }
414
415    // If no segments found, treat entire text as one segment
416    if segments.is_empty() {
417        segments.push(SemanticSegment {
418            text: text.to_string(),
419            segment_type: SegmentType::Text,
420        });
421    }
422
423    segments
424}
425
426/// Split text by sentences for fallback
427fn split_by_sentences(text: &str) -> Vec<SemanticSegment> {
428    let sentence_pattern = Regex::new(r"[.!?]+\s+").unwrap();
429    let mut segments = Vec::new();
430    let mut last_end = 0;
431
432    for mat in sentence_pattern.find_iter(text) {
433        let sentence = text[last_end..mat.end()].trim();
434        if !sentence.is_empty() {
435            segments.push(SemanticSegment {
436                text: sentence.to_string(),
437                segment_type: SegmentType::Text,
438            });
439        }
440        last_end = mat.end();
441    }
442
443    // Add remaining text
444    if last_end < text.len() {
445        let remaining = text[last_end..].trim();
446        if !remaining.is_empty() {
447            segments.push(SemanticSegment {
448                text: remaining.to_string(),
449                segment_type: SegmentType::Text,
450            });
451        }
452    }
453
454    segments
455}
456
457/// Group segments into chunks respecting size constraints
458fn group_segments_into_chunks(
459    segments: Vec<SemanticSegment>,
460    config: &SemanticChunkConfig,
461) -> Vec<String> {
462    let mut chunks = Vec::new();
463    let mut current_chunk = String::new();
464
465    for segment in segments {
466        let segment_len = segment.text.len();
467
468        // If segment alone exceeds max size, we need to split it
469        if segment_len > config.max_size {
470            // Flush current chunk first
471            if !current_chunk.is_empty() {
472                chunks.push(current_chunk.trim().to_string());
473                current_chunk = String::new();
474            }
475
476            // Split the large segment using fixed-size chunking
477            let fixed_config = ChunkConfig {
478                chunk_size: config.target_size,
479                overlap: config.min_size / 2,
480                min_chunk_size: config.min_size,
481            };
482            let sub_chunks = chunk_text(&segment.text, &fixed_config);
483            chunks.extend(sub_chunks.chunks);
484            continue;
485        }
486
487        // Check if adding this segment would exceed target
488        let new_len = current_chunk.len() + segment_len + 1; // +1 for newline
489
490        if new_len > config.target_size && !current_chunk.is_empty() {
491            // Flush current chunk
492            chunks.push(current_chunk.trim().to_string());
493            current_chunk = String::new();
494        }
495
496        // Add segment to current chunk
497        if !current_chunk.is_empty() {
498            current_chunk.push('\n');
499        }
500        current_chunk.push_str(&segment.text);
501    }
502
503    // Flush remaining chunk
504    if !current_chunk.is_empty() {
505        let trimmed = current_chunk.trim().to_string();
506        // Merge tiny trailing chunk with previous if too small
507        if trimmed.len() < config.min_size && !chunks.is_empty() {
508            let last = chunks.pop().unwrap_or_default();
509            chunks.push(format!("{last}\n{trimmed}"));
510        } else {
511            chunks.push(trimmed);
512        }
513    }
514
515    chunks
516}
517
518/// Detect if text appears to be dialogue/conversation format
519pub fn is_dialogue_format(text: &str) -> bool {
520    DIALOGUE_TURN_PATTERN.is_match(text)
521}
522
523/// Auto-select the best chunking strategy based on content
524pub fn auto_chunk_text(text: &str) -> ChunkResult {
525    if is_dialogue_format(text) {
526        semantic_chunk_text(text, &SemanticChunkConfig::default())
527    } else {
528        chunk_text(text, &ChunkConfig::default())
529    }
530}
531
532#[cfg(test)]
533mod tests {
534    use super::*;
535
536    #[test]
537    fn test_short_text_no_chunking() {
538        let config = ChunkConfig::default();
539        let result = chunk_text("This is a short text.", &config);
540
541        assert_eq!(result.chunks.len(), 1);
542        assert!(!result.was_chunked);
543        assert_eq!(result.chunks[0], "This is a short text.");
544    }
545
546    #[test]
547    fn test_long_text_chunking() {
548        let config = ChunkConfig {
549            chunk_size: 100,
550            overlap: 20,
551            min_chunk_size: 30,
552        };
553
554        // Create text longer than chunk_size
555        let text = "This is sentence one. This is sentence two. This is sentence three. \
556                   This is sentence four. This is sentence five. This is sentence six. \
557                   This is sentence seven. This is sentence eight.";
558
559        let result = chunk_text(text, &config);
560
561        assert!(result.was_chunked);
562        assert!(result.chunks.len() > 1);
563
564        // Verify each chunk has meaningful content
565        for chunk in &result.chunks {
566            assert!(
567                chunk.len() >= config.min_chunk_size,
568                "Chunk too small: '{}' (len={})",
569                chunk,
570                chunk.len()
571            );
572        }
573
574        // Verify total chunked content is at least as long as original (with overlaps)
575        let total_len: usize = result.chunks.iter().map(|c| c.len()).sum();
576        assert!(
577            total_len >= result.original_length,
578            "Total chunk length {} < original {}",
579            total_len,
580            result.original_length
581        );
582    }
583
584    #[test]
585    fn test_sentence_boundary_respected() {
586        let config = ChunkConfig {
587            chunk_size: 50,
588            overlap: 10,
589            min_chunk_size: 20,
590        };
591
592        let text = "First sentence here. Second sentence follows. Third sentence ends.";
593        let result = chunk_text(text, &config);
594
595        // Chunks should end at sentence boundaries when possible
596        for chunk in &result.chunks {
597            let trimmed = chunk.trim();
598            if !trimmed.is_empty() && result.chunks.len() > 1 {
599                // Most chunks should end with sentence-ending punctuation
600                let last_char = trimmed.chars().last().unwrap();
601                // Allow some flexibility - not all chunks will end at sentence boundary
602                assert!(
603                    last_char == '.'
604                        || last_char == '!'
605                        || last_char == '?'
606                        || chunk == result.chunks.last().unwrap(),
607                    "Chunk '{chunk}' doesn't end at sentence boundary"
608                );
609            }
610        }
611    }
612
613    #[test]
614    fn test_overlap_exists() {
615        let config = ChunkConfig {
616            chunk_size: 60,
617            overlap: 20,
618            min_chunk_size: 20,
619        };
620
621        let text = "AAAA BBBB CCCC DDDD EEEE FFFF GGGG HHHH IIII JJJJ KKKK LLLL MMMM";
622        let result = chunk_text(text, &config);
623
624        if result.chunks.len() >= 2 {
625            // Check that consecutive chunks have overlapping content
626            for i in 0..result.chunks.len() - 1 {
627                let chunk1 = &result.chunks[i];
628                let chunk2 = &result.chunks[i + 1];
629
630                // Find common words
631                let words1: std::collections::HashSet<_> = chunk1.split_whitespace().collect();
632                let words2: std::collections::HashSet<_> = chunk2.split_whitespace().collect();
633                let common: Vec<_> = words1.intersection(&words2).collect();
634
635                // Should have some overlap
636                assert!(
637                    !common.is_empty() || chunk1.len() < config.overlap,
638                    "No overlap between chunks {} and {}",
639                    i,
640                    i + 1
641                );
642            }
643        }
644    }
645
646    #[test]
647    fn test_token_estimation() {
648        // Empty string
649        assert_eq!(estimate_tokens(""), 0);
650
651        // Single word: 1 * 1.3 = 2 (ceil)
652        assert_eq!(estimate_tokens("test"), 2);
653
654        // Two words: 2 * 1.3 = 3 (ceil)
655        assert_eq!(estimate_tokens("hello world"), 3);
656
657        // Sentence with punctuation: 5 words * 1.3 = 7 (ceil) + 1 punct token (3 punct chars / 3)
658        assert_eq!(estimate_tokens("Hello, world! How are you?"), 8);
659
660        // Code-like content with more punctuation
661        let code = "fn main() { println!(\"hello\"); }";
662        let tokens = estimate_tokens(code);
663        assert!(tokens >= 5 && tokens <= 15, "Code tokens: {}", tokens);
664
665        // No whitespace (falls back to char-based)
666        assert_eq!(estimate_tokens("abcdefgh"), 2); // 8 chars / 4 = 2
667    }
668
669    #[test]
670    fn test_very_long_content() {
671        let config = ChunkConfig::default();
672
673        // Create 10KB of content
674        let long_text = "This is a test sentence. ".repeat(400);
675        let result = chunk_text(&long_text, &config);
676
677        assert!(result.was_chunked);
678        assert!(result.chunks.len() > 10); // Should have many chunks
679        assert_eq!(result.coverage_ratio(), 1.0);
680
681        // Verify no chunk exceeds config size significantly
682        for chunk in &result.chunks {
683            assert!(
684                chunk.len() <= config.chunk_size + 100,
685                "Chunk too large: {} chars",
686                chunk.len()
687            );
688        }
689    }
690
691    #[test]
692    fn test_chunking_quality_unique_content_searchable() {
693        let config = ChunkConfig::default();
694
695        // Create content with UNIQUE markers at beginning, middle, and end
696        // These markers should each appear in at least one chunk
697        let beginning = "ALPHA_BEGINNING_MARKER is a unique identifier at the start.";
698        let middle_padding = "This is filler content to push things apart. ".repeat(30);
699        let middle = "BETA_MIDDLE_MARKER represents content in the center of the document.";
700        let end_padding = "More filler content for separation between sections. ".repeat(30);
701        let end = "GAMMA_END_MARKER signifies the conclusion of this memory content.";
702
703        let full_text = format!("{beginning} {middle_padding} {middle} {end_padding} {end}");
704
705        let result = chunk_text(&full_text, &config);
706
707        // Content should be chunked
708        assert!(result.was_chunked, "Content should require chunking");
709        assert!(result.chunks.len() >= 3, "Should have multiple chunks");
710
711        // Verify each unique marker appears in at least one chunk
712        let has_alpha = result.chunks.iter().any(|c| c.contains("ALPHA_BEGINNING"));
713        let has_beta = result.chunks.iter().any(|c| c.contains("BETA_MIDDLE"));
714        let has_gamma = result.chunks.iter().any(|c| c.contains("GAMMA_END"));
715
716        assert!(has_alpha, "ALPHA marker (beginning) not found in any chunk");
717        assert!(has_beta, "BETA marker (middle) not found in any chunk");
718        assert!(has_gamma, "GAMMA marker (end) not found in any chunk");
719
720        // Log chunk info for debugging
721        println!("Total chunks: {}", result.chunks.len());
722        println!("Original length: {} chars", result.original_length);
723        for (i, chunk) in result.chunks.iter().enumerate() {
724            let markers: Vec<&str> = vec![
725                if chunk.contains("ALPHA") { "ALPHA" } else { "" },
726                if chunk.contains("BETA") { "BETA" } else { "" },
727                if chunk.contains("GAMMA") { "GAMMA" } else { "" },
728            ]
729            .into_iter()
730            .filter(|m| !m.is_empty())
731            .collect();
732            println!(
733                "  Chunk {}: {} chars {}",
734                i,
735                chunk.len(),
736                if markers.is_empty() {
737                    String::new()
738                } else {
739                    format!("[contains: {}]", markers.join(", "))
740                }
741            );
742        }
743    }
744
745    #[test]
746    fn test_chunking_coverage_no_content_lost() {
747        let config = ChunkConfig {
748            chunk_size: 200,
749            overlap: 50,
750            min_chunk_size: 50,
751        };
752
753        // Create text with numbered sentences for easy tracking
754        let sentences: Vec<String> = (1..=20)
755            .map(|i| format!("Sentence number {i} contains unique information. "))
756            .collect();
757        let text = sentences.join("");
758
759        let result = chunk_text(&text, &config);
760
761        // Every sentence number should appear in at least one chunk
762        for i in 1..=20 {
763            let marker = format!("number {i}");
764            let found = result.chunks.iter().any(|c| c.contains(&marker));
765            assert!(
766                found,
767                "Sentence {i} not found in any chunk! Coverage gap detected."
768            );
769        }
770    }
771}