langextract_rust/
chunking.rs

1//! Text chunking functionality for processing large documents.
2//!
3//! This module provides comprehensive text chunking capabilities to handle
4//! documents that exceed the language model's context window. It supports
5//! multiple chunking strategies and overlap management to ensure no information
6//! is lost during processing.
7
8use crate::{
9    data::{AnnotatedDocument, CharInterval, Document, Extraction},
10    exceptions::LangExtractResult,
11};
12use regex::Regex;
13
14/// Different strategies for chunking text
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum ChunkingStrategy {
17    /// Fixed character-based chunking
18    FixedSize,
19    /// Split at sentence boundaries
20    Sentence,
21    /// Split at paragraph boundaries
22    Paragraph,
23    /// Adaptive chunking based on content structure
24    Adaptive,
25}
26
27/// A chunk of text with metadata
28#[derive(Debug, Clone)]
29pub struct TextChunk {
30    /// The chunk ID
31    pub id: usize,
32    /// Text content of the chunk
33    pub text: String,
34    /// Character offset from the beginning of the original document
35    pub char_offset: usize,
36    /// Length of the chunk in characters
37    pub char_length: usize,
38    /// Original document this chunk belongs to
39    pub document_id: Option<String>,
40    /// Whether this chunk overlaps with adjacent chunks
41    pub has_overlap: bool,
42    /// Overlap information (start and end overlap lengths)
43    pub overlap_info: Option<(usize, usize)>,
44}
45
46impl TextChunk {
47    /// Create a new text chunk
48    pub fn new(
49        id: usize,
50        text: String,
51        char_offset: usize,
52        document_id: Option<String>,
53    ) -> Self {
54        let char_length = text.len();
55        Self {
56            id,
57            text,
58            char_offset,
59            char_length,
60            document_id,
61            has_overlap: false,
62            overlap_info: None,
63        }
64    }
65
66    /// Create a chunk with overlap information
67    pub fn with_overlap(
68        id: usize,
69        text: String,
70        char_offset: usize,
71        document_id: Option<String>,
72        overlap_start: usize,
73        overlap_end: usize,
74    ) -> Self {
75        let char_length = text.len();
76        Self {
77            id,
78            text,
79            char_offset,
80            char_length,
81            document_id,
82            has_overlap: overlap_start > 0 || overlap_end > 0,
83            overlap_info: Some((overlap_start, overlap_end)),
84        }
85    }
86
87    /// Get the character interval for this chunk in the original document
88    pub fn char_interval(&self) -> CharInterval {
89        CharInterval::new(
90            Some(self.char_offset),
91            Some(self.char_offset + self.char_length),
92        )
93    }
94
95    /// Get the core text without overlaps
96    pub fn core_text(&self) -> &str {
97        if let Some((start_overlap, end_overlap)) = self.overlap_info {
98            let start = start_overlap;
99            let end = self.text.len().saturating_sub(end_overlap);
100            &self.text[start..end]
101        } else {
102            &self.text
103        }
104    }
105}
106
107/// Configuration for text chunking
108#[derive(Debug, Clone)]
109pub struct ChunkingConfig {
110    /// Maximum characters per chunk
111    pub max_chunk_size: usize,
112    /// Overlap size in characters
113    pub overlap_size: usize,
114    /// Chunking strategy to use
115    pub strategy: ChunkingStrategy,
116    /// Minimum chunk size (chunks smaller than this will be merged)
117    pub min_chunk_size: usize,
118    /// Whether to respect paragraph boundaries
119    pub respect_paragraphs: bool,
120    /// Whether to respect sentence boundaries
121    pub respect_sentences: bool,
122}
123
124impl Default for ChunkingConfig {
125    fn default() -> Self {
126        Self {
127            max_chunk_size: 2000,
128            overlap_size: 200,
129            strategy: ChunkingStrategy::Adaptive,
130            min_chunk_size: 100,
131            respect_paragraphs: true,
132            respect_sentences: true,
133        }
134    }
135}
136
137/// Text chunker for processing large documents
138pub struct TextChunker {
139    config: ChunkingConfig,
140    sentence_regex: Regex,
141    paragraph_regex: Regex,
142}
143
144impl TextChunker {
145    /// Create a new text chunker with default configuration
146    pub fn new() -> Self {
147        Self::with_config(ChunkingConfig::default())
148    }
149
150    /// Create a new text chunker with custom configuration
151    pub fn with_config(config: ChunkingConfig) -> Self {
152        // Regex for sentence boundaries (basic implementation)
153        let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
154        
155        // Regex for paragraph boundaries
156        let paragraph_regex = Regex::new(r"\n\s*\n").unwrap();
157
158        Self {
159            config,
160            sentence_regex,
161            paragraph_regex,
162        }
163    }
164
165    /// Chunk a document into smaller pieces
166    pub fn chunk_document(&self, document: &Document) -> LangExtractResult<Vec<TextChunk>> {
167        self.chunk_text(&document.text, document.document_id.clone())
168    }
169
170    /// Chunk text into smaller pieces
171    pub fn chunk_text(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
172        if text.len() <= self.config.max_chunk_size {
173            // Text is small enough, return as single chunk
174            return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
175        }
176
177        match self.config.strategy {
178            ChunkingStrategy::FixedSize => self.chunk_fixed_size(text, document_id),
179            ChunkingStrategy::Sentence => self.chunk_by_sentences(text, document_id),
180            ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text, document_id),
181            ChunkingStrategy::Adaptive => self.chunk_adaptive(text, document_id),
182        }
183    }
184
185    /// Fixed-size chunking with overlap
186    fn chunk_fixed_size(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
187        let mut chunks = Vec::new();
188        let mut chunk_id = 0;
189        let mut current_pos = 0;
190
191        while current_pos < text.len() {
192            let chunk_end = std::cmp::min(
193                current_pos + self.config.max_chunk_size,
194                text.len()
195            );
196
197            let chunk_text = text[current_pos..chunk_end].to_string();
198            
199            let overlap_start = if chunk_id > 0 { self.config.overlap_size } else { 0 };
200            let overlap_end = if chunk_end < text.len() { self.config.overlap_size } else { 0 };
201
202            let chunk = TextChunk::with_overlap(
203                chunk_id,
204                chunk_text,
205                current_pos,
206                document_id.clone(),
207                overlap_start,
208                overlap_end,
209            );
210
211            chunks.push(chunk);
212            chunk_id += 1;
213
214            // Move forward, accounting for overlap
215            let step_size = self.config.max_chunk_size.saturating_sub(self.config.overlap_size);
216            current_pos += step_size;
217        }
218
219        Ok(chunks)
220    }
221
222    /// Chunk by sentence boundaries
223    fn chunk_by_sentences(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
224        let sentence_boundaries = self.find_sentence_boundaries(text);
225        self.chunk_by_boundaries(text, &sentence_boundaries, document_id)
226    }
227
228    /// Chunk by paragraph boundaries  
229    fn chunk_by_paragraphs(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
230        let paragraph_boundaries = self.find_paragraph_boundaries(text);
231        self.chunk_by_boundaries(text, &paragraph_boundaries, document_id)
232    }
233
234    /// Adaptive chunking that respects natural boundaries
235    fn chunk_adaptive(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
236        // First try paragraph boundaries
237        let paragraph_boundaries = self.find_paragraph_boundaries(text);
238        if !paragraph_boundaries.is_empty() && self.config.respect_paragraphs {
239            if let Ok(chunks) = self.chunk_by_boundaries(text, &paragraph_boundaries, document_id.clone()) {
240                // Check if any chunks are too large
241                let oversized_chunks: Vec<_> = chunks.iter()
242                    .filter(|c| c.char_length > self.config.max_chunk_size)
243                    .collect();
244                
245                if oversized_chunks.is_empty() {
246                    return Ok(chunks);
247                }
248            }
249        }
250
251        // Fall back to sentence boundaries
252        if self.config.respect_sentences {
253            let sentence_boundaries = self.find_sentence_boundaries(text);
254            if let Ok(chunks) = self.chunk_by_boundaries(text, &sentence_boundaries, document_id.clone()) {
255                let oversized_chunks: Vec<_> = chunks.iter()
256                    .filter(|c| c.char_length > self.config.max_chunk_size)
257                    .collect();
258                
259                if oversized_chunks.is_empty() {
260                    return Ok(chunks);
261                }
262            }
263        }
264
265        // Final fallback to fixed-size chunking
266        self.chunk_fixed_size(text, document_id)
267    }
268
269    /// Find sentence boundaries in text
270    fn find_sentence_boundaries(&self, text: &str) -> Vec<usize> {
271        let mut boundaries = vec![0]; // Start of text
272        
273        for mat in self.sentence_regex.find_iter(text) {
274            boundaries.push(mat.end());
275        }
276        
277        if boundaries.last() != Some(&text.len()) {
278            boundaries.push(text.len()); // End of text
279        }
280        
281        boundaries
282    }
283
284    /// Find paragraph boundaries in text
285    fn find_paragraph_boundaries(&self, text: &str) -> Vec<usize> {
286        let mut boundaries = vec![0]; // Start of text
287        
288        for mat in self.paragraph_regex.find_iter(text) {
289            boundaries.push(mat.end());
290        }
291        
292        if boundaries.last() != Some(&text.len()) {
293            boundaries.push(text.len()); // End of text
294        }
295        
296        boundaries
297    }
298
299    /// Chunk text based on provided boundaries
300    fn chunk_by_boundaries(
301        &self,
302        text: &str,
303        boundaries: &[usize],
304        document_id: Option<String>,
305    ) -> LangExtractResult<Vec<TextChunk>> {
306        let mut chunks = Vec::new();
307        let mut chunk_id = 0;
308        let mut current_start = 0;
309
310        for &boundary in boundaries.iter().skip(1) {
311            let potential_chunk_size = boundary - current_start;
312            
313            // If the potential chunk is within size limits, use it
314            if potential_chunk_size <= self.config.max_chunk_size {
315                if potential_chunk_size >= self.config.min_chunk_size || chunks.is_empty() {
316                    let chunk_text = text[current_start..boundary].to_string();
317                    let chunk = TextChunk::new(chunk_id, chunk_text, current_start, document_id.clone());
318                    chunks.push(chunk);
319                    chunk_id += 1;
320                    current_start = boundary;
321                }
322            } else {
323                // Chunk is too large, need to split it further
324                // For now, fall back to fixed-size chunking for this section
325                let section = &text[current_start..boundary];
326                let mut section_chunks = self.chunk_fixed_size(section, document_id.clone())?;
327                
328                // Adjust offsets
329                for chunk in &mut section_chunks {
330                    chunk.id = chunk_id;
331                    chunk.char_offset += current_start;
332                    chunk_id += 1;
333                }
334                
335                chunks.extend(section_chunks);
336                current_start = boundary;
337            }
338        }
339
340        if chunks.is_empty() {
341            // Fallback: create a single chunk with the entire text
342            chunks.push(TextChunk::new(0, text.to_string(), 0, document_id));
343        }
344
345        Ok(chunks)
346    }
347
348    /// Get chunking configuration
349    pub fn config(&self) -> &ChunkingConfig {
350        &self.config
351    }
352}
353
354impl Default for TextChunker {
355    fn default() -> Self {
356        Self::new()
357    }
358}
359
360/// Result aggregator for combining extractions from multiple chunks
361pub struct ResultAggregator {
362    /// Similarity threshold for duplicate detection
363    similarity_threshold: f32,
364    /// Whether to merge overlapping extractions
365    merge_overlaps: bool,
366}
367
368impl ResultAggregator {
369    /// Create a new result aggregator
370    pub fn new() -> Self {
371        Self {
372            similarity_threshold: 0.8,
373            merge_overlaps: true,
374        }
375    }
376
377    /// Create a result aggregator with custom settings
378    pub fn with_settings(similarity_threshold: f32, merge_overlaps: bool) -> Self {
379        Self {
380            similarity_threshold,
381            merge_overlaps,
382        }
383    }
384
385    /// Aggregate results from multiple chunks into a single annotated document
386    pub fn aggregate_chunk_results(
387        &self,
388        chunk_results: Vec<ChunkResult>,
389        original_text: String,
390        document_id: Option<String>,
391    ) -> LangExtractResult<AnnotatedDocument> {
392        let mut all_extractions = Vec::new();
393
394        // Collect all extractions from chunks
395        for chunk_result in chunk_results {
396            if let Some(extractions) = chunk_result.extractions {
397                // Character positions should already be adjusted by the alignment process
398                // during chunk processing, so we don't need to add the offset again here
399                all_extractions.extend(extractions);
400            }
401        }
402
403        // Deduplicate and merge overlapping extractions
404        let deduplicated_extractions = if self.merge_overlaps {
405            self.deduplicate_extractions(all_extractions)?
406        } else {
407            all_extractions
408        };
409
410        // Create the aggregated document
411        let mut annotated_doc = AnnotatedDocument::with_extractions(deduplicated_extractions, original_text);
412        annotated_doc.document_id = document_id;
413
414        Ok(annotated_doc)
415    }
416
417    /// Remove duplicate extractions based on similarity
418    fn deduplicate_extractions(&self, extractions: Vec<Extraction>) -> LangExtractResult<Vec<Extraction>> {
419        let mut unique_extractions = Vec::new();
420        
421        for extraction in extractions {
422            let mut is_duplicate = false;
423            
424            // Check against existing extractions
425            for existing in &unique_extractions {
426                if self.are_similar_extractions(&extraction, existing) {
427                    is_duplicate = true;
428                    break;
429                }
430            }
431            
432            if !is_duplicate {
433                unique_extractions.push(extraction);
434            }
435        }
436
437        Ok(unique_extractions)
438    }
439
440    /// Check if two extractions are similar enough to be considered duplicates
441    fn are_similar_extractions(&self, e1: &Extraction, e2: &Extraction) -> bool {
442        // Same extraction class and similar text
443        if e1.extraction_class == e2.extraction_class {
444            let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
445            return similarity >= self.similarity_threshold;
446        }
447
448        // Check for overlapping character positions
449        if let (Some(interval1), Some(interval2)) = (&e1.char_interval, &e2.char_interval) {
450            if interval1.overlaps_with(interval2) {
451                let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
452                return similarity >= self.similarity_threshold;
453            }
454        }
455
456        false
457    }
458
459    /// Calculate simple text similarity (Jaccard similarity on words)
460    fn text_similarity(&self, text1: &str, text2: &str) -> f32 {
461        if text1 == text2 {
462            return 1.0;
463        }
464
465        let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
466        let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
467
468        if words1.is_empty() && words2.is_empty() {
469            return 1.0;
470        }
471
472        let intersection = words1.intersection(&words2).count();
473        let union = words1.union(&words2).count();
474
475        if union == 0 {
476            0.0
477        } else {
478            intersection as f32 / union as f32
479        }
480    }
481}
482
483impl Default for ResultAggregator {
484    fn default() -> Self {
485        Self::new()
486    }
487}
488
489/// Result from processing a single chunk
490#[derive(Debug, Clone)]
491pub struct ChunkResult {
492    /// ID of the chunk that was processed
493    pub chunk_id: usize,
494    /// Extractions found in this chunk
495    pub extractions: Option<Vec<Extraction>>,
496    /// Character offset of this chunk in the original document
497    pub char_offset: usize,
498    /// Length of the chunk
499    pub char_length: usize,
500    /// Whether processing was successful
501    pub success: bool,
502    /// Error message if processing failed
503    pub error: Option<String>,
504    /// Processing time for this chunk
505    pub processing_time: Option<std::time::Duration>,
506}
507
508impl ChunkResult {
509    /// Create a successful chunk result
510    pub fn success(
511        chunk_id: usize,
512        extractions: Vec<Extraction>,
513        char_offset: usize,
514        char_length: usize,
515    ) -> Self {
516        Self {
517            chunk_id,
518            extractions: Some(extractions),
519            char_offset,
520            char_length,
521            success: true,
522            error: None,
523            processing_time: None,
524        }
525    }
526
527    /// Create a failed chunk result
528    pub fn failure(
529        chunk_id: usize,
530        char_offset: usize,
531        char_length: usize,
532        error: String,
533    ) -> Self {
534        Self {
535            chunk_id,
536            extractions: None,
537            char_offset,
538            char_length,
539            success: false,
540            error: Some(error),
541            processing_time: None,
542        }
543    }
544
545    /// Set processing time
546    pub fn with_processing_time(mut self, duration: std::time::Duration) -> Self {
547        self.processing_time = Some(duration);
548        self
549    }
550}
551
552#[cfg(test)]
553mod tests {
554    use super::*;
555
556    #[test]
557    fn test_fixed_size_chunking() {
558        let chunker = TextChunker::with_config(ChunkingConfig {
559            max_chunk_size: 20,
560            overlap_size: 5,
561            strategy: ChunkingStrategy::FixedSize,
562            ..Default::default()
563        });
564
565        let text = "This is a test document with some text that needs to be chunked into smaller pieces.";
566        let chunks = chunker.chunk_text(text, None).unwrap();
567
568        assert!(chunks.len() > 1);
569        for chunk in &chunks {
570            assert!(chunk.char_length <= 20);
571        }
572    }
573
574    #[test]
575    fn test_sentence_chunking() {
576        let chunker = TextChunker::with_config(ChunkingConfig {
577            max_chunk_size: 50,
578            strategy: ChunkingStrategy::Sentence,
579            ..Default::default()
580        });
581
582        let text = "First sentence. Second sentence! Third sentence? Fourth sentence.";
583        let chunks = chunker.chunk_text(text, None).unwrap();
584
585        // Should have multiple chunks based on sentences
586        assert!(chunks.len() > 0);
587        for chunk in &chunks {
588            println!("Chunk: '{}'", chunk.text);
589        }
590    }
591
592    #[test]
593    fn test_small_text_no_chunking() {
594        let chunker = TextChunker::new();
595        let text = "Short text.";
596        let chunks = chunker.chunk_text(text, None).unwrap();
597
598        assert_eq!(chunks.len(), 1);
599        assert_eq!(chunks[0].text, text);
600    }
601
602    #[test]
603    fn test_chunk_char_interval() {
604        let chunk = TextChunk::new(0, "test".to_string(), 10, None);
605        let interval = chunk.char_interval();
606        
607        assert_eq!(interval.start_pos, Some(10));
608        assert_eq!(interval.end_pos, Some(14));
609    }
610
611    #[test]
612    fn test_chunk_with_overlap() {
613        let chunk = TextChunk::with_overlap(
614            0,
615            "overlap test text".to_string(),
616            0,
617            None,
618            3,
619            4,
620        );
621
622        assert!(chunk.has_overlap);
623        assert_eq!(chunk.overlap_info, Some((3, 4)));
624        assert_eq!(chunk.core_text(), "rlap test ");
625    }
626}