graphrag_core/text/
mod.rs

1//! Text processing pipeline: chunking, enrichment, and analysis.
2//!
3//! Splits documents into [`TextChunk`]s via pluggable `ChunkingStrategy` implementations
4//! (including late chunking), and optionally enriches each chunk with contextual metadata.
5
6/// Text analysis utilities
7pub mod analysis;
8/// Semantic boundary detection for BAR-RAG
9pub mod boundary_detection;
10/// Chunk enrichment pipeline
11pub mod chunk_enricher;
12/// Text chunking utilities module
13pub mod chunking;
14/// Trait-based chunking strategies
15pub mod chunking_strategies;
16/// LLM-based contextual chunk enrichment (Anthropic Contextual Retrieval pattern)
17pub mod contextual_enricher;
18/// Document structure representation
19pub mod document_structure;
20/// Extractive summarization
21pub mod extractive_summarizer;
22/// TF-IDF keyword extraction
23pub mod keyword_extraction;
24/// Late Chunking for context-preserving embeddings (Jina AI technique)
25pub mod late_chunking;
26/// Layout parser trait
27pub mod layout_parser;
28/// Document layout parsers
29pub mod parsers;
30/// Semantic chunking based on embedding similarity
31pub mod semantic_chunking;
32/// Semantic coherence scoring for BAR-RAG
33pub mod semantic_coherence;
34
35pub use analysis::{TextAnalyzer, TextStats};
36pub use boundary_detection::{Boundary, BoundaryDetectionConfig, BoundaryDetector, BoundaryType};
37pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
38pub use chunking_strategies::{
39    BoundaryAwareChunkingStrategy, HierarchicalChunkingStrategy, SemanticChunkingStrategy,
40};
41pub use contextual_enricher::{ContextualEnricher, ContextualEnricherConfig};
42pub use document_structure::{
43    DocumentStructure, Heading, HeadingHierarchy, Section, SectionNumber, SectionNumberFormat,
44    StructureStatistics,
45};
46pub use extractive_summarizer::ExtractiveSummarizer;
47pub use keyword_extraction::TfIdfKeywordExtractor;
48pub use late_chunking::{JinaLateChunkingClient, LateChunkingConfig, LateChunkingStrategy};
49pub use layout_parser::{LayoutParser, LayoutParserFactory};
50pub use semantic_chunking::{
51    BreakpointStrategy, SemanticChunk, SemanticChunker, SemanticChunkerConfig,
52};
53pub use semantic_coherence::{CoherenceConfig, OptimalSplit, ScoredChunk, SemanticCoherenceScorer};
54
55#[cfg(feature = "code-chunking")]
56pub use chunking_strategies::RustCodeChunkingStrategy;
57
58#[cfg(feature = "parallel-processing")]
59use crate::parallel::{ParallelProcessor, PerformanceMonitor};
60use crate::{
61    core::{ChunkId, ChunkingStrategy, Document, TextChunk},
62    Result,
63};
64use chunking::HierarchicalChunker;
65
66/// Text processing utilities for chunking and preprocessing
67#[derive(Debug)]
68pub struct TextProcessor {
69    chunk_size: usize,
70    chunk_overlap: usize,
71    #[cfg(feature = "parallel-processing")]
72    parallel_processor: Option<ParallelProcessor>,
73    #[cfg(feature = "parallel-processing")]
74    performance_monitor: PerformanceMonitor,
75}
76
77impl TextProcessor {
78    /// Create a new text processor
79    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
80        Ok(Self {
81            chunk_size,
82            chunk_overlap,
83            #[cfg(feature = "parallel-processing")]
84            parallel_processor: None,
85            #[cfg(feature = "parallel-processing")]
86            performance_monitor: PerformanceMonitor::new(),
87        })
88    }
89
90    /// Create a new text processor with parallel processing support
91    #[cfg(feature = "parallel-processing")]
92    pub fn with_parallel_processing(
93        chunk_size: usize,
94        chunk_overlap: usize,
95        parallel_processor: ParallelProcessor,
96    ) -> Result<Self> {
97        Ok(Self {
98            chunk_size,
99            chunk_overlap,
100            parallel_processor: Some(parallel_processor),
101            performance_monitor: PerformanceMonitor::new(),
102        })
103    }
104
105    /// Split text into chunks with overlap using hierarchical boundary preservation
106    pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
107        let chunker = HierarchicalChunker::new().with_min_size(50);
108        let chunks_text =
109            chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
110
111        let mut chunks = Vec::new();
112        let mut chunk_counter = 0;
113        let mut current_pos = 0;
114
115        for chunk_content in chunks_text {
116            if !chunk_content.trim().is_empty() {
117                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
118                let chunk_start = current_pos;
119                let chunk_end = chunk_start + chunk_content.len();
120
121                current_pos += chunk_content.len();
122
123                let chunk = TextChunk::new(
124                    chunk_id,
125                    document.id.clone(),
126                    chunk_content,
127                    chunk_start,
128                    chunk_end,
129                );
130                chunks.push(chunk);
131                chunk_counter += 1;
132            } else {
133                current_pos += chunk_content.len();
134            }
135        }
136
137        Ok(chunks)
138    }
139
140    /// Split text into chunks with overlap (legacy method)
141    pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
142        let text = &document.content;
143        let mut chunks = Vec::new();
144        let mut start = 0;
145        let mut chunk_counter = 0;
146
147        while start < text.len() {
148            let end = std::cmp::min(start + self.chunk_size, text.len());
149
150            // Try to find a good breaking point (sentence boundary)
151            let actual_end = if end < text.len() {
152                self.find_sentence_boundary(text, start, end)
153                    .unwrap_or_else(|| self.find_char_boundary(text, end))
154            } else {
155                end
156            };
157
158            let chunk_content = text[start..actual_end].to_string();
159
160            if !chunk_content.trim().is_empty() {
161                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
162                let chunk = TextChunk::new(
163                    chunk_id,
164                    document.id.clone(),
165                    chunk_content,
166                    start,
167                    actual_end,
168                );
169                chunks.push(chunk);
170                chunk_counter += 1;
171            }
172
173            // Calculate next start position with overlap
174            let next_start = if actual_end >= text.len() {
175                break;
176            } else {
177                let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
178                let safe_overlap = self.find_char_boundary(text, overlap_start);
179                std::cmp::max(start + 1, safe_overlap)
180            };
181
182            start = next_start;
183        }
184
185        Ok(chunks)
186    }
187
188    /// Chunk text and enrich with semantic metadata
189    pub fn chunk_text_with_enrichment(
190        &self,
191        document: &Document,
192        enricher: &mut ChunkEnricher,
193    ) -> Result<Vec<TextChunk>> {
194        // First, chunk the document
195        let mut chunks = self.chunk_text(document)?;
196
197        // Then enrich the chunks with metadata
198        enricher.enrich_chunks(&mut chunks, document)?;
199
200        Ok(chunks)
201    }
202
203    /// Chunk text hierarchically and enrich with semantic metadata
204    pub fn chunk_text_hierarchical_with_enrichment(
205        &self,
206        document: &Document,
207        enricher: &mut ChunkEnricher,
208    ) -> Result<Vec<TextChunk>> {
209        // First, chunk the document hierarchically
210        let mut chunks = self.chunk_text_hierarchical(document)?;
211
212        // Then enrich the chunks with metadata
213        enricher.enrich_chunks(&mut chunks, document)?;
214
215        Ok(chunks)
216    }
217
218    /// Create a default enricher for document processing
219    pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
220        let parser = LayoutParserFactory::create_for_document(document);
221        ChunkEnricher::new_default(parser)
222    }
223
224    /// Convenience method: chunk and enrich with auto-detected format
225    pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
226        let mut enricher = Self::create_default_enricher(document);
227        self.chunk_text_with_enrichment(document, &mut enricher)
228    }
229
230    /// Convenience method: chunk hierarchically and enrich with auto-detected format
231    pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
232        let mut enricher = Self::create_default_enricher(document);
233        self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
234    }
235
236    /// Chunk text using any strategy that implements ChunkingStrategy trait
237    ///
238    /// This method provides a flexible way to use different chunking approaches
239    /// while maintaining the same interface.
240    ///
241    /// # Arguments
242    /// * `document` - The document to chunk
243    /// * `strategy` - Any type implementing ChunkingStrategy
244    ///
245    /// # Returns
246    /// A vector of TextChunk objects
247    ///
248    /// # Examples
249    ///
250    /// ```ignore
251    /// use graphrag_core::text::{TextProcessor, HierarchicalChunkingStrategy};
252    ///
253    /// let processor = TextProcessor::new(1000, 100)?;
254    /// let strategy = HierarchicalChunkingStrategy::new(500, 50, document.id.clone());
255    /// let chunks = processor.chunk_with_strategy(&document, &strategy)?;
256    /// ```
257    pub fn chunk_with_strategy(
258        &self,
259        document: &Document,
260        strategy: &dyn ChunkingStrategy,
261    ) -> Result<Vec<TextChunk>> {
262        let chunks = strategy.chunk(&document.content);
263        Ok(chunks)
264    }
265
266    /// Find a safe character boundary at or before the given position
267    fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
268        pos = pos.min(text.len());
269        while pos > 0 && !text.is_char_boundary(pos) {
270            pos -= 1;
271        }
272        pos
273    }
274
275    /// Find a safe character boundary within a slice at or before the given position
276    fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
277        pos = pos.min(text.len());
278        while pos > 0 && !text.is_char_boundary(pos) {
279            pos -= 1;
280        }
281        pos
282    }
283
284    /// Find a good sentence boundary for chunking
285    fn find_sentence_boundary(
286        &self,
287        text: &str,
288        start: usize,
289        preferred_end: usize,
290    ) -> Option<usize> {
291        // Ensure we're at character boundaries
292        let safe_start = self.find_char_boundary(text, start);
293        let safe_end = self.find_char_boundary(text, preferred_end);
294
295        if safe_start >= safe_end {
296            return None;
297        }
298
299        let search_window = &text[safe_start..safe_end];
300
301        // Look for sentence boundaries in the last part of the chunk
302        let search_start = search_window.len().saturating_sub(200);
303        // Find character boundary within the search window
304        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
305        let search_text = &search_window[safe_search_start..];
306
307        // Simple sentence boundary detection
308        let sentence_endings = ['.', '!', '?'];
309        let mut last_boundary = None;
310
311        for (i, ch) in search_text.char_indices() {
312            if sentence_endings.contains(&ch) {
313                // Check if next character is whitespace or end of text
314                let next_pos = i + ch.len_utf8();
315                if next_pos >= search_text.len()
316                    || search_text
317                        .chars()
318                        .nth(next_pos)
319                        .map_or(true, |c| c.is_whitespace())
320                {
321                    last_boundary = Some(safe_start + safe_search_start + next_pos);
322                }
323            }
324        }
325
326        last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
327    }
328
329    /// Find a word boundary for chunking
330    fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
331        // These should already be safe boundaries from the caller
332        if start >= preferred_end {
333            return None;
334        }
335
336        let search_window = &text[start..preferred_end];
337
338        // Find the last whitespace in the last 50 characters
339        let search_start = search_window.len().saturating_sub(50);
340        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
341        let search_text = &search_window[safe_search_start..];
342
343        search_text
344            .rfind(' ')
345            .map(|pos| start + safe_search_start + pos)
346    }
347
348    /// Clean and normalize text
349    pub fn clean_text(&self, text: &str) -> String {
350        text
351            // Normalize whitespace
352            .split_whitespace()
353            .collect::<Vec<_>>()
354            .join(" ")
355            // Remove excessive punctuation
356            .chars()
357            .collect::<String>()
358    }
359
360    /// Extract sentences from text
361    pub fn extract_sentences(&self, text: &str) -> Vec<String> {
362        let sentence_endings = ['.', '!', '?'];
363        let mut sentences = Vec::new();
364        let mut current_sentence = String::new();
365
366        for ch in text.chars() {
367            if sentence_endings.contains(&ch) {
368                let trimmed = current_sentence.trim().to_string();
369                if !trimmed.is_empty() {
370                    sentences.push(trimmed);
371                }
372                current_sentence.clear();
373            } else {
374                current_sentence.push(ch);
375            }
376        }
377
378        // Add any remaining text as a sentence
379        let trimmed = current_sentence.trim().to_string();
380        if !trimmed.is_empty() {
381            sentences.push(trimmed);
382        }
383
384        sentences
385    }
386
387    /// Count words in text
388    pub fn word_count(&self, text: &str) -> usize {
389        text.split_whitespace().count()
390    }
391
392    /// Process multiple documents in parallel
393    pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
394        #[cfg(feature = "parallel-processing")]
395        {
396            if let Some(processor) = &self.parallel_processor {
397                if processor.should_use_parallel(documents.len()) {
398                    use rayon::prelude::*;
399                    let results: Result<Vec<Vec<TextChunk>>> = documents
400                        .par_iter()
401                        .map(|doc| self.chunk_text(doc))
402                        .collect();
403                    return results;
404                }
405            }
406        }
407
408        // Sequential fallback
409        documents.iter().map(|doc| self.chunk_text(doc)).collect()
410    }
411
412    /// Parallel extraction of keywords from multiple texts
413    pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
414        #[cfg(feature = "parallel-processing")]
415        {
416            if let Some(processor) = &self.parallel_processor {
417                if processor.should_use_parallel(texts.len()) {
418                    use rayon::prelude::*;
419                    return texts
420                        .par_iter()
421                        .map(|&text| self.extract_keywords(text, max_keywords))
422                        .collect();
423                }
424            }
425        }
426
427        // Sequential fallback
428        texts
429            .iter()
430            .map(|&text| self.extract_keywords(text, max_keywords))
431            .collect()
432    }
433
434    /// Parallel sentence extraction from multiple texts
435    pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
436        #[cfg(feature = "parallel-processing")]
437        {
438            if let Some(processor) = &self.parallel_processor {
439                if processor.should_use_parallel(texts.len()) {
440                    use rayon::prelude::*;
441                    return texts
442                        .par_iter()
443                        .map(|&text| self.extract_sentences(text))
444                        .collect();
445                }
446            }
447        }
448
449        // Sequential fallback
450        texts
451            .iter()
452            .map(|&text| self.extract_sentences(text))
453            .collect()
454    }
455
456    /// Parallel text cleaning for multiple texts
457    pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
458        #[cfg(feature = "parallel-processing")]
459        {
460            if let Some(processor) = &self.parallel_processor {
461                if processor.should_use_parallel(texts.len()) {
462                    use rayon::prelude::*;
463                    return texts
464                        .par_iter()
465                        .map(|&text| self.clean_text(text))
466                        .collect();
467                }
468            }
469        }
470
471        // Sequential fallback
472        texts.iter().map(|&text| self.clean_text(text)).collect()
473    }
474
475    /// Extract keywords using simple frequency analysis
476    pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
477        use std::collections::HashMap;
478
479        let words: Vec<String> = text
480            .split_whitespace()
481            .map(|w| w.to_lowercase())
482            .filter(|w| w.len() > 3) // Filter out short words
483            .filter(|w| !self.is_stop_word(w))
484            .collect();
485
486        let mut word_counts = HashMap::new();
487        for word in words {
488            *word_counts.entry(word).or_insert(0) += 1;
489        }
490
491        let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
492        sorted_words.sort_by_key(|item| std::cmp::Reverse(item.1));
493
494        sorted_words
495            .into_iter()
496            .take(max_keywords)
497            .map(|(word, _)| word)
498            .collect()
499    }
500
501    /// Simple stop word detection (English)
502    fn is_stop_word(&self, word: &str) -> bool {
503        const STOP_WORDS: &[&str] = &[
504            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
505            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
506            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
507            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
508            "go", "me",
509        ];
510        STOP_WORDS.contains(&word)
511    }
512
513    /// Get performance statistics
514    #[cfg(feature = "parallel-processing")]
515    pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
516        let stats = self.performance_monitor.get_stats();
517        (
518            stats.tasks_processed,
519            std::time::Duration::from_millis(stats.total_time_ms),
520        )
521    }
522
523    /// Get average processing time per operation
524    #[cfg(feature = "parallel-processing")]
525    pub fn average_processing_time(&self) -> std::time::Duration {
526        let avg_ms = self.performance_monitor.average_duration();
527        std::time::Duration::from_millis(avg_ms as u64)
528    }
529
530    /// Reset performance monitoring statistics
531    #[cfg(feature = "parallel-processing")]
532    pub fn reset_performance_stats(&mut self) {
533        self.performance_monitor.reset();
534    }
535
536    /// Get parallel processing statistics if available
537    #[cfg(feature = "parallel-processing")]
538    pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
539        self.parallel_processor.as_ref().map(|p| p.get_statistics())
540    }
541}
542
543/// Language detection utilities
544pub struct LanguageDetector;
545
546impl LanguageDetector {
547    /// Simple language detection based on character patterns
548    /// This is a very basic implementation - in practice you'd want a proper library
549    pub fn detect_language(text: &str) -> String {
550        // Very basic detection - in practice use a proper language detection library
551        if text
552            .chars()
553            .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
554        {
555            "es".to_string()
556        } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
557            "pt".to_string()
558        } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
559            "fr".to_string()
560        } else {
561            "en".to_string() // Default to English
562        }
563    }
564}
565
566#[cfg(test)]
567mod tests {
568    use super::*;
569    use crate::core::DocumentId;
570
571    #[test]
572    fn test_text_chunking() {
573        let processor = TextProcessor::new(100, 20).unwrap();
574        let document = Document::new(
575            DocumentId::new("test".to_string()),
576            "Test Document".to_string(),
577            "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
578        );
579
580        let chunks = processor.chunk_text(&document).unwrap();
581        assert!(!chunks.is_empty());
582        assert!(chunks[0].content.len() <= 100);
583    }
584
585    #[test]
586    fn test_keyword_extraction() {
587        let processor = TextProcessor::new(1000, 100).unwrap();
588        let text = "machine learning artificial intelligence data science computer vision natural language processing";
589        let keywords = processor.extract_keywords(text, 3);
590
591        assert!(!keywords.is_empty());
592        assert!(keywords.len() <= 3);
593    }
594
595    #[test]
596    fn test_sentence_extraction() {
597        let processor = TextProcessor::new(1000, 100).unwrap();
598        let text = "First sentence. Second sentence! Third sentence?";
599        let sentences = processor.extract_sentences(text);
600
601        assert_eq!(sentences.len(), 3);
602        assert_eq!(sentences[0], "First sentence");
603        assert_eq!(sentences[1], "Second sentence");
604        assert_eq!(sentences[2], "Third sentence");
605    }
606
607    #[test]
608    fn test_enriched_chunking() {
609        let processor = TextProcessor::new(100, 20).unwrap();
610        let document = Document::new(
611            DocumentId::new("test".to_string()),
612            "test.md".to_string(),
613            "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
614        );
615
616        let chunks = processor.chunk_and_enrich(&document).unwrap();
617
618        assert!(!chunks.is_empty());
619        // At least some chunks should have enriched metadata
620        let has_metadata = chunks
621            .iter()
622            .any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
623        assert!(has_metadata, "Chunks should have enriched metadata");
624    }
625
626    #[test]
627    fn test_custom_enricher() {
628        let processor = TextProcessor::new(100, 20).unwrap();
629        let document = Document::new(
630            DocumentId::new("test".to_string()),
631            "test.md".to_string(),
632            "# Test Chapter\n\nContent about machine learning here.".to_string(),
633        );
634
635        let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
636        let mut enricher = ChunkEnricher::new_default(parser);
637
638        let chunks = processor
639            .chunk_text_with_enrichment(&document, &mut enricher)
640            .unwrap();
641
642        assert!(!chunks.is_empty());
643        // Verify metadata is present
644        assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
645    }
646}
graphrag_core/text/mod.rs

graphrag_core/text/
mod.rs