Skip to main content

graphrag_core/text/
mod.rs

1/// Text analysis utilities
2pub mod analysis;
3/// Semantic boundary detection for BAR-RAG
4pub mod boundary_detection;
5/// Chunk enrichment pipeline
6pub mod chunk_enricher;
7/// Text chunking utilities module
8pub mod chunking;
9/// Trait-based chunking strategies
10pub mod chunking_strategies;
11/// LLM-based contextual chunk enrichment (Anthropic Contextual Retrieval pattern)
12pub mod contextual_enricher;
13/// Document structure representation
14pub mod document_structure;
15/// Extractive summarization
16pub mod extractive_summarizer;
17/// TF-IDF keyword extraction
18pub mod keyword_extraction;
19/// Late Chunking for context-preserving embeddings (Jina AI technique)
20pub mod late_chunking;
21/// Layout parser trait
22pub mod layout_parser;
23/// Document layout parsers
24pub mod parsers;
25/// Semantic chunking based on embedding similarity
26pub mod semantic_chunking;
27/// Semantic coherence scoring for BAR-RAG
28pub mod semantic_coherence;
29
30pub use analysis::{TextAnalyzer, TextStats};
31pub use boundary_detection::{Boundary, BoundaryDetectionConfig, BoundaryDetector, BoundaryType};
32pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
33pub use chunking_strategies::{
34    BoundaryAwareChunkingStrategy, HierarchicalChunkingStrategy, SemanticChunkingStrategy,
35};
36pub use contextual_enricher::{ContextualEnricher, ContextualEnricherConfig};
37pub use document_structure::{
38    DocumentStructure, Heading, HeadingHierarchy, Section, SectionNumber, SectionNumberFormat,
39    StructureStatistics,
40};
41pub use extractive_summarizer::ExtractiveSummarizer;
42pub use keyword_extraction::TfIdfKeywordExtractor;
43pub use late_chunking::{JinaLateChunkingClient, LateChunkingConfig, LateChunkingStrategy};
44pub use layout_parser::{LayoutParser, LayoutParserFactory};
45pub use semantic_chunking::{
46    BreakpointStrategy, SemanticChunk, SemanticChunker, SemanticChunkerConfig,
47};
48pub use semantic_coherence::{CoherenceConfig, OptimalSplit, ScoredChunk, SemanticCoherenceScorer};
49
50#[cfg(feature = "code-chunking")]
51pub use chunking_strategies::RustCodeChunkingStrategy;
52
53#[cfg(feature = "parallel-processing")]
54use crate::parallel::{ParallelProcessor, PerformanceMonitor};
55use crate::{
56    core::{ChunkId, ChunkingStrategy, Document, TextChunk},
57    Result,
58};
59use chunking::HierarchicalChunker;
60
61/// Text processing utilities for chunking and preprocessing
62#[derive(Debug)]
63pub struct TextProcessor {
64    chunk_size: usize,
65    chunk_overlap: usize,
66    #[cfg(feature = "parallel-processing")]
67    parallel_processor: Option<ParallelProcessor>,
68    #[cfg(feature = "parallel-processing")]
69    performance_monitor: PerformanceMonitor,
70}
71
72impl TextProcessor {
73    /// Create a new text processor
74    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
75        Ok(Self {
76            chunk_size,
77            chunk_overlap,
78            #[cfg(feature = "parallel-processing")]
79            parallel_processor: None,
80            #[cfg(feature = "parallel-processing")]
81            performance_monitor: PerformanceMonitor::new(),
82        })
83    }
84
85    /// Create a new text processor with parallel processing support
86    #[cfg(feature = "parallel-processing")]
87    pub fn with_parallel_processing(
88        chunk_size: usize,
89        chunk_overlap: usize,
90        parallel_processor: ParallelProcessor,
91    ) -> Result<Self> {
92        Ok(Self {
93            chunk_size,
94            chunk_overlap,
95            parallel_processor: Some(parallel_processor),
96            performance_monitor: PerformanceMonitor::new(),
97        })
98    }
99
100    /// Split text into chunks with overlap using hierarchical boundary preservation
101    pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
102        let chunker = HierarchicalChunker::new().with_min_size(50);
103        let chunks_text =
104            chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
105
106        let mut chunks = Vec::new();
107        let mut chunk_counter = 0;
108        let mut current_pos = 0;
109
110        for chunk_content in chunks_text {
111            if !chunk_content.trim().is_empty() {
112                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
113                let chunk_start = current_pos;
114                let chunk_end = chunk_start + chunk_content.len();
115
116                current_pos += chunk_content.len();
117
118                let chunk = TextChunk::new(
119                    chunk_id,
120                    document.id.clone(),
121                    chunk_content,
122                    chunk_start,
123                    chunk_end,
124                );
125                chunks.push(chunk);
126                chunk_counter += 1;
127            } else {
128                current_pos += chunk_content.len();
129            }
130        }
131
132        Ok(chunks)
133    }
134
135    /// Split text into chunks with overlap (legacy method)
136    pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
137        let text = &document.content;
138        let mut chunks = Vec::new();
139        let mut start = 0;
140        let mut chunk_counter = 0;
141
142        while start < text.len() {
143            let end = std::cmp::min(start + self.chunk_size, text.len());
144
145            // Try to find a good breaking point (sentence boundary)
146            let actual_end = if end < text.len() {
147                self.find_sentence_boundary(text, start, end)
148                    .unwrap_or_else(|| self.find_char_boundary(text, end))
149            } else {
150                end
151            };
152
153            let chunk_content = text[start..actual_end].to_string();
154
155            if !chunk_content.trim().is_empty() {
156                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
157                let chunk = TextChunk::new(
158                    chunk_id,
159                    document.id.clone(),
160                    chunk_content,
161                    start,
162                    actual_end,
163                );
164                chunks.push(chunk);
165                chunk_counter += 1;
166            }
167
168            // Calculate next start position with overlap
169            let next_start = if actual_end >= text.len() {
170                break;
171            } else {
172                let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
173                let safe_overlap = self.find_char_boundary(text, overlap_start);
174                std::cmp::max(start + 1, safe_overlap)
175            };
176
177            start = next_start;
178        }
179
180        Ok(chunks)
181    }
182
183    /// Chunk text and enrich with semantic metadata
184    pub fn chunk_text_with_enrichment(
185        &self,
186        document: &Document,
187        enricher: &mut ChunkEnricher,
188    ) -> Result<Vec<TextChunk>> {
189        // First, chunk the document
190        let mut chunks = self.chunk_text(document)?;
191
192        // Then enrich the chunks with metadata
193        enricher.enrich_chunks(&mut chunks, document)?;
194
195        Ok(chunks)
196    }
197
198    /// Chunk text hierarchically and enrich with semantic metadata
199    pub fn chunk_text_hierarchical_with_enrichment(
200        &self,
201        document: &Document,
202        enricher: &mut ChunkEnricher,
203    ) -> Result<Vec<TextChunk>> {
204        // First, chunk the document hierarchically
205        let mut chunks = self.chunk_text_hierarchical(document)?;
206
207        // Then enrich the chunks with metadata
208        enricher.enrich_chunks(&mut chunks, document)?;
209
210        Ok(chunks)
211    }
212
213    /// Create a default enricher for document processing
214    pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
215        let parser = LayoutParserFactory::create_for_document(document);
216        ChunkEnricher::new_default(parser)
217    }
218
219    /// Convenience method: chunk and enrich with auto-detected format
220    pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
221        let mut enricher = Self::create_default_enricher(document);
222        self.chunk_text_with_enrichment(document, &mut enricher)
223    }
224
225    /// Convenience method: chunk hierarchically and enrich with auto-detected format
226    pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
227        let mut enricher = Self::create_default_enricher(document);
228        self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
229    }
230
231    /// Chunk text using any strategy that implements ChunkingStrategy trait
232    ///
233    /// This method provides a flexible way to use different chunking approaches
234    /// while maintaining the same interface.
235    ///
236    /// # Arguments
237    /// * `document` - The document to chunk
238    /// * `strategy` - Any type implementing ChunkingStrategy
239    ///
240    /// # Returns
241    /// A vector of TextChunk objects
242    ///
243    /// # Examples
244    ///
245    /// ```rust
246    /// use graphrag_core::text::{TextProcessor, HierarchicalChunkingStrategy};
247    ///
248    /// let processor = TextProcessor::new(1000, 100)?;
249    /// let strategy = HierarchicalChunkingStrategy::new(500, 50, document.id.clone());
250    /// let chunks = processor.chunk_with_strategy(&document, &strategy)?;
251    /// ```
252    pub fn chunk_with_strategy(
253        &self,
254        document: &Document,
255        strategy: &dyn ChunkingStrategy,
256    ) -> Result<Vec<TextChunk>> {
257        let chunks = strategy.chunk(&document.content);
258        Ok(chunks)
259    }
260
261    /// Find a safe character boundary at or before the given position
262    fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
263        pos = pos.min(text.len());
264        while pos > 0 && !text.is_char_boundary(pos) {
265            pos -= 1;
266        }
267        pos
268    }
269
270    /// Find a safe character boundary within a slice at or before the given position
271    fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
272        pos = pos.min(text.len());
273        while pos > 0 && !text.is_char_boundary(pos) {
274            pos -= 1;
275        }
276        pos
277    }
278
279    /// Find a good sentence boundary for chunking
280    fn find_sentence_boundary(
281        &self,
282        text: &str,
283        start: usize,
284        preferred_end: usize,
285    ) -> Option<usize> {
286        // Ensure we're at character boundaries
287        let safe_start = self.find_char_boundary(text, start);
288        let safe_end = self.find_char_boundary(text, preferred_end);
289
290        if safe_start >= safe_end {
291            return None;
292        }
293
294        let search_window = &text[safe_start..safe_end];
295
296        // Look for sentence boundaries in the last part of the chunk
297        let search_start = search_window.len().saturating_sub(200);
298        // Find character boundary within the search window
299        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
300        let search_text = &search_window[safe_search_start..];
301
302        // Simple sentence boundary detection
303        let sentence_endings = ['.', '!', '?'];
304        let mut last_boundary = None;
305
306        for (i, ch) in search_text.char_indices() {
307            if sentence_endings.contains(&ch) {
308                // Check if next character is whitespace or end of text
309                let next_pos = i + ch.len_utf8();
310                if next_pos >= search_text.len()
311                    || search_text
312                        .chars()
313                        .nth(next_pos)
314                        .map_or(true, |c| c.is_whitespace())
315                {
316                    last_boundary = Some(safe_start + safe_search_start + next_pos);
317                }
318            }
319        }
320
321        last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
322    }
323
324    /// Find a word boundary for chunking
325    fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
326        // These should already be safe boundaries from the caller
327        if start >= preferred_end {
328            return None;
329        }
330
331        let search_window = &text[start..preferred_end];
332
333        // Find the last whitespace in the last 50 characters
334        let search_start = search_window.len().saturating_sub(50);
335        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
336        let search_text = &search_window[safe_search_start..];
337
338        search_text
339            .rfind(' ')
340            .map(|pos| start + safe_search_start + pos)
341    }
342
343    /// Clean and normalize text
344    pub fn clean_text(&self, text: &str) -> String {
345        text
346            // Normalize whitespace
347            .split_whitespace()
348            .collect::<Vec<_>>()
349            .join(" ")
350            // Remove excessive punctuation
351            .chars()
352            .collect::<String>()
353    }
354
355    /// Extract sentences from text
356    pub fn extract_sentences(&self, text: &str) -> Vec<String> {
357        let sentence_endings = ['.', '!', '?'];
358        let mut sentences = Vec::new();
359        let mut current_sentence = String::new();
360
361        for ch in text.chars() {
362            if sentence_endings.contains(&ch) {
363                let trimmed = current_sentence.trim().to_string();
364                if !trimmed.is_empty() {
365                    sentences.push(trimmed);
366                }
367                current_sentence.clear();
368            } else {
369                current_sentence.push(ch);
370            }
371        }
372
373        // Add any remaining text as a sentence
374        let trimmed = current_sentence.trim().to_string();
375        if !trimmed.is_empty() {
376            sentences.push(trimmed);
377        }
378
379        sentences
380    }
381
382    /// Count words in text
383    pub fn word_count(&self, text: &str) -> usize {
384        text.split_whitespace().count()
385    }
386
387    /// Process multiple documents in parallel
388    pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
389        #[cfg(feature = "parallel-processing")]
390        {
391            if let Some(processor) = &self.parallel_processor {
392                if processor.should_use_parallel(documents.len()) {
393                    use rayon::prelude::*;
394                    let results: Result<Vec<Vec<TextChunk>>> = documents
395                        .par_iter()
396                        .map(|doc| self.chunk_text(doc))
397                        .collect();
398                    return results;
399                }
400            }
401        }
402
403        // Sequential fallback
404        documents.iter().map(|doc| self.chunk_text(doc)).collect()
405    }
406
407    /// Parallel extraction of keywords from multiple texts
408    pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
409        #[cfg(feature = "parallel-processing")]
410        {
411            if let Some(processor) = &self.parallel_processor {
412                if processor.should_use_parallel(texts.len()) {
413                    use rayon::prelude::*;
414                    return texts
415                        .par_iter()
416                        .map(|&text| self.extract_keywords(text, max_keywords))
417                        .collect();
418                }
419            }
420        }
421
422        // Sequential fallback
423        texts
424            .iter()
425            .map(|&text| self.extract_keywords(text, max_keywords))
426            .collect()
427    }
428
429    /// Parallel sentence extraction from multiple texts
430    pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
431        #[cfg(feature = "parallel-processing")]
432        {
433            if let Some(processor) = &self.parallel_processor {
434                if processor.should_use_parallel(texts.len()) {
435                    use rayon::prelude::*;
436                    return texts
437                        .par_iter()
438                        .map(|&text| self.extract_sentences(text))
439                        .collect();
440                }
441            }
442        }
443
444        // Sequential fallback
445        texts
446            .iter()
447            .map(|&text| self.extract_sentences(text))
448            .collect()
449    }
450
451    /// Parallel text cleaning for multiple texts
452    pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
453        #[cfg(feature = "parallel-processing")]
454        {
455            if let Some(processor) = &self.parallel_processor {
456                if processor.should_use_parallel(texts.len()) {
457                    use rayon::prelude::*;
458                    return texts
459                        .par_iter()
460                        .map(|&text| self.clean_text(text))
461                        .collect();
462                }
463            }
464        }
465
466        // Sequential fallback
467        texts.iter().map(|&text| self.clean_text(text)).collect()
468    }
469
470    /// Extract keywords using simple frequency analysis
471    pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
472        use std::collections::HashMap;
473
474        let words: Vec<String> = text
475            .split_whitespace()
476            .map(|w| w.to_lowercase())
477            .filter(|w| w.len() > 3) // Filter out short words
478            .filter(|w| !self.is_stop_word(w))
479            .collect();
480
481        let mut word_counts = HashMap::new();
482        for word in words {
483            *word_counts.entry(word).or_insert(0) += 1;
484        }
485
486        let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
487        sorted_words.sort_by(|a, b| b.1.cmp(&a.1));
488
489        sorted_words
490            .into_iter()
491            .take(max_keywords)
492            .map(|(word, _)| word)
493            .collect()
494    }
495
496    /// Simple stop word detection (English)
497    fn is_stop_word(&self, word: &str) -> bool {
498        const STOP_WORDS: &[&str] = &[
499            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
500            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
501            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
502            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
503            "go", "me",
504        ];
505        STOP_WORDS.contains(&word)
506    }
507
508    /// Get performance statistics
509    #[cfg(feature = "parallel-processing")]
510    pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
511        let stats = self.performance_monitor.get_stats();
512        (
513            stats.tasks_processed,
514            std::time::Duration::from_millis(stats.total_time_ms),
515        )
516    }
517
518    /// Get average processing time per operation
519    #[cfg(feature = "parallel-processing")]
520    pub fn average_processing_time(&self) -> std::time::Duration {
521        let avg_ms = self.performance_monitor.average_duration();
522        std::time::Duration::from_millis(avg_ms as u64)
523    }
524
525    /// Reset performance monitoring statistics
526    #[cfg(feature = "parallel-processing")]
527    pub fn reset_performance_stats(&mut self) {
528        self.performance_monitor.reset();
529    }
530
531    /// Get parallel processing statistics if available
532    #[cfg(feature = "parallel-processing")]
533    pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
534        self.parallel_processor.as_ref().map(|p| p.get_statistics())
535    }
536}
537
538/// Language detection utilities
539pub struct LanguageDetector;
540
541impl LanguageDetector {
542    /// Simple language detection based on character patterns
543    /// This is a very basic implementation - in practice you'd want a proper library
544    pub fn detect_language(text: &str) -> String {
545        // Very basic detection - in practice use a proper language detection library
546        if text
547            .chars()
548            .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
549        {
550            "es".to_string()
551        } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
552            "pt".to_string()
553        } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
554            "fr".to_string()
555        } else {
556            "en".to_string() // Default to English
557        }
558    }
559}
560
561#[cfg(test)]
562mod tests {
563    use super::*;
564    use crate::core::DocumentId;
565
566    #[test]
567    fn test_text_chunking() {
568        let processor = TextProcessor::new(100, 20).unwrap();
569        let document = Document::new(
570            DocumentId::new("test".to_string()),
571            "Test Document".to_string(),
572            "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
573        );
574
575        let chunks = processor.chunk_text(&document).unwrap();
576        assert!(!chunks.is_empty());
577        assert!(chunks[0].content.len() <= 100);
578    }
579
580    #[test]
581    fn test_keyword_extraction() {
582        let processor = TextProcessor::new(1000, 100).unwrap();
583        let text = "machine learning artificial intelligence data science computer vision natural language processing";
584        let keywords = processor.extract_keywords(text, 3);
585
586        assert!(!keywords.is_empty());
587        assert!(keywords.len() <= 3);
588    }
589
590    #[test]
591    fn test_sentence_extraction() {
592        let processor = TextProcessor::new(1000, 100).unwrap();
593        let text = "First sentence. Second sentence! Third sentence?";
594        let sentences = processor.extract_sentences(text);
595
596        assert_eq!(sentences.len(), 3);
597        assert_eq!(sentences[0], "First sentence");
598        assert_eq!(sentences[1], "Second sentence");
599        assert_eq!(sentences[2], "Third sentence");
600    }
601
602    #[test]
603    fn test_enriched_chunking() {
604        let processor = TextProcessor::new(100, 20).unwrap();
605        let document = Document::new(
606            DocumentId::new("test".to_string()),
607            "test.md".to_string(),
608            "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
609        );
610
611        let chunks = processor.chunk_and_enrich(&document).unwrap();
612
613        assert!(!chunks.is_empty());
614        // At least some chunks should have enriched metadata
615        let has_metadata = chunks
616            .iter()
617            .any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
618        assert!(has_metadata, "Chunks should have enriched metadata");
619    }
620
621    #[test]
622    fn test_custom_enricher() {
623        let processor = TextProcessor::new(100, 20).unwrap();
624        let document = Document::new(
625            DocumentId::new("test".to_string()),
626            "test.md".to_string(),
627            "# Test Chapter\n\nContent about machine learning here.".to_string(),
628        );
629
630        let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
631        let mut enricher = ChunkEnricher::new_default(parser);
632
633        let chunks = processor
634            .chunk_text_with_enrichment(&document, &mut enricher)
635            .unwrap();
636
637        assert!(!chunks.is_empty());
638        // Verify metadata is present
639        assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
640    }
641}