Skip to main content

graphrag_core/text/
mod.rs

1/// Text chunking utilities module
2pub mod chunking;
3/// Semantic chunking based on embedding similarity
4pub mod semantic_chunking;
5/// Document structure representation
6pub mod document_structure;
7/// Text analysis utilities
8pub mod analysis;
9/// TF-IDF keyword extraction
10pub mod keyword_extraction;
11/// Extractive summarization
12pub mod extractive_summarizer;
13/// Layout parser trait
14pub mod layout_parser;
15/// Document layout parsers
16pub mod parsers;
17/// Chunk enrichment pipeline
18pub mod chunk_enricher;
19/// Trait-based chunking strategies
20pub mod chunking_strategies;
21
22pub use semantic_chunking::{
23    SemanticChunk, SemanticChunker, SemanticChunkerConfig, BreakpointStrategy,
24};
25pub use document_structure::{
26    DocumentStructure, Heading, Section, HeadingHierarchy, SectionNumber,
27    SectionNumberFormat, StructureStatistics,
28};
29pub use analysis::{TextAnalyzer, TextStats};
30pub use keyword_extraction::TfIdfKeywordExtractor;
31pub use extractive_summarizer::ExtractiveSummarizer;
32pub use layout_parser::{LayoutParser, LayoutParserFactory};
33pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
34pub use chunking_strategies::{
35    HierarchicalChunkingStrategy, SemanticChunkingStrategy,
36};
37
38#[cfg(feature = "code-chunking")]
39pub use chunking_strategies::RustCodeChunkingStrategy;
40
41use crate::{
42    core::{ChunkId, Document, TextChunk, ChunkingStrategy},
43    Result,
44};
45#[cfg(feature = "parallel-processing")]
46use crate::parallel::{ParallelProcessor, PerformanceMonitor};
47use chunking::HierarchicalChunker;
48
49/// Text processing utilities for chunking and preprocessing
50#[derive(Debug)]
51pub struct TextProcessor {
52    chunk_size: usize,
53    chunk_overlap: usize,
54    #[cfg(feature = "parallel-processing")]
55    parallel_processor: Option<ParallelProcessor>,
56    #[cfg(feature = "parallel-processing")]
57    performance_monitor: PerformanceMonitor,
58}
59
60impl TextProcessor {
61    /// Create a new text processor
62    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
63        Ok(Self {
64            chunk_size,
65            chunk_overlap,
66            #[cfg(feature = "parallel-processing")]
67            parallel_processor: None,
68            #[cfg(feature = "parallel-processing")]
69            performance_monitor: PerformanceMonitor::new(),
70        })
71    }
72
73    /// Create a new text processor with parallel processing support
74    #[cfg(feature = "parallel-processing")]
75    pub fn with_parallel_processing(
76        chunk_size: usize,
77        chunk_overlap: usize,
78        parallel_processor: ParallelProcessor,
79    ) -> Result<Self> {
80        Ok(Self {
81            chunk_size,
82            chunk_overlap,
83            parallel_processor: Some(parallel_processor),
84            performance_monitor: PerformanceMonitor::new(),
85        })
86    }
87
88    /// Split text into chunks with overlap using hierarchical boundary preservation
89    pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
90        let chunker = HierarchicalChunker::new().with_min_size(50);
91        let chunks_text = chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
92
93        let mut chunks = Vec::new();
94        let mut chunk_counter = 0;
95        let mut current_pos = 0;
96
97        for chunk_content in chunks_text {
98            if !chunk_content.trim().is_empty() {
99                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
100                let chunk_start = current_pos;
101                let chunk_end = chunk_start + chunk_content.len();
102
103                current_pos += chunk_content.len();
104
105                let chunk = TextChunk::new(
106                    chunk_id,
107                    document.id.clone(),
108                    chunk_content,
109                    chunk_start,
110                    chunk_end,
111                );
112                chunks.push(chunk);
113                chunk_counter += 1;
114            } else {
115                current_pos += chunk_content.len();
116            }
117        }
118
119        Ok(chunks)
120    }
121
122    /// Split text into chunks with overlap (legacy method)
123    pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
124        let text = &document.content;
125        let mut chunks = Vec::new();
126        let mut start = 0;
127        let mut chunk_counter = 0;
128
129        while start < text.len() {
130            let end = std::cmp::min(start + self.chunk_size, text.len());
131
132            // Try to find a good breaking point (sentence boundary)
133            let actual_end = if end < text.len() {
134                self.find_sentence_boundary(text, start, end)
135                    .unwrap_or_else(|| self.find_char_boundary(text, end))
136            } else {
137                end
138            };
139
140            let chunk_content = text[start..actual_end].to_string();
141
142            if !chunk_content.trim().is_empty() {
143                let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
144                let chunk = TextChunk::new(
145                    chunk_id,
146                    document.id.clone(),
147                    chunk_content,
148                    start,
149                    actual_end,
150                );
151                chunks.push(chunk);
152                chunk_counter += 1;
153            }
154
155            // Calculate next start position with overlap
156            let next_start = if actual_end >= text.len() {
157                break;
158            } else {
159                let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
160                let safe_overlap = self.find_char_boundary(text, overlap_start);
161                std::cmp::max(start + 1, safe_overlap)
162            };
163
164            start = next_start;
165        }
166
167        Ok(chunks)
168    }
169
170    /// Chunk text and enrich with semantic metadata
171    pub fn chunk_text_with_enrichment(
172        &self,
173        document: &Document,
174        enricher: &mut ChunkEnricher,
175    ) -> Result<Vec<TextChunk>> {
176        // First, chunk the document
177        let mut chunks = self.chunk_text(document)?;
178
179        // Then enrich the chunks with metadata
180        enricher.enrich_chunks(&mut chunks, document)?;
181
182        Ok(chunks)
183    }
184
185    /// Chunk text hierarchically and enrich with semantic metadata
186    pub fn chunk_text_hierarchical_with_enrichment(
187        &self,
188        document: &Document,
189        enricher: &mut ChunkEnricher,
190    ) -> Result<Vec<TextChunk>> {
191        // First, chunk the document hierarchically
192        let mut chunks = self.chunk_text_hierarchical(document)?;
193
194        // Then enrich the chunks with metadata
195        enricher.enrich_chunks(&mut chunks, document)?;
196
197        Ok(chunks)
198    }
199
200    /// Create a default enricher for document processing
201    pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
202        let parser = LayoutParserFactory::create_for_document(document);
203        ChunkEnricher::new_default(parser)
204    }
205
206    /// Convenience method: chunk and enrich with auto-detected format
207    pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
208        let mut enricher = Self::create_default_enricher(document);
209        self.chunk_text_with_enrichment(document, &mut enricher)
210    }
211
212    /// Convenience method: chunk hierarchically and enrich with auto-detected format
213    pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
214        let mut enricher = Self::create_default_enricher(document);
215        self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
216    }
217
218    /// Chunk text using any strategy that implements ChunkingStrategy trait
219    ///
220    /// This method provides a flexible way to use different chunking approaches
221    /// while maintaining the same interface.
222    ///
223    /// # Arguments
224    /// * `document` - The document to chunk
225    /// * `strategy` - Any type implementing ChunkingStrategy
226    ///
227    /// # Returns
228    /// A vector of TextChunk objects
229    ///
230    /// # Examples
231    ///
232    /// ```rust
233    /// use graphrag_core::text::{TextProcessor, HierarchicalChunkingStrategy};
234    ///
235    /// let processor = TextProcessor::new(1000, 100)?;
236    /// let strategy = HierarchicalChunkingStrategy::new(500, 50, document.id.clone());
237    /// let chunks = processor.chunk_with_strategy(&document, &strategy)?;
238    /// ```
239    pub fn chunk_with_strategy(&self, document: &Document, strategy: &dyn ChunkingStrategy) -> Result<Vec<TextChunk>> {
240        let chunks = strategy.chunk(&document.content);
241        Ok(chunks)
242    }
243
244    /// Find a safe character boundary at or before the given position
245    fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
246        pos = pos.min(text.len());
247        while pos > 0 && !text.is_char_boundary(pos) {
248            pos -= 1;
249        }
250        pos
251    }
252
253    /// Find a safe character boundary within a slice at or before the given position
254    fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
255        pos = pos.min(text.len());
256        while pos > 0 && !text.is_char_boundary(pos) {
257            pos -= 1;
258        }
259        pos
260    }
261
262    /// Find a good sentence boundary for chunking
263    fn find_sentence_boundary(
264        &self,
265        text: &str,
266        start: usize,
267        preferred_end: usize,
268    ) -> Option<usize> {
269        // Ensure we're at character boundaries
270        let safe_start = self.find_char_boundary(text, start);
271        let safe_end = self.find_char_boundary(text, preferred_end);
272
273        if safe_start >= safe_end {
274            return None;
275        }
276
277        let search_window = &text[safe_start..safe_end];
278
279        // Look for sentence boundaries in the last part of the chunk
280        let search_start = search_window.len().saturating_sub(200);
281        // Find character boundary within the search window
282        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
283        let search_text = &search_window[safe_search_start..];
284
285        // Simple sentence boundary detection
286        let sentence_endings = ['.', '!', '?'];
287        let mut last_boundary = None;
288
289        for (i, ch) in search_text.char_indices() {
290            if sentence_endings.contains(&ch) {
291                // Check if next character is whitespace or end of text
292                let next_pos = i + ch.len_utf8();
293                if next_pos >= search_text.len()
294                    || search_text
295                        .chars()
296                        .nth(next_pos)
297                        .map_or(true, |c| c.is_whitespace())
298                {
299                    last_boundary = Some(safe_start + safe_search_start + next_pos);
300                }
301            }
302        }
303
304        last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
305    }
306
307    /// Find a word boundary for chunking
308    fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
309        // These should already be safe boundaries from the caller
310        if start >= preferred_end {
311            return None;
312        }
313
314        let search_window = &text[start..preferred_end];
315
316        // Find the last whitespace in the last 50 characters
317        let search_start = search_window.len().saturating_sub(50);
318        let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
319        let search_text = &search_window[safe_search_start..];
320
321        search_text
322            .rfind(' ')
323            .map(|pos| start + safe_search_start + pos)
324    }
325
326    /// Clean and normalize text
327    pub fn clean_text(&self, text: &str) -> String {
328        text
329            // Normalize whitespace
330            .split_whitespace()
331            .collect::<Vec<_>>()
332            .join(" ")
333            // Remove excessive punctuation
334            .chars()
335            .collect::<String>()
336    }
337
338    /// Extract sentences from text
339    pub fn extract_sentences(&self, text: &str) -> Vec<String> {
340        let sentence_endings = ['.', '!', '?'];
341        let mut sentences = Vec::new();
342        let mut current_sentence = String::new();
343
344        for ch in text.chars() {
345            if sentence_endings.contains(&ch) {
346                let trimmed = current_sentence.trim().to_string();
347                if !trimmed.is_empty() {
348                    sentences.push(trimmed);
349                }
350                current_sentence.clear();
351            } else {
352                current_sentence.push(ch);
353            }
354        }
355
356        // Add any remaining text as a sentence
357        let trimmed = current_sentence.trim().to_string();
358        if !trimmed.is_empty() {
359            sentences.push(trimmed);
360        }
361
362        sentences
363    }
364
365    /// Count words in text
366    pub fn word_count(&self, text: &str) -> usize {
367        text.split_whitespace().count()
368    }
369
370    /// Process multiple documents in parallel
371    pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
372        #[cfg(feature = "parallel-processing")]
373        {
374            if let Some(processor) = &self.parallel_processor {
375                if processor.should_use_parallel(documents.len()) {
376                    use rayon::prelude::*;
377                    let results: Result<Vec<Vec<TextChunk>>> = documents
378                        .par_iter()
379                        .map(|doc| self.chunk_text(doc))
380                        .collect();
381                    return results;
382                }
383            }
384        }
385
386        // Sequential fallback
387        documents
388            .iter()
389            .map(|doc| self.chunk_text(doc))
390            .collect()
391    }
392
393    /// Parallel extraction of keywords from multiple texts
394    pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
395        #[cfg(feature = "parallel-processing")]
396        {
397            if let Some(processor) = &self.parallel_processor {
398                if processor.should_use_parallel(texts.len()) {
399                    use rayon::prelude::*;
400                    return texts
401                        .par_iter()
402                        .map(|&text| self.extract_keywords(text, max_keywords))
403                        .collect();
404                }
405            }
406        }
407
408        // Sequential fallback
409        texts
410            .iter()
411            .map(|&text| self.extract_keywords(text, max_keywords))
412            .collect()
413    }
414
415    /// Parallel sentence extraction from multiple texts
416    pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
417        #[cfg(feature = "parallel-processing")]
418        {
419            if let Some(processor) = &self.parallel_processor {
420                if processor.should_use_parallel(texts.len()) {
421                    use rayon::prelude::*;
422                    return texts
423                        .par_iter()
424                        .map(|&text| self.extract_sentences(text))
425                        .collect();
426                }
427            }
428        }
429
430        // Sequential fallback
431        texts
432            .iter()
433            .map(|&text| self.extract_sentences(text))
434            .collect()
435    }
436
437    /// Parallel text cleaning for multiple texts
438    pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
439        #[cfg(feature = "parallel-processing")]
440        {
441            if let Some(processor) = &self.parallel_processor {
442                if processor.should_use_parallel(texts.len()) {
443                    use rayon::prelude::*;
444                    return texts
445                        .par_iter()
446                        .map(|&text| self.clean_text(text))
447                        .collect();
448                }
449            }
450        }
451
452        // Sequential fallback
453        texts.iter().map(|&text| self.clean_text(text)).collect()
454    }
455
456    /// Extract keywords using simple frequency analysis
457    pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
458        use std::collections::HashMap;
459
460        let words: Vec<String> = text
461            .split_whitespace()
462            .map(|w| w.to_lowercase())
463            .filter(|w| w.len() > 3) // Filter out short words
464            .filter(|w| !self.is_stop_word(w))
465            .collect();
466
467        let mut word_counts = HashMap::new();
468        for word in words {
469            *word_counts.entry(word).or_insert(0) += 1;
470        }
471
472        let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
473        sorted_words.sort_by(|a, b| b.1.cmp(&a.1));
474
475        sorted_words
476            .into_iter()
477            .take(max_keywords)
478            .map(|(word, _)| word)
479            .collect()
480    }
481
482    /// Simple stop word detection (English)
483    fn is_stop_word(&self, word: &str) -> bool {
484        const STOP_WORDS: &[&str] = &[
485            "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
486            "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
487            "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
488            "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
489            "go", "me",
490        ];
491        STOP_WORDS.contains(&word)
492    }
493
494    /// Get performance statistics
495    #[cfg(feature = "parallel-processing")]
496    pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
497        let stats = self.performance_monitor.get_stats();
498        (
499            stats.tasks_processed,
500            std::time::Duration::from_millis(stats.total_time_ms),
501        )
502    }
503
504    /// Get average processing time per operation
505    #[cfg(feature = "parallel-processing")]
506    pub fn average_processing_time(&self) -> std::time::Duration {
507        let avg_ms = self.performance_monitor.average_duration();
508        std::time::Duration::from_millis(avg_ms as u64)
509    }
510
511    /// Reset performance monitoring statistics
512    #[cfg(feature = "parallel-processing")]
513    pub fn reset_performance_stats(&mut self) {
514        self.performance_monitor.reset();
515    }
516
517    /// Get parallel processing statistics if available
518    #[cfg(feature = "parallel-processing")]
519    pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
520        self.parallel_processor.as_ref().map(|p| p.get_statistics())
521    }
522}
523
524/// Language detection utilities
525pub struct LanguageDetector;
526
527impl LanguageDetector {
528    /// Simple language detection based on character patterns
529    /// This is a very basic implementation - in practice you'd want a proper library
530    pub fn detect_language(text: &str) -> String {
531        // Very basic detection - in practice use a proper language detection library
532        if text
533            .chars()
534            .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
535        {
536            "es".to_string()
537        } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
538            "pt".to_string()
539        } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
540            "fr".to_string()
541        } else {
542            "en".to_string() // Default to English
543        }
544    }
545}
546
547#[cfg(test)]
548mod tests {
549    use super::*;
550    use crate::core::DocumentId;
551
552    #[test]
553    fn test_text_chunking() {
554        let processor = TextProcessor::new(100, 20).unwrap();
555        let document = Document::new(
556            DocumentId::new("test".to_string()),
557            "Test Document".to_string(),
558            "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
559        );
560
561        let chunks = processor.chunk_text(&document).unwrap();
562        assert!(!chunks.is_empty());
563        assert!(chunks[0].content.len() <= 100);
564    }
565
566    #[test]
567    fn test_keyword_extraction() {
568        let processor = TextProcessor::new(1000, 100).unwrap();
569        let text = "machine learning artificial intelligence data science computer vision natural language processing";
570        let keywords = processor.extract_keywords(text, 3);
571
572        assert!(!keywords.is_empty());
573        assert!(keywords.len() <= 3);
574    }
575
576    #[test]
577    fn test_sentence_extraction() {
578        let processor = TextProcessor::new(1000, 100).unwrap();
579        let text = "First sentence. Second sentence! Third sentence?";
580        let sentences = processor.extract_sentences(text);
581
582        assert_eq!(sentences.len(), 3);
583        assert_eq!(sentences[0], "First sentence");
584        assert_eq!(sentences[1], "Second sentence");
585        assert_eq!(sentences[2], "Third sentence");
586    }
587
588    #[test]
589    fn test_enriched_chunking() {
590        let processor = TextProcessor::new(100, 20).unwrap();
591        let document = Document::new(
592            DocumentId::new("test".to_string()),
593            "test.md".to_string(),
594            "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
595        );
596
597        let chunks = processor.chunk_and_enrich(&document).unwrap();
598
599        assert!(!chunks.is_empty());
600        // At least some chunks should have enriched metadata
601        let has_metadata = chunks.iter().any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
602        assert!(has_metadata, "Chunks should have enriched metadata");
603    }
604
605    #[test]
606    fn test_custom_enricher() {
607        let processor = TextProcessor::new(100, 20).unwrap();
608        let document = Document::new(
609            DocumentId::new("test".to_string()),
610            "test.md".to_string(),
611            "# Test Chapter\n\nContent about machine learning here.".to_string(),
612        );
613
614        let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
615        let mut enricher = ChunkEnricher::new_default(parser);
616
617        let chunks = processor.chunk_text_with_enrichment(&document, &mut enricher).unwrap();
618
619        assert!(!chunks.is_empty());
620        // Verify metadata is present
621        assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
622    }
623}