1pub mod analysis;
8pub mod boundary_detection;
10pub mod chunk_enricher;
12pub mod chunking;
14pub mod chunking_strategies;
16pub mod contextual_enricher;
18pub mod document_structure;
20pub mod extractive_summarizer;
22pub mod keyword_extraction;
24pub mod late_chunking;
26pub mod layout_parser;
28pub mod parsers;
30pub mod semantic_chunking;
32pub mod semantic_coherence;
34
35pub use analysis::{TextAnalyzer, TextStats};
36pub use boundary_detection::{Boundary, BoundaryDetectionConfig, BoundaryDetector, BoundaryType};
37pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
38pub use chunking_strategies::{
39 BoundaryAwareChunkingStrategy, HierarchicalChunkingStrategy, SemanticChunkingStrategy,
40};
41pub use contextual_enricher::{ContextualEnricher, ContextualEnricherConfig};
42pub use document_structure::{
43 DocumentStructure, Heading, HeadingHierarchy, Section, SectionNumber, SectionNumberFormat,
44 StructureStatistics,
45};
46pub use extractive_summarizer::ExtractiveSummarizer;
47pub use keyword_extraction::TfIdfKeywordExtractor;
48pub use late_chunking::{JinaLateChunkingClient, LateChunkingConfig, LateChunkingStrategy};
49pub use layout_parser::{LayoutParser, LayoutParserFactory};
50pub use semantic_chunking::{
51 BreakpointStrategy, SemanticChunk, SemanticChunker, SemanticChunkerConfig,
52};
53pub use semantic_coherence::{CoherenceConfig, OptimalSplit, ScoredChunk, SemanticCoherenceScorer};
54
55#[cfg(feature = "code-chunking")]
56pub use chunking_strategies::RustCodeChunkingStrategy;
57
58#[cfg(feature = "parallel-processing")]
59use crate::parallel::{ParallelProcessor, PerformanceMonitor};
60use crate::{
61 core::{ChunkId, ChunkingStrategy, Document, TextChunk},
62 Result,
63};
64use chunking::HierarchicalChunker;
65
66#[derive(Debug)]
68pub struct TextProcessor {
69 chunk_size: usize,
70 chunk_overlap: usize,
71 #[cfg(feature = "parallel-processing")]
72 parallel_processor: Option<ParallelProcessor>,
73 #[cfg(feature = "parallel-processing")]
74 performance_monitor: PerformanceMonitor,
75}
76
77impl TextProcessor {
78 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
80 Ok(Self {
81 chunk_size,
82 chunk_overlap,
83 #[cfg(feature = "parallel-processing")]
84 parallel_processor: None,
85 #[cfg(feature = "parallel-processing")]
86 performance_monitor: PerformanceMonitor::new(),
87 })
88 }
89
90 #[cfg(feature = "parallel-processing")]
92 pub fn with_parallel_processing(
93 chunk_size: usize,
94 chunk_overlap: usize,
95 parallel_processor: ParallelProcessor,
96 ) -> Result<Self> {
97 Ok(Self {
98 chunk_size,
99 chunk_overlap,
100 parallel_processor: Some(parallel_processor),
101 performance_monitor: PerformanceMonitor::new(),
102 })
103 }
104
105 pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
107 let chunker = HierarchicalChunker::new().with_min_size(50);
108 let chunks_text =
109 chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
110
111 let mut chunks = Vec::new();
112 let mut chunk_counter = 0;
113 let mut current_pos = 0;
114
115 for chunk_content in chunks_text {
116 if !chunk_content.trim().is_empty() {
117 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
118 let chunk_start = current_pos;
119 let chunk_end = chunk_start + chunk_content.len();
120
121 current_pos += chunk_content.len();
122
123 let chunk = TextChunk::new(
124 chunk_id,
125 document.id.clone(),
126 chunk_content,
127 chunk_start,
128 chunk_end,
129 );
130 chunks.push(chunk);
131 chunk_counter += 1;
132 } else {
133 current_pos += chunk_content.len();
134 }
135 }
136
137 Ok(chunks)
138 }
139
140 pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
142 let text = &document.content;
143 let mut chunks = Vec::new();
144 let mut start = 0;
145 let mut chunk_counter = 0;
146
147 while start < text.len() {
148 let end = std::cmp::min(start + self.chunk_size, text.len());
149
150 let actual_end = if end < text.len() {
152 self.find_sentence_boundary(text, start, end)
153 .unwrap_or_else(|| self.find_char_boundary(text, end))
154 } else {
155 end
156 };
157
158 let chunk_content = text[start..actual_end].to_string();
159
160 if !chunk_content.trim().is_empty() {
161 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
162 let chunk = TextChunk::new(
163 chunk_id,
164 document.id.clone(),
165 chunk_content,
166 start,
167 actual_end,
168 );
169 chunks.push(chunk);
170 chunk_counter += 1;
171 }
172
173 let next_start = if actual_end >= text.len() {
175 break;
176 } else {
177 let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
178 let safe_overlap = self.find_char_boundary(text, overlap_start);
179 std::cmp::max(start + 1, safe_overlap)
180 };
181
182 start = next_start;
183 }
184
185 Ok(chunks)
186 }
187
188 pub fn chunk_text_with_enrichment(
190 &self,
191 document: &Document,
192 enricher: &mut ChunkEnricher,
193 ) -> Result<Vec<TextChunk>> {
194 let mut chunks = self.chunk_text(document)?;
196
197 enricher.enrich_chunks(&mut chunks, document)?;
199
200 Ok(chunks)
201 }
202
203 pub fn chunk_text_hierarchical_with_enrichment(
205 &self,
206 document: &Document,
207 enricher: &mut ChunkEnricher,
208 ) -> Result<Vec<TextChunk>> {
209 let mut chunks = self.chunk_text_hierarchical(document)?;
211
212 enricher.enrich_chunks(&mut chunks, document)?;
214
215 Ok(chunks)
216 }
217
218 pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
220 let parser = LayoutParserFactory::create_for_document(document);
221 ChunkEnricher::new_default(parser)
222 }
223
224 pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
226 let mut enricher = Self::create_default_enricher(document);
227 self.chunk_text_with_enrichment(document, &mut enricher)
228 }
229
230 pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
232 let mut enricher = Self::create_default_enricher(document);
233 self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
234 }
235
236 pub fn chunk_with_strategy(
258 &self,
259 document: &Document,
260 strategy: &dyn ChunkingStrategy,
261 ) -> Result<Vec<TextChunk>> {
262 let chunks = strategy.chunk(&document.content);
263 Ok(chunks)
264 }
265
266 fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
268 pos = pos.min(text.len());
269 while pos > 0 && !text.is_char_boundary(pos) {
270 pos -= 1;
271 }
272 pos
273 }
274
275 fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
277 pos = pos.min(text.len());
278 while pos > 0 && !text.is_char_boundary(pos) {
279 pos -= 1;
280 }
281 pos
282 }
283
284 fn find_sentence_boundary(
286 &self,
287 text: &str,
288 start: usize,
289 preferred_end: usize,
290 ) -> Option<usize> {
291 let safe_start = self.find_char_boundary(text, start);
293 let safe_end = self.find_char_boundary(text, preferred_end);
294
295 if safe_start >= safe_end {
296 return None;
297 }
298
299 let search_window = &text[safe_start..safe_end];
300
301 let search_start = search_window.len().saturating_sub(200);
303 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
305 let search_text = &search_window[safe_search_start..];
306
307 let sentence_endings = ['.', '!', '?'];
309 let mut last_boundary = None;
310
311 for (i, ch) in search_text.char_indices() {
312 if sentence_endings.contains(&ch) {
313 let next_pos = i + ch.len_utf8();
315 if next_pos >= search_text.len()
316 || search_text
317 .chars()
318 .nth(next_pos)
319 .map_or(true, |c| c.is_whitespace())
320 {
321 last_boundary = Some(safe_start + safe_search_start + next_pos);
322 }
323 }
324 }
325
326 last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
327 }
328
329 fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
331 if start >= preferred_end {
333 return None;
334 }
335
336 let search_window = &text[start..preferred_end];
337
338 let search_start = search_window.len().saturating_sub(50);
340 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
341 let search_text = &search_window[safe_search_start..];
342
343 search_text
344 .rfind(' ')
345 .map(|pos| start + safe_search_start + pos)
346 }
347
348 pub fn clean_text(&self, text: &str) -> String {
350 text
351 .split_whitespace()
353 .collect::<Vec<_>>()
354 .join(" ")
355 .chars()
357 .collect::<String>()
358 }
359
360 pub fn extract_sentences(&self, text: &str) -> Vec<String> {
362 let sentence_endings = ['.', '!', '?'];
363 let mut sentences = Vec::new();
364 let mut current_sentence = String::new();
365
366 for ch in text.chars() {
367 if sentence_endings.contains(&ch) {
368 let trimmed = current_sentence.trim().to_string();
369 if !trimmed.is_empty() {
370 sentences.push(trimmed);
371 }
372 current_sentence.clear();
373 } else {
374 current_sentence.push(ch);
375 }
376 }
377
378 let trimmed = current_sentence.trim().to_string();
380 if !trimmed.is_empty() {
381 sentences.push(trimmed);
382 }
383
384 sentences
385 }
386
387 pub fn word_count(&self, text: &str) -> usize {
389 text.split_whitespace().count()
390 }
391
392 pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
394 #[cfg(feature = "parallel-processing")]
395 {
396 if let Some(processor) = &self.parallel_processor {
397 if processor.should_use_parallel(documents.len()) {
398 use rayon::prelude::*;
399 let results: Result<Vec<Vec<TextChunk>>> = documents
400 .par_iter()
401 .map(|doc| self.chunk_text(doc))
402 .collect();
403 return results;
404 }
405 }
406 }
407
408 documents.iter().map(|doc| self.chunk_text(doc)).collect()
410 }
411
412 pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
414 #[cfg(feature = "parallel-processing")]
415 {
416 if let Some(processor) = &self.parallel_processor {
417 if processor.should_use_parallel(texts.len()) {
418 use rayon::prelude::*;
419 return texts
420 .par_iter()
421 .map(|&text| self.extract_keywords(text, max_keywords))
422 .collect();
423 }
424 }
425 }
426
427 texts
429 .iter()
430 .map(|&text| self.extract_keywords(text, max_keywords))
431 .collect()
432 }
433
434 pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
436 #[cfg(feature = "parallel-processing")]
437 {
438 if let Some(processor) = &self.parallel_processor {
439 if processor.should_use_parallel(texts.len()) {
440 use rayon::prelude::*;
441 return texts
442 .par_iter()
443 .map(|&text| self.extract_sentences(text))
444 .collect();
445 }
446 }
447 }
448
449 texts
451 .iter()
452 .map(|&text| self.extract_sentences(text))
453 .collect()
454 }
455
456 pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
458 #[cfg(feature = "parallel-processing")]
459 {
460 if let Some(processor) = &self.parallel_processor {
461 if processor.should_use_parallel(texts.len()) {
462 use rayon::prelude::*;
463 return texts
464 .par_iter()
465 .map(|&text| self.clean_text(text))
466 .collect();
467 }
468 }
469 }
470
471 texts.iter().map(|&text| self.clean_text(text)).collect()
473 }
474
475 pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
477 use std::collections::HashMap;
478
479 let words: Vec<String> = text
480 .split_whitespace()
481 .map(|w| w.to_lowercase())
482 .filter(|w| w.len() > 3) .filter(|w| !self.is_stop_word(w))
484 .collect();
485
486 let mut word_counts = HashMap::new();
487 for word in words {
488 *word_counts.entry(word).or_insert(0) += 1;
489 }
490
491 let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
492 sorted_words.sort_by_key(|item| std::cmp::Reverse(item.1));
493
494 sorted_words
495 .into_iter()
496 .take(max_keywords)
497 .map(|(word, _)| word)
498 .collect()
499 }
500
501 fn is_stop_word(&self, word: &str) -> bool {
503 const STOP_WORDS: &[&str] = &[
504 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
505 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
506 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
507 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
508 "go", "me",
509 ];
510 STOP_WORDS.contains(&word)
511 }
512
513 #[cfg(feature = "parallel-processing")]
515 pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
516 let stats = self.performance_monitor.get_stats();
517 (
518 stats.tasks_processed,
519 std::time::Duration::from_millis(stats.total_time_ms),
520 )
521 }
522
523 #[cfg(feature = "parallel-processing")]
525 pub fn average_processing_time(&self) -> std::time::Duration {
526 let avg_ms = self.performance_monitor.average_duration();
527 std::time::Duration::from_millis(avg_ms as u64)
528 }
529
530 #[cfg(feature = "parallel-processing")]
532 pub fn reset_performance_stats(&mut self) {
533 self.performance_monitor.reset();
534 }
535
536 #[cfg(feature = "parallel-processing")]
538 pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
539 self.parallel_processor.as_ref().map(|p| p.get_statistics())
540 }
541}
542
543pub struct LanguageDetector;
545
546impl LanguageDetector {
547 pub fn detect_language(text: &str) -> String {
550 if text
552 .chars()
553 .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
554 {
555 "es".to_string()
556 } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
557 "pt".to_string()
558 } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
559 "fr".to_string()
560 } else {
561 "en".to_string() }
563 }
564}
565
566#[cfg(test)]
567mod tests {
568 use super::*;
569 use crate::core::DocumentId;
570
571 #[test]
572 fn test_text_chunking() {
573 let processor = TextProcessor::new(100, 20).unwrap();
574 let document = Document::new(
575 DocumentId::new("test".to_string()),
576 "Test Document".to_string(),
577 "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
578 );
579
580 let chunks = processor.chunk_text(&document).unwrap();
581 assert!(!chunks.is_empty());
582 assert!(chunks[0].content.len() <= 100);
583 }
584
585 #[test]
586 fn test_keyword_extraction() {
587 let processor = TextProcessor::new(1000, 100).unwrap();
588 let text = "machine learning artificial intelligence data science computer vision natural language processing";
589 let keywords = processor.extract_keywords(text, 3);
590
591 assert!(!keywords.is_empty());
592 assert!(keywords.len() <= 3);
593 }
594
595 #[test]
596 fn test_sentence_extraction() {
597 let processor = TextProcessor::new(1000, 100).unwrap();
598 let text = "First sentence. Second sentence! Third sentence?";
599 let sentences = processor.extract_sentences(text);
600
601 assert_eq!(sentences.len(), 3);
602 assert_eq!(sentences[0], "First sentence");
603 assert_eq!(sentences[1], "Second sentence");
604 assert_eq!(sentences[2], "Third sentence");
605 }
606
607 #[test]
608 fn test_enriched_chunking() {
609 let processor = TextProcessor::new(100, 20).unwrap();
610 let document = Document::new(
611 DocumentId::new("test".to_string()),
612 "test.md".to_string(),
613 "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
614 );
615
616 let chunks = processor.chunk_and_enrich(&document).unwrap();
617
618 assert!(!chunks.is_empty());
619 let has_metadata = chunks
621 .iter()
622 .any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
623 assert!(has_metadata, "Chunks should have enriched metadata");
624 }
625
626 #[test]
627 fn test_custom_enricher() {
628 let processor = TextProcessor::new(100, 20).unwrap();
629 let document = Document::new(
630 DocumentId::new("test".to_string()),
631 "test.md".to_string(),
632 "# Test Chapter\n\nContent about machine learning here.".to_string(),
633 );
634
635 let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
636 let mut enricher = ChunkEnricher::new_default(parser);
637
638 let chunks = processor
639 .chunk_text_with_enrichment(&document, &mut enricher)
640 .unwrap();
641
642 assert!(!chunks.is_empty());
643 assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
645 }
646}