1pub mod analysis;
3pub mod boundary_detection;
5pub mod chunk_enricher;
7pub mod chunking;
9pub mod chunking_strategies;
11pub mod contextual_enricher;
13pub mod document_structure;
15pub mod extractive_summarizer;
17pub mod keyword_extraction;
19pub mod late_chunking;
21pub mod layout_parser;
23pub mod parsers;
25pub mod semantic_chunking;
27pub mod semantic_coherence;
29
30pub use analysis::{TextAnalyzer, TextStats};
31pub use boundary_detection::{Boundary, BoundaryDetectionConfig, BoundaryDetector, BoundaryType};
32pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
33pub use chunking_strategies::{
34 BoundaryAwareChunkingStrategy, HierarchicalChunkingStrategy, SemanticChunkingStrategy,
35};
36pub use contextual_enricher::{ContextualEnricher, ContextualEnricherConfig};
37pub use document_structure::{
38 DocumentStructure, Heading, HeadingHierarchy, Section, SectionNumber, SectionNumberFormat,
39 StructureStatistics,
40};
41pub use extractive_summarizer::ExtractiveSummarizer;
42pub use keyword_extraction::TfIdfKeywordExtractor;
43pub use late_chunking::{JinaLateChunkingClient, LateChunkingConfig, LateChunkingStrategy};
44pub use layout_parser::{LayoutParser, LayoutParserFactory};
45pub use semantic_chunking::{
46 BreakpointStrategy, SemanticChunk, SemanticChunker, SemanticChunkerConfig,
47};
48pub use semantic_coherence::{CoherenceConfig, OptimalSplit, ScoredChunk, SemanticCoherenceScorer};
49
50#[cfg(feature = "code-chunking")]
51pub use chunking_strategies::RustCodeChunkingStrategy;
52
53#[cfg(feature = "parallel-processing")]
54use crate::parallel::{ParallelProcessor, PerformanceMonitor};
55use crate::{
56 core::{ChunkId, ChunkingStrategy, Document, TextChunk},
57 Result,
58};
59use chunking::HierarchicalChunker;
60
61#[derive(Debug)]
63pub struct TextProcessor {
64 chunk_size: usize,
65 chunk_overlap: usize,
66 #[cfg(feature = "parallel-processing")]
67 parallel_processor: Option<ParallelProcessor>,
68 #[cfg(feature = "parallel-processing")]
69 performance_monitor: PerformanceMonitor,
70}
71
72impl TextProcessor {
73 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
75 Ok(Self {
76 chunk_size,
77 chunk_overlap,
78 #[cfg(feature = "parallel-processing")]
79 parallel_processor: None,
80 #[cfg(feature = "parallel-processing")]
81 performance_monitor: PerformanceMonitor::new(),
82 })
83 }
84
85 #[cfg(feature = "parallel-processing")]
87 pub fn with_parallel_processing(
88 chunk_size: usize,
89 chunk_overlap: usize,
90 parallel_processor: ParallelProcessor,
91 ) -> Result<Self> {
92 Ok(Self {
93 chunk_size,
94 chunk_overlap,
95 parallel_processor: Some(parallel_processor),
96 performance_monitor: PerformanceMonitor::new(),
97 })
98 }
99
100 pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
102 let chunker = HierarchicalChunker::new().with_min_size(50);
103 let chunks_text =
104 chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
105
106 let mut chunks = Vec::new();
107 let mut chunk_counter = 0;
108 let mut current_pos = 0;
109
110 for chunk_content in chunks_text {
111 if !chunk_content.trim().is_empty() {
112 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
113 let chunk_start = current_pos;
114 let chunk_end = chunk_start + chunk_content.len();
115
116 current_pos += chunk_content.len();
117
118 let chunk = TextChunk::new(
119 chunk_id,
120 document.id.clone(),
121 chunk_content,
122 chunk_start,
123 chunk_end,
124 );
125 chunks.push(chunk);
126 chunk_counter += 1;
127 } else {
128 current_pos += chunk_content.len();
129 }
130 }
131
132 Ok(chunks)
133 }
134
135 pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
137 let text = &document.content;
138 let mut chunks = Vec::new();
139 let mut start = 0;
140 let mut chunk_counter = 0;
141
142 while start < text.len() {
143 let end = std::cmp::min(start + self.chunk_size, text.len());
144
145 let actual_end = if end < text.len() {
147 self.find_sentence_boundary(text, start, end)
148 .unwrap_or_else(|| self.find_char_boundary(text, end))
149 } else {
150 end
151 };
152
153 let chunk_content = text[start..actual_end].to_string();
154
155 if !chunk_content.trim().is_empty() {
156 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
157 let chunk = TextChunk::new(
158 chunk_id,
159 document.id.clone(),
160 chunk_content,
161 start,
162 actual_end,
163 );
164 chunks.push(chunk);
165 chunk_counter += 1;
166 }
167
168 let next_start = if actual_end >= text.len() {
170 break;
171 } else {
172 let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
173 let safe_overlap = self.find_char_boundary(text, overlap_start);
174 std::cmp::max(start + 1, safe_overlap)
175 };
176
177 start = next_start;
178 }
179
180 Ok(chunks)
181 }
182
183 pub fn chunk_text_with_enrichment(
185 &self,
186 document: &Document,
187 enricher: &mut ChunkEnricher,
188 ) -> Result<Vec<TextChunk>> {
189 let mut chunks = self.chunk_text(document)?;
191
192 enricher.enrich_chunks(&mut chunks, document)?;
194
195 Ok(chunks)
196 }
197
198 pub fn chunk_text_hierarchical_with_enrichment(
200 &self,
201 document: &Document,
202 enricher: &mut ChunkEnricher,
203 ) -> Result<Vec<TextChunk>> {
204 let mut chunks = self.chunk_text_hierarchical(document)?;
206
207 enricher.enrich_chunks(&mut chunks, document)?;
209
210 Ok(chunks)
211 }
212
213 pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
215 let parser = LayoutParserFactory::create_for_document(document);
216 ChunkEnricher::new_default(parser)
217 }
218
219 pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
221 let mut enricher = Self::create_default_enricher(document);
222 self.chunk_text_with_enrichment(document, &mut enricher)
223 }
224
225 pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
227 let mut enricher = Self::create_default_enricher(document);
228 self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
229 }
230
231 pub fn chunk_with_strategy(
253 &self,
254 document: &Document,
255 strategy: &dyn ChunkingStrategy,
256 ) -> Result<Vec<TextChunk>> {
257 let chunks = strategy.chunk(&document.content);
258 Ok(chunks)
259 }
260
261 fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
263 pos = pos.min(text.len());
264 while pos > 0 && !text.is_char_boundary(pos) {
265 pos -= 1;
266 }
267 pos
268 }
269
270 fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
272 pos = pos.min(text.len());
273 while pos > 0 && !text.is_char_boundary(pos) {
274 pos -= 1;
275 }
276 pos
277 }
278
279 fn find_sentence_boundary(
281 &self,
282 text: &str,
283 start: usize,
284 preferred_end: usize,
285 ) -> Option<usize> {
286 let safe_start = self.find_char_boundary(text, start);
288 let safe_end = self.find_char_boundary(text, preferred_end);
289
290 if safe_start >= safe_end {
291 return None;
292 }
293
294 let search_window = &text[safe_start..safe_end];
295
296 let search_start = search_window.len().saturating_sub(200);
298 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
300 let search_text = &search_window[safe_search_start..];
301
302 let sentence_endings = ['.', '!', '?'];
304 let mut last_boundary = None;
305
306 for (i, ch) in search_text.char_indices() {
307 if sentence_endings.contains(&ch) {
308 let next_pos = i + ch.len_utf8();
310 if next_pos >= search_text.len()
311 || search_text
312 .chars()
313 .nth(next_pos)
314 .map_or(true, |c| c.is_whitespace())
315 {
316 last_boundary = Some(safe_start + safe_search_start + next_pos);
317 }
318 }
319 }
320
321 last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
322 }
323
324 fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
326 if start >= preferred_end {
328 return None;
329 }
330
331 let search_window = &text[start..preferred_end];
332
333 let search_start = search_window.len().saturating_sub(50);
335 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
336 let search_text = &search_window[safe_search_start..];
337
338 search_text
339 .rfind(' ')
340 .map(|pos| start + safe_search_start + pos)
341 }
342
343 pub fn clean_text(&self, text: &str) -> String {
345 text
346 .split_whitespace()
348 .collect::<Vec<_>>()
349 .join(" ")
350 .chars()
352 .collect::<String>()
353 }
354
355 pub fn extract_sentences(&self, text: &str) -> Vec<String> {
357 let sentence_endings = ['.', '!', '?'];
358 let mut sentences = Vec::new();
359 let mut current_sentence = String::new();
360
361 for ch in text.chars() {
362 if sentence_endings.contains(&ch) {
363 let trimmed = current_sentence.trim().to_string();
364 if !trimmed.is_empty() {
365 sentences.push(trimmed);
366 }
367 current_sentence.clear();
368 } else {
369 current_sentence.push(ch);
370 }
371 }
372
373 let trimmed = current_sentence.trim().to_string();
375 if !trimmed.is_empty() {
376 sentences.push(trimmed);
377 }
378
379 sentences
380 }
381
382 pub fn word_count(&self, text: &str) -> usize {
384 text.split_whitespace().count()
385 }
386
387 pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
389 #[cfg(feature = "parallel-processing")]
390 {
391 if let Some(processor) = &self.parallel_processor {
392 if processor.should_use_parallel(documents.len()) {
393 use rayon::prelude::*;
394 let results: Result<Vec<Vec<TextChunk>>> = documents
395 .par_iter()
396 .map(|doc| self.chunk_text(doc))
397 .collect();
398 return results;
399 }
400 }
401 }
402
403 documents.iter().map(|doc| self.chunk_text(doc)).collect()
405 }
406
407 pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
409 #[cfg(feature = "parallel-processing")]
410 {
411 if let Some(processor) = &self.parallel_processor {
412 if processor.should_use_parallel(texts.len()) {
413 use rayon::prelude::*;
414 return texts
415 .par_iter()
416 .map(|&text| self.extract_keywords(text, max_keywords))
417 .collect();
418 }
419 }
420 }
421
422 texts
424 .iter()
425 .map(|&text| self.extract_keywords(text, max_keywords))
426 .collect()
427 }
428
429 pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
431 #[cfg(feature = "parallel-processing")]
432 {
433 if let Some(processor) = &self.parallel_processor {
434 if processor.should_use_parallel(texts.len()) {
435 use rayon::prelude::*;
436 return texts
437 .par_iter()
438 .map(|&text| self.extract_sentences(text))
439 .collect();
440 }
441 }
442 }
443
444 texts
446 .iter()
447 .map(|&text| self.extract_sentences(text))
448 .collect()
449 }
450
451 pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
453 #[cfg(feature = "parallel-processing")]
454 {
455 if let Some(processor) = &self.parallel_processor {
456 if processor.should_use_parallel(texts.len()) {
457 use rayon::prelude::*;
458 return texts
459 .par_iter()
460 .map(|&text| self.clean_text(text))
461 .collect();
462 }
463 }
464 }
465
466 texts.iter().map(|&text| self.clean_text(text)).collect()
468 }
469
470 pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
472 use std::collections::HashMap;
473
474 let words: Vec<String> = text
475 .split_whitespace()
476 .map(|w| w.to_lowercase())
477 .filter(|w| w.len() > 3) .filter(|w| !self.is_stop_word(w))
479 .collect();
480
481 let mut word_counts = HashMap::new();
482 for word in words {
483 *word_counts.entry(word).or_insert(0) += 1;
484 }
485
486 let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
487 sorted_words.sort_by(|a, b| b.1.cmp(&a.1));
488
489 sorted_words
490 .into_iter()
491 .take(max_keywords)
492 .map(|(word, _)| word)
493 .collect()
494 }
495
496 fn is_stop_word(&self, word: &str) -> bool {
498 const STOP_WORDS: &[&str] = &[
499 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
500 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
501 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
502 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
503 "go", "me",
504 ];
505 STOP_WORDS.contains(&word)
506 }
507
508 #[cfg(feature = "parallel-processing")]
510 pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
511 let stats = self.performance_monitor.get_stats();
512 (
513 stats.tasks_processed,
514 std::time::Duration::from_millis(stats.total_time_ms),
515 )
516 }
517
518 #[cfg(feature = "parallel-processing")]
520 pub fn average_processing_time(&self) -> std::time::Duration {
521 let avg_ms = self.performance_monitor.average_duration();
522 std::time::Duration::from_millis(avg_ms as u64)
523 }
524
525 #[cfg(feature = "parallel-processing")]
527 pub fn reset_performance_stats(&mut self) {
528 self.performance_monitor.reset();
529 }
530
531 #[cfg(feature = "parallel-processing")]
533 pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
534 self.parallel_processor.as_ref().map(|p| p.get_statistics())
535 }
536}
537
538pub struct LanguageDetector;
540
541impl LanguageDetector {
542 pub fn detect_language(text: &str) -> String {
545 if text
547 .chars()
548 .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
549 {
550 "es".to_string()
551 } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
552 "pt".to_string()
553 } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
554 "fr".to_string()
555 } else {
556 "en".to_string() }
558 }
559}
560
561#[cfg(test)]
562mod tests {
563 use super::*;
564 use crate::core::DocumentId;
565
566 #[test]
567 fn test_text_chunking() {
568 let processor = TextProcessor::new(100, 20).unwrap();
569 let document = Document::new(
570 DocumentId::new("test".to_string()),
571 "Test Document".to_string(),
572 "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
573 );
574
575 let chunks = processor.chunk_text(&document).unwrap();
576 assert!(!chunks.is_empty());
577 assert!(chunks[0].content.len() <= 100);
578 }
579
580 #[test]
581 fn test_keyword_extraction() {
582 let processor = TextProcessor::new(1000, 100).unwrap();
583 let text = "machine learning artificial intelligence data science computer vision natural language processing";
584 let keywords = processor.extract_keywords(text, 3);
585
586 assert!(!keywords.is_empty());
587 assert!(keywords.len() <= 3);
588 }
589
590 #[test]
591 fn test_sentence_extraction() {
592 let processor = TextProcessor::new(1000, 100).unwrap();
593 let text = "First sentence. Second sentence! Third sentence?";
594 let sentences = processor.extract_sentences(text);
595
596 assert_eq!(sentences.len(), 3);
597 assert_eq!(sentences[0], "First sentence");
598 assert_eq!(sentences[1], "Second sentence");
599 assert_eq!(sentences[2], "Third sentence");
600 }
601
602 #[test]
603 fn test_enriched_chunking() {
604 let processor = TextProcessor::new(100, 20).unwrap();
605 let document = Document::new(
606 DocumentId::new("test".to_string()),
607 "test.md".to_string(),
608 "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
609 );
610
611 let chunks = processor.chunk_and_enrich(&document).unwrap();
612
613 assert!(!chunks.is_empty());
614 let has_metadata = chunks
616 .iter()
617 .any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
618 assert!(has_metadata, "Chunks should have enriched metadata");
619 }
620
621 #[test]
622 fn test_custom_enricher() {
623 let processor = TextProcessor::new(100, 20).unwrap();
624 let document = Document::new(
625 DocumentId::new("test".to_string()),
626 "test.md".to_string(),
627 "# Test Chapter\n\nContent about machine learning here.".to_string(),
628 );
629
630 let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
631 let mut enricher = ChunkEnricher::new_default(parser);
632
633 let chunks = processor
634 .chunk_text_with_enrichment(&document, &mut enricher)
635 .unwrap();
636
637 assert!(!chunks.is_empty());
638 assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
640 }
641}