1pub mod chunking;
3pub mod semantic_chunking;
5pub mod document_structure;
7pub mod analysis;
9pub mod keyword_extraction;
11pub mod extractive_summarizer;
13pub mod layout_parser;
15pub mod parsers;
17pub mod chunk_enricher;
19pub mod chunking_strategies;
21
22pub use semantic_chunking::{
23 SemanticChunk, SemanticChunker, SemanticChunkerConfig, BreakpointStrategy,
24};
25pub use document_structure::{
26 DocumentStructure, Heading, Section, HeadingHierarchy, SectionNumber,
27 SectionNumberFormat, StructureStatistics,
28};
29pub use analysis::{TextAnalyzer, TextStats};
30pub use keyword_extraction::TfIdfKeywordExtractor;
31pub use extractive_summarizer::ExtractiveSummarizer;
32pub use layout_parser::{LayoutParser, LayoutParserFactory};
33pub use chunk_enricher::{ChunkEnricher, EnrichmentStatistics};
34pub use chunking_strategies::{
35 HierarchicalChunkingStrategy, SemanticChunkingStrategy,
36};
37
38#[cfg(feature = "code-chunking")]
39pub use chunking_strategies::RustCodeChunkingStrategy;
40
41use crate::{
42 core::{ChunkId, Document, TextChunk, ChunkingStrategy},
43 Result,
44};
45#[cfg(feature = "parallel-processing")]
46use crate::parallel::{ParallelProcessor, PerformanceMonitor};
47use chunking::HierarchicalChunker;
48
49#[derive(Debug)]
51pub struct TextProcessor {
52 chunk_size: usize,
53 chunk_overlap: usize,
54 #[cfg(feature = "parallel-processing")]
55 parallel_processor: Option<ParallelProcessor>,
56 #[cfg(feature = "parallel-processing")]
57 performance_monitor: PerformanceMonitor,
58}
59
60impl TextProcessor {
61 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self> {
63 Ok(Self {
64 chunk_size,
65 chunk_overlap,
66 #[cfg(feature = "parallel-processing")]
67 parallel_processor: None,
68 #[cfg(feature = "parallel-processing")]
69 performance_monitor: PerformanceMonitor::new(),
70 })
71 }
72
73 #[cfg(feature = "parallel-processing")]
75 pub fn with_parallel_processing(
76 chunk_size: usize,
77 chunk_overlap: usize,
78 parallel_processor: ParallelProcessor,
79 ) -> Result<Self> {
80 Ok(Self {
81 chunk_size,
82 chunk_overlap,
83 parallel_processor: Some(parallel_processor),
84 performance_monitor: PerformanceMonitor::new(),
85 })
86 }
87
88 pub fn chunk_text_hierarchical(&self, document: &Document) -> Result<Vec<TextChunk>> {
90 let chunker = HierarchicalChunker::new().with_min_size(50);
91 let chunks_text = chunker.chunk_text(&document.content, self.chunk_size, self.chunk_overlap);
92
93 let mut chunks = Vec::new();
94 let mut chunk_counter = 0;
95 let mut current_pos = 0;
96
97 for chunk_content in chunks_text {
98 if !chunk_content.trim().is_empty() {
99 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
100 let chunk_start = current_pos;
101 let chunk_end = chunk_start + chunk_content.len();
102
103 current_pos += chunk_content.len();
104
105 let chunk = TextChunk::new(
106 chunk_id,
107 document.id.clone(),
108 chunk_content,
109 chunk_start,
110 chunk_end,
111 );
112 chunks.push(chunk);
113 chunk_counter += 1;
114 } else {
115 current_pos += chunk_content.len();
116 }
117 }
118
119 Ok(chunks)
120 }
121
122 pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>> {
124 let text = &document.content;
125 let mut chunks = Vec::new();
126 let mut start = 0;
127 let mut chunk_counter = 0;
128
129 while start < text.len() {
130 let end = std::cmp::min(start + self.chunk_size, text.len());
131
132 let actual_end = if end < text.len() {
134 self.find_sentence_boundary(text, start, end)
135 .unwrap_or_else(|| self.find_char_boundary(text, end))
136 } else {
137 end
138 };
139
140 let chunk_content = text[start..actual_end].to_string();
141
142 if !chunk_content.trim().is_empty() {
143 let chunk_id = ChunkId::new(format!("{}_{}", document.id, chunk_counter));
144 let chunk = TextChunk::new(
145 chunk_id,
146 document.id.clone(),
147 chunk_content,
148 start,
149 actual_end,
150 );
151 chunks.push(chunk);
152 chunk_counter += 1;
153 }
154
155 let next_start = if actual_end >= text.len() {
157 break;
158 } else {
159 let overlap_start = actual_end.saturating_sub(self.chunk_overlap);
160 let safe_overlap = self.find_char_boundary(text, overlap_start);
161 std::cmp::max(start + 1, safe_overlap)
162 };
163
164 start = next_start;
165 }
166
167 Ok(chunks)
168 }
169
170 pub fn chunk_text_with_enrichment(
172 &self,
173 document: &Document,
174 enricher: &mut ChunkEnricher,
175 ) -> Result<Vec<TextChunk>> {
176 let mut chunks = self.chunk_text(document)?;
178
179 enricher.enrich_chunks(&mut chunks, document)?;
181
182 Ok(chunks)
183 }
184
185 pub fn chunk_text_hierarchical_with_enrichment(
187 &self,
188 document: &Document,
189 enricher: &mut ChunkEnricher,
190 ) -> Result<Vec<TextChunk>> {
191 let mut chunks = self.chunk_text_hierarchical(document)?;
193
194 enricher.enrich_chunks(&mut chunks, document)?;
196
197 Ok(chunks)
198 }
199
200 pub fn create_default_enricher(document: &Document) -> ChunkEnricher {
202 let parser = LayoutParserFactory::create_for_document(document);
203 ChunkEnricher::new_default(parser)
204 }
205
206 pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
208 let mut enricher = Self::create_default_enricher(document);
209 self.chunk_text_with_enrichment(document, &mut enricher)
210 }
211
212 pub fn chunk_hierarchical_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>> {
214 let mut enricher = Self::create_default_enricher(document);
215 self.chunk_text_hierarchical_with_enrichment(document, &mut enricher)
216 }
217
218 pub fn chunk_with_strategy(&self, document: &Document, strategy: &dyn ChunkingStrategy) -> Result<Vec<TextChunk>> {
240 let chunks = strategy.chunk(&document.content);
241 Ok(chunks)
242 }
243
244 fn find_char_boundary(&self, text: &str, mut pos: usize) -> usize {
246 pos = pos.min(text.len());
247 while pos > 0 && !text.is_char_boundary(pos) {
248 pos -= 1;
249 }
250 pos
251 }
252
253 fn find_char_boundary_in_slice(&self, text: &str, mut pos: usize) -> usize {
255 pos = pos.min(text.len());
256 while pos > 0 && !text.is_char_boundary(pos) {
257 pos -= 1;
258 }
259 pos
260 }
261
262 fn find_sentence_boundary(
264 &self,
265 text: &str,
266 start: usize,
267 preferred_end: usize,
268 ) -> Option<usize> {
269 let safe_start = self.find_char_boundary(text, start);
271 let safe_end = self.find_char_boundary(text, preferred_end);
272
273 if safe_start >= safe_end {
274 return None;
275 }
276
277 let search_window = &text[safe_start..safe_end];
278
279 let search_start = search_window.len().saturating_sub(200);
281 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
283 let search_text = &search_window[safe_search_start..];
284
285 let sentence_endings = ['.', '!', '?'];
287 let mut last_boundary = None;
288
289 for (i, ch) in search_text.char_indices() {
290 if sentence_endings.contains(&ch) {
291 let next_pos = i + ch.len_utf8();
293 if next_pos >= search_text.len()
294 || search_text
295 .chars()
296 .nth(next_pos)
297 .map_or(true, |c| c.is_whitespace())
298 {
299 last_boundary = Some(safe_start + safe_search_start + next_pos);
300 }
301 }
302 }
303
304 last_boundary.or_else(|| self.find_word_boundary(text, safe_start, safe_end))
305 }
306
307 fn find_word_boundary(&self, text: &str, start: usize, preferred_end: usize) -> Option<usize> {
309 if start >= preferred_end {
311 return None;
312 }
313
314 let search_window = &text[start..preferred_end];
315
316 let search_start = search_window.len().saturating_sub(50);
318 let safe_search_start = self.find_char_boundary_in_slice(search_window, search_start);
319 let search_text = &search_window[safe_search_start..];
320
321 search_text
322 .rfind(' ')
323 .map(|pos| start + safe_search_start + pos)
324 }
325
326 pub fn clean_text(&self, text: &str) -> String {
328 text
329 .split_whitespace()
331 .collect::<Vec<_>>()
332 .join(" ")
333 .chars()
335 .collect::<String>()
336 }
337
338 pub fn extract_sentences(&self, text: &str) -> Vec<String> {
340 let sentence_endings = ['.', '!', '?'];
341 let mut sentences = Vec::new();
342 let mut current_sentence = String::new();
343
344 for ch in text.chars() {
345 if sentence_endings.contains(&ch) {
346 let trimmed = current_sentence.trim().to_string();
347 if !trimmed.is_empty() {
348 sentences.push(trimmed);
349 }
350 current_sentence.clear();
351 } else {
352 current_sentence.push(ch);
353 }
354 }
355
356 let trimmed = current_sentence.trim().to_string();
358 if !trimmed.is_empty() {
359 sentences.push(trimmed);
360 }
361
362 sentences
363 }
364
365 pub fn word_count(&self, text: &str) -> usize {
367 text.split_whitespace().count()
368 }
369
370 pub fn batch_chunk_documents(&self, documents: Vec<Document>) -> Result<Vec<Vec<TextChunk>>> {
372 #[cfg(feature = "parallel-processing")]
373 {
374 if let Some(processor) = &self.parallel_processor {
375 if processor.should_use_parallel(documents.len()) {
376 use rayon::prelude::*;
377 let results: Result<Vec<Vec<TextChunk>>> = documents
378 .par_iter()
379 .map(|doc| self.chunk_text(doc))
380 .collect();
381 return results;
382 }
383 }
384 }
385
386 documents
388 .iter()
389 .map(|doc| self.chunk_text(doc))
390 .collect()
391 }
392
393 pub fn batch_extract_keywords(&self, texts: &[&str], max_keywords: usize) -> Vec<Vec<String>> {
395 #[cfg(feature = "parallel-processing")]
396 {
397 if let Some(processor) = &self.parallel_processor {
398 if processor.should_use_parallel(texts.len()) {
399 use rayon::prelude::*;
400 return texts
401 .par_iter()
402 .map(|&text| self.extract_keywords(text, max_keywords))
403 .collect();
404 }
405 }
406 }
407
408 texts
410 .iter()
411 .map(|&text| self.extract_keywords(text, max_keywords))
412 .collect()
413 }
414
415 pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>> {
417 #[cfg(feature = "parallel-processing")]
418 {
419 if let Some(processor) = &self.parallel_processor {
420 if processor.should_use_parallel(texts.len()) {
421 use rayon::prelude::*;
422 return texts
423 .par_iter()
424 .map(|&text| self.extract_sentences(text))
425 .collect();
426 }
427 }
428 }
429
430 texts
432 .iter()
433 .map(|&text| self.extract_sentences(text))
434 .collect()
435 }
436
437 pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String> {
439 #[cfg(feature = "parallel-processing")]
440 {
441 if let Some(processor) = &self.parallel_processor {
442 if processor.should_use_parallel(texts.len()) {
443 use rayon::prelude::*;
444 return texts
445 .par_iter()
446 .map(|&text| self.clean_text(text))
447 .collect();
448 }
449 }
450 }
451
452 texts.iter().map(|&text| self.clean_text(text)).collect()
454 }
455
456 pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String> {
458 use std::collections::HashMap;
459
460 let words: Vec<String> = text
461 .split_whitespace()
462 .map(|w| w.to_lowercase())
463 .filter(|w| w.len() > 3) .filter(|w| !self.is_stop_word(w))
465 .collect();
466
467 let mut word_counts = HashMap::new();
468 for word in words {
469 *word_counts.entry(word).or_insert(0) += 1;
470 }
471
472 let mut sorted_words: Vec<_> = word_counts.into_iter().collect();
473 sorted_words.sort_by(|a, b| b.1.cmp(&a.1));
474
475 sorted_words
476 .into_iter()
477 .take(max_keywords)
478 .map(|(word, _)| word)
479 .collect()
480 }
481
482 fn is_stop_word(&self, word: &str) -> bool {
484 const STOP_WORDS: &[&str] = &[
485 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
486 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
487 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
488 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
489 "go", "me",
490 ];
491 STOP_WORDS.contains(&word)
492 }
493
494 #[cfg(feature = "parallel-processing")]
496 pub fn get_performance_stats(&self) -> (usize, std::time::Duration) {
497 let stats = self.performance_monitor.get_stats();
498 (
499 stats.tasks_processed,
500 std::time::Duration::from_millis(stats.total_time_ms),
501 )
502 }
503
504 #[cfg(feature = "parallel-processing")]
506 pub fn average_processing_time(&self) -> std::time::Duration {
507 let avg_ms = self.performance_monitor.average_duration();
508 std::time::Duration::from_millis(avg_ms as u64)
509 }
510
511 #[cfg(feature = "parallel-processing")]
513 pub fn reset_performance_stats(&mut self) {
514 self.performance_monitor.reset();
515 }
516
517 #[cfg(feature = "parallel-processing")]
519 pub fn get_parallel_stats(&self) -> Option<crate::parallel::ParallelStatistics> {
520 self.parallel_processor.as_ref().map(|p| p.get_statistics())
521 }
522}
523
524pub struct LanguageDetector;
526
527impl LanguageDetector {
528 pub fn detect_language(text: &str) -> String {
531 if text
533 .chars()
534 .any(|c| matches!(c, 'ñ' | 'ó' | 'é' | 'í' | 'á' | 'ú'))
535 {
536 "es".to_string()
537 } else if text.chars().any(|c| matches!(c, 'ç' | 'ã' | 'õ')) {
538 "pt".to_string()
539 } else if text.chars().any(|c| matches!(c, 'à' | 'è' | 'ù' | 'ò')) {
540 "fr".to_string()
541 } else {
542 "en".to_string() }
544 }
545}
546
547#[cfg(test)]
548mod tests {
549 use super::*;
550 use crate::core::DocumentId;
551
552 #[test]
553 fn test_text_chunking() {
554 let processor = TextProcessor::new(100, 20).unwrap();
555 let document = Document::new(
556 DocumentId::new("test".to_string()),
557 "Test Document".to_string(),
558 "This is a test document. It has multiple sentences. Each sentence should be processed correctly.".to_string(),
559 );
560
561 let chunks = processor.chunk_text(&document).unwrap();
562 assert!(!chunks.is_empty());
563 assert!(chunks[0].content.len() <= 100);
564 }
565
566 #[test]
567 fn test_keyword_extraction() {
568 let processor = TextProcessor::new(1000, 100).unwrap();
569 let text = "machine learning artificial intelligence data science computer vision natural language processing";
570 let keywords = processor.extract_keywords(text, 3);
571
572 assert!(!keywords.is_empty());
573 assert!(keywords.len() <= 3);
574 }
575
576 #[test]
577 fn test_sentence_extraction() {
578 let processor = TextProcessor::new(1000, 100).unwrap();
579 let text = "First sentence. Second sentence! Third sentence?";
580 let sentences = processor.extract_sentences(text);
581
582 assert_eq!(sentences.len(), 3);
583 assert_eq!(sentences[0], "First sentence");
584 assert_eq!(sentences[1], "Second sentence");
585 assert_eq!(sentences[2], "Third sentence");
586 }
587
588 #[test]
589 fn test_enriched_chunking() {
590 let processor = TextProcessor::new(100, 20).unwrap();
591 let document = Document::new(
592 DocumentId::new("test".to_string()),
593 "test.md".to_string(),
594 "# Chapter 1\n\nThis document discusses machine learning and artificial intelligence.\n\n## Section 1.1\n\nDeep learning is important.".to_string(),
595 );
596
597 let chunks = processor.chunk_and_enrich(&document).unwrap();
598
599 assert!(!chunks.is_empty());
600 let has_metadata = chunks.iter().any(|c| c.metadata.chapter.is_some() || !c.metadata.keywords.is_empty());
602 assert!(has_metadata, "Chunks should have enriched metadata");
603 }
604
605 #[test]
606 fn test_custom_enricher() {
607 let processor = TextProcessor::new(100, 20).unwrap();
608 let document = Document::new(
609 DocumentId::new("test".to_string()),
610 "test.md".to_string(),
611 "# Test Chapter\n\nContent about machine learning here.".to_string(),
612 );
613
614 let parser = Box::new(crate::text::parsers::MarkdownLayoutParser::new());
615 let mut enricher = ChunkEnricher::new_default(parser);
616
617 let chunks = processor.chunk_text_with_enrichment(&document, &mut enricher).unwrap();
618
619 assert!(!chunks.is_empty());
620 assert!(chunks.iter().any(|c| !c.metadata.keywords.is_empty()));
622 }
623}