1use crate::{
9 data::{AnnotatedDocument, Document, Extraction, CharInterval},
10 exceptions::LangExtractResult,
11 tokenizer::{TokenInterval, TokenizedText, Tokenizer, SentenceIterator},
12};
13use regex::Regex;
14use semchunk_rs::Chunker;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum ChunkingStrategy {
19 #[deprecated(note = "Use Semantic chunking for better results")]
21 FixedSize,
22 #[deprecated(note = "Use Semantic chunking for better results")]
24 Sentence,
25 #[deprecated(note = "Use Semantic chunking for better results")]
27 Paragraph,
28 Adaptive,
30 Semantic,
32}
33
34#[derive(Debug, Clone)]
36pub struct TextChunk {
37 pub id: usize,
39 pub text: String,
41 pub char_offset: usize,
43 pub char_length: usize,
45 pub document_id: Option<String>,
47 pub has_overlap: bool,
49 pub overlap_info: Option<(usize, usize)>,
51}
52
53impl TextChunk {
54 pub fn new(
56 id: usize,
57 text: String,
58 char_offset: usize,
59 document_id: Option<String>,
60 ) -> Self {
61 let char_length = text.len();
62 Self {
63 id,
64 text,
65 char_offset,
66 char_length,
67 document_id,
68 has_overlap: false,
69 overlap_info: None,
70 }
71 }
72
73 pub fn with_overlap(
75 id: usize,
76 text: String,
77 char_offset: usize,
78 document_id: Option<String>,
79 overlap_start: usize,
80 overlap_end: usize,
81 ) -> Self {
82 let char_length = text.len();
83 Self {
84 id,
85 text,
86 char_offset,
87 char_length,
88 document_id,
89 has_overlap: overlap_start > 0 || overlap_end > 0,
90 overlap_info: Some((overlap_start, overlap_end)),
91 }
92 }
93
94 pub fn char_interval(&self) -> CharInterval {
96 CharInterval::new(
97 Some(self.char_offset),
98 Some(self.char_offset + self.char_length),
99 )
100 }
101
102 pub fn core_text(&self) -> &str {
104 if let Some((start_overlap, end_overlap)) = self.overlap_info {
105 let start = start_overlap;
106 let end = self.text.len().saturating_sub(end_overlap);
107 &self.text[start..end]
108 } else {
109 &self.text
110 }
111 }
112}
113
114#[derive(Debug, Clone)]
116pub struct TokenChunk {
117 pub token_interval: TokenInterval,
119 pub document: Option<Document>,
121 chunk_text: Option<String>,
123 char_interval: Option<CharInterval>,
125 custom_char_end: Option<usize>,
127}
128
129impl TokenChunk {
130 pub fn new(token_interval: TokenInterval, document: Option<Document>) -> Self {
132 Self {
133 token_interval,
134 document,
135 chunk_text: None,
136 char_interval: None,
137 custom_char_end: None,
138 }
139 }
140
141 pub fn with_char_end(token_interval: TokenInterval, document: Option<Document>, char_end: usize) -> Self {
143 Self {
144 token_interval,
145 document,
146 chunk_text: None,
147 char_interval: None,
148 custom_char_end: Some(char_end),
149 }
150 }
151
152 pub fn document_id(&self) -> Option<&str> {
154 self.document.as_ref()?.document_id.as_deref()
155 }
156
157 pub fn document_text(&self) -> Option<&TokenizedText> {
159 None
162 }
163
164 pub fn chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
166 if let Some(ref cached) = self.chunk_text {
167 return Ok(cached.clone());
168 }
169
170 if let Some(ref document) = self.document {
171 let tokenized = tokenizer.tokenize(&document.text)?;
172
173 if let Some(custom_end) = self.custom_char_end {
175 if !tokenized.tokens.is_empty() && self.token_interval.start_index < tokenized.tokens.len() {
176 let start_token = &tokenized.tokens[self.token_interval.start_index];
177 let start_char = start_token.char_interval.start_pos;
178 let end_char = std::cmp::min(custom_end, document.text.len());
179 return Ok(document.text[start_char..end_char].to_string());
180 }
181 }
182
183 let text = tokenizer.tokens_text(&tokenized, &self.token_interval)?;
185 Ok(text)
186 } else {
187 Err(crate::exceptions::LangExtractError::invalid_input(
188 "Document text must be set to access chunk text"
189 ))
190 }
191 }
192
193 pub fn sanitized_chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
195 let text = self.chunk_text(tokenizer)?;
196 Ok(sanitize_text(&text)?)
197 }
198
199 pub fn additional_context(&self) -> Option<&str> {
201 self.document.as_ref()?.additional_context.as_deref()
202 }
203
204 pub fn char_interval(&self, tokenizer: &Tokenizer) -> LangExtractResult<CharInterval> {
206 if let Some(ref cached) = self.char_interval {
207 return Ok(cached.clone());
208 }
209
210 if let Some(ref document) = self.document {
211 let tokenized = tokenizer.tokenize(&document.text)?;
212 let tokens = &tokenized.tokens;
213
214 if self.token_interval.start_index >= tokens.len()
215 || self.token_interval.end_index > tokens.len() {
216 return Err(crate::exceptions::LangExtractError::invalid_input(
217 "Token interval is out of bounds for the document"
218 ));
219 }
220
221 let start_token = &tokens[self.token_interval.start_index];
222 let end_token = &tokens[self.token_interval.end_index - 1];
223
224 Ok(CharInterval {
226 start_pos: Some(start_token.char_interval.start_pos),
227 end_pos: Some(end_token.char_interval.end_pos),
228 })
229 } else {
230 Err(crate::exceptions::LangExtractError::invalid_input(
231 "Document text must be set to compute char interval"
232 ))
233 }
234 }
235}
236
237fn sanitize_text(text: &str) -> LangExtractResult<String> {
239 let sanitized = regex::Regex::new(r"\s+")
240 .map_err(|e| crate::exceptions::LangExtractError::configuration(format!("Regex error: {}", e)))?
241 .replace_all(text.trim(), " ")
242 .to_string();
243
244 if sanitized.is_empty() {
245 return Err(crate::exceptions::LangExtractError::invalid_input("Sanitized text is empty"));
246 }
247
248 Ok(sanitized)
249}
250
251#[derive(Debug, Clone)]
253pub struct ChunkingConfig {
254 pub max_chunk_size: usize,
256 pub overlap_size: usize,
258 pub strategy: ChunkingStrategy,
260 pub min_chunk_size: usize,
262 pub respect_paragraphs: bool,
264 pub respect_sentences: bool,
266 pub semantic_similarity_threshold: f32,
268 pub semantic_max_chunks: Option<usize>,
270}
271
272impl Default for ChunkingConfig {
273 fn default() -> Self {
274 Self {
275 max_chunk_size: 2000,
276 overlap_size: 200,
277 strategy: ChunkingStrategy::Adaptive,
278 min_chunk_size: 100,
279 respect_paragraphs: true,
280 respect_sentences: true,
281 semantic_similarity_threshold: 0.7,
282 semantic_max_chunks: None,
283 }
284 }
285}
286
287pub struct TextChunker {
289 config: ChunkingConfig,
290 sentence_regex: Regex,
291 paragraph_regex: Regex,
292}
293
294impl TextChunker {
295 pub fn new() -> Self {
297 Self::with_config(ChunkingConfig::default())
298 }
299
300 pub fn with_config(config: ChunkingConfig) -> Self {
302 let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
304
305 let paragraph_regex = Regex::new(r"\n\s*\n").unwrap();
307
308 Self {
309 config,
310 sentence_regex,
311 paragraph_regex,
312 }
313 }
314
315 pub fn chunk_document(&self, document: &Document) -> LangExtractResult<Vec<TextChunk>> {
317 self.chunk_text(&document.text, document.document_id.clone())
318 }
319
320 pub fn chunk_text(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
322 if text.len() <= self.config.max_chunk_size {
323 return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
325 }
326
327 match self.config.strategy {
328 ChunkingStrategy::FixedSize => self.chunk_fixed_size(text, document_id),
329 ChunkingStrategy::Sentence => self.chunk_by_sentences(text, document_id),
330 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text, document_id),
331 ChunkingStrategy::Adaptive => self.chunk_adaptive(text, document_id),
332 ChunkingStrategy::Semantic => self.chunk_semantic(text, document_id),
333 }
334 }
335
336 fn chunk_fixed_size(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
338 let mut chunks = Vec::new();
339 let mut chunk_id = 0;
340 let mut current_pos = 0;
341
342 while current_pos < text.len() {
343 let chunk_end = std::cmp::min(
344 current_pos + self.config.max_chunk_size,
345 text.len()
346 );
347
348 let chunk_text = text[current_pos..chunk_end].to_string();
349
350 let overlap_start = if chunk_id > 0 { self.config.overlap_size } else { 0 };
351 let overlap_end = if chunk_end < text.len() { self.config.overlap_size } else { 0 };
352
353 let chunk = TextChunk::with_overlap(
354 chunk_id,
355 chunk_text,
356 current_pos,
357 document_id.clone(),
358 overlap_start,
359 overlap_end,
360 );
361
362 chunks.push(chunk);
363 chunk_id += 1;
364
365 let step_size = self.config.max_chunk_size.saturating_sub(self.config.overlap_size);
367 current_pos += step_size;
368 }
369
370 Ok(chunks)
371 }
372
373 fn chunk_by_sentences(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
375 let sentence_boundaries = self.find_sentence_boundaries(text);
376 self.chunk_by_boundaries(text, &sentence_boundaries, document_id)
377 }
378
379 fn chunk_by_paragraphs(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
381 let paragraph_boundaries = self.find_paragraph_boundaries(text);
382 self.chunk_by_boundaries(text, ¶graph_boundaries, document_id)
383 }
384
385 fn chunk_adaptive(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
387 self.chunk_semantic(text, document_id)
390 }
391
392 fn find_sentence_boundaries(&self, text: &str) -> Vec<usize> {
394 let mut boundaries = vec![0]; for mat in self.sentence_regex.find_iter(text) {
397 boundaries.push(mat.end());
398 }
399
400 if boundaries.last() != Some(&text.len()) {
401 boundaries.push(text.len()); }
403
404 boundaries
405 }
406
407 fn find_paragraph_boundaries(&self, text: &str) -> Vec<usize> {
409 let mut boundaries = vec![0]; for mat in self.paragraph_regex.find_iter(text) {
412 boundaries.push(mat.end());
413 }
414
415 if boundaries.last() != Some(&text.len()) {
416 boundaries.push(text.len()); }
418
419 boundaries
420 }
421
422 fn chunk_semantic(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
424 let token_counter = Box::new(|s: &str| s.split_whitespace().count());
427
428 let chunker = Chunker::new(self.config.max_chunk_size, token_counter);
430
431 let semantic_chunks = chunker.chunk(text);
433
434 let mut chunks = Vec::new();
436 let mut current_pos = 0;
437
438 for (chunk_id, chunk_text) in semantic_chunks.into_iter().enumerate() {
439 let start_pos = if let Some(found_pos) = text[current_pos..].find(&chunk_text) {
441 current_pos + found_pos
442 } else {
443 current_pos
445 };
446
447 let end_pos = start_pos + chunk_text.len();
448
449 let text_chunk = TextChunk::new(
450 chunk_id,
451 chunk_text.clone(),
452 start_pos,
453 document_id.clone(),
454 );
455
456 chunks.push(text_chunk);
457 current_pos = end_pos;
458 }
459
460 if chunks.is_empty() {
462 return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
463 }
464
465 let final_chunks = if let Some(max_chunks) = self.config.semantic_max_chunks {
467 if chunks.len() > max_chunks {
468 let mut merged_chunks = chunks[..max_chunks-1].to_vec();
470 let remaining_chunks = &chunks[max_chunks-1..];
471 let merged_text = remaining_chunks.iter()
472 .map(|c| c.text.as_str())
473 .collect::<Vec<_>>()
474 .join(" ");
475 let merged_start = remaining_chunks[0].char_offset;
476 let merged_chunk = TextChunk::new(
477 max_chunks - 1,
478 merged_text,
479 merged_start,
480 document_id,
481 );
482 merged_chunks.push(merged_chunk);
483 merged_chunks
484 } else {
485 chunks
486 }
487 } else {
488 chunks
489 };
490
491 Ok(final_chunks)
492 }
493
494 fn chunk_by_boundaries(
496 &self,
497 text: &str,
498 boundaries: &[usize],
499 document_id: Option<String>,
500 ) -> LangExtractResult<Vec<TextChunk>> {
501 let mut chunks = Vec::new();
502 let mut chunk_id = 0;
503 let mut current_start = 0;
504
505 for &boundary in boundaries.iter().skip(1) {
506 let potential_chunk_size = boundary - current_start;
507
508 if potential_chunk_size <= self.config.max_chunk_size {
510 if potential_chunk_size >= self.config.min_chunk_size || chunks.is_empty() {
511 let chunk_text = text[current_start..boundary].to_string();
512 let chunk = TextChunk::new(chunk_id, chunk_text, current_start, document_id.clone());
513 chunks.push(chunk);
514 chunk_id += 1;
515 current_start = boundary;
516 }
517 } else {
518 let section = &text[current_start..boundary];
521 let mut section_chunks = self.chunk_fixed_size(section, document_id.clone())?;
522
523 for chunk in &mut section_chunks {
525 chunk.id = chunk_id;
526 chunk.char_offset += current_start;
527 chunk_id += 1;
528 }
529
530 chunks.extend(section_chunks);
531 current_start = boundary;
532 }
533 }
534
535 if chunks.is_empty() {
536 chunks.push(TextChunk::new(0, text.to_string(), 0, document_id));
538 }
539
540 Ok(chunks)
541 }
542
543 pub fn config(&self) -> &ChunkingConfig {
545 &self.config
546 }
547}
548
549impl Default for TextChunker {
550 fn default() -> Self {
551 Self::new()
552 }
553}
554
555pub struct ChunkIterator<'a> {
557 tokenized_text: &'a TokenizedText,
558 tokenizer: &'a Tokenizer,
559 max_char_buffer: usize,
560 sentence_iter: SentenceIterator<'a>,
561 broken_sentence: bool,
562 document: Option<&'a Document>,
563 next_chunk_start_char: Option<usize>,
564}
565
566impl<'a> ChunkIterator<'a> {
567 pub fn new(
569 text: &'a TokenizedText,
570 tokenizer: &'a Tokenizer,
571 max_char_buffer: usize,
572 document: Option<&'a Document>,
573 ) -> LangExtractResult<Self> {
574 let sentence_iter = SentenceIterator::new(text, tokenizer, 0)?;
575
576 Ok(Self {
577 tokenized_text: text,
578 tokenizer,
579 max_char_buffer,
580 sentence_iter,
581 broken_sentence: false,
582 document,
583 next_chunk_start_char: Some(0),
584 })
585 }
586
587 fn tokens_exceed_buffer(&self, token_interval: &TokenInterval) -> LangExtractResult<bool> {
589 let char_interval = self.get_char_interval_for_tokens(token_interval)?;
590 match (char_interval.start_pos, char_interval.end_pos) {
591 (Some(start), Some(end)) => Ok((end - start) > self.max_char_buffer),
592 _ => Ok(false), }
594 }
595
596 fn get_char_interval_for_tokens(&self, token_interval: &TokenInterval) -> LangExtractResult<CharInterval> {
598 if token_interval.start_index >= self.tokenized_text.tokens.len()
599 || token_interval.end_index > self.tokenized_text.tokens.len() {
600 return Err(crate::exceptions::LangExtractError::invalid_input(
601 "Token interval is out of bounds"
602 ));
603 }
604
605 let start_token = &self.tokenized_text.tokens[token_interval.start_index];
606 let end_token = &self.tokenized_text.tokens[token_interval.end_index - 1];
607
608 Ok(CharInterval {
609 start_pos: Some(start_token.char_interval.start_pos),
610 end_pos: Some(end_token.char_interval.end_pos),
611 })
612 }
613
614 fn create_adjacent_chunk(&self, token_interval: TokenInterval, next_chunk_start_token: Option<usize>) -> TokenChunk {
616 if let Some(next_start) = next_chunk_start_token {
617 if next_start < self.tokenized_text.tokens.len() {
618 let next_token = &self.tokenized_text.tokens[next_start];
620 let custom_end = next_token.char_interval.start_pos;
621 return TokenChunk::with_char_end(token_interval, self.document.cloned(), custom_end);
622 }
623 }
624
625 TokenChunk::new(token_interval, self.document.cloned())
627 }
628}
629
630impl<'a> Iterator for ChunkIterator<'a> {
631 type Item = LangExtractResult<TokenChunk>;
632
633 fn next(&mut self) -> Option<Self::Item> {
634 let sentence = match self.sentence_iter.next() {
636 Some(Ok(sentence)) => sentence,
637 Some(Err(e)) => return Some(Err(e)),
638 None => return None,
639 };
640
641 let curr_chunk = match TokenInterval::new(
643 sentence.start_index,
644 sentence.start_index + 1
645 ) {
646 Ok(interval) => interval,
647 Err(e) => return Some(Err(e)),
648 };
649
650 match self.tokens_exceed_buffer(&curr_chunk) {
652 Ok(true) => {
653 match SentenceIterator::new(
655 self.tokenized_text,
656 self.tokenizer,
657 sentence.start_index + 1,
658 ) {
659 Ok(new_iter) => {
660 self.sentence_iter = new_iter;
661 self.broken_sentence = curr_chunk.end_index < sentence.end_index;
662 }
663 Err(e) => return Some(Err(e)),
664 }
665
666 return Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())));
667 }
668 Ok(false) => {}, Err(e) => return Some(Err(e)),
670 }
671
672 let mut start_of_new_line = None;
674 let mut curr_chunk = curr_chunk;
675
676 for token_index in curr_chunk.start_index..sentence.end_index {
678 if self.tokenized_text.tokens[token_index].first_token_after_newline {
679 start_of_new_line = Some(token_index);
680 }
681
682 let test_chunk = match TokenInterval::new(curr_chunk.start_index, token_index + 1) {
683 Ok(interval) => interval,
684 Err(e) => return Some(Err(e)),
685 };
686
687 match self.tokens_exceed_buffer(&test_chunk) {
688 Ok(true) => {
689 if let Some(newline_pos) = start_of_new_line {
691 if newline_pos > curr_chunk.start_index {
692 curr_chunk = match TokenInterval::new(curr_chunk.start_index, newline_pos) {
694 Ok(interval) => interval,
695 Err(e) => return Some(Err(e)),
696 };
697 }
698 }
699
700 match SentenceIterator::new(
702 self.tokenized_text,
703 self.tokenizer,
704 curr_chunk.end_index,
705 ) {
706 Ok(new_iter) => {
707 self.sentence_iter = new_iter;
708 self.broken_sentence = true;
709 }
710 Err(e) => return Some(Err(e)),
711 }
712
713 return Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())));
714 }
715 Ok(false) => {
716 curr_chunk = test_chunk;
717 }
718 Err(e) => return Some(Err(e)),
719 }
720 }
721
722 if self.broken_sentence {
724 self.broken_sentence = false;
725 } else {
726 while let Some(next_sentence_result) = self.sentence_iter.next() {
728 let next_sentence = match next_sentence_result {
729 Ok(sentence) => sentence,
730 Err(e) => return Some(Err(e)),
731 };
732
733 let test_chunk = match TokenInterval::new(curr_chunk.start_index, next_sentence.end_index) {
734 Ok(interval) => interval,
735 Err(e) => return Some(Err(e)),
736 };
737
738 match self.tokens_exceed_buffer(&test_chunk) {
739 Ok(true) => {
740 match SentenceIterator::new(
742 self.tokenized_text,
743 self.tokenizer,
744 curr_chunk.end_index,
745 ) {
746 Ok(new_iter) => {
747 self.sentence_iter = new_iter;
748 }
749 Err(e) => return Some(Err(e)),
750 }
751 break;
752 }
753 Ok(false) => {
754 curr_chunk = test_chunk;
755 }
756 Err(e) => return Some(Err(e)),
757 }
758 }
759 }
760
761 Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())))
762 }
763}
764
765pub struct ResultAggregator {
767 similarity_threshold: f32,
769 merge_overlaps: bool,
771}
772
773impl ResultAggregator {
774 pub fn new() -> Self {
776 Self {
777 similarity_threshold: 0.8,
778 merge_overlaps: true,
779 }
780 }
781
782 pub fn with_settings(similarity_threshold: f32, merge_overlaps: bool) -> Self {
784 Self {
785 similarity_threshold,
786 merge_overlaps,
787 }
788 }
789
790 pub fn aggregate_chunk_results(
792 &self,
793 chunk_results: Vec<ChunkResult>,
794 original_text: String,
795 document_id: Option<String>,
796 ) -> LangExtractResult<AnnotatedDocument> {
797 let mut all_extractions = Vec::new();
798
799 for chunk_result in chunk_results {
801 if let Some(extractions) = chunk_result.extractions {
802 all_extractions.extend(extractions);
805 }
806 }
807
808 let deduplicated_extractions = if self.merge_overlaps {
810 self.deduplicate_extractions(all_extractions)?
811 } else {
812 all_extractions
813 };
814
815 let mut annotated_doc = AnnotatedDocument::with_extractions(deduplicated_extractions, original_text);
817 annotated_doc.document_id = document_id;
818
819 Ok(annotated_doc)
820 }
821
822 fn deduplicate_extractions(&self, extractions: Vec<Extraction>) -> LangExtractResult<Vec<Extraction>> {
824 let mut unique_extractions = Vec::new();
825
826 for extraction in extractions {
827 let mut is_duplicate = false;
828
829 for existing in &unique_extractions {
831 if self.are_similar_extractions(&extraction, existing) {
832 is_duplicate = true;
833 break;
834 }
835 }
836
837 if !is_duplicate {
838 unique_extractions.push(extraction);
839 }
840 }
841
842 Ok(unique_extractions)
843 }
844
845 fn are_similar_extractions(&self, e1: &Extraction, e2: &Extraction) -> bool {
847 if e1.extraction_class == e2.extraction_class {
849 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
850 return similarity >= self.similarity_threshold;
851 }
852
853 if let (Some(interval1), Some(interval2)) = (&e1.char_interval, &e2.char_interval) {
855 if interval1.overlaps_with(interval2) {
856 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
857 return similarity >= self.similarity_threshold;
858 }
859 }
860
861 false
862 }
863
864 fn text_similarity(&self, text1: &str, text2: &str) -> f32 {
866 if text1 == text2 {
867 return 1.0;
868 }
869
870 let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
871 let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
872
873 if words1.is_empty() && words2.is_empty() {
874 return 1.0;
875 }
876
877 let intersection = words1.intersection(&words2).count();
878 let union = words1.union(&words2).count();
879
880 if union == 0 {
881 0.0
882 } else {
883 intersection as f32 / union as f32
884 }
885 }
886}
887
888impl Default for ResultAggregator {
889 fn default() -> Self {
890 Self::new()
891 }
892}
893
894#[derive(Debug, Clone)]
896pub struct ChunkResult {
897 pub chunk_id: usize,
899 pub extractions: Option<Vec<Extraction>>,
901 pub char_offset: usize,
903 pub char_length: usize,
905 pub success: bool,
907 pub error: Option<String>,
909 pub processing_time: Option<std::time::Duration>,
911}
912
913impl ChunkResult {
914 pub fn success(
916 chunk_id: usize,
917 extractions: Vec<Extraction>,
918 char_offset: usize,
919 char_length: usize,
920 ) -> Self {
921 Self {
922 chunk_id,
923 extractions: Some(extractions),
924 char_offset,
925 char_length,
926 success: true,
927 error: None,
928 processing_time: None,
929 }
930 }
931
932 pub fn failure(
934 chunk_id: usize,
935 char_offset: usize,
936 char_length: usize,
937 error: String,
938 ) -> Self {
939 Self {
940 chunk_id,
941 extractions: None,
942 char_offset,
943 char_length,
944 success: false,
945 error: Some(error),
946 processing_time: None,
947 }
948 }
949
950 pub fn with_processing_time(mut self, duration: std::time::Duration) -> Self {
952 self.processing_time = Some(duration);
953 self
954 }
955}
956
957#[cfg(test)]
958mod tests {
959 use super::*;
960 use crate::tokenizer::Tokenizer;
961
962 fn create_tokenizer() -> Tokenizer {
963 Tokenizer::new().expect("Failed to create tokenizer")
964 }
965
966 fn create_document(text: &str) -> Document {
967 Document::new(text.to_string())
968 }
969
970 #[test]
972 fn test_fixed_size_chunking() {
973 let chunker = TextChunker::with_config(ChunkingConfig {
974 max_chunk_size: 20,
975 overlap_size: 5,
976 strategy: ChunkingStrategy::FixedSize,
977 ..Default::default()
978 });
979
980 let text = "This is a test document with some text that needs to be chunked into smaller pieces.";
981 let chunks = chunker.chunk_text(text, None).unwrap();
982
983 assert!(chunks.len() > 1);
984 for chunk in &chunks {
985 assert!(chunk.char_length <= 20);
986 }
987 }
988
989 #[test]
990 fn test_sentence_chunking() {
991 let chunker = TextChunker::with_config(ChunkingConfig {
992 max_chunk_size: 50,
993 strategy: ChunkingStrategy::Sentence,
994 ..Default::default()
995 });
996
997 let text = "First sentence. Second sentence! Third sentence? Fourth sentence.";
998 let chunks = chunker.chunk_text(text, None).unwrap();
999
1000 assert!(chunks.len() > 0);
1002 for chunk in &chunks {
1003 println!("Chunk: '{}'", chunk.text);
1004 }
1005 }
1006
1007 #[test]
1008 fn test_small_text_no_chunking() {
1009 let chunker = TextChunker::new();
1010 let text = "Short text.";
1011 let chunks = chunker.chunk_text(text, None).unwrap();
1012
1013 assert_eq!(chunks.len(), 1);
1014 assert_eq!(chunks[0].text, text);
1015 }
1016
1017 #[test]
1018 fn test_chunk_char_interval() {
1019 let chunk = TextChunk::new(0, "test".to_string(), 10, None);
1020 let interval = chunk.char_interval();
1021
1022 assert_eq!(interval.start_pos, Some(10));
1023 assert_eq!(interval.end_pos, Some(14));
1024 }
1025
1026 #[test]
1027 fn test_chunk_with_overlap() {
1028 let chunk = TextChunk::with_overlap(
1029 0,
1030 "overlap test text".to_string(),
1031 0,
1032 None,
1033 3,
1034 4,
1035 );
1036
1037 assert!(chunk.has_overlap);
1038 assert_eq!(chunk.overlap_info, Some((3, 4)));
1039 assert_eq!(chunk.core_text(), "rlap test ");
1040 }
1041
1042 #[test]
1045 fn test_multi_sentence_chunk() {
1046 let tokenizer = create_tokenizer();
1052 let text = "This is a sentence. This is a longer sentence.";
1053 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1054 let document = create_document(text);
1055
1056 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 50, Some(&document))
1057 .expect("Failed to create chunk iterator");
1058
1059 let first_chunk = chunk_iter.next()
1060 .expect("Should have a chunk")
1061 .expect("Chunk creation should succeed");
1062
1063 let chunk_text = first_chunk.chunk_text(&tokenizer)
1064 .expect("Failed to get chunk text");
1065
1066 assert!(chunk_text.contains("This is a sentence."));
1068 assert!(chunk_text.contains("This is a longer sentence."));
1069 }
1070
1071 #[test]
1072 fn test_sentence_breaking() {
1073 let tokenizer = create_tokenizer();
1079 let text = "This is a very long sentence that definitely exceeds the buffer.";
1080 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1081 let document = create_document(text);
1082
1083 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 20, Some(&document))
1084 .expect("Failed to create chunk iterator");
1085
1086 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1087 let chunks = chunks.expect("Chunk iteration should succeed");
1088
1089 assert!(chunks.len() > 1, "Should break long sentence into multiple chunks");
1091
1092 for chunk in &chunks {
1094 let chunk_text = chunk.chunk_text(&tokenizer)
1095 .expect("Failed to get chunk text");
1096 assert!(chunk_text.len() <= 25, "Chunk should not vastly exceed buffer: '{}'", chunk_text); }
1098 }
1099
1100 #[test]
1101 fn test_oversized_token() {
1102 let tokenizer = create_tokenizer();
1108 let text = "Short antidisestablishmentarianism word.";
1109 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1110 let document = create_document(text);
1111
1112 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 10, Some(&document))
1113 .expect("Failed to create chunk iterator");
1114
1115 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1116 let chunks = chunks.expect("Chunk iteration should succeed");
1117
1118 assert!(chunks.len() > 1, "Should break into multiple chunks");
1120
1121 let long_word_chunk = chunks.iter().find(|chunk| {
1123 chunk.chunk_text(&tokenizer)
1124 .map(|text| text.contains("antidisestablishmentarianism"))
1125 .unwrap_or(false)
1126 });
1127
1128 assert!(long_word_chunk.is_some(), "Should find chunk containing the long word");
1129 }
1130
1131 #[test]
1132 fn test_newline_preference_for_breaking() {
1133 let tokenizer = create_tokenizer();
1139 let text = "First part of sentence\nSecond part of sentence continues here";
1140 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1141 let document = create_document(text);
1142
1143 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 25, Some(&document))
1144 .expect("Failed to create chunk iterator");
1145
1146 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1147 let chunks = chunks.expect("Chunk iteration should succeed");
1148
1149 assert!(chunks.len() > 1, "Should break into multiple chunks");
1151
1152 let first_chunk_text = chunks[0].chunk_text(&tokenizer)
1154 .expect("Failed to get first chunk text");
1155
1156 assert!(!first_chunk_text.contains("continues"),
1158 "First chunk should not contain text after newline: '{}'", first_chunk_text);
1159 }
1160
1161 #[test]
1162 fn test_empty_text_handling() {
1163 let tokenizer = create_tokenizer();
1169 let text = "";
1170 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1171 let document = create_document(text);
1172
1173 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
1174 .expect("Failed to create chunk iterator");
1175
1176 let result = chunk_iter.next();
1177 assert!(result.is_none(), "Empty text should produce no chunks");
1178 }
1179
1180 #[test]
1181 fn test_single_sentence_chunk() {
1182 let tokenizer = create_tokenizer();
1188 let text = "Short sentence.";
1189 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1190 let document = create_document(text);
1191
1192 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
1193 .expect("Failed to create chunk iterator");
1194
1195 let chunk = chunk_iter.next()
1196 .expect("Should have a chunk")
1197 .expect("Chunk creation should succeed");
1198
1199 let chunk_text = chunk.chunk_text(&tokenizer)
1200 .expect("Failed to get chunk text");
1201
1202 assert_eq!(chunk_text, text);
1203
1204 assert!(chunk_iter.next().is_none(), "Should have only one chunk");
1206 }
1207
1208 #[test]
1209 fn test_token_chunk_properties() {
1210 let tokenizer = create_tokenizer();
1216 let text = "Test sentence.";
1217 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1218 let document = create_document(text);
1219
1220 let token_interval = crate::tokenizer::TokenInterval::new(0, tokenized.tokens.len())
1221 .expect("Failed to create token interval");
1222 let chunk = TokenChunk::new(token_interval, Some(document));
1223
1224 let chunk_text = chunk.chunk_text(&tokenizer)
1226 .expect("Failed to get chunk text");
1227 assert_eq!(chunk_text, text);
1228
1229 let sanitized = chunk.sanitized_chunk_text(&tokenizer)
1231 .expect("Failed to get sanitized text");
1232 assert_eq!(sanitized, text); let char_interval = chunk.char_interval(&tokenizer)
1236 .expect("Failed to get char interval");
1237 assert_eq!(char_interval.start_pos, Some(0));
1238 assert_eq!(char_interval.end_pos, Some(text.len()));
1239 }
1240
1241 #[test]
1242 fn test_progressive_chunking() {
1243 let tokenizer = create_tokenizer();
1249 let text = "Short. Medium length sentence here. Very long sentence that might need to be broken up depending on buffer size.";
1250 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1251 let document = create_document(text);
1252
1253 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 40, Some(&document))
1254 .expect("Failed to create chunk iterator");
1255
1256 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1257 let chunks = chunks.expect("Chunk iteration should succeed");
1258
1259 assert!(chunks.len() > 1, "Should produce multiple chunks");
1261
1262 println!("Debug: {} chunks created", chunks.len());
1264 for (i, chunk) in chunks.iter().enumerate() {
1265 let chunk_text = chunk.chunk_text(&tokenizer).expect("Failed to get chunk text");
1266 println!("Chunk {}: {:?} (interval: {:?})", i, chunk_text, chunk.token_interval);
1267 }
1268
1269 let mut reconstructed = String::new();
1271 for chunk in &chunks {
1272 let chunk_text = chunk.chunk_text(&tokenizer)
1273 .expect("Failed to get chunk text");
1274 reconstructed.push_str(&chunk_text);
1275 }
1276
1277 println!("Original: {:?}", text);
1278 println!("Reconstructed: {:?}", reconstructed);
1279
1280 assert!(chunks.len() >= 2, "Should produce multiple chunks for long text");
1283
1284 }
1287
1288 #[test]
1289 fn test_chunk_without_document() {
1290 let tokenizer = create_tokenizer();
1296 let token_interval = crate::tokenizer::TokenInterval::new(0, 1)
1297 .expect("Failed to create token interval");
1298 let chunk = TokenChunk::new(token_interval, None);
1299
1300 let result = chunk.chunk_text(&tokenizer);
1302 assert!(result.is_err(), "Should return error when no document is set");
1303
1304 assert!(chunk.document_id().is_none());
1306 assert!(chunk.additional_context().is_none());
1307 }
1308
1309 #[test]
1312 fn test_semantic_chunking_basic() {
1313 let chunker = TextChunker::with_config(ChunkingConfig {
1319 strategy: ChunkingStrategy::Semantic,
1320 max_chunk_size: 1000,
1321 semantic_similarity_threshold: 0.7,
1322 ..Default::default()
1323 });
1324
1325 let text = "Machine learning is a subset of artificial intelligence. It involves training algorithms on data to make predictions. Deep learning uses neural networks with multiple layers. Natural language processing helps computers understand human language.";
1326 let chunks = chunker.chunk_text(text, Some("test_doc".to_string())).unwrap();
1327
1328 assert!(chunks.len() > 0, "Should create at least one chunk");
1329 assert!(chunks.len() <= 10, "Should not create too many chunks");
1330
1331 for (i, chunk) in chunks.iter().enumerate() {
1333 assert_eq!(chunk.id, i);
1334 assert!(!chunk.text.is_empty());
1335 assert!(chunk.char_length > 0);
1336 assert_eq!(chunk.document_id, Some("test_doc".to_string()));
1337 }
1338
1339 for i in 0..chunks.len() - 1 {
1341 let current_end = chunks[i].char_offset + chunks[i].char_length;
1342 let next_start = chunks[i + 1].char_offset;
1343 assert!(current_end <= next_start, "Chunks should not overlap");
1344 }
1345 }
1346
1347 #[test]
1348 fn test_semantic_chunking_empty_text() {
1349 let chunker = TextChunker::with_config(ChunkingConfig {
1355 strategy: ChunkingStrategy::Semantic,
1356 ..Default::default()
1357 });
1358
1359 let text = "";
1360 let chunks = chunker.chunk_text(text, None).unwrap();
1361
1362 assert_eq!(chunks.len(), 1);
1363 assert_eq!(chunks[0].text, "");
1364 assert_eq!(chunks[0].char_length, 0);
1365 assert_eq!(chunks[0].char_offset, 0);
1366 }
1367
1368 #[test]
1369 fn test_semantic_chunking_small_text() {
1370 let chunker = TextChunker::with_config(ChunkingConfig {
1376 strategy: ChunkingStrategy::Semantic,
1377 max_chunk_size: 1000,
1378 ..Default::default()
1379 });
1380
1381 let text = "Short text that fits in one chunk.";
1382 let chunks = chunker.chunk_text(text, None).unwrap();
1383
1384 assert_eq!(chunks.len(), 1);
1385 assert_eq!(chunks[0].text, text);
1386 assert_eq!(chunks[0].char_offset, 0);
1387 assert_eq!(chunks[0].char_length, text.len());
1388 }
1389
1390 #[test]
1391 fn test_semantic_chunking_with_max_chunks() {
1392 let chunker = TextChunker::with_config(ChunkingConfig {
1398 strategy: ChunkingStrategy::Semantic,
1399 max_chunk_size: 500,
1400 semantic_similarity_threshold: 0.5, semantic_max_chunks: Some(3),
1402 ..Default::default()
1403 });
1404
1405 let text = "This is a very long text about artificial intelligence and machine learning. It contains multiple paragraphs with different topics. The first paragraph discusses AI fundamentals. The second paragraph covers machine learning techniques. The third paragraph explores deep learning applications. The fourth paragraph examines natural language processing. This should create multiple semantic chunks that will need to be merged due to the max_chunks limit.";
1406
1407 let chunks = chunker.chunk_text(text, None).unwrap();
1408
1409 assert!(chunks.len() <= 3, "Should not exceed max_chunks limit: got {}, limit is 3", chunks.len());
1411 assert!(!chunks.is_empty(), "Should create at least one chunk");
1412 }
1413
1414 #[test]
1415 fn test_semantic_chunking_similarity_threshold() {
1416 let text = "Python is a programming language. Java is also a programming language. The weather is nice today. I like to eat pizza. Programming involves writing code. Food is essential for life.";
1422
1423 let low_threshold_chunker = TextChunker::with_config(ChunkingConfig {
1424 strategy: ChunkingStrategy::Semantic,
1425 max_chunk_size: 200,
1426 semantic_similarity_threshold: 0.3, ..Default::default()
1428 });
1429
1430 let high_threshold_chunker = TextChunker::with_config(ChunkingConfig {
1431 strategy: ChunkingStrategy::Semantic,
1432 max_chunk_size: 200,
1433 semantic_similarity_threshold: 0.9, ..Default::default()
1435 });
1436
1437 let low_threshold_chunks = low_threshold_chunker.chunk_text(text, None).unwrap();
1438 let high_threshold_chunks = high_threshold_chunker.chunk_text(text, None).unwrap();
1439
1440 println!("Low threshold chunks: {}, High threshold chunks: {}",
1443 low_threshold_chunks.len(), high_threshold_chunks.len());
1444
1445 assert!(!low_threshold_chunks.is_empty());
1447 assert!(!high_threshold_chunks.is_empty());
1448 }
1449
1450 #[test]
1451 fn test_semantic_chunking_preserves_text() {
1452 let chunker = TextChunker::with_config(ChunkingConfig {
1458 strategy: ChunkingStrategy::Semantic,
1459 max_chunk_size: 100,
1460 semantic_similarity_threshold: 0.7,
1461 ..Default::default()
1462 });
1463
1464 let text = "The quick brown fox jumps over the lazy dog. This is a test sentence. Machine learning is fascinating.";
1465 let chunks = chunker.chunk_text(text, None).unwrap();
1466
1467 let mut reconstructed = String::new();
1469 for chunk in &chunks {
1470 reconstructed.push_str(&chunk.text);
1471 }
1472
1473 assert_eq!(reconstructed.trim(), text.trim());
1476 }
1477
1478 #[test]
1479 fn test_semantic_chunking_error_handling() {
1480 let chunker = TextChunker::with_config(ChunkingConfig {
1486 strategy: ChunkingStrategy::Semantic,
1487 max_chunk_size: 10, semantic_similarity_threshold: 2.0, ..Default::default()
1490 });
1491
1492 let text = "This is a test text for semantic chunking error handling.";
1494 let result = chunker.chunk_text(text, None);
1495
1496 match result {
1498 Ok(chunks) => {
1499 assert!(!chunks.is_empty());
1500 for chunk in chunks {
1501 assert!(!chunk.text.is_empty());
1502 }
1503 }
1504 Err(e) => {
1505 println!("Expected error occurred: {}", e);
1507 }
1508 }
1509 }
1510
1511 #[test]
1512 fn test_semantic_vs_fixed_size_chunking() {
1513 let text = "Natural language processing is a field of artificial intelligence. It focuses on the interaction between computers and human language. Machine learning algorithms power many NLP applications. Deep learning has revolutionized computer vision and NLP.";
1519
1520 let semantic_chunker = TextChunker::with_config(ChunkingConfig {
1521 strategy: ChunkingStrategy::Semantic,
1522 max_chunk_size: 150,
1523 semantic_similarity_threshold: 0.7,
1524 ..Default::default()
1525 });
1526
1527 #[allow(deprecated)]
1528 let fixed_chunker = TextChunker::with_config(ChunkingConfig {
1529 strategy: ChunkingStrategy::FixedSize,
1530 max_chunk_size: 150,
1531 ..Default::default()
1532 });
1533
1534 let semantic_chunks = semantic_chunker.chunk_text(text, None).unwrap();
1535 let fixed_chunks = fixed_chunker.chunk_text(text, None).unwrap();
1536
1537 println!("Semantic chunks: {}, Fixed chunks: {}", semantic_chunks.len(), fixed_chunks.len());
1538 println!("Text length: {}", text.len());
1539
1540 assert!(!semantic_chunks.is_empty());
1542 assert!(!fixed_chunks.is_empty());
1543
1544 }
1547
1548 #[test]
1549 fn test_semantic_chunking_integration() {
1550 let mut config = ChunkingConfig::default();
1556 config.strategy = ChunkingStrategy::Semantic;
1557 config.max_chunk_size = 100;
1558
1559 let chunker = TextChunker::with_config(config);
1560 let text = "This is a test document. It has multiple sentences with different topics. The first sentence introduces the topic. The second sentence provides more details. The third sentence concludes the discussion.";
1561
1562 let chunks = chunker.chunk_text(text, Some("integration_test".to_string())).unwrap();
1563
1564 assert!(!chunks.is_empty());
1566 assert!(chunks.len() <= 10); for chunk in &chunks {
1570 assert!(!chunk.text.is_empty());
1571 assert!(chunk.char_length > 0);
1572 assert_eq!(chunk.document_id, Some("integration_test".to_string()));
1573 }
1574
1575 for i in 0..chunks.len() - 1 {
1577 let current_end = chunks[i].char_offset + chunks[i].char_length;
1578 let next_start = chunks[i + 1].char_offset;
1579 assert!(current_end <= next_start, "Chunks should not overlap");
1580 }
1581
1582 println!("✅ Semantic chunking integration test passed with {} chunks", chunks.len());
1583 }
1584
1585 #[test]
1586 fn test_semantic_chunking_with_document_id() {
1587 let chunker = TextChunker::with_config(ChunkingConfig {
1593 strategy: ChunkingStrategy::Semantic,
1594 max_chunk_size: 100,
1595 ..Default::default()
1596 });
1597
1598 let text = "This is a test document with multiple sentences. Each sentence should be processed correctly. The document ID should be preserved.";
1599 let document_id = Some("doc_123".to_string());
1600 let chunks = chunker.chunk_text(text, document_id.clone()).unwrap();
1601
1602 for chunk in &chunks {
1604 assert_eq!(chunk.document_id, document_id);
1605 }
1606 }
1607}