1use crate::{
9 data::{AnnotatedDocument, Document, Extraction, CharInterval},
10 exceptions::LangExtractResult,
11 tokenizer::{TokenInterval, TokenizedText, Tokenizer, SentenceIterator},
12};
13use regex::Regex;
14
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum ChunkingStrategy {
18 FixedSize,
20 Sentence,
22 Paragraph,
24 Adaptive,
26}
27
28#[derive(Debug, Clone)]
30pub struct TextChunk {
31 pub id: usize,
33 pub text: String,
35 pub char_offset: usize,
37 pub char_length: usize,
39 pub document_id: Option<String>,
41 pub has_overlap: bool,
43 pub overlap_info: Option<(usize, usize)>,
45}
46
47impl TextChunk {
48 pub fn new(
50 id: usize,
51 text: String,
52 char_offset: usize,
53 document_id: Option<String>,
54 ) -> Self {
55 let char_length = text.len();
56 Self {
57 id,
58 text,
59 char_offset,
60 char_length,
61 document_id,
62 has_overlap: false,
63 overlap_info: None,
64 }
65 }
66
67 pub fn with_overlap(
69 id: usize,
70 text: String,
71 char_offset: usize,
72 document_id: Option<String>,
73 overlap_start: usize,
74 overlap_end: usize,
75 ) -> Self {
76 let char_length = text.len();
77 Self {
78 id,
79 text,
80 char_offset,
81 char_length,
82 document_id,
83 has_overlap: overlap_start > 0 || overlap_end > 0,
84 overlap_info: Some((overlap_start, overlap_end)),
85 }
86 }
87
88 pub fn char_interval(&self) -> CharInterval {
90 CharInterval::new(
91 Some(self.char_offset),
92 Some(self.char_offset + self.char_length),
93 )
94 }
95
96 pub fn core_text(&self) -> &str {
98 if let Some((start_overlap, end_overlap)) = self.overlap_info {
99 let start = start_overlap;
100 let end = self.text.len().saturating_sub(end_overlap);
101 &self.text[start..end]
102 } else {
103 &self.text
104 }
105 }
106}
107
108#[derive(Debug, Clone)]
110pub struct TokenChunk {
111 pub token_interval: TokenInterval,
113 pub document: Option<Document>,
115 chunk_text: Option<String>,
117 sanitized_chunk_text: Option<String>,
119 char_interval: Option<CharInterval>,
121 custom_char_end: Option<usize>,
123}
124
125impl TokenChunk {
126 pub fn new(token_interval: TokenInterval, document: Option<Document>) -> Self {
128 Self {
129 token_interval,
130 document,
131 chunk_text: None,
132 sanitized_chunk_text: None,
133 char_interval: None,
134 custom_char_end: None,
135 }
136 }
137
138 pub fn with_char_end(token_interval: TokenInterval, document: Option<Document>, char_end: usize) -> Self {
140 Self {
141 token_interval,
142 document,
143 chunk_text: None,
144 sanitized_chunk_text: None,
145 char_interval: None,
146 custom_char_end: Some(char_end),
147 }
148 }
149
150 pub fn document_id(&self) -> Option<&str> {
152 self.document.as_ref()?.document_id.as_deref()
153 }
154
155 pub fn document_text(&self) -> Option<&TokenizedText> {
157 None
160 }
161
162 pub fn chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
164 if let Some(ref cached) = self.chunk_text {
165 return Ok(cached.clone());
166 }
167
168 if let Some(ref document) = self.document {
169 let tokenized = tokenizer.tokenize(&document.text)?;
170
171 if let Some(custom_end) = self.custom_char_end {
173 if !tokenized.tokens.is_empty() && self.token_interval.start_index < tokenized.tokens.len() {
174 let start_token = &tokenized.tokens[self.token_interval.start_index];
175 let start_char = start_token.char_interval.start_pos;
176 let end_char = std::cmp::min(custom_end, document.text.len());
177 return Ok(document.text[start_char..end_char].to_string());
178 }
179 }
180
181 let text = tokenizer.tokens_text(&tokenized, &self.token_interval)?;
183 Ok(text)
184 } else {
185 Err(crate::exceptions::LangExtractError::invalid_input(
186 "Document text must be set to access chunk text"
187 ))
188 }
189 }
190
191 pub fn sanitized_chunk_text(&self, tokenizer: &Tokenizer) -> LangExtractResult<String> {
193 let text = self.chunk_text(tokenizer)?;
194 Ok(sanitize_text(&text)?)
195 }
196
197 pub fn additional_context(&self) -> Option<&str> {
199 self.document.as_ref()?.additional_context.as_deref()
200 }
201
202 pub fn char_interval(&self, tokenizer: &Tokenizer) -> LangExtractResult<CharInterval> {
204 if let Some(ref cached) = self.char_interval {
205 return Ok(cached.clone());
206 }
207
208 if let Some(ref document) = self.document {
209 let tokenized = tokenizer.tokenize(&document.text)?;
210 let tokens = &tokenized.tokens;
211
212 if self.token_interval.start_index >= tokens.len()
213 || self.token_interval.end_index > tokens.len() {
214 return Err(crate::exceptions::LangExtractError::invalid_input(
215 "Token interval is out of bounds for the document"
216 ));
217 }
218
219 let start_token = &tokens[self.token_interval.start_index];
220 let end_token = &tokens[self.token_interval.end_index - 1];
221
222 Ok(CharInterval {
224 start_pos: Some(start_token.char_interval.start_pos),
225 end_pos: Some(end_token.char_interval.end_pos),
226 })
227 } else {
228 Err(crate::exceptions::LangExtractError::invalid_input(
229 "Document text must be set to compute char interval"
230 ))
231 }
232 }
233}
234
235fn sanitize_text(text: &str) -> LangExtractResult<String> {
237 let sanitized = regex::Regex::new(r"\s+")
238 .map_err(|e| crate::exceptions::LangExtractError::configuration(format!("Regex error: {}", e)))?
239 .replace_all(text.trim(), " ")
240 .to_string();
241
242 if sanitized.is_empty() {
243 return Err(crate::exceptions::LangExtractError::invalid_input("Sanitized text is empty"));
244 }
245
246 Ok(sanitized)
247}
248
249#[derive(Debug, Clone)]
251pub struct ChunkingConfig {
252 pub max_chunk_size: usize,
254 pub overlap_size: usize,
256 pub strategy: ChunkingStrategy,
258 pub min_chunk_size: usize,
260 pub respect_paragraphs: bool,
262 pub respect_sentences: bool,
264}
265
266impl Default for ChunkingConfig {
267 fn default() -> Self {
268 Self {
269 max_chunk_size: 2000,
270 overlap_size: 200,
271 strategy: ChunkingStrategy::Adaptive,
272 min_chunk_size: 100,
273 respect_paragraphs: true,
274 respect_sentences: true,
275 }
276 }
277}
278
279pub struct TextChunker {
281 config: ChunkingConfig,
282 sentence_regex: Regex,
283 paragraph_regex: Regex,
284}
285
286impl TextChunker {
287 pub fn new() -> Self {
289 Self::with_config(ChunkingConfig::default())
290 }
291
292 pub fn with_config(config: ChunkingConfig) -> Self {
294 let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
296
297 let paragraph_regex = Regex::new(r"\n\s*\n").unwrap();
299
300 Self {
301 config,
302 sentence_regex,
303 paragraph_regex,
304 }
305 }
306
307 pub fn chunk_document(&self, document: &Document) -> LangExtractResult<Vec<TextChunk>> {
309 self.chunk_text(&document.text, document.document_id.clone())
310 }
311
312 pub fn chunk_text(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
314 if text.len() <= self.config.max_chunk_size {
315 return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
317 }
318
319 match self.config.strategy {
320 ChunkingStrategy::FixedSize => self.chunk_fixed_size(text, document_id),
321 ChunkingStrategy::Sentence => self.chunk_by_sentences(text, document_id),
322 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text, document_id),
323 ChunkingStrategy::Adaptive => self.chunk_adaptive(text, document_id),
324 }
325 }
326
327 fn chunk_fixed_size(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
329 let mut chunks = Vec::new();
330 let mut chunk_id = 0;
331 let mut current_pos = 0;
332
333 while current_pos < text.len() {
334 let chunk_end = std::cmp::min(
335 current_pos + self.config.max_chunk_size,
336 text.len()
337 );
338
339 let chunk_text = text[current_pos..chunk_end].to_string();
340
341 let overlap_start = if chunk_id > 0 { self.config.overlap_size } else { 0 };
342 let overlap_end = if chunk_end < text.len() { self.config.overlap_size } else { 0 };
343
344 let chunk = TextChunk::with_overlap(
345 chunk_id,
346 chunk_text,
347 current_pos,
348 document_id.clone(),
349 overlap_start,
350 overlap_end,
351 );
352
353 chunks.push(chunk);
354 chunk_id += 1;
355
356 let step_size = self.config.max_chunk_size.saturating_sub(self.config.overlap_size);
358 current_pos += step_size;
359 }
360
361 Ok(chunks)
362 }
363
364 fn chunk_by_sentences(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
366 let sentence_boundaries = self.find_sentence_boundaries(text);
367 self.chunk_by_boundaries(text, &sentence_boundaries, document_id)
368 }
369
370 fn chunk_by_paragraphs(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
372 let paragraph_boundaries = self.find_paragraph_boundaries(text);
373 self.chunk_by_boundaries(text, ¶graph_boundaries, document_id)
374 }
375
376 fn chunk_adaptive(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
378 let paragraph_boundaries = self.find_paragraph_boundaries(text);
380 if !paragraph_boundaries.is_empty() && self.config.respect_paragraphs {
381 if let Ok(chunks) = self.chunk_by_boundaries(text, ¶graph_boundaries, document_id.clone()) {
382 let oversized_chunks: Vec<_> = chunks.iter()
384 .filter(|c| c.char_length > self.config.max_chunk_size)
385 .collect();
386
387 if oversized_chunks.is_empty() {
388 return Ok(chunks);
389 }
390 }
391 }
392
393 if self.config.respect_sentences {
395 let sentence_boundaries = self.find_sentence_boundaries(text);
396 if let Ok(chunks) = self.chunk_by_boundaries(text, &sentence_boundaries, document_id.clone()) {
397 let oversized_chunks: Vec<_> = chunks.iter()
398 .filter(|c| c.char_length > self.config.max_chunk_size)
399 .collect();
400
401 if oversized_chunks.is_empty() {
402 return Ok(chunks);
403 }
404 }
405 }
406
407 self.chunk_fixed_size(text, document_id)
409 }
410
411 fn find_sentence_boundaries(&self, text: &str) -> Vec<usize> {
413 let mut boundaries = vec![0]; for mat in self.sentence_regex.find_iter(text) {
416 boundaries.push(mat.end());
417 }
418
419 if boundaries.last() != Some(&text.len()) {
420 boundaries.push(text.len()); }
422
423 boundaries
424 }
425
426 fn find_paragraph_boundaries(&self, text: &str) -> Vec<usize> {
428 let mut boundaries = vec![0]; for mat in self.paragraph_regex.find_iter(text) {
431 boundaries.push(mat.end());
432 }
433
434 if boundaries.last() != Some(&text.len()) {
435 boundaries.push(text.len()); }
437
438 boundaries
439 }
440
441 fn chunk_by_boundaries(
443 &self,
444 text: &str,
445 boundaries: &[usize],
446 document_id: Option<String>,
447 ) -> LangExtractResult<Vec<TextChunk>> {
448 let mut chunks = Vec::new();
449 let mut chunk_id = 0;
450 let mut current_start = 0;
451
452 for &boundary in boundaries.iter().skip(1) {
453 let potential_chunk_size = boundary - current_start;
454
455 if potential_chunk_size <= self.config.max_chunk_size {
457 if potential_chunk_size >= self.config.min_chunk_size || chunks.is_empty() {
458 let chunk_text = text[current_start..boundary].to_string();
459 let chunk = TextChunk::new(chunk_id, chunk_text, current_start, document_id.clone());
460 chunks.push(chunk);
461 chunk_id += 1;
462 current_start = boundary;
463 }
464 } else {
465 let section = &text[current_start..boundary];
468 let mut section_chunks = self.chunk_fixed_size(section, document_id.clone())?;
469
470 for chunk in &mut section_chunks {
472 chunk.id = chunk_id;
473 chunk.char_offset += current_start;
474 chunk_id += 1;
475 }
476
477 chunks.extend(section_chunks);
478 current_start = boundary;
479 }
480 }
481
482 if chunks.is_empty() {
483 chunks.push(TextChunk::new(0, text.to_string(), 0, document_id));
485 }
486
487 Ok(chunks)
488 }
489
490 pub fn config(&self) -> &ChunkingConfig {
492 &self.config
493 }
494}
495
496impl Default for TextChunker {
497 fn default() -> Self {
498 Self::new()
499 }
500}
501
502pub struct ChunkIterator<'a> {
504 tokenized_text: &'a TokenizedText,
505 tokenizer: &'a Tokenizer,
506 max_char_buffer: usize,
507 sentence_iter: SentenceIterator<'a>,
508 broken_sentence: bool,
509 document: Option<&'a Document>,
510 next_chunk_start_char: Option<usize>,
511}
512
513impl<'a> ChunkIterator<'a> {
514 pub fn new(
516 text: &'a TokenizedText,
517 tokenizer: &'a Tokenizer,
518 max_char_buffer: usize,
519 document: Option<&'a Document>,
520 ) -> LangExtractResult<Self> {
521 let sentence_iter = SentenceIterator::new(text, tokenizer, 0)?;
522
523 Ok(Self {
524 tokenized_text: text,
525 tokenizer,
526 max_char_buffer,
527 sentence_iter,
528 broken_sentence: false,
529 document,
530 next_chunk_start_char: Some(0),
531 })
532 }
533
534 fn tokens_exceed_buffer(&self, token_interval: &TokenInterval) -> LangExtractResult<bool> {
536 let char_interval = self.get_char_interval_for_tokens(token_interval)?;
537 match (char_interval.start_pos, char_interval.end_pos) {
538 (Some(start), Some(end)) => Ok((end - start) > self.max_char_buffer),
539 _ => Ok(false), }
541 }
542
543 fn get_char_interval_for_tokens(&self, token_interval: &TokenInterval) -> LangExtractResult<CharInterval> {
545 if token_interval.start_index >= self.tokenized_text.tokens.len()
546 || token_interval.end_index > self.tokenized_text.tokens.len() {
547 return Err(crate::exceptions::LangExtractError::invalid_input(
548 "Token interval is out of bounds"
549 ));
550 }
551
552 let start_token = &self.tokenized_text.tokens[token_interval.start_index];
553 let end_token = &self.tokenized_text.tokens[token_interval.end_index - 1];
554
555 Ok(CharInterval {
556 start_pos: Some(start_token.char_interval.start_pos),
557 end_pos: Some(end_token.char_interval.end_pos),
558 })
559 }
560
561 fn create_adjacent_chunk(&self, token_interval: TokenInterval, next_chunk_start_token: Option<usize>) -> TokenChunk {
563 if let Some(next_start) = next_chunk_start_token {
564 if next_start < self.tokenized_text.tokens.len() {
565 let next_token = &self.tokenized_text.tokens[next_start];
567 let custom_end = next_token.char_interval.start_pos;
568 return TokenChunk::with_char_end(token_interval, self.document.cloned(), custom_end);
569 }
570 }
571
572 TokenChunk::new(token_interval, self.document.cloned())
574 }
575}
576
577impl<'a> Iterator for ChunkIterator<'a> {
578 type Item = LangExtractResult<TokenChunk>;
579
580 fn next(&mut self) -> Option<Self::Item> {
581 let sentence = match self.sentence_iter.next() {
583 Some(Ok(sentence)) => sentence,
584 Some(Err(e)) => return Some(Err(e)),
585 None => return None,
586 };
587
588 let curr_chunk = match TokenInterval::new(
590 sentence.start_index,
591 sentence.start_index + 1
592 ) {
593 Ok(interval) => interval,
594 Err(e) => return Some(Err(e)),
595 };
596
597 match self.tokens_exceed_buffer(&curr_chunk) {
599 Ok(true) => {
600 match SentenceIterator::new(
602 self.tokenized_text,
603 self.tokenizer,
604 sentence.start_index + 1,
605 ) {
606 Ok(new_iter) => {
607 self.sentence_iter = new_iter;
608 self.broken_sentence = curr_chunk.end_index < sentence.end_index;
609 }
610 Err(e) => return Some(Err(e)),
611 }
612
613 return Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())));
614 }
615 Ok(false) => {}, Err(e) => return Some(Err(e)),
617 }
618
619 let mut start_of_new_line = None;
621 let mut curr_chunk = curr_chunk;
622
623 for token_index in curr_chunk.start_index..sentence.end_index {
625 if self.tokenized_text.tokens[token_index].first_token_after_newline {
626 start_of_new_line = Some(token_index);
627 }
628
629 let test_chunk = match TokenInterval::new(curr_chunk.start_index, token_index + 1) {
630 Ok(interval) => interval,
631 Err(e) => return Some(Err(e)),
632 };
633
634 match self.tokens_exceed_buffer(&test_chunk) {
635 Ok(true) => {
636 if let Some(newline_pos) = start_of_new_line {
638 if newline_pos > curr_chunk.start_index {
639 curr_chunk = match TokenInterval::new(curr_chunk.start_index, newline_pos) {
641 Ok(interval) => interval,
642 Err(e) => return Some(Err(e)),
643 };
644 }
645 }
646
647 match SentenceIterator::new(
649 self.tokenized_text,
650 self.tokenizer,
651 curr_chunk.end_index,
652 ) {
653 Ok(new_iter) => {
654 self.sentence_iter = new_iter;
655 self.broken_sentence = true;
656 }
657 Err(e) => return Some(Err(e)),
658 }
659
660 return Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())));
661 }
662 Ok(false) => {
663 curr_chunk = test_chunk;
664 }
665 Err(e) => return Some(Err(e)),
666 }
667 }
668
669 if self.broken_sentence {
671 self.broken_sentence = false;
672 } else {
673 while let Some(next_sentence_result) = self.sentence_iter.next() {
675 let next_sentence = match next_sentence_result {
676 Ok(sentence) => sentence,
677 Err(e) => return Some(Err(e)),
678 };
679
680 let test_chunk = match TokenInterval::new(curr_chunk.start_index, next_sentence.end_index) {
681 Ok(interval) => interval,
682 Err(e) => return Some(Err(e)),
683 };
684
685 match self.tokens_exceed_buffer(&test_chunk) {
686 Ok(true) => {
687 match SentenceIterator::new(
689 self.tokenized_text,
690 self.tokenizer,
691 curr_chunk.end_index,
692 ) {
693 Ok(new_iter) => {
694 self.sentence_iter = new_iter;
695 }
696 Err(e) => return Some(Err(e)),
697 }
698 break;
699 }
700 Ok(false) => {
701 curr_chunk = test_chunk;
702 }
703 Err(e) => return Some(Err(e)),
704 }
705 }
706 }
707
708 Some(Ok(TokenChunk::new(curr_chunk, self.document.cloned())))
709 }
710}
711
712pub struct ResultAggregator {
714 similarity_threshold: f32,
716 merge_overlaps: bool,
718}
719
720impl ResultAggregator {
721 pub fn new() -> Self {
723 Self {
724 similarity_threshold: 0.8,
725 merge_overlaps: true,
726 }
727 }
728
729 pub fn with_settings(similarity_threshold: f32, merge_overlaps: bool) -> Self {
731 Self {
732 similarity_threshold,
733 merge_overlaps,
734 }
735 }
736
737 pub fn aggregate_chunk_results(
739 &self,
740 chunk_results: Vec<ChunkResult>,
741 original_text: String,
742 document_id: Option<String>,
743 ) -> LangExtractResult<AnnotatedDocument> {
744 let mut all_extractions = Vec::new();
745
746 for chunk_result in chunk_results {
748 if let Some(extractions) = chunk_result.extractions {
749 all_extractions.extend(extractions);
752 }
753 }
754
755 let deduplicated_extractions = if self.merge_overlaps {
757 self.deduplicate_extractions(all_extractions)?
758 } else {
759 all_extractions
760 };
761
762 let mut annotated_doc = AnnotatedDocument::with_extractions(deduplicated_extractions, original_text);
764 annotated_doc.document_id = document_id;
765
766 Ok(annotated_doc)
767 }
768
769 fn deduplicate_extractions(&self, extractions: Vec<Extraction>) -> LangExtractResult<Vec<Extraction>> {
771 let mut unique_extractions = Vec::new();
772
773 for extraction in extractions {
774 let mut is_duplicate = false;
775
776 for existing in &unique_extractions {
778 if self.are_similar_extractions(&extraction, existing) {
779 is_duplicate = true;
780 break;
781 }
782 }
783
784 if !is_duplicate {
785 unique_extractions.push(extraction);
786 }
787 }
788
789 Ok(unique_extractions)
790 }
791
792 fn are_similar_extractions(&self, e1: &Extraction, e2: &Extraction) -> bool {
794 if e1.extraction_class == e2.extraction_class {
796 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
797 return similarity >= self.similarity_threshold;
798 }
799
800 if let (Some(interval1), Some(interval2)) = (&e1.char_interval, &e2.char_interval) {
802 if interval1.overlaps_with(interval2) {
803 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
804 return similarity >= self.similarity_threshold;
805 }
806 }
807
808 false
809 }
810
811 fn text_similarity(&self, text1: &str, text2: &str) -> f32 {
813 if text1 == text2 {
814 return 1.0;
815 }
816
817 let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
818 let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
819
820 if words1.is_empty() && words2.is_empty() {
821 return 1.0;
822 }
823
824 let intersection = words1.intersection(&words2).count();
825 let union = words1.union(&words2).count();
826
827 if union == 0 {
828 0.0
829 } else {
830 intersection as f32 / union as f32
831 }
832 }
833}
834
835impl Default for ResultAggregator {
836 fn default() -> Self {
837 Self::new()
838 }
839}
840
841#[derive(Debug, Clone)]
843pub struct ChunkResult {
844 pub chunk_id: usize,
846 pub extractions: Option<Vec<Extraction>>,
848 pub char_offset: usize,
850 pub char_length: usize,
852 pub success: bool,
854 pub error: Option<String>,
856 pub processing_time: Option<std::time::Duration>,
858}
859
860impl ChunkResult {
861 pub fn success(
863 chunk_id: usize,
864 extractions: Vec<Extraction>,
865 char_offset: usize,
866 char_length: usize,
867 ) -> Self {
868 Self {
869 chunk_id,
870 extractions: Some(extractions),
871 char_offset,
872 char_length,
873 success: true,
874 error: None,
875 processing_time: None,
876 }
877 }
878
879 pub fn failure(
881 chunk_id: usize,
882 char_offset: usize,
883 char_length: usize,
884 error: String,
885 ) -> Self {
886 Self {
887 chunk_id,
888 extractions: None,
889 char_offset,
890 char_length,
891 success: false,
892 error: Some(error),
893 processing_time: None,
894 }
895 }
896
897 pub fn with_processing_time(mut self, duration: std::time::Duration) -> Self {
899 self.processing_time = Some(duration);
900 self
901 }
902}
903
904#[cfg(test)]
905mod tests {
906 use super::*;
907 use crate::tokenizer::Tokenizer;
908
909 fn create_tokenizer() -> Tokenizer {
910 Tokenizer::new().expect("Failed to create tokenizer")
911 }
912
913 fn create_document(text: &str) -> Document {
914 Document::new(text.to_string())
915 }
916
917 #[test]
919 fn test_fixed_size_chunking() {
920 let chunker = TextChunker::with_config(ChunkingConfig {
921 max_chunk_size: 20,
922 overlap_size: 5,
923 strategy: ChunkingStrategy::FixedSize,
924 ..Default::default()
925 });
926
927 let text = "This is a test document with some text that needs to be chunked into smaller pieces.";
928 let chunks = chunker.chunk_text(text, None).unwrap();
929
930 assert!(chunks.len() > 1);
931 for chunk in &chunks {
932 assert!(chunk.char_length <= 20);
933 }
934 }
935
936 #[test]
937 fn test_sentence_chunking() {
938 let chunker = TextChunker::with_config(ChunkingConfig {
939 max_chunk_size: 50,
940 strategy: ChunkingStrategy::Sentence,
941 ..Default::default()
942 });
943
944 let text = "First sentence. Second sentence! Third sentence? Fourth sentence.";
945 let chunks = chunker.chunk_text(text, None).unwrap();
946
947 assert!(chunks.len() > 0);
949 for chunk in &chunks {
950 println!("Chunk: '{}'", chunk.text);
951 }
952 }
953
954 #[test]
955 fn test_small_text_no_chunking() {
956 let chunker = TextChunker::new();
957 let text = "Short text.";
958 let chunks = chunker.chunk_text(text, None).unwrap();
959
960 assert_eq!(chunks.len(), 1);
961 assert_eq!(chunks[0].text, text);
962 }
963
964 #[test]
965 fn test_chunk_char_interval() {
966 let chunk = TextChunk::new(0, "test".to_string(), 10, None);
967 let interval = chunk.char_interval();
968
969 assert_eq!(interval.start_pos, Some(10));
970 assert_eq!(interval.end_pos, Some(14));
971 }
972
973 #[test]
974 fn test_chunk_with_overlap() {
975 let chunk = TextChunk::with_overlap(
976 0,
977 "overlap test text".to_string(),
978 0,
979 None,
980 3,
981 4,
982 );
983
984 assert!(chunk.has_overlap);
985 assert_eq!(chunk.overlap_info, Some((3, 4)));
986 assert_eq!(chunk.core_text(), "rlap test ");
987 }
988
989 #[test]
992 fn test_multi_sentence_chunk() {
993 let tokenizer = create_tokenizer();
999 let text = "This is a sentence. This is a longer sentence.";
1000 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1001 let document = create_document(text);
1002
1003 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 50, Some(&document))
1004 .expect("Failed to create chunk iterator");
1005
1006 let first_chunk = chunk_iter.next()
1007 .expect("Should have a chunk")
1008 .expect("Chunk creation should succeed");
1009
1010 let chunk_text = first_chunk.chunk_text(&tokenizer)
1011 .expect("Failed to get chunk text");
1012
1013 assert!(chunk_text.contains("This is a sentence."));
1015 assert!(chunk_text.contains("This is a longer sentence."));
1016 }
1017
1018 #[test]
1019 fn test_sentence_breaking() {
1020 let tokenizer = create_tokenizer();
1026 let text = "This is a very long sentence that definitely exceeds the buffer.";
1027 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1028 let document = create_document(text);
1029
1030 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 20, Some(&document))
1031 .expect("Failed to create chunk iterator");
1032
1033 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1034 let chunks = chunks.expect("Chunk iteration should succeed");
1035
1036 assert!(chunks.len() > 1, "Should break long sentence into multiple chunks");
1038
1039 for chunk in &chunks {
1041 let chunk_text = chunk.chunk_text(&tokenizer)
1042 .expect("Failed to get chunk text");
1043 assert!(chunk_text.len() <= 25, "Chunk should not vastly exceed buffer: '{}'", chunk_text); }
1045 }
1046
1047 #[test]
1048 fn test_oversized_token() {
1049 let tokenizer = create_tokenizer();
1055 let text = "Short antidisestablishmentarianism word.";
1056 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1057 let document = create_document(text);
1058
1059 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 10, Some(&document))
1060 .expect("Failed to create chunk iterator");
1061
1062 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1063 let chunks = chunks.expect("Chunk iteration should succeed");
1064
1065 assert!(chunks.len() > 1, "Should break into multiple chunks");
1067
1068 let long_word_chunk = chunks.iter().find(|chunk| {
1070 chunk.chunk_text(&tokenizer)
1071 .map(|text| text.contains("antidisestablishmentarianism"))
1072 .unwrap_or(false)
1073 });
1074
1075 assert!(long_word_chunk.is_some(), "Should find chunk containing the long word");
1076 }
1077
1078 #[test]
1079 fn test_newline_preference_for_breaking() {
1080 let tokenizer = create_tokenizer();
1086 let text = "First part of sentence\nSecond part of sentence continues here";
1087 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1088 let document = create_document(text);
1089
1090 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 25, Some(&document))
1091 .expect("Failed to create chunk iterator");
1092
1093 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1094 let chunks = chunks.expect("Chunk iteration should succeed");
1095
1096 assert!(chunks.len() > 1, "Should break into multiple chunks");
1098
1099 let first_chunk_text = chunks[0].chunk_text(&tokenizer)
1101 .expect("Failed to get first chunk text");
1102
1103 assert!(!first_chunk_text.contains("continues"),
1105 "First chunk should not contain text after newline: '{}'", first_chunk_text);
1106 }
1107
1108 #[test]
1109 fn test_empty_text_handling() {
1110 let tokenizer = create_tokenizer();
1116 let text = "";
1117 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1118 let document = create_document(text);
1119
1120 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
1121 .expect("Failed to create chunk iterator");
1122
1123 let result = chunk_iter.next();
1124 assert!(result.is_none(), "Empty text should produce no chunks");
1125 }
1126
1127 #[test]
1128 fn test_single_sentence_chunk() {
1129 let tokenizer = create_tokenizer();
1135 let text = "Short sentence.";
1136 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1137 let document = create_document(text);
1138
1139 let mut chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 100, Some(&document))
1140 .expect("Failed to create chunk iterator");
1141
1142 let chunk = chunk_iter.next()
1143 .expect("Should have a chunk")
1144 .expect("Chunk creation should succeed");
1145
1146 let chunk_text = chunk.chunk_text(&tokenizer)
1147 .expect("Failed to get chunk text");
1148
1149 assert_eq!(chunk_text, text);
1150
1151 assert!(chunk_iter.next().is_none(), "Should have only one chunk");
1153 }
1154
1155 #[test]
1156 fn test_token_chunk_properties() {
1157 let tokenizer = create_tokenizer();
1163 let text = "Test sentence.";
1164 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1165 let document = create_document(text);
1166
1167 let token_interval = crate::tokenizer::TokenInterval::new(0, tokenized.tokens.len())
1168 .expect("Failed to create token interval");
1169 let chunk = TokenChunk::new(token_interval, Some(document));
1170
1171 let chunk_text = chunk.chunk_text(&tokenizer)
1173 .expect("Failed to get chunk text");
1174 assert_eq!(chunk_text, text);
1175
1176 let sanitized = chunk.sanitized_chunk_text(&tokenizer)
1178 .expect("Failed to get sanitized text");
1179 assert_eq!(sanitized, text); let char_interval = chunk.char_interval(&tokenizer)
1183 .expect("Failed to get char interval");
1184 assert_eq!(char_interval.start_pos, Some(0));
1185 assert_eq!(char_interval.end_pos, Some(text.len()));
1186 }
1187
1188 #[test]
1189 fn test_progressive_chunking() {
1190 let tokenizer = create_tokenizer();
1196 let text = "Short. Medium length sentence here. Very long sentence that might need to be broken up depending on buffer size.";
1197 let tokenized = tokenizer.tokenize(text).expect("Tokenization failed");
1198 let document = create_document(text);
1199
1200 let chunk_iter = ChunkIterator::new(&tokenized, &tokenizer, 40, Some(&document))
1201 .expect("Failed to create chunk iterator");
1202
1203 let chunks: Result<Vec<_>, _> = chunk_iter.collect();
1204 let chunks = chunks.expect("Chunk iteration should succeed");
1205
1206 assert!(chunks.len() > 1, "Should produce multiple chunks");
1208
1209 println!("Debug: {} chunks created", chunks.len());
1211 for (i, chunk) in chunks.iter().enumerate() {
1212 let chunk_text = chunk.chunk_text(&tokenizer).expect("Failed to get chunk text");
1213 println!("Chunk {}: {:?} (interval: {:?})", i, chunk_text, chunk.token_interval);
1214 }
1215
1216 let mut reconstructed = String::new();
1218 for chunk in &chunks {
1219 let chunk_text = chunk.chunk_text(&tokenizer)
1220 .expect("Failed to get chunk text");
1221 reconstructed.push_str(&chunk_text);
1222 }
1223
1224 println!("Original: {:?}", text);
1225 println!("Reconstructed: {:?}", reconstructed);
1226
1227 assert!(chunks.len() >= 2, "Should produce multiple chunks for long text");
1230
1231 }
1234
1235 #[test]
1236 fn test_chunk_without_document() {
1237 let tokenizer = create_tokenizer();
1243 let token_interval = crate::tokenizer::TokenInterval::new(0, 1)
1244 .expect("Failed to create token interval");
1245 let chunk = TokenChunk::new(token_interval, None);
1246
1247 let result = chunk.chunk_text(&tokenizer);
1249 assert!(result.is_err(), "Should return error when no document is set");
1250
1251 assert!(chunk.document_id().is_none());
1253 assert!(chunk.additional_context().is_none());
1254 }
1255}