1use crate::{
9 data::{AnnotatedDocument, CharInterval, Document, Extraction},
10 exceptions::LangExtractResult,
11};
12use regex::Regex;
13
14#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub enum ChunkingStrategy {
17 FixedSize,
19 Sentence,
21 Paragraph,
23 Adaptive,
25}
26
27#[derive(Debug, Clone)]
29pub struct TextChunk {
30 pub id: usize,
32 pub text: String,
34 pub char_offset: usize,
36 pub char_length: usize,
38 pub document_id: Option<String>,
40 pub has_overlap: bool,
42 pub overlap_info: Option<(usize, usize)>,
44}
45
46impl TextChunk {
47 pub fn new(
49 id: usize,
50 text: String,
51 char_offset: usize,
52 document_id: Option<String>,
53 ) -> Self {
54 let char_length = text.len();
55 Self {
56 id,
57 text,
58 char_offset,
59 char_length,
60 document_id,
61 has_overlap: false,
62 overlap_info: None,
63 }
64 }
65
66 pub fn with_overlap(
68 id: usize,
69 text: String,
70 char_offset: usize,
71 document_id: Option<String>,
72 overlap_start: usize,
73 overlap_end: usize,
74 ) -> Self {
75 let char_length = text.len();
76 Self {
77 id,
78 text,
79 char_offset,
80 char_length,
81 document_id,
82 has_overlap: overlap_start > 0 || overlap_end > 0,
83 overlap_info: Some((overlap_start, overlap_end)),
84 }
85 }
86
87 pub fn char_interval(&self) -> CharInterval {
89 CharInterval::new(
90 Some(self.char_offset),
91 Some(self.char_offset + self.char_length),
92 )
93 }
94
95 pub fn core_text(&self) -> &str {
97 if let Some((start_overlap, end_overlap)) = self.overlap_info {
98 let start = start_overlap;
99 let end = self.text.len().saturating_sub(end_overlap);
100 &self.text[start..end]
101 } else {
102 &self.text
103 }
104 }
105}
106
107#[derive(Debug, Clone)]
109pub struct ChunkingConfig {
110 pub max_chunk_size: usize,
112 pub overlap_size: usize,
114 pub strategy: ChunkingStrategy,
116 pub min_chunk_size: usize,
118 pub respect_paragraphs: bool,
120 pub respect_sentences: bool,
122}
123
124impl Default for ChunkingConfig {
125 fn default() -> Self {
126 Self {
127 max_chunk_size: 2000,
128 overlap_size: 200,
129 strategy: ChunkingStrategy::Adaptive,
130 min_chunk_size: 100,
131 respect_paragraphs: true,
132 respect_sentences: true,
133 }
134 }
135}
136
137pub struct TextChunker {
139 config: ChunkingConfig,
140 sentence_regex: Regex,
141 paragraph_regex: Regex,
142}
143
144impl TextChunker {
145 pub fn new() -> Self {
147 Self::with_config(ChunkingConfig::default())
148 }
149
150 pub fn with_config(config: ChunkingConfig) -> Self {
152 let sentence_regex = Regex::new(r"[.!?]+\s+").unwrap();
154
155 let paragraph_regex = Regex::new(r"\n\s*\n").unwrap();
157
158 Self {
159 config,
160 sentence_regex,
161 paragraph_regex,
162 }
163 }
164
165 pub fn chunk_document(&self, document: &Document) -> LangExtractResult<Vec<TextChunk>> {
167 self.chunk_text(&document.text, document.document_id.clone())
168 }
169
170 pub fn chunk_text(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
172 if text.len() <= self.config.max_chunk_size {
173 return Ok(vec![TextChunk::new(0, text.to_string(), 0, document_id)]);
175 }
176
177 match self.config.strategy {
178 ChunkingStrategy::FixedSize => self.chunk_fixed_size(text, document_id),
179 ChunkingStrategy::Sentence => self.chunk_by_sentences(text, document_id),
180 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text, document_id),
181 ChunkingStrategy::Adaptive => self.chunk_adaptive(text, document_id),
182 }
183 }
184
185 fn chunk_fixed_size(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
187 let mut chunks = Vec::new();
188 let mut chunk_id = 0;
189 let mut current_pos = 0;
190
191 while current_pos < text.len() {
192 let chunk_end = std::cmp::min(
193 current_pos + self.config.max_chunk_size,
194 text.len()
195 );
196
197 let chunk_text = text[current_pos..chunk_end].to_string();
198
199 let overlap_start = if chunk_id > 0 { self.config.overlap_size } else { 0 };
200 let overlap_end = if chunk_end < text.len() { self.config.overlap_size } else { 0 };
201
202 let chunk = TextChunk::with_overlap(
203 chunk_id,
204 chunk_text,
205 current_pos,
206 document_id.clone(),
207 overlap_start,
208 overlap_end,
209 );
210
211 chunks.push(chunk);
212 chunk_id += 1;
213
214 let step_size = self.config.max_chunk_size.saturating_sub(self.config.overlap_size);
216 current_pos += step_size;
217 }
218
219 Ok(chunks)
220 }
221
222 fn chunk_by_sentences(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
224 let sentence_boundaries = self.find_sentence_boundaries(text);
225 self.chunk_by_boundaries(text, &sentence_boundaries, document_id)
226 }
227
228 fn chunk_by_paragraphs(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
230 let paragraph_boundaries = self.find_paragraph_boundaries(text);
231 self.chunk_by_boundaries(text, ¶graph_boundaries, document_id)
232 }
233
234 fn chunk_adaptive(&self, text: &str, document_id: Option<String>) -> LangExtractResult<Vec<TextChunk>> {
236 let paragraph_boundaries = self.find_paragraph_boundaries(text);
238 if !paragraph_boundaries.is_empty() && self.config.respect_paragraphs {
239 if let Ok(chunks) = self.chunk_by_boundaries(text, ¶graph_boundaries, document_id.clone()) {
240 let oversized_chunks: Vec<_> = chunks.iter()
242 .filter(|c| c.char_length > self.config.max_chunk_size)
243 .collect();
244
245 if oversized_chunks.is_empty() {
246 return Ok(chunks);
247 }
248 }
249 }
250
251 if self.config.respect_sentences {
253 let sentence_boundaries = self.find_sentence_boundaries(text);
254 if let Ok(chunks) = self.chunk_by_boundaries(text, &sentence_boundaries, document_id.clone()) {
255 let oversized_chunks: Vec<_> = chunks.iter()
256 .filter(|c| c.char_length > self.config.max_chunk_size)
257 .collect();
258
259 if oversized_chunks.is_empty() {
260 return Ok(chunks);
261 }
262 }
263 }
264
265 self.chunk_fixed_size(text, document_id)
267 }
268
269 fn find_sentence_boundaries(&self, text: &str) -> Vec<usize> {
271 let mut boundaries = vec![0]; for mat in self.sentence_regex.find_iter(text) {
274 boundaries.push(mat.end());
275 }
276
277 if boundaries.last() != Some(&text.len()) {
278 boundaries.push(text.len()); }
280
281 boundaries
282 }
283
284 fn find_paragraph_boundaries(&self, text: &str) -> Vec<usize> {
286 let mut boundaries = vec![0]; for mat in self.paragraph_regex.find_iter(text) {
289 boundaries.push(mat.end());
290 }
291
292 if boundaries.last() != Some(&text.len()) {
293 boundaries.push(text.len()); }
295
296 boundaries
297 }
298
299 fn chunk_by_boundaries(
301 &self,
302 text: &str,
303 boundaries: &[usize],
304 document_id: Option<String>,
305 ) -> LangExtractResult<Vec<TextChunk>> {
306 let mut chunks = Vec::new();
307 let mut chunk_id = 0;
308 let mut current_start = 0;
309
310 for &boundary in boundaries.iter().skip(1) {
311 let potential_chunk_size = boundary - current_start;
312
313 if potential_chunk_size <= self.config.max_chunk_size {
315 if potential_chunk_size >= self.config.min_chunk_size || chunks.is_empty() {
316 let chunk_text = text[current_start..boundary].to_string();
317 let chunk = TextChunk::new(chunk_id, chunk_text, current_start, document_id.clone());
318 chunks.push(chunk);
319 chunk_id += 1;
320 current_start = boundary;
321 }
322 } else {
323 let section = &text[current_start..boundary];
326 let mut section_chunks = self.chunk_fixed_size(section, document_id.clone())?;
327
328 for chunk in &mut section_chunks {
330 chunk.id = chunk_id;
331 chunk.char_offset += current_start;
332 chunk_id += 1;
333 }
334
335 chunks.extend(section_chunks);
336 current_start = boundary;
337 }
338 }
339
340 if chunks.is_empty() {
341 chunks.push(TextChunk::new(0, text.to_string(), 0, document_id));
343 }
344
345 Ok(chunks)
346 }
347
348 pub fn config(&self) -> &ChunkingConfig {
350 &self.config
351 }
352}
353
354impl Default for TextChunker {
355 fn default() -> Self {
356 Self::new()
357 }
358}
359
360pub struct ResultAggregator {
362 similarity_threshold: f32,
364 merge_overlaps: bool,
366}
367
368impl ResultAggregator {
369 pub fn new() -> Self {
371 Self {
372 similarity_threshold: 0.8,
373 merge_overlaps: true,
374 }
375 }
376
377 pub fn with_settings(similarity_threshold: f32, merge_overlaps: bool) -> Self {
379 Self {
380 similarity_threshold,
381 merge_overlaps,
382 }
383 }
384
385 pub fn aggregate_chunk_results(
387 &self,
388 chunk_results: Vec<ChunkResult>,
389 original_text: String,
390 document_id: Option<String>,
391 ) -> LangExtractResult<AnnotatedDocument> {
392 let mut all_extractions = Vec::new();
393
394 for chunk_result in chunk_results {
396 if let Some(extractions) = chunk_result.extractions {
397 all_extractions.extend(extractions);
400 }
401 }
402
403 let deduplicated_extractions = if self.merge_overlaps {
405 self.deduplicate_extractions(all_extractions)?
406 } else {
407 all_extractions
408 };
409
410 let mut annotated_doc = AnnotatedDocument::with_extractions(deduplicated_extractions, original_text);
412 annotated_doc.document_id = document_id;
413
414 Ok(annotated_doc)
415 }
416
417 fn deduplicate_extractions(&self, extractions: Vec<Extraction>) -> LangExtractResult<Vec<Extraction>> {
419 let mut unique_extractions = Vec::new();
420
421 for extraction in extractions {
422 let mut is_duplicate = false;
423
424 for existing in &unique_extractions {
426 if self.are_similar_extractions(&extraction, existing) {
427 is_duplicate = true;
428 break;
429 }
430 }
431
432 if !is_duplicate {
433 unique_extractions.push(extraction);
434 }
435 }
436
437 Ok(unique_extractions)
438 }
439
440 fn are_similar_extractions(&self, e1: &Extraction, e2: &Extraction) -> bool {
442 if e1.extraction_class == e2.extraction_class {
444 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
445 return similarity >= self.similarity_threshold;
446 }
447
448 if let (Some(interval1), Some(interval2)) = (&e1.char_interval, &e2.char_interval) {
450 if interval1.overlaps_with(interval2) {
451 let similarity = self.text_similarity(&e1.extraction_text, &e2.extraction_text);
452 return similarity >= self.similarity_threshold;
453 }
454 }
455
456 false
457 }
458
459 fn text_similarity(&self, text1: &str, text2: &str) -> f32 {
461 if text1 == text2 {
462 return 1.0;
463 }
464
465 let words1: std::collections::HashSet<&str> = text1.split_whitespace().collect();
466 let words2: std::collections::HashSet<&str> = text2.split_whitespace().collect();
467
468 if words1.is_empty() && words2.is_empty() {
469 return 1.0;
470 }
471
472 let intersection = words1.intersection(&words2).count();
473 let union = words1.union(&words2).count();
474
475 if union == 0 {
476 0.0
477 } else {
478 intersection as f32 / union as f32
479 }
480 }
481}
482
483impl Default for ResultAggregator {
484 fn default() -> Self {
485 Self::new()
486 }
487}
488
489#[derive(Debug, Clone)]
491pub struct ChunkResult {
492 pub chunk_id: usize,
494 pub extractions: Option<Vec<Extraction>>,
496 pub char_offset: usize,
498 pub char_length: usize,
500 pub success: bool,
502 pub error: Option<String>,
504 pub processing_time: Option<std::time::Duration>,
506}
507
508impl ChunkResult {
509 pub fn success(
511 chunk_id: usize,
512 extractions: Vec<Extraction>,
513 char_offset: usize,
514 char_length: usize,
515 ) -> Self {
516 Self {
517 chunk_id,
518 extractions: Some(extractions),
519 char_offset,
520 char_length,
521 success: true,
522 error: None,
523 processing_time: None,
524 }
525 }
526
527 pub fn failure(
529 chunk_id: usize,
530 char_offset: usize,
531 char_length: usize,
532 error: String,
533 ) -> Self {
534 Self {
535 chunk_id,
536 extractions: None,
537 char_offset,
538 char_length,
539 success: false,
540 error: Some(error),
541 processing_time: None,
542 }
543 }
544
545 pub fn with_processing_time(mut self, duration: std::time::Duration) -> Self {
547 self.processing_time = Some(duration);
548 self
549 }
550}
551
552#[cfg(test)]
553mod tests {
554 use super::*;
555
556 #[test]
557 fn test_fixed_size_chunking() {
558 let chunker = TextChunker::with_config(ChunkingConfig {
559 max_chunk_size: 20,
560 overlap_size: 5,
561 strategy: ChunkingStrategy::FixedSize,
562 ..Default::default()
563 });
564
565 let text = "This is a test document with some text that needs to be chunked into smaller pieces.";
566 let chunks = chunker.chunk_text(text, None).unwrap();
567
568 assert!(chunks.len() > 1);
569 for chunk in &chunks {
570 assert!(chunk.char_length <= 20);
571 }
572 }
573
574 #[test]
575 fn test_sentence_chunking() {
576 let chunker = TextChunker::with_config(ChunkingConfig {
577 max_chunk_size: 50,
578 strategy: ChunkingStrategy::Sentence,
579 ..Default::default()
580 });
581
582 let text = "First sentence. Second sentence! Third sentence? Fourth sentence.";
583 let chunks = chunker.chunk_text(text, None).unwrap();
584
585 assert!(chunks.len() > 0);
587 for chunk in &chunks {
588 println!("Chunk: '{}'", chunk.text);
589 }
590 }
591
592 #[test]
593 fn test_small_text_no_chunking() {
594 let chunker = TextChunker::new();
595 let text = "Short text.";
596 let chunks = chunker.chunk_text(text, None).unwrap();
597
598 assert_eq!(chunks.len(), 1);
599 assert_eq!(chunks[0].text, text);
600 }
601
602 #[test]
603 fn test_chunk_char_interval() {
604 let chunk = TextChunk::new(0, "test".to_string(), 10, None);
605 let interval = chunk.char_interval();
606
607 assert_eq!(interval.start_pos, Some(10));
608 assert_eq!(interval.end_pos, Some(14));
609 }
610
611 #[test]
612 fn test_chunk_with_overlap() {
613 let chunk = TextChunk::with_overlap(
614 0,
615 "overlap test text".to_string(),
616 0,
617 None,
618 3,
619 4,
620 );
621
622 assert!(chunk.has_overlap);
623 assert_eq!(chunk.overlap_info, Some((3, 4)));
624 assert_eq!(chunk.core_text(), "rlap test ");
625 }
626}