1use serde::{Deserialize, Serialize};
18
19#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
21pub enum ChunkingStrategy {
22 FixedSize,
24 Sentence,
26 Paragraph,
28 Topic,
30 Semantic,
32 Hybrid,
34}
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ChunkingConfig {
39 pub strategy: ChunkingStrategy,
41 pub target_size: usize,
43 pub min_size: usize,
45 pub max_size: usize,
47 pub overlap: usize,
49 pub similarity_threshold: f32,
51}
52
53impl Default for ChunkingConfig {
54 fn default() -> Self {
55 Self {
56 strategy: ChunkingStrategy::Sentence,
57 target_size: 500,
58 min_size: 100,
59 max_size: 1000,
60 overlap: 50,
61 similarity_threshold: 0.7,
62 }
63 }
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SemanticChunk {
69 pub text: String,
71 pub start: usize,
73 pub end: usize,
75 pub sentence_count: usize,
77 pub paragraph_count: usize,
79 pub coherence: f32,
81}
82
83pub struct SemanticChunker {
85 config: ChunkingConfig,
86}
87
88impl SemanticChunker {
89 pub fn new(config: ChunkingConfig) -> Self {
91 Self { config }
92 }
93
94 pub fn default_config() -> Self {
96 Self {
97 config: ChunkingConfig::default(),
98 }
99 }
100
101 pub fn chunk(&self, text: &str) -> Vec<SemanticChunk> {
103 match self.config.strategy {
104 ChunkingStrategy::FixedSize => self.chunk_fixed_size(text),
105 ChunkingStrategy::Sentence => self.chunk_by_sentences(text),
106 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text),
107 ChunkingStrategy::Topic => self.chunk_by_topic(text),
108 ChunkingStrategy::Semantic => self.chunk_by_similarity(text),
109 ChunkingStrategy::Hybrid => self.chunk_hybrid(text),
110 }
111 }
112
113 fn chunk_fixed_size(&self, text: &str) -> Vec<SemanticChunk> {
115 let mut chunks = Vec::new();
116 let chars: Vec<char> = text.chars().collect();
117 let total_len = chars.len();
118 let mut start = 0;
119
120 while start < total_len {
121 let end = (start + self.config.target_size).min(total_len);
122 let chunk_text: String = chars[start..end].iter().collect();
123
124 chunks.push(SemanticChunk {
125 text: chunk_text,
126 start,
127 end,
128 sentence_count: 0, paragraph_count: 0,
130 coherence: 1.0,
131 });
132
133 start += self.config.target_size - self.config.overlap;
134 }
135
136 chunks
137 }
138
139 fn chunk_by_sentences(&self, text: &str) -> Vec<SemanticChunk> {
141 let sentences = self.split_sentences(text);
142 let mut chunks = Vec::new();
143 let mut current_chunk = Vec::new();
144 let mut current_size = 0;
145 let mut chunk_start = 0;
146
147 for sentence in sentences.iter() {
148 let sentence_len = sentence.len();
149
150 if current_size + sentence_len > self.config.max_size && !current_chunk.is_empty() {
152 let chunk_text = current_chunk.join(" ");
154 let chunk_end = chunk_start + chunk_text.len();
155
156 chunks.push(SemanticChunk {
157 text: chunk_text,
158 start: chunk_start,
159 end: chunk_end,
160 sentence_count: current_chunk.len(),
161 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
162 coherence: self.calculate_coherence(¤t_chunk),
163 });
164
165 let overlap_sentences = if current_chunk.len() > 1 {
167 vec![current_chunk.last().unwrap().clone()]
168 } else {
169 Vec::new()
170 };
171
172 chunk_start = chunk_end - overlap_sentences.join(" ").len();
173 current_chunk = overlap_sentences;
174 current_size = current_chunk.iter().map(|s| s.len()).sum();
175 }
176
177 current_chunk.push(sentence.clone());
178 current_size += sentence_len;
179
180 if current_size >= self.config.target_size {
182 let chunk_text = current_chunk.join(" ");
183 let chunk_end = chunk_start + chunk_text.len();
184
185 chunks.push(SemanticChunk {
186 text: chunk_text,
187 start: chunk_start,
188 end: chunk_end,
189 sentence_count: current_chunk.len(),
190 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
191 coherence: self.calculate_coherence(¤t_chunk),
192 });
193
194 let overlap_sentences = if current_chunk.len() > 1 {
196 vec![current_chunk.last().unwrap().clone()]
197 } else {
198 Vec::new()
199 };
200
201 chunk_start = chunk_end - overlap_sentences.join(" ").len();
202 current_chunk = overlap_sentences;
203 current_size = current_chunk.iter().map(|s| s.len()).sum();
204 }
205 }
206
207 if !current_chunk.is_empty() && current_chunk.join(" ").len() >= self.config.min_size {
209 let chunk_text = current_chunk.join(" ");
210 let chunk_end = chunk_start + chunk_text.len();
211
212 chunks.push(SemanticChunk {
213 text: chunk_text,
214 start: chunk_start,
215 end: chunk_end,
216 sentence_count: current_chunk.len(),
217 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
218 coherence: self.calculate_coherence(¤t_chunk),
219 });
220 }
221
222 chunks
223 }
224
225 fn chunk_by_paragraphs(&self, text: &str) -> Vec<SemanticChunk> {
227 let paragraphs: Vec<&str> = text
228 .split("\n\n")
229 .filter(|p| !p.trim().is_empty())
230 .collect();
231
232 let mut chunks = Vec::new();
233 let mut current_chunk = Vec::new();
234 let mut current_size = 0;
235 let mut chunk_start = 0;
236
237 for paragraph in paragraphs {
238 let para_len = paragraph.len();
239
240 if current_size + para_len > self.config.max_size && !current_chunk.is_empty() {
241 let chunk_text = current_chunk.join("\n\n");
243 let chunk_end = chunk_start + chunk_text.len();
244
245 chunks.push(SemanticChunk {
246 text: chunk_text.clone(),
247 start: chunk_start,
248 end: chunk_end,
249 sentence_count: self.count_sentences(&chunk_text),
250 paragraph_count: current_chunk.len(),
251 coherence: self.calculate_coherence(¤t_chunk),
252 });
253
254 chunk_start = chunk_end;
255 current_chunk = Vec::new();
256 current_size = 0;
257 }
258
259 current_chunk.push(paragraph.to_string());
260 current_size += para_len;
261 }
262
263 if !current_chunk.is_empty() {
265 let chunk_text = current_chunk.join("\n\n");
266 let chunk_end = chunk_start + chunk_text.len();
267
268 chunks.push(SemanticChunk {
269 text: chunk_text.clone(),
270 start: chunk_start,
271 end: chunk_end,
272 sentence_count: self.count_sentences(&chunk_text),
273 paragraph_count: current_chunk.len(),
274 coherence: self.calculate_coherence(¤t_chunk),
275 });
276 }
277
278 chunks
279 }
280
281 fn chunk_by_topic(&self, text: &str) -> Vec<SemanticChunk> {
283 let sentences = self.split_sentences(text);
284 let mut chunks = Vec::new();
285
286 let mut boundaries = vec![0]; for i in 1..sentences.len() {
290 let cohesion = self.lexical_cohesion(&sentences[i - 1], &sentences[i]);
291
292 if cohesion < self.config.similarity_threshold {
294 boundaries.push(i);
295 }
296 }
297
298 boundaries.push(sentences.len()); let mut text_pos = 0;
302 for window in boundaries.windows(2) {
303 let start_idx = window[0];
304 let end_idx = window[1];
305
306 let chunk_sentences = &sentences[start_idx..end_idx];
307 let chunk_text = chunk_sentences.join(" ");
308 let chunk_len = chunk_text.len();
309
310 if chunk_len >= self.config.min_size {
311 chunks.push(SemanticChunk {
312 text: chunk_text,
313 start: text_pos,
314 end: text_pos + chunk_len,
315 sentence_count: chunk_sentences.len(),
316 paragraph_count: self.count_paragraphs(&chunk_sentences.join(" ")),
317 coherence: self.calculate_coherence(chunk_sentences),
318 });
319 }
320
321 text_pos += chunk_len;
322 }
323
324 chunks
325 }
326
327 fn chunk_by_similarity(&self, text: &str) -> Vec<SemanticChunk> {
332 let sentences = self.split_sentences(text);
333
334 if sentences.is_empty() {
335 return vec![];
336 }
337
338 if sentences.len() == 1 {
339 let sentence = &sentences[0];
340 return vec![SemanticChunk {
341 text: sentence.clone(),
342 start: 0,
343 end: sentence.len(),
344 sentence_count: 1,
345 paragraph_count: 1,
346 coherence: 1.0,
347 }];
348 }
349
350 let mut similarities = Vec::new();
352 for i in 0..sentences.len() - 1 {
353 let similarity = self.lexical_cohesion(&sentences[i], &sentences[i + 1]);
354 similarities.push(similarity);
355 }
356
357 let mut boundaries = vec![0]; for (i, &similarity) in similarities.iter().enumerate() {
360 if similarity < self.config.similarity_threshold {
361 boundaries.push(i + 1);
363 }
364 }
365 boundaries.push(sentences.len()); let mut chunks: Vec<SemanticChunk> = Vec::new();
369 let mut text_pos = 0;
370
371 for window in boundaries.windows(2) {
372 let start_idx = window[0];
373 let end_idx = window[1];
374
375 let chunk_sentences = &sentences[start_idx..end_idx];
376 let chunk_text = chunk_sentences.join(" ");
377 let chunk_len = chunk_text.len();
378
379 if chunk_text.trim().is_empty() {
381 continue;
382 }
383
384 if chunk_len < self.config.min_size && !chunks.is_empty() {
386 if let Some(last_chunk) = chunks.last_mut() {
388 last_chunk.text.push(' ');
389 last_chunk.text.push_str(&chunk_text);
390 last_chunk.end = text_pos + chunk_len;
391 last_chunk.sentence_count += chunk_sentences.len();
392 last_chunk.paragraph_count = self.count_paragraphs(&last_chunk.text);
393 last_chunk.coherence =
394 self.calculate_coherence(&self.split_sentences(&last_chunk.text));
395 text_pos += chunk_len + 1; continue;
397 }
398 }
399
400 if chunk_len > self.config.max_size {
402 let mut current_text = String::new();
404 let mut current_start = text_pos;
405 let mut current_sentences = Vec::new();
406
407 for sentence in chunk_sentences {
408 if current_text.len() + sentence.len() > self.config.max_size
409 && !current_text.is_empty()
410 {
411 chunks.push(SemanticChunk {
413 text: current_text.trim().to_string(),
414 start: current_start,
415 end: current_start + current_text.len(),
416 sentence_count: current_sentences.len(),
417 paragraph_count: self.count_paragraphs(¤t_text),
418 coherence: self.calculate_coherence(¤t_sentences),
419 });
420
421 current_start += current_text.len() + 1;
422 current_text = String::new();
423 current_sentences.clear();
424 }
425
426 if !current_text.is_empty() {
427 current_text.push(' ');
428 }
429 current_text.push_str(sentence);
430 current_sentences.push(sentence.clone());
431 }
432
433 if !current_text.is_empty() {
435 chunks.push(SemanticChunk {
436 text: current_text.trim().to_string(),
437 start: current_start,
438 end: current_start + current_text.len(),
439 sentence_count: current_sentences.len(),
440 paragraph_count: self.count_paragraphs(¤t_text),
441 coherence: self.calculate_coherence(¤t_sentences),
442 });
443 }
444
445 text_pos += chunk_len + 1;
446 } else {
447 chunks.push(SemanticChunk {
449 text: chunk_text.clone(),
450 start: text_pos,
451 end: text_pos + chunk_len,
452 sentence_count: chunk_sentences.len(),
453 paragraph_count: self.count_paragraphs(&chunk_text),
454 coherence: self.calculate_coherence(chunk_sentences),
455 });
456
457 text_pos += chunk_len + 1; }
459 }
460
461 chunks
462 }
463
464 fn chunk_hybrid(&self, text: &str) -> Vec<SemanticChunk> {
466 let para_chunks = self.chunk_by_paragraphs(text);
468
469 let mut final_chunks = Vec::new();
471
472 for chunk in para_chunks {
473 if chunk.text.len() > self.config.max_size {
474 let mut temp_config = self.config.clone();
476 temp_config.strategy = ChunkingStrategy::Sentence;
477 let sub_chunker = SemanticChunker::new(temp_config);
478 let sub_chunks = sub_chunker.chunk(&chunk.text);
479 final_chunks.extend(sub_chunks);
480 } else {
481 final_chunks.push(chunk);
482 }
483 }
484
485 final_chunks
486 }
487
488 fn split_sentences(&self, text: &str) -> Vec<String> {
490 let mut sentences = Vec::new();
491 let mut current = String::new();
492
493 for c in text.chars() {
494 current.push(c);
495
496 if matches!(c, '.' | '!' | '?') {
498 if let Some(next) = current.chars().last() {
499 if next.is_whitespace() || !current.trim().is_empty() {
500 sentences.push(current.trim().to_string());
501 current = String::new();
502 }
503 }
504 }
505 }
506
507 if !current.trim().is_empty() {
509 sentences.push(current.trim().to_string());
510 }
511
512 sentences
513 }
514
515 fn count_sentences(&self, text: &str) -> usize {
517 text.chars()
518 .filter(|c| matches!(c, '.' | '!' | '?'))
519 .count()
520 }
521
522 fn count_paragraphs(&self, text: &str) -> usize {
524 text.split("\n\n")
525 .filter(|p| !p.trim().is_empty())
526 .count()
527 .max(1)
528 }
529
530 fn lexical_cohesion(&self, text1: &str, text2: &str) -> f32 {
532 let text1_lower = text1.to_lowercase();
533 let words1: std::collections::HashSet<_> = text1_lower.split_whitespace().collect();
534
535 let text2_lower = text2.to_lowercase();
536 let words2: std::collections::HashSet<_> = text2_lower.split_whitespace().collect();
537
538 let intersection = words1.intersection(&words2).count();
539 let union = words1.union(&words2).count();
540
541 if union == 0 {
542 0.0
543 } else {
544 intersection as f32 / union as f32
545 }
546 }
547
548 fn calculate_coherence(&self, sentences: &[String]) -> f32 {
550 if sentences.len() < 2 {
551 return 1.0;
552 }
553
554 let mut total_cohesion = 0.0;
555 for window in sentences.windows(2) {
556 total_cohesion += self.lexical_cohesion(&window[0], &window[1]);
557 }
558
559 total_cohesion / (sentences.len() - 1) as f32
560 }
561}
562
563#[derive(Debug, Clone, Serialize, Deserialize)]
565pub struct ChunkingStats {
566 pub total_chunks: usize,
568 pub avg_chunk_size: f32,
570 pub min_chunk_size: usize,
572 pub max_chunk_size: usize,
574 pub avg_coherence: f32,
576 pub avg_sentences_per_chunk: f32,
578}
579
580impl ChunkingStats {
581 pub fn from_chunks(chunks: &[SemanticChunk]) -> Self {
583 if chunks.is_empty() {
584 return Self {
585 total_chunks: 0,
586 avg_chunk_size: 0.0,
587 min_chunk_size: 0,
588 max_chunk_size: 0,
589 avg_coherence: 0.0,
590 avg_sentences_per_chunk: 0.0,
591 };
592 }
593
594 let total_chunks = chunks.len();
595 let sizes: Vec<usize> = chunks.iter().map(|c| c.text.len()).collect();
596 let avg_chunk_size = sizes.iter().sum::<usize>() as f32 / total_chunks as f32;
597 let min_chunk_size = *sizes.iter().min().unwrap();
598 let max_chunk_size = *sizes.iter().max().unwrap();
599
600 let avg_coherence = chunks.iter().map(|c| c.coherence).sum::<f32>() / total_chunks as f32;
601 let avg_sentences_per_chunk =
602 chunks.iter().map(|c| c.sentence_count).sum::<usize>() as f32 / total_chunks as f32;
603
604 Self {
605 total_chunks,
606 avg_chunk_size,
607 min_chunk_size,
608 max_chunk_size,
609 avg_coherence,
610 avg_sentences_per_chunk,
611 }
612 }
613}
614
615#[cfg(test)]
616mod tests {
617 use super::*;
618
619 const TEST_TEXT: &str = "This is the first sentence. This is the second sentence. \
620 This is the third sentence.\n\n\
621 This is a new paragraph with different content. \
622 It has multiple sentences too. \
623 And here is another one.";
624
625 #[test]
626 fn test_fixed_size_chunking() {
627 let config = ChunkingConfig {
628 strategy: ChunkingStrategy::FixedSize,
629 target_size: 50,
630 min_size: 10,
631 max_size: 100,
632 overlap: 10,
633 similarity_threshold: 0.7,
634 };
635
636 let chunker = SemanticChunker::new(config);
637 let chunks = chunker.chunk(TEST_TEXT);
638
639 assert!(!chunks.is_empty());
640 for chunk in &chunks {
641 assert!(chunk.text.len() <= 100);
642 }
643 }
644
645 #[test]
646 fn test_sentence_chunking() {
647 let config = ChunkingConfig {
648 strategy: ChunkingStrategy::Sentence,
649 target_size: 100,
650 min_size: 20,
651 max_size: 200,
652 overlap: 20,
653 similarity_threshold: 0.7,
654 };
655
656 let chunker = SemanticChunker::new(config);
657 let chunks = chunker.chunk(TEST_TEXT);
658
659 assert!(!chunks.is_empty());
660 for chunk in &chunks {
661 assert!(chunk.sentence_count > 0);
662 assert!(chunk.text.len() >= 20);
663 }
664 }
665
666 #[test]
667 fn test_paragraph_chunking() {
668 let config = ChunkingConfig {
669 strategy: ChunkingStrategy::Paragraph,
670 target_size: 100,
671 min_size: 20,
672 max_size: 500,
673 overlap: 0,
674 similarity_threshold: 0.7,
675 };
676
677 let chunker = SemanticChunker::new(config);
678 let chunks = chunker.chunk(TEST_TEXT);
679
680 assert!(!chunks.is_empty());
681 for chunk in &chunks {
682 assert!(chunk.paragraph_count > 0);
683 }
684 }
685
686 #[test]
687 fn test_topic_chunking() {
688 let config = ChunkingConfig {
689 strategy: ChunkingStrategy::Topic,
690 target_size: 100,
691 min_size: 20,
692 max_size: 300,
693 overlap: 0,
694 similarity_threshold: 0.3,
695 };
696
697 let chunker = SemanticChunker::new(config);
698 let chunks = chunker.chunk(TEST_TEXT);
699
700 assert!(!chunks.is_empty());
701 }
702
703 #[test]
704 fn test_hybrid_chunking() {
705 let config = ChunkingConfig {
706 strategy: ChunkingStrategy::Hybrid,
707 target_size: 100,
708 min_size: 20,
709 max_size: 150,
710 overlap: 10,
711 similarity_threshold: 0.7,
712 };
713
714 let chunker = SemanticChunker::new(config);
715 let chunks = chunker.chunk(TEST_TEXT);
716
717 assert!(!chunks.is_empty());
718 }
719
720 #[test]
721 fn test_chunking_stats() {
722 let chunker = SemanticChunker::default_config();
723 let chunks = chunker.chunk(TEST_TEXT);
724 let stats = ChunkingStats::from_chunks(&chunks);
725
726 assert_eq!(stats.total_chunks, chunks.len());
727 assert!(stats.avg_chunk_size > 0.0);
728 assert!(stats.avg_coherence >= 0.0 && stats.avg_coherence <= 1.0);
729 }
730
731 #[test]
732 fn test_sentence_splitting() {
733 let chunker = SemanticChunker::default_config();
734 let sentences = chunker.split_sentences("Hello world. How are you? I am fine!");
735
736 assert_eq!(sentences.len(), 3);
737 assert_eq!(sentences[0], "Hello world.");
738 assert_eq!(sentences[1], "How are you?");
739 assert_eq!(sentences[2], "I am fine!");
740 }
741
742 #[test]
743 fn test_lexical_cohesion() {
744 let chunker = SemanticChunker::default_config();
745
746 let cohesion1 =
747 chunker.lexical_cohesion("The cat sat on the mat", "The cat was very happy");
748 assert!(cohesion1 > 0.0);
749
750 let cohesion2 =
751 chunker.lexical_cohesion("The cat sat on the mat", "Quantum physics is complex");
752 assert!(cohesion2 < cohesion1);
753 }
754}