1use serde::{Deserialize, Serialize};
18
19#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
21pub enum ChunkingStrategy {
22 FixedSize,
24 Sentence,
26 Paragraph,
28 Topic,
30 Semantic,
32 Hybrid,
34}
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ChunkingConfig {
39 pub strategy: ChunkingStrategy,
41 pub target_size: usize,
43 pub min_size: usize,
45 pub max_size: usize,
47 pub overlap: usize,
49 pub similarity_threshold: f32,
51}
52
53impl Default for ChunkingConfig {
54 fn default() -> Self {
55 Self {
56 strategy: ChunkingStrategy::Sentence,
57 target_size: 500,
58 min_size: 100,
59 max_size: 1000,
60 overlap: 50,
61 similarity_threshold: 0.7,
62 }
63 }
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct SemanticChunk {
69 pub text: String,
71 pub start: usize,
73 pub end: usize,
75 pub sentence_count: usize,
77 pub paragraph_count: usize,
79 pub coherence: f32,
81}
82
83pub struct SemanticChunker {
85 config: ChunkingConfig,
86}
87
88impl SemanticChunker {
89 pub fn new(config: ChunkingConfig) -> Self {
91 Self { config }
92 }
93
94 pub fn default_config() -> Self {
96 Self {
97 config: ChunkingConfig::default(),
98 }
99 }
100
101 pub fn chunk(&self, text: &str) -> Vec<SemanticChunk> {
103 match self.config.strategy {
104 ChunkingStrategy::FixedSize => self.chunk_fixed_size(text),
105 ChunkingStrategy::Sentence => self.chunk_by_sentences(text),
106 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(text),
107 ChunkingStrategy::Topic => self.chunk_by_topic(text),
108 ChunkingStrategy::Semantic => self.chunk_by_similarity(text),
109 ChunkingStrategy::Hybrid => self.chunk_hybrid(text),
110 }
111 }
112
113 fn chunk_fixed_size(&self, text: &str) -> Vec<SemanticChunk> {
115 let mut chunks = Vec::new();
116 let chars: Vec<char> = text.chars().collect();
117 let total_len = chars.len();
118 let mut start = 0;
119
120 while start < total_len {
121 let end = (start + self.config.target_size).min(total_len);
122 let chunk_text: String = chars[start..end].iter().collect();
123
124 chunks.push(SemanticChunk {
125 text: chunk_text,
126 start,
127 end,
128 sentence_count: 0, paragraph_count: 0,
130 coherence: 1.0,
131 });
132
133 start += self.config.target_size - self.config.overlap;
134 }
135
136 chunks
137 }
138
139 fn chunk_by_sentences(&self, text: &str) -> Vec<SemanticChunk> {
141 let sentences = self.split_sentences(text);
142 let mut chunks = Vec::new();
143 let mut current_chunk = Vec::new();
144 let mut current_size = 0;
145 let mut chunk_start = 0;
146
147 for sentence in sentences.iter() {
148 let sentence_len = sentence.len();
149
150 if current_size + sentence_len > self.config.max_size && !current_chunk.is_empty() {
152 let chunk_text = current_chunk.join(" ");
154 let chunk_end = chunk_start + chunk_text.len();
155
156 chunks.push(SemanticChunk {
157 text: chunk_text,
158 start: chunk_start,
159 end: chunk_end,
160 sentence_count: current_chunk.len(),
161 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
162 coherence: self.calculate_coherence(¤t_chunk),
163 });
164
165 let overlap_sentences = if current_chunk.len() > 1 {
167 vec![current_chunk.last().unwrap().clone()]
168 } else {
169 Vec::new()
170 };
171
172 chunk_start = chunk_end - overlap_sentences.join(" ").len();
173 current_chunk = overlap_sentences;
174 current_size = current_chunk.iter().map(|s| s.len()).sum();
175 }
176
177 current_chunk.push(sentence.clone());
178 current_size += sentence_len;
179
180 if current_size >= self.config.target_size {
182 let chunk_text = current_chunk.join(" ");
183 let chunk_end = chunk_start + chunk_text.len();
184
185 chunks.push(SemanticChunk {
186 text: chunk_text,
187 start: chunk_start,
188 end: chunk_end,
189 sentence_count: current_chunk.len(),
190 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
191 coherence: self.calculate_coherence(¤t_chunk),
192 });
193
194 let overlap_sentences = if current_chunk.len() > 1 {
196 vec![current_chunk.last().unwrap().clone()]
197 } else {
198 Vec::new()
199 };
200
201 chunk_start = chunk_end - overlap_sentences.join(" ").len();
202 current_chunk = overlap_sentences;
203 current_size = current_chunk.iter().map(|s| s.len()).sum();
204 }
205 }
206
207 if !current_chunk.is_empty() && current_chunk.join(" ").len() >= self.config.min_size {
209 let chunk_text = current_chunk.join(" ");
210 let chunk_end = chunk_start + chunk_text.len();
211
212 chunks.push(SemanticChunk {
213 text: chunk_text,
214 start: chunk_start,
215 end: chunk_end,
216 sentence_count: current_chunk.len(),
217 paragraph_count: self.count_paragraphs(¤t_chunk.join(" ")),
218 coherence: self.calculate_coherence(¤t_chunk),
219 });
220 }
221
222 chunks
223 }
224
225 fn chunk_by_paragraphs(&self, text: &str) -> Vec<SemanticChunk> {
227 let paragraphs: Vec<&str> = text.split("\n\n")
228 .filter(|p| !p.trim().is_empty())
229 .collect();
230
231 let mut chunks = Vec::new();
232 let mut current_chunk = Vec::new();
233 let mut current_size = 0;
234 let mut chunk_start = 0;
235
236 for paragraph in paragraphs {
237 let para_len = paragraph.len();
238
239 if current_size + para_len > self.config.max_size && !current_chunk.is_empty() {
240 let chunk_text = current_chunk.join("\n\n");
242 let chunk_end = chunk_start + chunk_text.len();
243
244 chunks.push(SemanticChunk {
245 text: chunk_text.clone(),
246 start: chunk_start,
247 end: chunk_end,
248 sentence_count: self.count_sentences(&chunk_text),
249 paragraph_count: current_chunk.len(),
250 coherence: self.calculate_coherence(¤t_chunk),
251 });
252
253 chunk_start = chunk_end;
254 current_chunk = Vec::new();
255 current_size = 0;
256 }
257
258 current_chunk.push(paragraph.to_string());
259 current_size += para_len;
260 }
261
262 if !current_chunk.is_empty() {
264 let chunk_text = current_chunk.join("\n\n");
265 let chunk_end = chunk_start + chunk_text.len();
266
267 chunks.push(SemanticChunk {
268 text: chunk_text.clone(),
269 start: chunk_start,
270 end: chunk_end,
271 sentence_count: self.count_sentences(&chunk_text),
272 paragraph_count: current_chunk.len(),
273 coherence: self.calculate_coherence(¤t_chunk),
274 });
275 }
276
277 chunks
278 }
279
280 fn chunk_by_topic(&self, text: &str) -> Vec<SemanticChunk> {
282 let sentences = self.split_sentences(text);
283 let mut chunks = Vec::new();
284
285 let mut boundaries = vec![0]; for i in 1..sentences.len() {
289 let cohesion = self.lexical_cohesion(&sentences[i-1], &sentences[i]);
290
291 if cohesion < self.config.similarity_threshold {
293 boundaries.push(i);
294 }
295 }
296
297 boundaries.push(sentences.len()); let mut text_pos = 0;
301 for window in boundaries.windows(2) {
302 let start_idx = window[0];
303 let end_idx = window[1];
304
305 let chunk_sentences = &sentences[start_idx..end_idx];
306 let chunk_text = chunk_sentences.join(" ");
307 let chunk_len = chunk_text.len();
308
309 if chunk_len >= self.config.min_size {
310 chunks.push(SemanticChunk {
311 text: chunk_text,
312 start: text_pos,
313 end: text_pos + chunk_len,
314 sentence_count: chunk_sentences.len(),
315 paragraph_count: self.count_paragraphs(&chunk_sentences.join(" ")),
316 coherence: self.calculate_coherence(chunk_sentences),
317 });
318 }
319
320 text_pos += chunk_len;
321 }
322
323 chunks
324 }
325
326 fn chunk_by_similarity(&self, text: &str) -> Vec<SemanticChunk> {
328 self.chunk_by_sentences(text)
337 }
338
339 fn chunk_hybrid(&self, text: &str) -> Vec<SemanticChunk> {
341 let para_chunks = self.chunk_by_paragraphs(text);
343
344 let mut final_chunks = Vec::new();
346
347 for chunk in para_chunks {
348 if chunk.text.len() > self.config.max_size {
349 let mut temp_config = self.config.clone();
351 temp_config.strategy = ChunkingStrategy::Sentence;
352 let sub_chunker = SemanticChunker::new(temp_config);
353 let sub_chunks = sub_chunker.chunk(&chunk.text);
354 final_chunks.extend(sub_chunks);
355 } else {
356 final_chunks.push(chunk);
357 }
358 }
359
360 final_chunks
361 }
362
363 fn split_sentences(&self, text: &str) -> Vec<String> {
365 let mut sentences = Vec::new();
366 let mut current = String::new();
367
368 for c in text.chars() {
369 current.push(c);
370
371 if matches!(c, '.' | '!' | '?') {
373 if let Some(next) = current.chars().last() {
374 if next.is_whitespace() || !current.trim().is_empty() {
375 sentences.push(current.trim().to_string());
376 current = String::new();
377 }
378 }
379 }
380 }
381
382 if !current.trim().is_empty() {
384 sentences.push(current.trim().to_string());
385 }
386
387 sentences
388 }
389
390 fn count_sentences(&self, text: &str) -> usize {
392 text.chars().filter(|c| matches!(c, '.' | '!' | '?')).count()
393 }
394
395 fn count_paragraphs(&self, text: &str) -> usize {
397 text.split("\n\n").filter(|p| !p.trim().is_empty()).count().max(1)
398 }
399
400 fn lexical_cohesion(&self, text1: &str, text2: &str) -> f32 {
402 let text1_lower = text1.to_lowercase();
403 let words1: std::collections::HashSet<_> = text1_lower
404 .split_whitespace()
405 .collect();
406
407 let text2_lower = text2.to_lowercase();
408 let words2: std::collections::HashSet<_> = text2_lower
409 .split_whitespace()
410 .collect();
411
412 let intersection = words1.intersection(&words2).count();
413 let union = words1.union(&words2).count();
414
415 if union == 0 {
416 0.0
417 } else {
418 intersection as f32 / union as f32
419 }
420 }
421
422 fn calculate_coherence(&self, sentences: &[String]) -> f32 {
424 if sentences.len() < 2 {
425 return 1.0;
426 }
427
428 let mut total_cohesion = 0.0;
429 for window in sentences.windows(2) {
430 total_cohesion += self.lexical_cohesion(&window[0], &window[1]);
431 }
432
433 total_cohesion / (sentences.len() - 1) as f32
434 }
435}
436
437#[derive(Debug, Clone, Serialize, Deserialize)]
439pub struct ChunkingStats {
440 pub total_chunks: usize,
442 pub avg_chunk_size: f32,
444 pub min_chunk_size: usize,
446 pub max_chunk_size: usize,
448 pub avg_coherence: f32,
450 pub avg_sentences_per_chunk: f32,
452}
453
454impl ChunkingStats {
455 pub fn from_chunks(chunks: &[SemanticChunk]) -> Self {
457 if chunks.is_empty() {
458 return Self {
459 total_chunks: 0,
460 avg_chunk_size: 0.0,
461 min_chunk_size: 0,
462 max_chunk_size: 0,
463 avg_coherence: 0.0,
464 avg_sentences_per_chunk: 0.0,
465 };
466 }
467
468 let total_chunks = chunks.len();
469 let sizes: Vec<usize> = chunks.iter().map(|c| c.text.len()).collect();
470 let avg_chunk_size = sizes.iter().sum::<usize>() as f32 / total_chunks as f32;
471 let min_chunk_size = *sizes.iter().min().unwrap();
472 let max_chunk_size = *sizes.iter().max().unwrap();
473
474 let avg_coherence = chunks.iter().map(|c| c.coherence).sum::<f32>() / total_chunks as f32;
475 let avg_sentences_per_chunk = chunks.iter().map(|c| c.sentence_count).sum::<usize>() as f32
476 / total_chunks as f32;
477
478 Self {
479 total_chunks,
480 avg_chunk_size,
481 min_chunk_size,
482 max_chunk_size,
483 avg_coherence,
484 avg_sentences_per_chunk,
485 }
486 }
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492
493 const TEST_TEXT: &str = "This is the first sentence. This is the second sentence. \
494 This is the third sentence.\n\n\
495 This is a new paragraph with different content. \
496 It has multiple sentences too. \
497 And here is another one.";
498
499 #[test]
500 fn test_fixed_size_chunking() {
501 let config = ChunkingConfig {
502 strategy: ChunkingStrategy::FixedSize,
503 target_size: 50,
504 min_size: 10,
505 max_size: 100,
506 overlap: 10,
507 similarity_threshold: 0.7,
508 };
509
510 let chunker = SemanticChunker::new(config);
511 let chunks = chunker.chunk(TEST_TEXT);
512
513 assert!(!chunks.is_empty());
514 for chunk in &chunks {
515 assert!(chunk.text.len() <= 100);
516 }
517 }
518
519 #[test]
520 fn test_sentence_chunking() {
521 let config = ChunkingConfig {
522 strategy: ChunkingStrategy::Sentence,
523 target_size: 100,
524 min_size: 20,
525 max_size: 200,
526 overlap: 20,
527 similarity_threshold: 0.7,
528 };
529
530 let chunker = SemanticChunker::new(config);
531 let chunks = chunker.chunk(TEST_TEXT);
532
533 assert!(!chunks.is_empty());
534 for chunk in &chunks {
535 assert!(chunk.sentence_count > 0);
536 assert!(chunk.text.len() >= 20);
537 }
538 }
539
540 #[test]
541 fn test_paragraph_chunking() {
542 let config = ChunkingConfig {
543 strategy: ChunkingStrategy::Paragraph,
544 target_size: 100,
545 min_size: 20,
546 max_size: 500,
547 overlap: 0,
548 similarity_threshold: 0.7,
549 };
550
551 let chunker = SemanticChunker::new(config);
552 let chunks = chunker.chunk(TEST_TEXT);
553
554 assert!(!chunks.is_empty());
555 for chunk in &chunks {
556 assert!(chunk.paragraph_count > 0);
557 }
558 }
559
560 #[test]
561 fn test_topic_chunking() {
562 let config = ChunkingConfig {
563 strategy: ChunkingStrategy::Topic,
564 target_size: 100,
565 min_size: 20,
566 max_size: 300,
567 overlap: 0,
568 similarity_threshold: 0.3,
569 };
570
571 let chunker = SemanticChunker::new(config);
572 let chunks = chunker.chunk(TEST_TEXT);
573
574 assert!(!chunks.is_empty());
575 }
576
577 #[test]
578 fn test_hybrid_chunking() {
579 let config = ChunkingConfig {
580 strategy: ChunkingStrategy::Hybrid,
581 target_size: 100,
582 min_size: 20,
583 max_size: 150,
584 overlap: 10,
585 similarity_threshold: 0.7,
586 };
587
588 let chunker = SemanticChunker::new(config);
589 let chunks = chunker.chunk(TEST_TEXT);
590
591 assert!(!chunks.is_empty());
592 }
593
594 #[test]
595 fn test_chunking_stats() {
596 let chunker = SemanticChunker::default_config();
597 let chunks = chunker.chunk(TEST_TEXT);
598 let stats = ChunkingStats::from_chunks(&chunks);
599
600 assert_eq!(stats.total_chunks, chunks.len());
601 assert!(stats.avg_chunk_size > 0.0);
602 assert!(stats.avg_coherence >= 0.0 && stats.avg_coherence <= 1.0);
603 }
604
605 #[test]
606 fn test_sentence_splitting() {
607 let chunker = SemanticChunker::default_config();
608 let sentences = chunker.split_sentences("Hello world. How are you? I am fine!");
609
610 assert_eq!(sentences.len(), 3);
611 assert_eq!(sentences[0], "Hello world.");
612 assert_eq!(sentences[1], "How are you?");
613 assert_eq!(sentences[2], "I am fine!");
614 }
615
616 #[test]
617 fn test_lexical_cohesion() {
618 let chunker = SemanticChunker::default_config();
619
620 let cohesion1 = chunker.lexical_cohesion(
621 "The cat sat on the mat",
622 "The cat was very happy"
623 );
624 assert!(cohesion1 > 0.0);
625
626 let cohesion2 = chunker.lexical_cohesion(
627 "The cat sat on the mat",
628 "Quantum physics is complex"
629 );
630 assert!(cohesion2 < cohesion1);
631 }
632}