1use regex::Regex;
33use std::sync::LazyLock;
34
35static DIALOGUE_TURN_PATTERN: LazyLock<Regex> =
37 LazyLock::new(|| Regex::new(r"(?m)^([A-Z][a-zA-Z0-9_\- ]{0,30})\s*:").unwrap());
38
39static SECTION_PATTERN: LazyLock<Regex> =
41 LazyLock::new(|| Regex::new(r"(?m)^(?:\[.*?\]|#{1,3}\s+\w|Session \d+|---+)").unwrap());
42
43pub struct ChunkConfig {
45 pub chunk_size: usize,
47 pub overlap: usize,
49 pub min_chunk_size: usize,
51}
52
53impl Default for ChunkConfig {
54 fn default() -> Self {
55 Self {
56 chunk_size: 800,
59 overlap: 200,
61 min_chunk_size: 200,
63 }
64 }
65}
66
67#[derive(Debug, Clone)]
69pub struct ChunkResult {
70 pub chunks: Vec<String>,
72 pub original_length: usize,
74 pub was_chunked: bool,
76}
77
78impl ChunkResult {
79 pub fn coverage_ratio(&self) -> f32 {
81 if self.chunks.is_empty() {
82 return 0.0;
83 }
84 1.0
87 }
88}
89
90#[inline]
92fn floor_char_boundary(s: &str, index: usize) -> usize {
93 if index >= s.len() {
94 return s.len();
95 }
96 let mut i = index;
98 while i > 0 && !s.is_char_boundary(i) {
99 i -= 1;
100 }
101 i
102}
103
104#[inline]
106fn ceil_char_boundary(s: &str, index: usize) -> usize {
107 if index >= s.len() {
108 return s.len();
109 }
110 let mut i = index;
112 while i < s.len() && !s.is_char_boundary(i) {
113 i += 1;
114 }
115 i
116}
117
118pub fn chunk_text(text: &str, config: &ChunkConfig) -> ChunkResult {
122 let text = text.trim();
123 let original_length = text.len();
124
125 if original_length <= config.chunk_size {
127 return ChunkResult {
128 chunks: vec![text.to_string()],
129 original_length,
130 was_chunked: false,
131 };
132 }
133
134 let mut chunks = Vec::new();
135 let mut start = 0;
136
137 while start < original_length {
138 let mut end = floor_char_boundary(text, (start + config.chunk_size).min(original_length));
140
141 if end < original_length {
143 end = find_break_point(text, start, end, config.min_chunk_size);
144 end = floor_char_boundary(text, end);
146 }
147
148 start = ceil_char_boundary(text, start);
150
151 if start >= end {
153 break;
154 }
155
156 let chunk = text[start..end].trim();
158 if chunk.len() >= config.min_chunk_size || chunks.is_empty() {
159 chunks.push(chunk.to_string());
160 } else if let Some(last) = chunks.last_mut() {
161 last.push(' ');
163 last.push_str(chunk);
164 }
165
166 if end >= original_length {
168 break;
169 }
170 start = ceil_char_boundary(text, end.saturating_sub(config.overlap));
172
173 if start <= chunks.len().saturating_sub(1) * (config.chunk_size - config.overlap) {
175 start = ceil_char_boundary(text, end);
176 }
177 }
178
179 ChunkResult {
180 chunks,
181 original_length,
182 was_chunked: true,
183 }
184}
185
186fn find_break_point(text: &str, start: usize, ideal_end: usize, min_size: usize) -> usize {
188 let chunk = &text[start..ideal_end];
189
190 let sentence_boundaries: Vec<usize> = chunk
192 .char_indices()
193 .filter_map(|(byte_offset, c)| {
194 if (c == '.' || c == '!' || c == '?') && byte_offset >= min_size {
195 let after = byte_offset + c.len_utf8();
197 let next_char = chunk[after..].chars().next();
198 if next_char.is_none_or(|nc| nc.is_whitespace()) {
199 return Some(start + after);
200 }
201 }
202 None
203 })
204 .collect();
205
206 if let Some(&boundary) = sentence_boundaries.last() {
208 return boundary;
209 }
210
211 let word_boundaries: Vec<usize> = chunk
213 .char_indices()
214 .filter_map(|(i, c)| {
215 if c.is_whitespace() && i >= min_size {
216 Some(start + i)
217 } else {
218 None
219 }
220 })
221 .collect();
222
223 if let Some(&boundary) = word_boundaries.last() {
224 return boundary;
225 }
226
227 ideal_end
229}
230
231pub fn estimate_tokens(text: &str) -> usize {
239 if text.is_empty() {
240 return 0;
241 }
242
243 let words = text.split_whitespace().count();
244 if words == 0 {
245 return text.chars().count().div_ceil(4);
248 }
249
250 let base_tokens = (words as f64 * 1.3).ceil() as usize;
253
254 let special_chars = text
257 .chars()
258 .filter(|c| c.is_ascii_punctuation() || *c == '\n')
259 .count();
260 let punct_tokens = special_chars / 3; base_tokens + punct_tokens
263}
264
265pub struct SemanticChunkConfig {
267 pub target_size: usize,
269 pub max_size: usize,
271 pub min_size: usize,
273 pub preserve_dialogue_turns: bool,
275 pub split_on_paragraphs: bool,
277}
278
279impl Default for SemanticChunkConfig {
280 fn default() -> Self {
281 Self {
282 target_size: 800,
283 max_size: 1200,
284 min_size: 100,
285 preserve_dialogue_turns: true,
286 split_on_paragraphs: true,
287 }
288 }
289}
290
291#[derive(Debug, Clone)]
293struct SemanticSegment {
294 text: String,
295 #[allow(dead_code)]
296 segment_type: SegmentType,
297}
298
299#[derive(Debug, Clone, PartialEq)]
300enum SegmentType {
301 DialogueTurn,
302 Paragraph,
303 Section,
304 Text,
305}
306
307pub fn semantic_chunk_text(text: &str, config: &SemanticChunkConfig) -> ChunkResult {
312 let text = text.trim();
313 let original_length = text.len();
314
315 if original_length <= config.target_size {
317 return ChunkResult {
318 chunks: vec![text.to_string()],
319 original_length,
320 was_chunked: false,
321 };
322 }
323
324 let segments = split_into_segments(text, config);
326
327 let chunks = group_segments_into_chunks(segments, config);
329
330 ChunkResult {
331 chunks,
332 original_length,
333 was_chunked: true,
334 }
335}
336
337fn split_into_segments(text: &str, config: &SemanticChunkConfig) -> Vec<SemanticSegment> {
339 let mut segments = Vec::new();
340
341 let is_dialogue = config.preserve_dialogue_turns && DIALOGUE_TURN_PATTERN.is_match(text);
343
344 if is_dialogue {
345 let turn_starts: Vec<usize> = DIALOGUE_TURN_PATTERN
347 .find_iter(text)
348 .map(|m| m.start())
349 .collect();
350
351 if !turn_starts.is_empty() && turn_starts[0] > 0 {
353 let pre_text = text[..turn_starts[0]].trim();
354 if !pre_text.is_empty() {
355 segments.push(SemanticSegment {
356 text: pre_text.to_string(),
357 segment_type: SegmentType::Text,
358 });
359 }
360 }
361
362 for (i, &start) in turn_starts.iter().enumerate() {
363 let end = if i + 1 < turn_starts.len() {
364 turn_starts[i + 1]
365 } else {
366 text.len()
367 };
368
369 let turn_text = text[start..end].trim();
370 if !turn_text.is_empty() {
371 segments.push(SemanticSegment {
372 text: turn_text.to_string(),
373 segment_type: SegmentType::DialogueTurn,
374 });
375 }
376 }
377 } else if config.split_on_paragraphs {
378 let paragraph_pattern = Regex::new(r"\n\s*\n").unwrap();
380 let mut last_end = 0;
381
382 for mat in paragraph_pattern.find_iter(text) {
383 if mat.start() > last_end {
384 let para_text = text[last_end..mat.start()].trim();
385 if !para_text.is_empty() {
386 let seg_type = if SECTION_PATTERN.is_match(para_text) {
387 SegmentType::Section
388 } else {
389 SegmentType::Paragraph
390 };
391 segments.push(SemanticSegment {
392 text: para_text.to_string(),
393 segment_type: seg_type,
394 });
395 }
396 }
397 last_end = mat.end();
398 }
399
400 if last_end < text.len() {
402 let remaining = text[last_end..].trim();
403 if !remaining.is_empty() {
404 segments.push(SemanticSegment {
405 text: remaining.to_string(),
406 segment_type: SegmentType::Paragraph,
407 });
408 }
409 }
410 } else {
411 segments = split_by_sentences(text);
413 }
414
415 if segments.is_empty() {
417 segments.push(SemanticSegment {
418 text: text.to_string(),
419 segment_type: SegmentType::Text,
420 });
421 }
422
423 segments
424}
425
426fn split_by_sentences(text: &str) -> Vec<SemanticSegment> {
428 let sentence_pattern = Regex::new(r"[.!?]+\s+").unwrap();
429 let mut segments = Vec::new();
430 let mut last_end = 0;
431
432 for mat in sentence_pattern.find_iter(text) {
433 let sentence = text[last_end..mat.end()].trim();
434 if !sentence.is_empty() {
435 segments.push(SemanticSegment {
436 text: sentence.to_string(),
437 segment_type: SegmentType::Text,
438 });
439 }
440 last_end = mat.end();
441 }
442
443 if last_end < text.len() {
445 let remaining = text[last_end..].trim();
446 if !remaining.is_empty() {
447 segments.push(SemanticSegment {
448 text: remaining.to_string(),
449 segment_type: SegmentType::Text,
450 });
451 }
452 }
453
454 segments
455}
456
457fn group_segments_into_chunks(
459 segments: Vec<SemanticSegment>,
460 config: &SemanticChunkConfig,
461) -> Vec<String> {
462 let mut chunks = Vec::new();
463 let mut current_chunk = String::new();
464
465 for segment in segments {
466 let segment_len = segment.text.len();
467
468 if segment_len > config.max_size {
470 if !current_chunk.is_empty() {
472 chunks.push(current_chunk.trim().to_string());
473 current_chunk = String::new();
474 }
475
476 let fixed_config = ChunkConfig {
478 chunk_size: config.target_size,
479 overlap: config.min_size / 2,
480 min_chunk_size: config.min_size,
481 };
482 let sub_chunks = chunk_text(&segment.text, &fixed_config);
483 chunks.extend(sub_chunks.chunks);
484 continue;
485 }
486
487 let new_len = current_chunk.len() + segment_len + 1; if new_len > config.target_size && !current_chunk.is_empty() {
491 chunks.push(current_chunk.trim().to_string());
493 current_chunk = String::new();
494 }
495
496 if !current_chunk.is_empty() {
498 current_chunk.push('\n');
499 }
500 current_chunk.push_str(&segment.text);
501 }
502
503 if !current_chunk.is_empty() {
505 let trimmed = current_chunk.trim().to_string();
506 if trimmed.len() < config.min_size && !chunks.is_empty() {
508 let last = chunks.pop().unwrap_or_default();
509 chunks.push(format!("{last}\n{trimmed}"));
510 } else {
511 chunks.push(trimmed);
512 }
513 }
514
515 chunks
516}
517
518pub fn is_dialogue_format(text: &str) -> bool {
520 DIALOGUE_TURN_PATTERN.is_match(text)
521}
522
523pub fn auto_chunk_text(text: &str) -> ChunkResult {
525 if is_dialogue_format(text) {
526 semantic_chunk_text(text, &SemanticChunkConfig::default())
527 } else {
528 chunk_text(text, &ChunkConfig::default())
529 }
530}
531
532#[cfg(test)]
533mod tests {
534 use super::*;
535
536 #[test]
537 fn test_short_text_no_chunking() {
538 let config = ChunkConfig::default();
539 let result = chunk_text("This is a short text.", &config);
540
541 assert_eq!(result.chunks.len(), 1);
542 assert!(!result.was_chunked);
543 assert_eq!(result.chunks[0], "This is a short text.");
544 }
545
546 #[test]
547 fn test_long_text_chunking() {
548 let config = ChunkConfig {
549 chunk_size: 100,
550 overlap: 20,
551 min_chunk_size: 30,
552 };
553
554 let text = "This is sentence one. This is sentence two. This is sentence three. \
556 This is sentence four. This is sentence five. This is sentence six. \
557 This is sentence seven. This is sentence eight.";
558
559 let result = chunk_text(text, &config);
560
561 assert!(result.was_chunked);
562 assert!(result.chunks.len() > 1);
563
564 for chunk in &result.chunks {
566 assert!(
567 chunk.len() >= config.min_chunk_size,
568 "Chunk too small: '{}' (len={})",
569 chunk,
570 chunk.len()
571 );
572 }
573
574 let total_len: usize = result.chunks.iter().map(|c| c.len()).sum();
576 assert!(
577 total_len >= result.original_length,
578 "Total chunk length {} < original {}",
579 total_len,
580 result.original_length
581 );
582 }
583
584 #[test]
585 fn test_sentence_boundary_respected() {
586 let config = ChunkConfig {
587 chunk_size: 50,
588 overlap: 10,
589 min_chunk_size: 20,
590 };
591
592 let text = "First sentence here. Second sentence follows. Third sentence ends.";
593 let result = chunk_text(text, &config);
594
595 for chunk in &result.chunks {
597 let trimmed = chunk.trim();
598 if !trimmed.is_empty() && result.chunks.len() > 1 {
599 let last_char = trimmed.chars().last().unwrap();
601 assert!(
603 last_char == '.'
604 || last_char == '!'
605 || last_char == '?'
606 || chunk == result.chunks.last().unwrap(),
607 "Chunk '{chunk}' doesn't end at sentence boundary"
608 );
609 }
610 }
611 }
612
613 #[test]
614 fn test_overlap_exists() {
615 let config = ChunkConfig {
616 chunk_size: 60,
617 overlap: 20,
618 min_chunk_size: 20,
619 };
620
621 let text = "AAAA BBBB CCCC DDDD EEEE FFFF GGGG HHHH IIII JJJJ KKKK LLLL MMMM";
622 let result = chunk_text(text, &config);
623
624 if result.chunks.len() >= 2 {
625 for i in 0..result.chunks.len() - 1 {
627 let chunk1 = &result.chunks[i];
628 let chunk2 = &result.chunks[i + 1];
629
630 let words1: std::collections::HashSet<_> = chunk1.split_whitespace().collect();
632 let words2: std::collections::HashSet<_> = chunk2.split_whitespace().collect();
633 let common: Vec<_> = words1.intersection(&words2).collect();
634
635 assert!(
637 !common.is_empty() || chunk1.len() < config.overlap,
638 "No overlap between chunks {} and {}",
639 i,
640 i + 1
641 );
642 }
643 }
644 }
645
646 #[test]
647 fn test_token_estimation() {
648 assert_eq!(estimate_tokens(""), 0);
650
651 assert_eq!(estimate_tokens("test"), 2);
653
654 assert_eq!(estimate_tokens("hello world"), 3);
656
657 assert_eq!(estimate_tokens("Hello, world! How are you?"), 8);
659
660 let code = "fn main() { println!(\"hello\"); }";
662 let tokens = estimate_tokens(code);
663 assert!(tokens >= 5 && tokens <= 15, "Code tokens: {}", tokens);
664
665 assert_eq!(estimate_tokens("abcdefgh"), 2); }
668
669 #[test]
670 fn test_very_long_content() {
671 let config = ChunkConfig::default();
672
673 let long_text = "This is a test sentence. ".repeat(400);
675 let result = chunk_text(&long_text, &config);
676
677 assert!(result.was_chunked);
678 assert!(result.chunks.len() > 10); assert_eq!(result.coverage_ratio(), 1.0);
680
681 for chunk in &result.chunks {
683 assert!(
684 chunk.len() <= config.chunk_size + 100,
685 "Chunk too large: {} chars",
686 chunk.len()
687 );
688 }
689 }
690
691 #[test]
692 fn test_chunking_quality_unique_content_searchable() {
693 let config = ChunkConfig::default();
694
695 let beginning = "ALPHA_BEGINNING_MARKER is a unique identifier at the start.";
698 let middle_padding = "This is filler content to push things apart. ".repeat(30);
699 let middle = "BETA_MIDDLE_MARKER represents content in the center of the document.";
700 let end_padding = "More filler content for separation between sections. ".repeat(30);
701 let end = "GAMMA_END_MARKER signifies the conclusion of this memory content.";
702
703 let full_text = format!("{beginning} {middle_padding} {middle} {end_padding} {end}");
704
705 let result = chunk_text(&full_text, &config);
706
707 assert!(result.was_chunked, "Content should require chunking");
709 assert!(result.chunks.len() >= 3, "Should have multiple chunks");
710
711 let has_alpha = result.chunks.iter().any(|c| c.contains("ALPHA_BEGINNING"));
713 let has_beta = result.chunks.iter().any(|c| c.contains("BETA_MIDDLE"));
714 let has_gamma = result.chunks.iter().any(|c| c.contains("GAMMA_END"));
715
716 assert!(has_alpha, "ALPHA marker (beginning) not found in any chunk");
717 assert!(has_beta, "BETA marker (middle) not found in any chunk");
718 assert!(has_gamma, "GAMMA marker (end) not found in any chunk");
719
720 println!("Total chunks: {}", result.chunks.len());
722 println!("Original length: {} chars", result.original_length);
723 for (i, chunk) in result.chunks.iter().enumerate() {
724 let markers: Vec<&str> = vec![
725 if chunk.contains("ALPHA") { "ALPHA" } else { "" },
726 if chunk.contains("BETA") { "BETA" } else { "" },
727 if chunk.contains("GAMMA") { "GAMMA" } else { "" },
728 ]
729 .into_iter()
730 .filter(|m| !m.is_empty())
731 .collect();
732 println!(
733 " Chunk {}: {} chars {}",
734 i,
735 chunk.len(),
736 if markers.is_empty() {
737 String::new()
738 } else {
739 format!("[contains: {}]", markers.join(", "))
740 }
741 );
742 }
743 }
744
745 #[test]
746 fn test_chunking_coverage_no_content_lost() {
747 let config = ChunkConfig {
748 chunk_size: 200,
749 overlap: 50,
750 min_chunk_size: 50,
751 };
752
753 let sentences: Vec<String> = (1..=20)
755 .map(|i| format!("Sentence number {i} contains unique information. "))
756 .collect();
757 let text = sentences.join("");
758
759 let result = chunk_text(&text, &config);
760
761 for i in 1..=20 {
763 let marker = format!("number {i}");
764 let found = result.chunks.iter().any(|c| c.contains(&marker));
765 assert!(
766 found,
767 "Sentence {i} not found in any chunk! Coverage gap detected."
768 );
769 }
770 }
771}