1use crate::document::ContentType;
10
11#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14 pub min_chunk_threshold: usize,
16 pub max_chunk_size: usize,
18 pub min_chunk_size: usize,
20 pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25 fn default() -> Self {
26 Self {
27 min_chunk_threshold: 1000,
28 max_chunk_size: 800,
29 min_chunk_size: 200,
30 chunk_overlap: 100,
31 }
32 }
33}
34
35impl ChunkingConfig {
36 pub fn with_chunk_size(mut self, size: usize) -> Self {
38 self.max_chunk_size = size;
39 self
40 }
41}
42
43#[derive(Debug, Clone)]
45pub struct ChunkResult {
46 pub content: String,
48 pub start_offset: usize,
50 pub end_offset: usize,
52 pub context: Option<String>,
54 pub is_boundary: bool,
56}
57
58impl ChunkResult {
59 fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
60 Self {
61 content,
62 start_offset,
63 end_offset,
64 context: None,
65 is_boundary: false,
66 }
67 }
68
69 fn with_context(mut self, context: Option<String>) -> Self {
70 self.context = context;
71 self
72 }
73
74 fn with_boundary(mut self, is_boundary: bool) -> Self {
75 self.is_boundary = is_boundary;
76 self
77 }
78}
79
80pub fn chunk_content(
82 content: &str,
83 content_type: ContentType,
84 config: &ChunkingConfig,
85) -> Vec<ChunkResult> {
86 let char_count = content.chars().count();
88 if char_count < config.min_chunk_threshold {
89 return vec![ChunkResult::new(content.to_string(), 0, content.len())];
90 }
91
92 let chunks = match content_type {
94 ContentType::Markdown => chunk_markdown(content, config),
95 ContentType::Json => chunk_json(content, config),
96 ContentType::Yaml => chunk_yaml(content, config),
97 ContentType::Code => chunk_code(content, config),
98 ContentType::Text => chunk_text(content, config),
99 };
100
101 let chunks = enforce_max_size(chunks, config);
103
104 merge_small_chunks(chunks, config.min_chunk_size)
106}
107
108fn split_at_sentences(text: &str) -> Vec<&str> {
114 let mut sentences = Vec::new();
115 let mut start = 0;
116 let mut char_indices = text.char_indices().peekable();
117
118 while let Some((i, ch)) = char_indices.next() {
119 if matches!(ch, '.' | '?' | '!' | 'γ' | 'οΌ' | 'οΌ') {
121 let end = i + ch.len_utf8();
122 let at_end_or_ws = match char_indices.peek() {
124 None => true,
125 Some(&(_, next_ch)) => next_ch == ' ' || next_ch == '\n' || next_ch == '\t',
126 };
127 if at_end_or_ws {
128 if start < end {
129 sentences.push(&text[start..end]);
130 }
131 while let Some(&(_, next_ch)) = char_indices.peek() {
133 if next_ch == ' ' || next_ch == '\n' || next_ch == '\t' {
134 char_indices.next();
135 } else {
136 break;
137 }
138 }
139 start = match char_indices.peek() {
140 Some(&(idx, _)) => idx,
141 None => text.len(),
142 };
143 }
144 }
145 }
146
147 if start < text.len() {
149 sentences.push(&text[start..]);
150 }
151
152 if sentences.is_empty() && !text.is_empty() {
153 sentences.push(text);
154 }
155
156 sentences
157}
158
159fn recursive_split(
161 text: &str,
162 max_size: usize,
163 offset: usize,
164 context: Option<String>,
165 overlap: usize,
166) -> Vec<ChunkResult> {
167 if text.len() <= max_size {
168 return vec![
169 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
170 ];
171 }
172
173 let mut chunks = Vec::new();
174
175 let paragraphs: Vec<&str> = text.split("\n\n").collect();
177 if paragraphs.len() > 1 {
178 let mut current_chunk = String::new();
179 let mut chunk_start = offset;
180 let mut current_pos = offset;
181
182 for (i, para) in paragraphs.iter().enumerate() {
183 let sep = if i > 0 { "\n\n" } else { "" };
184 let para_with_sep = format!("{}{}", sep, para);
185
186 if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
187 chunks.push(
189 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
190 .with_context(context.clone()),
191 );
192
193 let overlap_text = get_overlap_text(¤t_chunk, overlap);
195 chunk_start = current_pos - overlap_text.len();
196 current_chunk = overlap_text;
197 }
198
199 current_chunk.push_str(¶_with_sep);
200 current_pos += para_with_sep.len();
201 }
202
203 if !current_chunk.is_empty() {
204 if current_chunk.len() > max_size {
206 chunks.extend(split_by_sentences(
207 ¤t_chunk,
208 max_size,
209 chunk_start,
210 context.clone(),
211 overlap,
212 ));
213 } else {
214 chunks.push(
215 ChunkResult::new(current_chunk, chunk_start, current_pos)
216 .with_context(context.clone()),
217 );
218 }
219 }
220
221 return chunks;
222 }
223
224 split_by_sentences(text, max_size, offset, context, overlap)
226}
227
228fn split_by_sentences(
230 text: &str,
231 max_size: usize,
232 offset: usize,
233 context: Option<String>,
234 overlap: usize,
235) -> Vec<ChunkResult> {
236 let sentences = split_at_sentences(text);
237
238 if sentences.len() <= 1 {
239 return split_by_chars(text, max_size, offset, context, overlap);
241 }
242
243 let mut chunks = Vec::new();
244 let mut current_chunk = String::new();
245 let mut chunk_start = offset;
246 let mut current_pos = offset;
247
248 for sentence in sentences {
249 let sep = if !current_chunk.is_empty() { " " } else { "" };
250 let sentence_with_sep = format!("{}{}", sep, sentence);
251
252 if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
253 chunks.push(
254 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
255 .with_context(context.clone()),
256 );
257
258 let overlap_text = get_overlap_text(¤t_chunk, overlap);
259 chunk_start = current_pos - overlap_text.len();
260 current_chunk = overlap_text;
261 }
262
263 current_chunk.push_str(&sentence_with_sep);
264 current_pos += sentence_with_sep.len();
265 }
266
267 if !current_chunk.is_empty() {
268 if current_chunk.len() > max_size {
269 chunks.extend(split_by_chars(
270 ¤t_chunk,
271 max_size,
272 chunk_start,
273 context.clone(),
274 overlap,
275 ));
276 } else {
277 chunks.push(
278 ChunkResult::new(current_chunk, chunk_start, current_pos)
279 .with_context(context.clone()),
280 );
281 }
282 }
283
284 chunks
285}
286
287fn split_by_chars(
289 text: &str,
290 max_size: usize,
291 offset: usize,
292 context: Option<String>,
293 overlap: usize,
294) -> Vec<ChunkResult> {
295 let mut chunks = Vec::new();
296 let bytes = text.as_bytes();
297 let mut start = 0;
298
299 let effective_overlap = overlap.min(max_size / 2);
301
302 while start < text.len() {
303 let end = (start + max_size).min(text.len());
304
305 let actual_end = if end < text.len() {
307 find_word_boundary_bytes(bytes, start, end)
308 } else {
309 end
310 };
311
312 let actual_end = if actual_end <= start {
314 (start + max_size).min(text.len())
315 } else {
316 actual_end
317 };
318
319 let actual_end = {
321 let mut e = actual_end;
322 while e < text.len() && !text.is_char_boundary(e) {
323 e += 1;
324 }
325 e
326 };
327
328 chunks.push(
329 ChunkResult::new(
330 text[start..actual_end].to_string(),
331 offset + start,
332 offset + actual_end,
333 )
334 .with_context(context.clone()),
335 );
336
337 let next_start = actual_end.saturating_sub(effective_overlap);
340 let next_start = {
341 let mut s = next_start;
342 while s < text.len() && !text.is_char_boundary(s) {
343 s += 1;
344 }
345 s
346 };
347 start = if next_start <= start {
348 actual_end } else {
350 next_start
351 };
352 }
353
354 if chunks.is_empty() && !text.is_empty() {
355 chunks.push(
356 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
357 );
358 }
359
360 chunks
361}
362
363fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
370 let search_start = target.saturating_sub(50).max(start);
372 for i in (search_start..target).rev() {
373 if bytes[i] == b' ' || bytes[i] == b'\n' {
374 return i + 1;
375 }
376 }
377 target
378}
379
380fn get_overlap_text(text: &str, overlap: usize) -> String {
382 if text.len() <= overlap {
383 return text.to_string();
384 }
385
386 let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
387 text[actual_start..].to_string()
388}
389
390fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
392 if bytes.len() <= target_overlap {
393 return 0;
394 }
395
396 let start_search = bytes.len().saturating_sub(target_overlap + 50);
397 let end_search = bytes
398 .len()
399 .saturating_sub(target_overlap.saturating_sub(50));
400
401 for i in (start_search..end_search).rev() {
403 if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
404 return i + 1;
405 }
406 }
407
408 bytes.len().saturating_sub(target_overlap)
410}
411
412fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
414 let mut result = Vec::new();
415
416 for chunk in chunks {
417 if chunk.content.len() > config.max_chunk_size {
418 result.extend(recursive_split(
419 &chunk.content,
420 config.max_chunk_size,
421 chunk.start_offset,
422 chunk.context,
423 config.chunk_overlap,
424 ));
425 } else {
426 result.push(chunk);
427 }
428 }
429
430 result
431}
432
433fn merge_small_chunks(chunks: Vec<ChunkResult>, min_size: usize) -> Vec<ChunkResult> {
435 if chunks.is_empty() {
436 return chunks;
437 }
438
439 let mut result: Vec<ChunkResult> = Vec::new();
440
441 for chunk in chunks {
442 if chunk.content.len() >= min_size || chunk.is_boundary {
443 result.push(chunk);
444 } else if let Some(last) = result.last_mut() {
445 if !last.is_boundary {
447 last.content.push_str("\n\n");
449 last.content.push_str(&chunk.content);
450 last.end_offset = chunk.end_offset;
451 if chunk.context.is_some() {
453 last.context = chunk.context;
454 }
455 } else {
456 result.push(chunk);
457 }
458 } else {
459 result.push(chunk);
460 }
461 }
462
463 result
465}
466
467struct MarkdownSection {
473 header_path: Vec<String>,
475 content: String,
477 start_offset: usize,
479 end_offset: usize,
481}
482
483fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
485 let mut sections = Vec::new();
486 let mut current_section = String::new();
487 let mut section_start = 0;
488 let mut current_pos = 0;
489 let mut header_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
492
493 for line in lines.iter() {
494 let line_with_newline = if current_pos > 0 {
495 format!("\n{}", line)
496 } else {
497 line.to_string()
498 };
499
500 if let Some(level) = get_header_level(line) {
502 if !current_section.is_empty() {
504 sections.push(MarkdownSection {
505 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
506 content: current_section.clone(),
507 start_offset: section_start,
508 end_offset: current_pos,
509 });
510 }
511
512 while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
515 header_stack.pop();
516 }
517 header_stack.push((level, line.to_string()));
518
519 current_section = line_with_newline.trim_start_matches('\n').to_string();
521 section_start = current_pos;
522 } else {
523 current_section.push_str(&line_with_newline);
524 }
525
526 current_pos += line_with_newline.len();
527 }
528
529 if !current_section.is_empty() {
531 sections.push(MarkdownSection {
532 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
533 content: current_section,
534 start_offset: section_start,
535 end_offset: content.len(),
536 });
537 }
538
539 if sections.is_empty() {
541 sections.push(MarkdownSection {
542 header_path: vec![],
543 content: content.to_string(),
544 start_offset: 0,
545 end_offset: content.len(),
546 });
547 }
548
549 sections
550}
551
552fn get_header_level(line: &str) -> Option<usize> {
554 let trimmed = line.trim_start();
555 if !trimmed.starts_with('#') {
556 return None;
557 }
558
559 let level = trimmed.chars().take_while(|&c| c == '#').count();
560 if level > 0 && level <= 6 {
561 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
563 return Some(level);
564 }
565 }
566 None
567}
568
569fn format_header_path(path: &[String]) -> Option<String> {
571 if path.is_empty() {
572 return None;
573 }
574 Some(path.join(" > "))
575}
576
577fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
579 let sections = parse_markdown_sections(content);
580 let mut chunks = Vec::new();
581
582 for section in sections {
583 let context = format_header_path(§ion.header_path);
584 let is_boundary = !section.header_path.is_empty();
585
586 chunks.push(
587 ChunkResult::new(section.content, section.start_offset, section.end_offset)
588 .with_context(context)
589 .with_boundary(is_boundary),
590 );
591 }
592
593 chunks
595}
596
597fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
603 let mut chunks = Vec::new();
604 let mut current_chunk = String::new();
605 let mut chunk_start = 0;
606 let mut current_pos = 0;
607
608 let paragraphs: Vec<&str> = content.split("\n\n").collect();
610
611 for (i, para) in paragraphs.iter().enumerate() {
612 let sep = if i > 0 { "\n\n" } else { "" };
613 let para_with_sep = format!("{}{}", sep, para);
614
615 if !current_chunk.is_empty()
617 && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
618 {
619 chunks.push(ChunkResult::new(
620 current_chunk.clone(),
621 chunk_start,
622 current_pos,
623 ));
624
625 let overlap_text = get_overlap_text(¤t_chunk, config.chunk_overlap);
627 chunk_start = current_pos - overlap_text.len();
628 current_chunk = overlap_text;
629 }
630
631 current_chunk.push_str(¶_with_sep);
632 current_pos += para_with_sep.len();
633 }
634
635 if !current_chunk.is_empty() {
637 chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
638 }
639
640 if chunks.is_empty() {
642 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
643 }
644
645 chunks
646}
647
648const CODE_BOUNDARY_PATTERNS: &[&str] = &[
654 "fn ",
656 "pub fn ",
657 "async fn ",
658 "pub async fn ",
659 "impl ",
660 "struct ",
661 "enum ",
662 "trait ",
663 "mod ",
664 "const ",
665 "static ",
666 "type ",
667 "#[",
668 "//!",
669 "func ",
671 "def ",
673 "class ",
674 "async def ",
675 "function ",
677 "async function ",
678 "export ",
679 "export default",
680 "module.exports",
681 "const ",
682 "let ",
683 "var ",
684 "interface ",
685 "void ",
687 "int ",
688 "char ",
689 "double ",
690 "float ",
691 "#define ",
692 "#include ",
693];
694
695fn extract_code_context(line: &str) -> String {
697 let trimmed = line.trim();
698
699 if let Some(paren_pos) = trimmed.find('(') {
702 let signature = &trimmed[..paren_pos];
703 if signature.rfind(' ').is_some() {
705 return format!("{}...", &trimmed[..paren_pos.min(60)]);
706 }
707 }
708
709 for keyword in &[
711 "struct ",
712 "class ",
713 "impl ",
714 "trait ",
715 "interface ",
716 "enum ",
717 ] {
718 if let Some(rest) = trimmed.strip_prefix(keyword) {
719 let name_end = rest
720 .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
721 .unwrap_or(rest.len());
722 return format!("{}{}", keyword, &rest[..name_end.min(50)]);
723 }
724 }
725
726 trimmed.chars().take(60).collect()
728}
729
730fn is_code_boundary(line: &str) -> bool {
732 let trimmed = line.trim_start();
733 CODE_BOUNDARY_PATTERNS
734 .iter()
735 .any(|p| trimmed.starts_with(p))
736}
737
738fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
740 let mut chunks = Vec::new();
741 let mut current_chunk = String::new();
742 let mut chunk_start = 0;
743 let mut current_pos = 0;
744 let mut current_context: Option<String> = None;
745 let mut is_at_boundary = false;
746
747 let lines: Vec<&str> = content.lines().collect();
748
749 for line in lines {
750 let line_with_newline = if current_pos > 0 {
751 format!("\n{}", line)
752 } else {
753 line.to_string()
754 };
755
756 let boundary = is_code_boundary(line);
757
758 if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
760 chunks.push(
761 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
762 .with_context(current_context.clone())
763 .with_boundary(is_at_boundary),
764 );
765
766 current_chunk = String::new();
767 chunk_start = current_pos;
768 is_at_boundary = true;
769 }
770
771 if boundary {
772 current_context = Some(extract_code_context(line));
773 is_at_boundary = true;
774 }
775
776 current_chunk.push_str(&line_with_newline);
778 current_pos += line_with_newline.len();
779 }
780
781 if !current_chunk.is_empty() {
783 chunks.push(
784 ChunkResult::new(current_chunk, chunk_start, content.len())
785 .with_context(current_context)
786 .with_boundary(is_at_boundary),
787 );
788 }
789
790 if chunks.is_empty() {
791 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
792 }
793
794 chunks
795}
796
797fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
803 if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
805 let chunks = chunk_json_value(&value, config, vec![]);
806 if !chunks.is_empty() {
807 return chunks;
808 }
809 }
810
811 chunk_text(content, config)
813}
814
815fn chunk_json_value(
821 value: &serde_json::Value,
822 config: &ChunkingConfig,
823 path: Vec<String>,
824) -> Vec<ChunkResult> {
825 let mut chunks = Vec::new();
826
827 match value {
828 serde_json::Value::Object(map) => {
829 let mut current_chunk = String::from("{\n");
830 let entries: Vec<_> = map.iter().collect();
831
832 for (i, (key, val)) in entries.iter().enumerate() {
833 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
834 let entry = if i < entries.len() - 1 {
835 format!(" \"{}\": {},\n", key, val_str)
836 } else {
837 format!(" \"{}\": {}\n", key, val_str)
838 };
839
840 let mut new_path = path.clone();
841 new_path.push((*key).clone());
842 let path_str = new_path.join(".");
843
844 if entry.len() > config.max_chunk_size {
846 if current_chunk.len() > 3 {
848 current_chunk.push('}');
849 let len = current_chunk.len();
850 let context = if path.is_empty() {
851 None
852 } else {
853 Some(path.join("."))
854 };
855 chunks.push(
856 ChunkResult::new(current_chunk, 0, len)
857 .with_context(context)
858 .with_boundary(true),
859 );
860 current_chunk = String::from("{\n");
861 }
862
863 let sub_chunks = chunk_json_value(val, config, new_path);
865 chunks.extend(sub_chunks);
866 continue;
867 }
868
869 if current_chunk.len() + entry.len() > config.max_chunk_size
870 && current_chunk.len() > 3
871 {
872 current_chunk.push('}');
873 let len = current_chunk.len();
874 chunks.push(
875 ChunkResult::new(current_chunk, 0, len)
876 .with_context(Some(path_str.clone()))
877 .with_boundary(true),
878 );
879 current_chunk = String::from("{\n");
880 }
881
882 current_chunk.push_str(&entry);
883 }
884
885 current_chunk.push('}');
886 if current_chunk.len() > 3 {
887 let len = current_chunk.len();
888 let context = if path.is_empty() {
889 None
890 } else {
891 Some(path.join("."))
892 };
893 chunks.push(
894 ChunkResult::new(current_chunk, 0, len)
895 .with_context(context)
896 .with_boundary(true),
897 );
898 }
899 }
900 serde_json::Value::Array(arr) => {
901 let mut current_chunk = String::from("[\n");
902
903 for (i, val) in arr.iter().enumerate() {
904 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
905 let entry = if i < arr.len() - 1 {
906 format!(" {},\n", val_str)
907 } else {
908 format!(" {}\n", val_str)
909 };
910
911 let mut new_path = path.clone();
912 new_path.push(format!("[{}]", i));
913 let path_str = new_path.join(".");
914
915 if current_chunk.len() + entry.len() > config.max_chunk_size
916 && current_chunk.len() > 3
917 {
918 current_chunk.push(']');
919 let len = current_chunk.len();
920 chunks.push(
921 ChunkResult::new(current_chunk, 0, len)
922 .with_context(Some(path_str.clone()))
923 .with_boundary(true),
924 );
925 current_chunk = String::from("[\n");
926 }
927
928 current_chunk.push_str(&entry);
929 }
930
931 current_chunk.push(']');
932 if current_chunk.len() > 3 {
933 let len = current_chunk.len();
934 let context = if path.is_empty() {
935 None
936 } else {
937 Some(path.join("."))
938 };
939 chunks.push(
940 ChunkResult::new(current_chunk, 0, len)
941 .with_context(context)
942 .with_boundary(true),
943 );
944 }
945 }
946 _ => {
947 let content = serde_json::to_string_pretty(value).unwrap_or_default();
949 let len = content.len();
950 let context = if path.is_empty() {
951 None
952 } else {
953 Some(path.join("."))
954 };
955 chunks.push(
956 ChunkResult::new(content, 0, len)
957 .with_context(context)
958 .with_boundary(false),
959 );
960 }
961 }
962
963 chunks
964}
965
966fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
972 let mut chunks = Vec::new();
973 let mut current_chunk = String::new();
974 let mut chunk_start = 0;
975 let mut current_pos = 0;
976 let mut key_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
979
980 for line in lines {
981 let line_with_newline = if current_pos > 0 {
982 format!("\n{}", line)
983 } else {
984 line.to_string()
985 };
986
987 let indent = line.len() - line.trim_start().len();
989 let trimmed = line.trim();
990
991 let is_key_line = !trimmed.starts_with('-')
995 && !trimmed.starts_with('#')
996 && !trimmed.starts_with('"')
997 && !trimmed.starts_with('\'')
998 && !trimmed.starts_with('{')
999 && !trimmed.starts_with('[')
1000 && trimmed.contains(':')
1001 && !trimmed.contains("://");
1002
1003 if is_key_line {
1004 if let Some(key) = trimmed.split(':').next() {
1006 let key = key.trim().to_string();
1007
1008 while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
1010 key_stack.pop();
1011 }
1012 key_stack.push((indent, key));
1013 }
1014 }
1015
1016 let is_top_level_key = indent == 0 && is_key_line;
1018
1019 if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
1021 let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
1022 chunks.push(
1023 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
1024 .with_context(context)
1025 .with_boundary(true),
1026 );
1027
1028 current_chunk = String::new();
1029 chunk_start = current_pos;
1030 }
1031
1032 current_chunk.push_str(&line_with_newline);
1033 current_pos += line_with_newline.len();
1034 }
1035
1036 if !current_chunk.is_empty() {
1038 let context = format_yaml_path(&key_stack);
1039 chunks.push(
1040 ChunkResult::new(current_chunk, chunk_start, content.len())
1041 .with_context(context)
1042 .with_boundary(!key_stack.is_empty()),
1043 );
1044 }
1045
1046 if chunks.is_empty() {
1047 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1048 }
1049
1050 chunks
1051}
1052
1053fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1055 if stack.is_empty() {
1056 return None;
1057 }
1058 Some(
1059 stack
1060 .iter()
1061 .map(|(_, k)| k.as_str())
1062 .collect::<Vec<_>>()
1063 .join("."),
1064 )
1065}
1066
1067#[cfg(test)]
1072mod tests {
1073 use super::*;
1074
1075 #[test]
1076 fn test_small_content_no_chunking() {
1077 let content = "Small content";
1078 let config = ChunkingConfig::default();
1079 let chunks = chunk_content(content, ContentType::Text, &config);
1080
1081 assert_eq!(chunks.len(), 1);
1082 assert_eq!(chunks[0].content, content);
1083 }
1084
1085 #[test]
1086 fn test_text_chunking() {
1087 let content = "a".repeat(2000);
1088 let config = ChunkingConfig {
1089 min_chunk_threshold: 1000,
1090 max_chunk_size: 500,
1091 min_chunk_size: 100,
1092 chunk_overlap: 100,
1093 };
1094 let chunks = chunk_content(&content, ContentType::Text, &config);
1095
1096 assert!(chunks.len() > 1);
1097 for chunk in &chunks {
1098 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1100 }
1101 }
1102
1103 #[test]
1104 fn test_markdown_splits_by_headers_first() {
1105 let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1106 let config = ChunkingConfig {
1107 min_chunk_threshold: 10, max_chunk_size: 1000,
1109 min_chunk_size: 10, chunk_overlap: 0,
1111 };
1112 let chunks = chunk_content(content, ContentType::Markdown, &config);
1113
1114 assert_eq!(chunks.len(), 2);
1116 assert!(chunks[0].content.contains("# H1"));
1117 assert!(chunks[1].content.contains("# H2"));
1118 }
1119
1120 #[test]
1121 fn test_large_section_gets_subsplit() {
1122 let long_paragraph = "This is a long sentence. ".repeat(100);
1123 let content = format!("# Header\n\n{}", long_paragraph);
1124 let config = ChunkingConfig {
1125 min_chunk_threshold: 100,
1126 max_chunk_size: 500,
1127 min_chunk_size: 100,
1128 chunk_overlap: 50,
1129 };
1130 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1131
1132 assert!(chunks.len() > 1);
1134 for chunk in &chunks {
1136 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1137 }
1138 }
1139
1140 #[test]
1141 fn test_small_chunks_merged() {
1142 let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1143 let config = ChunkingConfig {
1144 min_chunk_threshold: 5,
1145 max_chunk_size: 1000,
1146 min_chunk_size: 50, chunk_overlap: 0,
1148 };
1149 let chunks = chunk_content(content, ContentType::Markdown, &config);
1150
1151 assert!(chunks.len() <= 3);
1154 }
1155
1156 #[test]
1157 fn test_header_path_context() {
1158 let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1159 let config = ChunkingConfig {
1160 min_chunk_threshold: 10,
1161 max_chunk_size: 1000,
1162 min_chunk_size: 10,
1163 chunk_overlap: 0,
1164 };
1165 let chunks = chunk_content(content, ContentType::Markdown, &config);
1166
1167 let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1169 assert!(detail_chunk.is_some());
1170 let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1171 assert!(ctx.contains("# Main"));
1172 assert!(ctx.contains("## Sub"));
1173 assert!(ctx.contains("### Detail"));
1174 }
1175
1176 #[test]
1177 fn test_markdown_chunking_preserves_context() {
1178 let content = format!(
1179 "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1180 "a".repeat(600),
1181 "b".repeat(600)
1182 );
1183 let config = ChunkingConfig {
1184 min_chunk_threshold: 500,
1185 max_chunk_size: 500,
1186 min_chunk_size: 100,
1187 chunk_overlap: 50,
1188 };
1189 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1190
1191 assert!(chunks.len() >= 2);
1192 assert!(chunks.iter().any(|c| c.context.is_some()));
1194 }
1195
1196 #[test]
1197 fn test_code_chunking() {
1198 let content = format!(
1199 "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1200 " // code\n".repeat(50),
1201 " // more code\n".repeat(50)
1202 );
1203 let config = ChunkingConfig {
1204 min_chunk_threshold: 500,
1205 max_chunk_size: 500,
1206 min_chunk_size: 100,
1207 chunk_overlap: 50,
1208 };
1209 let chunks = chunk_content(&content, ContentType::Code, &config);
1210
1211 assert!(chunks.len() >= 2);
1212 }
1213
1214 #[test]
1215 fn test_code_boundary_patterns() {
1216 let patterns = [
1217 "fn test()",
1218 "pub fn test()",
1219 "async fn test()",
1220 "const FOO",
1221 "export default",
1222 "module.exports",
1223 "interface Foo",
1224 "type Bar",
1225 ];
1226
1227 for pattern in patterns {
1228 assert!(
1229 is_code_boundary(pattern),
1230 "Pattern '{}' should be recognized as boundary",
1231 pattern
1232 );
1233 }
1234 }
1235
1236 #[test]
1237 fn test_json_chunking() {
1238 let content = serde_json::json!({
1239 "key1": "a".repeat(300),
1240 "key2": "b".repeat(300),
1241 "key3": "c".repeat(300),
1242 })
1243 .to_string();
1244
1245 let config = ChunkingConfig {
1246 min_chunk_threshold: 500,
1247 max_chunk_size: 400,
1248 min_chunk_size: 100,
1249 chunk_overlap: 50,
1250 };
1251 let chunks = chunk_content(&content, ContentType::Json, &config);
1252
1253 assert!(!chunks.is_empty());
1254 }
1255
1256 #[test]
1257 fn test_json_nested_path_context() {
1258 let content = serde_json::json!({
1259 "users": {
1260 "profile": {
1261 "settings": "value"
1262 }
1263 }
1264 })
1265 .to_string();
1266
1267 let config = ChunkingConfig {
1268 min_chunk_threshold: 10,
1269 max_chunk_size: 1000,
1270 min_chunk_size: 10,
1271 chunk_overlap: 0,
1272 };
1273 let chunks = chunk_content(&content, ContentType::Json, &config);
1274
1275 assert!(!chunks.is_empty());
1277 }
1278
1279 #[test]
1280 fn test_sentence_splitting() {
1281 let text = "First sentence. Second sentence? Third sentence! Fourth.";
1282 let sentences = split_at_sentences(text);
1283
1284 assert!(sentences.len() >= 3);
1285 assert!(sentences[0].contains("First"));
1286 }
1287
1288 #[test]
1289 fn test_yaml_chunking_with_path() {
1290 let content = r#"
1291server:
1292 host: localhost
1293 port: 8080
1294database:
1295 host: db.example.com
1296 port: 5432
1297"#;
1298 let config = ChunkingConfig {
1299 min_chunk_threshold: 10,
1300 max_chunk_size: 1000,
1301 min_chunk_size: 10,
1302 chunk_overlap: 0,
1303 };
1304 let chunks = chunk_content(content, ContentType::Yaml, &config);
1305
1306 assert!(!chunks.is_empty());
1308 }
1309
1310 #[test]
1311 fn test_chunk_content_threshold_uses_chars_not_bytes() {
1312 let content: String = "π".repeat(999);
1315 assert_eq!(content.chars().count(), 999);
1316 assert_eq!(content.len(), 3996);
1317
1318 let config = ChunkingConfig::default(); let chunks = chunk_content(&content, ContentType::Text, &config);
1320 assert_eq!(
1322 chunks.len(),
1323 1,
1324 "Should not chunk content below char threshold"
1325 );
1326 }
1327
1328 #[test]
1329 fn test_split_by_chars_multibyte_no_panic() {
1330 let content: String = "ζ₯".repeat(2000); let chunks = split_by_chars(&content, 500, 0, None, 50);
1334 assert!(!chunks.is_empty());
1335 for chunk in &chunks {
1336 assert!(!chunk.content.is_empty());
1339 }
1340 }
1341
1342 #[test]
1343 fn test_split_by_chars_mixed_multibyte() {
1344 let content = "Hello δΈη! ".repeat(200);
1346 let chunks = split_by_chars(&content, 100, 0, None, 20);
1347 assert!(!chunks.is_empty());
1348 for chunk in &chunks {
1349 assert!(!chunk.content.is_empty());
1350 }
1351 }
1352
1353 #[test]
1354 fn test_recursive_split_preserves_context() {
1355 let long_text = "This is a sentence. ".repeat(100);
1356 let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1357
1358 assert!(chunks.len() > 1);
1359 for chunk in &chunks {
1360 assert!(
1361 chunk
1362 .context
1363 .as_ref()
1364 .map(|c| c == "test context")
1365 .unwrap_or(false)
1366 );
1367 }
1368 }
1369}