1use crate::document::ContentType;
10
11#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14 pub min_chunk_threshold: usize,
16 pub max_chunk_size: usize,
18 pub min_chunk_size: usize,
20 pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25 fn default() -> Self {
26 Self {
27 min_chunk_threshold: 1000,
28 max_chunk_size: 800,
29 min_chunk_size: 200,
30 chunk_overlap: 100,
31 }
32 }
33}
34
35#[derive(Debug, Clone)]
37pub struct ChunkResult {
38 pub content: String,
40 pub start_offset: usize,
42 pub end_offset: usize,
44 pub context: Option<String>,
46 pub is_boundary: bool,
48}
49
50impl ChunkResult {
51 fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
52 Self {
53 content,
54 start_offset,
55 end_offset,
56 context: None,
57 is_boundary: false,
58 }
59 }
60
61 fn with_context(mut self, context: Option<String>) -> Self {
62 self.context = context;
63 self
64 }
65
66 fn with_boundary(mut self, is_boundary: bool) -> Self {
67 self.is_boundary = is_boundary;
68 self
69 }
70}
71
72pub fn chunk_content(
74 content: &str,
75 content_type: ContentType,
76 config: &ChunkingConfig,
77) -> Vec<ChunkResult> {
78 let char_count = content.chars().count();
80 if char_count < config.min_chunk_threshold {
81 return vec![ChunkResult::new(content.to_string(), 0, content.len())];
82 }
83
84 let chunks = match content_type {
86 ContentType::Markdown => chunk_markdown(content, config),
87 ContentType::Json => chunk_json(content, config),
88 ContentType::Yaml => chunk_yaml(content, config),
89 ContentType::Code => chunk_code(content, config),
90 ContentType::Text => chunk_text(content, config),
91 };
92
93 let chunks = enforce_max_size(chunks, config);
95
96 merge_small_chunks(chunks, config.min_chunk_size, config.max_chunk_size)
98}
99
100fn split_at_sentences(text: &str) -> Vec<&str> {
106 let mut sentences = Vec::new();
107 let mut start = 0;
108 let mut char_indices = text.char_indices().peekable();
109
110 while let Some((i, ch)) = char_indices.next() {
111 if matches!(ch, '.' | '?' | '!' | 'γ' | 'οΌ' | 'οΌ') {
113 let end = i + ch.len_utf8();
114 let at_end_or_ws = match char_indices.peek() {
116 None => true,
117 Some(&(_, next_ch)) => next_ch == ' ' || next_ch == '\n' || next_ch == '\t',
118 };
119 if at_end_or_ws {
120 if start < end {
121 sentences.push(&text[start..end]);
122 }
123 while let Some(&(_, next_ch)) = char_indices.peek() {
125 if next_ch == ' ' || next_ch == '\n' || next_ch == '\t' {
126 char_indices.next();
127 } else {
128 break;
129 }
130 }
131 start = match char_indices.peek() {
132 Some(&(idx, _)) => idx,
133 None => text.len(),
134 };
135 }
136 }
137 }
138
139 if start < text.len() {
141 sentences.push(&text[start..]);
142 }
143
144 if sentences.is_empty() && !text.is_empty() {
145 sentences.push(text);
146 }
147
148 sentences
149}
150
151fn recursive_split(
153 text: &str,
154 max_size: usize,
155 offset: usize,
156 context: Option<String>,
157 overlap: usize,
158) -> Vec<ChunkResult> {
159 if text.len() <= max_size {
160 return vec![
161 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
162 ];
163 }
164
165 let mut chunks = Vec::new();
166
167 let paragraphs: Vec<&str> = text.split("\n\n").collect();
169 if paragraphs.len() > 1 {
170 let mut current_chunk = String::new();
171 let mut chunk_start = offset;
172 let mut current_pos = offset;
173
174 for (i, para) in paragraphs.iter().enumerate() {
175 let sep = if i > 0 { "\n\n" } else { "" };
176 let para_with_sep = format!("{}{}", sep, para);
177
178 if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
179 chunks.push(
181 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
182 .with_context(context.clone()),
183 );
184
185 let overlap_text = get_overlap_text(¤t_chunk, overlap);
187 chunk_start = current_pos - overlap_text.len();
188 current_chunk = overlap_text;
189 }
190
191 current_chunk.push_str(¶_with_sep);
192 current_pos += para_with_sep.len();
193 }
194
195 if !current_chunk.is_empty() {
196 if current_chunk.len() > max_size {
198 chunks.extend(split_by_sentences(
199 ¤t_chunk,
200 max_size,
201 chunk_start,
202 context.clone(),
203 overlap,
204 ));
205 } else {
206 chunks.push(
207 ChunkResult::new(current_chunk, chunk_start, current_pos)
208 .with_context(context.clone()),
209 );
210 }
211 }
212
213 return chunks;
214 }
215
216 split_by_sentences(text, max_size, offset, context, overlap)
218}
219
220fn split_by_sentences(
222 text: &str,
223 max_size: usize,
224 offset: usize,
225 context: Option<String>,
226 overlap: usize,
227) -> Vec<ChunkResult> {
228 let sentences = split_at_sentences(text);
229
230 if sentences.len() <= 1 {
231 return split_by_chars(text, max_size, offset, context, overlap);
233 }
234
235 let mut chunks = Vec::new();
236 let mut current_chunk = String::new();
237 let mut chunk_start = offset;
238 let mut current_pos = offset;
239
240 for sentence in sentences {
241 let sep = if !current_chunk.is_empty() { " " } else { "" };
242 let sentence_with_sep = format!("{}{}", sep, sentence);
243
244 if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
245 chunks.push(
246 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
247 .with_context(context.clone()),
248 );
249
250 let overlap_text = get_overlap_text(¤t_chunk, overlap);
251 chunk_start = current_pos - overlap_text.len();
252 current_chunk = overlap_text;
253 }
254
255 current_chunk.push_str(&sentence_with_sep);
256 current_pos += sentence_with_sep.len();
257 }
258
259 if !current_chunk.is_empty() {
260 if current_chunk.len() > max_size {
261 chunks.extend(split_by_chars(
262 ¤t_chunk,
263 max_size,
264 chunk_start,
265 context.clone(),
266 overlap,
267 ));
268 } else {
269 chunks.push(
270 ChunkResult::new(current_chunk, chunk_start, current_pos)
271 .with_context(context.clone()),
272 );
273 }
274 }
275
276 chunks
277}
278
279fn split_by_chars(
281 text: &str,
282 max_size: usize,
283 offset: usize,
284 context: Option<String>,
285 overlap: usize,
286) -> Vec<ChunkResult> {
287 let mut chunks = Vec::new();
288 let bytes = text.as_bytes();
289 let mut start = 0;
290
291 let effective_overlap = overlap.min(max_size / 2);
293
294 while start < text.len() {
295 let end = (start + max_size).min(text.len());
296
297 let actual_end = if end < text.len() {
299 find_word_boundary_bytes(bytes, start, end)
300 } else {
301 end
302 };
303
304 let actual_end = if actual_end <= start {
306 (start + max_size).min(text.len())
307 } else {
308 actual_end
309 };
310
311 let actual_end = {
313 let mut e = actual_end;
314 while e < text.len() && !text.is_char_boundary(e) {
315 e += 1;
316 }
317 e
318 };
319
320 chunks.push(
321 ChunkResult::new(
322 text[start..actual_end].to_string(),
323 offset + start,
324 offset + actual_end,
325 )
326 .with_context(context.clone()),
327 );
328
329 let next_start = actual_end.saturating_sub(effective_overlap);
332 let next_start = {
333 let mut s = next_start;
334 while s < text.len() && !text.is_char_boundary(s) {
335 s += 1;
336 }
337 s
338 };
339 start = if next_start <= start {
340 actual_end } else {
342 next_start
343 };
344 }
345
346 if chunks.is_empty() && !text.is_empty() {
347 chunks.push(
348 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
349 );
350 }
351
352 chunks
353}
354
355fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
362 let search_start = target.saturating_sub(50).max(start);
364 for i in (search_start..target).rev() {
365 if bytes[i] == b' ' || bytes[i] == b'\n' {
366 return i + 1;
367 }
368 }
369 target
370}
371
372fn get_overlap_text(text: &str, overlap: usize) -> String {
374 if text.len() <= overlap {
375 return text.to_string();
376 }
377
378 let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
379 text[actual_start..].to_string()
380}
381
382fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
384 if bytes.len() <= target_overlap {
385 return 0;
386 }
387
388 let start_search = bytes.len().saturating_sub(target_overlap + 50);
389 let end_search = bytes
390 .len()
391 .saturating_sub(target_overlap.saturating_sub(50));
392
393 for i in (start_search..end_search).rev() {
395 if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
396 return i + 1;
397 }
398 }
399
400 let mut pos = bytes.len().saturating_sub(target_overlap);
403 while pos < bytes.len() && bytes[pos] & 0xC0 == 0x80 {
404 pos += 1;
405 }
406 pos
407}
408
409fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
411 let mut result = Vec::new();
412
413 for chunk in chunks {
414 if chunk.content.len() > config.max_chunk_size {
415 result.extend(recursive_split(
416 &chunk.content,
417 config.max_chunk_size,
418 chunk.start_offset,
419 chunk.context,
420 config.chunk_overlap,
421 ));
422 } else {
423 result.push(chunk);
424 }
425 }
426
427 result
428}
429
430fn merge_small_chunks(
432 chunks: Vec<ChunkResult>,
433 min_size: usize,
434 max_size: usize,
435) -> Vec<ChunkResult> {
436 if chunks.is_empty() {
437 return chunks;
438 }
439
440 let mut result: Vec<ChunkResult> = Vec::new();
441
442 for chunk in chunks {
443 if chunk.content.len() >= min_size || chunk.is_boundary {
444 result.push(chunk);
445 } else if let Some(last) = result.last_mut() {
446 let merged_len = last.content.len() + 2 + chunk.content.len();
448 if !last.is_boundary && merged_len <= max_size {
449 last.content.push_str("\n\n");
451 last.content.push_str(&chunk.content);
452 last.end_offset = chunk.end_offset;
453 if chunk.context.is_some() {
455 last.context = chunk.context;
456 }
457 } else {
458 result.push(chunk);
459 }
460 } else {
461 result.push(chunk);
462 }
463 }
464
465 result
467}
468
469struct MarkdownSection {
475 header_path: Vec<String>,
477 content: String,
479 start_offset: usize,
481 end_offset: usize,
483}
484
485fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
487 let mut sections = Vec::new();
488 let mut current_section = String::new();
489 let mut section_start = 0;
490 let mut current_pos = 0;
491 let mut header_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
494
495 for line in lines.iter() {
496 let line_with_newline = if current_pos > 0 {
497 format!("\n{}", line)
498 } else {
499 line.to_string()
500 };
501
502 if let Some(level) = get_header_level(line) {
504 if !current_section.is_empty() {
506 sections.push(MarkdownSection {
507 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
508 content: current_section.clone(),
509 start_offset: section_start,
510 end_offset: current_pos,
511 });
512 }
513
514 while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
517 header_stack.pop();
518 }
519 header_stack.push((level, line.to_string()));
520
521 current_section = line_with_newline.trim_start_matches('\n').to_string();
523 section_start = current_pos;
524 } else {
525 current_section.push_str(&line_with_newline);
526 }
527
528 current_pos += line_with_newline.len();
529 }
530
531 if !current_section.is_empty() {
533 sections.push(MarkdownSection {
534 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
535 content: current_section,
536 start_offset: section_start,
537 end_offset: content.len(),
538 });
539 }
540
541 if sections.is_empty() {
543 sections.push(MarkdownSection {
544 header_path: vec![],
545 content: content.to_string(),
546 start_offset: 0,
547 end_offset: content.len(),
548 });
549 }
550
551 sections
552}
553
554fn get_header_level(line: &str) -> Option<usize> {
556 let trimmed = line.trim_start();
557 if !trimmed.starts_with('#') {
558 return None;
559 }
560
561 let level = trimmed.chars().take_while(|&c| c == '#').count();
562 if level > 0 && level <= 6 {
563 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
565 return Some(level);
566 }
567 }
568 None
569}
570
571fn format_header_path(path: &[String]) -> Option<String> {
573 if path.is_empty() {
574 return None;
575 }
576 Some(path.join(" > "))
577}
578
579fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
581 let sections = parse_markdown_sections(content);
582 let mut chunks = Vec::new();
583
584 for section in sections {
585 let context = format_header_path(§ion.header_path);
586 let is_boundary = !section.header_path.is_empty();
587
588 chunks.push(
589 ChunkResult::new(section.content, section.start_offset, section.end_offset)
590 .with_context(context)
591 .with_boundary(is_boundary),
592 );
593 }
594
595 chunks
597}
598
599fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
605 let mut chunks = Vec::new();
606 let mut current_chunk = String::new();
607 let mut chunk_start = 0;
608 let mut current_pos = 0;
609
610 let paragraphs: Vec<&str> = content.split("\n\n").collect();
612
613 for (i, para) in paragraphs.iter().enumerate() {
614 let sep = if i > 0 { "\n\n" } else { "" };
615 let para_with_sep = format!("{}{}", sep, para);
616
617 if !current_chunk.is_empty()
619 && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
620 {
621 chunks.push(ChunkResult::new(
622 current_chunk.clone(),
623 chunk_start,
624 current_pos,
625 ));
626
627 let overlap_text = get_overlap_text(¤t_chunk, config.chunk_overlap);
629 chunk_start = current_pos - overlap_text.len();
630 current_chunk = overlap_text;
631 }
632
633 current_chunk.push_str(¶_with_sep);
634 current_pos += para_with_sep.len();
635 }
636
637 if !current_chunk.is_empty() {
639 chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
640 }
641
642 if chunks.is_empty() {
644 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
645 }
646
647 chunks
648}
649
650const CODE_BOUNDARY_PATTERNS: &[&str] = &[
656 "fn ",
658 "pub fn ",
659 "async fn ",
660 "pub async fn ",
661 "impl ",
662 "struct ",
663 "enum ",
664 "trait ",
665 "mod ",
666 "const ",
667 "static ",
668 "type ",
669 "#[",
670 "//!",
671 "func ",
673 "def ",
675 "class ",
676 "async def ",
677 "function ",
679 "async function ",
680 "export ",
681 "export default",
682 "module.exports",
683 "const ",
684 "let ",
685 "var ",
686 "interface ",
687 "void ",
689 "int ",
690 "char ",
691 "double ",
692 "float ",
693 "#define ",
694 "#include ",
695];
696
697fn extract_code_context(line: &str) -> String {
699 let trimmed = line.trim();
700
701 if let Some(paren_pos) = trimmed.find('(') {
704 let signature = &trimmed[..paren_pos];
705 if signature.contains(' ') {
706 return format!("{}...", &trimmed[..paren_pos.min(60)]);
707 }
708 }
709
710 for keyword in &[
712 "struct ",
713 "class ",
714 "impl ",
715 "trait ",
716 "interface ",
717 "enum ",
718 ] {
719 if let Some(rest) = trimmed.strip_prefix(keyword) {
720 let name_end = rest
721 .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
722 .unwrap_or(rest.len());
723 return format!("{}{}", keyword, &rest[..name_end.min(50)]);
724 }
725 }
726
727 trimmed.chars().take(60).collect()
729}
730
731fn is_code_boundary(line: &str) -> bool {
733 let trimmed = line.trim_start();
734 CODE_BOUNDARY_PATTERNS
735 .iter()
736 .any(|p| trimmed.starts_with(p))
737}
738
739fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
741 let mut chunks = Vec::new();
742 let mut current_chunk = String::new();
743 let mut chunk_start = 0;
744 let mut current_pos = 0;
745 let mut current_context: Option<String> = None;
746 let mut is_at_boundary = false;
747
748 let lines: Vec<&str> = content.lines().collect();
749
750 for line in lines {
751 let line_with_newline = if current_pos > 0 {
752 format!("\n{}", line)
753 } else {
754 line.to_string()
755 };
756
757 let boundary = is_code_boundary(line);
758
759 if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
761 chunks.push(
762 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
763 .with_context(current_context.clone())
764 .with_boundary(is_at_boundary),
765 );
766
767 current_chunk = String::new();
768 chunk_start = current_pos;
769 is_at_boundary = true;
770 }
771
772 if boundary {
773 current_context = Some(extract_code_context(line));
774 is_at_boundary = true;
775 }
776
777 current_chunk.push_str(&line_with_newline);
779 current_pos += line_with_newline.len();
780 }
781
782 if !current_chunk.is_empty() {
784 chunks.push(
785 ChunkResult::new(current_chunk, chunk_start, content.len())
786 .with_context(current_context)
787 .with_boundary(is_at_boundary),
788 );
789 }
790
791 if chunks.is_empty() {
792 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
793 }
794
795 chunks
796}
797
798fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
804 if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
806 let chunks = chunk_json_value(&value, config, vec![]);
807 if !chunks.is_empty() {
808 return chunks;
809 }
810 }
811
812 chunk_text(content, config)
814}
815
816fn chunk_json_value(
822 value: &serde_json::Value,
823 config: &ChunkingConfig,
824 path: Vec<String>,
825) -> Vec<ChunkResult> {
826 let mut chunks = Vec::new();
827
828 match value {
829 serde_json::Value::Object(map) => {
830 let mut current_chunk = String::from("{\n");
831 let entries: Vec<_> = map.iter().collect();
832
833 for (i, (key, val)) in entries.iter().enumerate() {
834 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
835 let entry = if i < entries.len() - 1 {
836 format!(" \"{}\": {},\n", key, val_str)
837 } else {
838 format!(" \"{}\": {}\n", key, val_str)
839 };
840
841 let mut new_path = path.clone();
842 new_path.push((*key).clone());
843 let path_str = new_path.join(".");
844
845 if entry.len() > config.max_chunk_size {
847 if current_chunk.len() > 3 {
849 current_chunk.push('}');
850 let len = current_chunk.len();
851 let context = if path.is_empty() {
852 None
853 } else {
854 Some(path.join("."))
855 };
856 chunks.push(
857 ChunkResult::new(current_chunk, 0, len)
858 .with_context(context)
859 .with_boundary(true),
860 );
861 current_chunk = String::from("{\n");
862 }
863
864 let sub_chunks = chunk_json_value(val, config, new_path);
866 chunks.extend(sub_chunks);
867 continue;
868 }
869
870 if current_chunk.len() + entry.len() > config.max_chunk_size
871 && current_chunk.len() > 3
872 {
873 current_chunk.push('}');
874 let len = current_chunk.len();
875 chunks.push(
876 ChunkResult::new(current_chunk, 0, len)
877 .with_context(Some(path_str.clone()))
878 .with_boundary(true),
879 );
880 current_chunk = String::from("{\n");
881 }
882
883 current_chunk.push_str(&entry);
884 }
885
886 current_chunk.push('}');
887 if current_chunk.len() > 3 {
888 let len = current_chunk.len();
889 let context = if path.is_empty() {
890 None
891 } else {
892 Some(path.join("."))
893 };
894 chunks.push(
895 ChunkResult::new(current_chunk, 0, len)
896 .with_context(context)
897 .with_boundary(true),
898 );
899 }
900 }
901 serde_json::Value::Array(arr) => {
902 let mut current_chunk = String::from("[\n");
903
904 for (i, val) in arr.iter().enumerate() {
905 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
906 let entry = if i < arr.len() - 1 {
907 format!(" {},\n", val_str)
908 } else {
909 format!(" {}\n", val_str)
910 };
911
912 let mut new_path = path.clone();
913 new_path.push(format!("[{}]", i));
914 let path_str = new_path.join(".");
915
916 if current_chunk.len() + entry.len() > config.max_chunk_size
917 && current_chunk.len() > 3
918 {
919 current_chunk.push(']');
920 let len = current_chunk.len();
921 chunks.push(
922 ChunkResult::new(current_chunk, 0, len)
923 .with_context(Some(path_str.clone()))
924 .with_boundary(true),
925 );
926 current_chunk = String::from("[\n");
927 }
928
929 current_chunk.push_str(&entry);
930 }
931
932 current_chunk.push(']');
933 if current_chunk.len() > 3 {
934 let len = current_chunk.len();
935 let context = if path.is_empty() {
936 None
937 } else {
938 Some(path.join("."))
939 };
940 chunks.push(
941 ChunkResult::new(current_chunk, 0, len)
942 .with_context(context)
943 .with_boundary(true),
944 );
945 }
946 }
947 _ => {
948 let content = serde_json::to_string_pretty(value).unwrap_or_default();
950 let len = content.len();
951 let context = if path.is_empty() {
952 None
953 } else {
954 Some(path.join("."))
955 };
956 chunks.push(
957 ChunkResult::new(content, 0, len)
958 .with_context(context)
959 .with_boundary(false),
960 );
961 }
962 }
963
964 chunks
965}
966
967fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
973 let mut chunks = Vec::new();
974 let mut current_chunk = String::new();
975 let mut chunk_start = 0;
976 let mut current_pos = 0;
977 let mut key_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
980
981 for line in lines {
982 let line_with_newline = if current_pos > 0 {
983 format!("\n{}", line)
984 } else {
985 line.to_string()
986 };
987
988 let indent = line.len() - line.trim_start().len();
990 let trimmed = line.trim();
991
992 let is_key_line = !trimmed.starts_with('-')
996 && !trimmed.starts_with('#')
997 && !trimmed.starts_with('"')
998 && !trimmed.starts_with('\'')
999 && !trimmed.starts_with('{')
1000 && !trimmed.starts_with('[')
1001 && trimmed.contains(':')
1002 && !trimmed.contains("://");
1003
1004 if is_key_line {
1005 if let Some(key) = trimmed.split(':').next() {
1007 let key = key.trim().to_string();
1008
1009 while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
1011 key_stack.pop();
1012 }
1013 key_stack.push((indent, key));
1014 }
1015 }
1016
1017 let is_top_level_key = indent == 0 && is_key_line;
1019
1020 if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
1022 let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
1023 chunks.push(
1024 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
1025 .with_context(context)
1026 .with_boundary(true),
1027 );
1028
1029 current_chunk = String::new();
1030 chunk_start = current_pos;
1031 }
1032
1033 current_chunk.push_str(&line_with_newline);
1034 current_pos += line_with_newline.len();
1035 }
1036
1037 if !current_chunk.is_empty() {
1039 let context = format_yaml_path(&key_stack);
1040 chunks.push(
1041 ChunkResult::new(current_chunk, chunk_start, content.len())
1042 .with_context(context)
1043 .with_boundary(!key_stack.is_empty()),
1044 );
1045 }
1046
1047 if chunks.is_empty() {
1048 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1049 }
1050
1051 chunks
1052}
1053
1054fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1056 if stack.is_empty() {
1057 return None;
1058 }
1059 Some(
1060 stack
1061 .iter()
1062 .map(|(_, k)| k.as_str())
1063 .collect::<Vec<_>>()
1064 .join("."),
1065 )
1066}
1067
1068#[cfg(test)]
1073mod tests {
1074 use super::*;
1075
1076 #[test]
1077 fn test_small_content_no_chunking() {
1078 let content = "Small content";
1079 let config = ChunkingConfig::default();
1080 let chunks = chunk_content(content, ContentType::Text, &config);
1081
1082 assert_eq!(chunks.len(), 1);
1083 assert_eq!(chunks[0].content, content);
1084 }
1085
1086 #[test]
1087 fn test_text_chunking() {
1088 let content = "a".repeat(2000);
1089 let config = ChunkingConfig {
1090 min_chunk_threshold: 1000,
1091 max_chunk_size: 500,
1092 min_chunk_size: 100,
1093 chunk_overlap: 100,
1094 };
1095 let chunks = chunk_content(&content, ContentType::Text, &config);
1096
1097 assert!(chunks.len() > 1);
1098 for chunk in &chunks {
1099 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1101 }
1102 }
1103
1104 #[test]
1105 fn test_markdown_splits_by_headers_first() {
1106 let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1107 let config = ChunkingConfig {
1108 min_chunk_threshold: 10, max_chunk_size: 1000,
1110 min_chunk_size: 10, chunk_overlap: 0,
1112 };
1113 let chunks = chunk_content(content, ContentType::Markdown, &config);
1114
1115 assert_eq!(chunks.len(), 2);
1117 assert!(chunks[0].content.contains("# H1"));
1118 assert!(chunks[1].content.contains("# H2"));
1119 }
1120
1121 #[test]
1122 fn test_large_section_gets_subsplit() {
1123 let long_paragraph = "This is a long sentence. ".repeat(100);
1124 let content = format!("# Header\n\n{}", long_paragraph);
1125 let config = ChunkingConfig {
1126 min_chunk_threshold: 100,
1127 max_chunk_size: 500,
1128 min_chunk_size: 100,
1129 chunk_overlap: 50,
1130 };
1131 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1132
1133 assert!(chunks.len() > 1);
1135 for chunk in &chunks {
1137 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1138 }
1139 }
1140
1141 #[test]
1142 fn test_small_chunks_merged() {
1143 let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1144 let config = ChunkingConfig {
1145 min_chunk_threshold: 5,
1146 max_chunk_size: 1000,
1147 min_chunk_size: 50, chunk_overlap: 0,
1149 };
1150 let chunks = chunk_content(content, ContentType::Markdown, &config);
1151
1152 assert!(chunks.len() <= 3);
1155 }
1156
1157 #[test]
1158 fn test_header_path_context() {
1159 let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1160 let config = ChunkingConfig {
1161 min_chunk_threshold: 10,
1162 max_chunk_size: 1000,
1163 min_chunk_size: 10,
1164 chunk_overlap: 0,
1165 };
1166 let chunks = chunk_content(content, ContentType::Markdown, &config);
1167
1168 let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1170 assert!(detail_chunk.is_some());
1171 let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1172 assert!(ctx.contains("# Main"));
1173 assert!(ctx.contains("## Sub"));
1174 assert!(ctx.contains("### Detail"));
1175 }
1176
1177 #[test]
1178 fn test_markdown_chunking_preserves_context() {
1179 let content = format!(
1180 "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1181 "a".repeat(600),
1182 "b".repeat(600)
1183 );
1184 let config = ChunkingConfig {
1185 min_chunk_threshold: 500,
1186 max_chunk_size: 500,
1187 min_chunk_size: 100,
1188 chunk_overlap: 50,
1189 };
1190 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1191
1192 assert!(chunks.len() >= 2);
1193 assert!(chunks.iter().any(|c| c.context.is_some()));
1195 }
1196
1197 #[test]
1198 fn test_code_chunking() {
1199 let content = format!(
1200 "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1201 " // code\n".repeat(50),
1202 " // more code\n".repeat(50)
1203 );
1204 let config = ChunkingConfig {
1205 min_chunk_threshold: 500,
1206 max_chunk_size: 500,
1207 min_chunk_size: 100,
1208 chunk_overlap: 50,
1209 };
1210 let chunks = chunk_content(&content, ContentType::Code, &config);
1211
1212 assert!(chunks.len() >= 2);
1213 }
1214
1215 #[test]
1216 fn test_code_boundary_patterns() {
1217 let patterns = [
1218 "fn test()",
1219 "pub fn test()",
1220 "async fn test()",
1221 "const FOO",
1222 "export default",
1223 "module.exports",
1224 "interface Foo",
1225 "type Bar",
1226 ];
1227
1228 for pattern in patterns {
1229 assert!(
1230 is_code_boundary(pattern),
1231 "Pattern '{}' should be recognized as boundary",
1232 pattern
1233 );
1234 }
1235 }
1236
1237 #[test]
1238 fn test_json_chunking() {
1239 let content = serde_json::json!({
1240 "key1": "a".repeat(300),
1241 "key2": "b".repeat(300),
1242 "key3": "c".repeat(300),
1243 })
1244 .to_string();
1245
1246 let config = ChunkingConfig {
1247 min_chunk_threshold: 500,
1248 max_chunk_size: 400,
1249 min_chunk_size: 100,
1250 chunk_overlap: 50,
1251 };
1252 let chunks = chunk_content(&content, ContentType::Json, &config);
1253
1254 assert!(!chunks.is_empty());
1255 }
1256
1257 #[test]
1258 fn test_json_nested_path_context() {
1259 let content = serde_json::json!({
1260 "users": {
1261 "profile": {
1262 "settings": "value"
1263 }
1264 }
1265 })
1266 .to_string();
1267
1268 let config = ChunkingConfig {
1269 min_chunk_threshold: 10,
1270 max_chunk_size: 1000,
1271 min_chunk_size: 10,
1272 chunk_overlap: 0,
1273 };
1274 let chunks = chunk_content(&content, ContentType::Json, &config);
1275
1276 assert!(!chunks.is_empty());
1278 }
1279
1280 #[test]
1281 fn test_sentence_splitting() {
1282 let text = "First sentence. Second sentence? Third sentence! Fourth.";
1283 let sentences = split_at_sentences(text);
1284
1285 assert!(sentences.len() >= 3);
1286 assert!(sentences[0].contains("First"));
1287 }
1288
1289 #[test]
1290 fn test_yaml_chunking_with_path() {
1291 let content = r#"
1292server:
1293 host: localhost
1294 port: 8080
1295database:
1296 host: db.example.com
1297 port: 5432
1298"#;
1299 let config = ChunkingConfig {
1300 min_chunk_threshold: 10,
1301 max_chunk_size: 1000,
1302 min_chunk_size: 10,
1303 chunk_overlap: 0,
1304 };
1305 let chunks = chunk_content(content, ContentType::Yaml, &config);
1306
1307 assert!(!chunks.is_empty());
1309 }
1310
1311 #[test]
1312 fn test_chunk_content_threshold_uses_chars_not_bytes() {
1313 let content: String = "π".repeat(999);
1316 assert_eq!(content.chars().count(), 999);
1317 assert_eq!(content.len(), 3996);
1318
1319 let config = ChunkingConfig::default(); let chunks = chunk_content(&content, ContentType::Text, &config);
1321 assert_eq!(
1323 chunks.len(),
1324 1,
1325 "Should not chunk content below char threshold"
1326 );
1327 }
1328
1329 #[test]
1330 fn test_split_by_chars_multibyte_no_panic() {
1331 let content: String = "ζ₯".repeat(2000); let chunks = split_by_chars(&content, 500, 0, None, 50);
1335 assert!(!chunks.is_empty());
1336 for chunk in &chunks {
1337 assert!(!chunk.content.is_empty());
1340 }
1341 }
1342
1343 #[test]
1344 fn test_split_by_chars_mixed_multibyte() {
1345 let content = "Hello δΈη! ".repeat(200);
1347 let chunks = split_by_chars(&content, 100, 0, None, 20);
1348 assert!(!chunks.is_empty());
1349 for chunk in &chunks {
1350 assert!(!chunk.content.is_empty());
1351 }
1352 }
1353
1354 #[test]
1355 fn test_recursive_split_preserves_context() {
1356 let long_text = "This is a sentence. ".repeat(100);
1357 let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1358
1359 assert!(chunks.len() > 1);
1360 for chunk in &chunks {
1361 assert!(
1362 chunk
1363 .context
1364 .as_ref()
1365 .map(|c| c == "test context")
1366 .unwrap_or(false)
1367 );
1368 }
1369 }
1370}