1use crate::document::ContentType;
10
11#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14 pub min_chunk_threshold: usize,
16 pub max_chunk_size: usize,
18 pub min_chunk_size: usize,
20 pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25 fn default() -> Self {
26 Self {
27 min_chunk_threshold: 1000,
28 max_chunk_size: 800,
29 min_chunk_size: 200,
30 chunk_overlap: 100,
31 }
32 }
33}
34
35impl ChunkingConfig {
36 pub fn with_chunk_size(mut self, size: usize) -> Self {
38 self.max_chunk_size = size;
39 self
40 }
41}
42
43#[derive(Debug, Clone)]
45pub struct ChunkResult {
46 pub content: String,
48 pub start_offset: usize,
50 pub end_offset: usize,
52 pub context: Option<String>,
54 pub is_boundary: bool,
56}
57
58impl ChunkResult {
59 fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
60 Self {
61 content,
62 start_offset,
63 end_offset,
64 context: None,
65 is_boundary: false,
66 }
67 }
68
69 fn with_context(mut self, context: Option<String>) -> Self {
70 self.context = context;
71 self
72 }
73
74 fn with_boundary(mut self, is_boundary: bool) -> Self {
75 self.is_boundary = is_boundary;
76 self
77 }
78}
79
80pub fn chunk_content(
82 content: &str,
83 content_type: ContentType,
84 config: &ChunkingConfig,
85) -> Vec<ChunkResult> {
86 if content.len() < config.min_chunk_threshold {
88 return vec![ChunkResult::new(content.to_string(), 0, content.len())];
89 }
90
91 let chunks = match content_type {
93 ContentType::Markdown => chunk_markdown(content, config),
94 ContentType::Json => chunk_json(content, config),
95 ContentType::Yaml => chunk_yaml(content, config),
96 ContentType::Code => chunk_code(content, config),
97 ContentType::Text => chunk_text(content, config),
98 };
99
100 let chunks = enforce_max_size(chunks, config);
102
103 merge_small_chunks(chunks, config.min_chunk_size)
105}
106
107fn split_at_sentences(text: &str) -> Vec<&str> {
113 let mut sentences = Vec::new();
114 let bytes = text.as_bytes();
115 let mut start = 0;
116 let mut i = 0;
117
118 while i < bytes.len() {
119 if matches!(bytes[i], b'.' | b'?' | b'!') {
121 let next_idx = i + 1;
122 if next_idx >= bytes.len()
123 || bytes[next_idx] == b' '
124 || bytes[next_idx] == b'\n'
125 || bytes[next_idx] == b'\t'
126 {
127 let end = i + 1;
129 if start < end && end <= bytes.len() {
130 sentences.push(&text[start..end]);
131 }
132 i += 1;
134 while i < bytes.len()
135 && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\t')
136 {
137 i += 1;
138 }
139 start = i;
140 continue;
141 }
142 }
143 i += 1;
144 }
145
146 if start < bytes.len() {
148 sentences.push(&text[start..]);
149 }
150
151 if sentences.is_empty() && !text.is_empty() {
152 sentences.push(text);
153 }
154
155 sentences
156}
157
158fn recursive_split(
160 text: &str,
161 max_size: usize,
162 offset: usize,
163 context: Option<String>,
164 overlap: usize,
165) -> Vec<ChunkResult> {
166 if text.len() <= max_size {
167 return vec![
168 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
169 ];
170 }
171
172 let mut chunks = Vec::new();
173
174 let paragraphs: Vec<&str> = text.split("\n\n").collect();
176 if paragraphs.len() > 1 {
177 let mut current_chunk = String::new();
178 let mut chunk_start = offset;
179 let mut current_pos = offset;
180
181 for (i, para) in paragraphs.iter().enumerate() {
182 let sep = if i > 0 { "\n\n" } else { "" };
183 let para_with_sep = format!("{}{}", sep, para);
184
185 if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
186 chunks.push(
188 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
189 .with_context(context.clone()),
190 );
191
192 let overlap_text = get_overlap_text(¤t_chunk, overlap);
194 chunk_start = current_pos - overlap_text.len();
195 current_chunk = overlap_text;
196 }
197
198 current_chunk.push_str(¶_with_sep);
199 current_pos += para_with_sep.len();
200 }
201
202 if !current_chunk.is_empty() {
203 if current_chunk.len() > max_size {
205 chunks.extend(split_by_sentences(
206 ¤t_chunk,
207 max_size,
208 chunk_start,
209 context.clone(),
210 overlap,
211 ));
212 } else {
213 chunks.push(
214 ChunkResult::new(current_chunk, chunk_start, current_pos)
215 .with_context(context.clone()),
216 );
217 }
218 }
219
220 return chunks;
221 }
222
223 split_by_sentences(text, max_size, offset, context, overlap)
225}
226
227fn split_by_sentences(
229 text: &str,
230 max_size: usize,
231 offset: usize,
232 context: Option<String>,
233 overlap: usize,
234) -> Vec<ChunkResult> {
235 let sentences = split_at_sentences(text);
236
237 if sentences.len() <= 1 {
238 return split_by_chars(text, max_size, offset, context, overlap);
240 }
241
242 let mut chunks = Vec::new();
243 let mut current_chunk = String::new();
244 let mut chunk_start = offset;
245 let mut current_pos = offset;
246
247 for sentence in sentences {
248 let sep = if !current_chunk.is_empty() { " " } else { "" };
249 let sentence_with_sep = format!("{}{}", sep, sentence);
250
251 if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
252 chunks.push(
253 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
254 .with_context(context.clone()),
255 );
256
257 let overlap_text = get_overlap_text(¤t_chunk, overlap);
258 chunk_start = current_pos - overlap_text.len();
259 current_chunk = overlap_text;
260 }
261
262 current_chunk.push_str(&sentence_with_sep);
263 current_pos += sentence_with_sep.len();
264 }
265
266 if !current_chunk.is_empty() {
267 if current_chunk.len() > max_size {
268 chunks.extend(split_by_chars(
269 ¤t_chunk,
270 max_size,
271 chunk_start,
272 context.clone(),
273 overlap,
274 ));
275 } else {
276 chunks.push(
277 ChunkResult::new(current_chunk, chunk_start, current_pos)
278 .with_context(context.clone()),
279 );
280 }
281 }
282
283 chunks
284}
285
286fn split_by_chars(
288 text: &str,
289 max_size: usize,
290 offset: usize,
291 context: Option<String>,
292 overlap: usize,
293) -> Vec<ChunkResult> {
294 let mut chunks = Vec::new();
295 let bytes = text.as_bytes();
296 let mut start = 0;
297
298 let effective_overlap = overlap.min(max_size / 2);
300
301 while start < text.len() {
302 let end = (start + max_size).min(text.len());
303
304 let actual_end = if end < text.len() {
306 find_word_boundary_bytes(bytes, start, end)
307 } else {
308 end
309 };
310
311 let actual_end = if actual_end <= start {
313 (start + max_size).min(text.len())
314 } else {
315 actual_end
316 };
317
318 chunks.push(
319 ChunkResult::new(
320 text[start..actual_end].to_string(),
321 offset + start,
322 offset + actual_end,
323 )
324 .with_context(context.clone()),
325 );
326
327 let next_start = actual_end.saturating_sub(effective_overlap);
330 start = if next_start <= start {
331 actual_end } else {
333 next_start
334 };
335 }
336
337 if chunks.is_empty() && !text.is_empty() {
338 chunks.push(
339 ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
340 );
341 }
342
343 chunks
344}
345
346fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
348 let search_start = target.saturating_sub(50).max(start);
350 for i in (search_start..target).rev() {
351 if bytes[i] == b' ' || bytes[i] == b'\n' {
352 return i + 1;
353 }
354 }
355 target
356}
357
358fn get_overlap_text(text: &str, overlap: usize) -> String {
360 if text.len() <= overlap {
361 return text.to_string();
362 }
363
364 let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
365 text[actual_start..].to_string()
366}
367
368fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
370 if bytes.len() <= target_overlap {
371 return 0;
372 }
373
374 let start_search = bytes.len().saturating_sub(target_overlap + 50);
375 let end_search = bytes
376 .len()
377 .saturating_sub(target_overlap.saturating_sub(50));
378
379 for i in (start_search..end_search).rev() {
381 if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
382 return i + 1;
383 }
384 }
385
386 bytes.len().saturating_sub(target_overlap)
388}
389
390fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
392 let mut result = Vec::new();
393
394 for chunk in chunks {
395 if chunk.content.len() > config.max_chunk_size {
396 result.extend(recursive_split(
397 &chunk.content,
398 config.max_chunk_size,
399 chunk.start_offset,
400 chunk.context,
401 config.chunk_overlap,
402 ));
403 } else {
404 result.push(chunk);
405 }
406 }
407
408 result
409}
410
411fn merge_small_chunks(chunks: Vec<ChunkResult>, min_size: usize) -> Vec<ChunkResult> {
413 if chunks.is_empty() {
414 return chunks;
415 }
416
417 let mut result: Vec<ChunkResult> = Vec::new();
418
419 for chunk in chunks {
420 if chunk.content.len() >= min_size || chunk.is_boundary {
421 result.push(chunk);
422 } else if let Some(last) = result.last_mut() {
423 if !last.is_boundary {
425 last.content.push_str("\n\n");
427 last.content.push_str(&chunk.content);
428 last.end_offset = chunk.end_offset;
429 if chunk.context.is_some() {
431 last.context = chunk.context;
432 }
433 } else {
434 result.push(chunk);
435 }
436 } else {
437 result.push(chunk);
438 }
439 }
440
441 result
443}
444
445struct MarkdownSection {
451 header_path: Vec<String>,
453 content: String,
455 start_offset: usize,
457 end_offset: usize,
459}
460
461fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
463 let mut sections = Vec::new();
464 let mut current_section = String::new();
465 let mut section_start = 0;
466 let mut current_pos = 0;
467 let mut header_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
470
471 for line in lines.iter() {
472 let line_with_newline = if current_pos > 0 {
473 format!("\n{}", line)
474 } else {
475 line.to_string()
476 };
477
478 if let Some(level) = get_header_level(line) {
480 if !current_section.is_empty() {
482 sections.push(MarkdownSection {
483 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
484 content: current_section.clone(),
485 start_offset: section_start,
486 end_offset: current_pos,
487 });
488 }
489
490 while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
493 header_stack.pop();
494 }
495 header_stack.push((level, line.to_string()));
496
497 current_section = line_with_newline.trim_start_matches('\n').to_string();
499 section_start = current_pos;
500 } else {
501 current_section.push_str(&line_with_newline);
502 }
503
504 current_pos += line_with_newline.len();
505 }
506
507 if !current_section.is_empty() {
509 sections.push(MarkdownSection {
510 header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
511 content: current_section,
512 start_offset: section_start,
513 end_offset: content.len(),
514 });
515 }
516
517 if sections.is_empty() {
519 sections.push(MarkdownSection {
520 header_path: vec![],
521 content: content.to_string(),
522 start_offset: 0,
523 end_offset: content.len(),
524 });
525 }
526
527 sections
528}
529
530fn get_header_level(line: &str) -> Option<usize> {
532 let trimmed = line.trim_start();
533 if !trimmed.starts_with('#') {
534 return None;
535 }
536
537 let level = trimmed.chars().take_while(|&c| c == '#').count();
538 if level > 0 && level <= 6 {
539 if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
541 return Some(level);
542 }
543 }
544 None
545}
546
547fn format_header_path(path: &[String]) -> Option<String> {
549 if path.is_empty() {
550 return None;
551 }
552 Some(path.join(" > "))
553}
554
555fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
557 let sections = parse_markdown_sections(content);
558 let mut chunks = Vec::new();
559
560 for section in sections {
561 let context = format_header_path(§ion.header_path);
562 let is_boundary = !section.header_path.is_empty();
563
564 chunks.push(
565 ChunkResult::new(section.content, section.start_offset, section.end_offset)
566 .with_context(context)
567 .with_boundary(is_boundary),
568 );
569 }
570
571 chunks
573}
574
575fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
581 let mut chunks = Vec::new();
582 let mut current_chunk = String::new();
583 let mut chunk_start = 0;
584 let mut current_pos = 0;
585
586 let paragraphs: Vec<&str> = content.split("\n\n").collect();
588
589 for (i, para) in paragraphs.iter().enumerate() {
590 let sep = if i > 0 { "\n\n" } else { "" };
591 let para_with_sep = format!("{}{}", sep, para);
592
593 if !current_chunk.is_empty()
595 && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
596 {
597 chunks.push(ChunkResult::new(
598 current_chunk.clone(),
599 chunk_start,
600 current_pos,
601 ));
602
603 let overlap_text = get_overlap_text(¤t_chunk, config.chunk_overlap);
605 chunk_start = current_pos - overlap_text.len();
606 current_chunk = overlap_text;
607 }
608
609 current_chunk.push_str(¶_with_sep);
610 current_pos += para_with_sep.len();
611 }
612
613 if !current_chunk.is_empty() {
615 chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
616 }
617
618 if chunks.is_empty() {
620 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
621 }
622
623 chunks
624}
625
626const CODE_BOUNDARY_PATTERNS: &[&str] = &[
632 "fn ",
634 "pub fn ",
635 "async fn ",
636 "pub async fn ",
637 "impl ",
638 "struct ",
639 "enum ",
640 "trait ",
641 "mod ",
642 "const ",
643 "static ",
644 "type ",
645 "#[",
646 "//!",
647 "func ",
649 "def ",
651 "class ",
652 "async def ",
653 "function ",
655 "async function ",
656 "export ",
657 "export default",
658 "module.exports",
659 "const ",
660 "let ",
661 "var ",
662 "interface ",
663 "void ",
665 "int ",
666 "char ",
667 "double ",
668 "float ",
669 "#define ",
670 "#include ",
671];
672
673fn extract_code_context(line: &str) -> String {
675 let trimmed = line.trim();
676
677 if let Some(paren_pos) = trimmed.find('(') {
680 let signature = &trimmed[..paren_pos];
681 if signature.rfind(' ').is_some() {
683 return format!("{}...", &trimmed[..paren_pos.min(60)]);
684 }
685 }
686
687 for keyword in &[
689 "struct ",
690 "class ",
691 "impl ",
692 "trait ",
693 "interface ",
694 "enum ",
695 ] {
696 if let Some(rest) = trimmed.strip_prefix(keyword) {
697 let name_end = rest
698 .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
699 .unwrap_or(rest.len());
700 return format!("{}{}", keyword, &rest[..name_end.min(50)]);
701 }
702 }
703
704 trimmed.chars().take(60).collect()
706}
707
708fn is_code_boundary(line: &str) -> bool {
710 let trimmed = line.trim_start();
711 CODE_BOUNDARY_PATTERNS
712 .iter()
713 .any(|p| trimmed.starts_with(p))
714}
715
716fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
718 let mut chunks = Vec::new();
719 let mut current_chunk = String::new();
720 let mut chunk_start = 0;
721 let mut current_pos = 0;
722 let mut current_context: Option<String> = None;
723 let mut is_at_boundary = false;
724
725 let lines: Vec<&str> = content.lines().collect();
726
727 for line in lines {
728 let line_with_newline = if current_pos > 0 {
729 format!("\n{}", line)
730 } else {
731 line.to_string()
732 };
733
734 let boundary = is_code_boundary(line);
735
736 if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
738 chunks.push(
739 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
740 .with_context(current_context.clone())
741 .with_boundary(is_at_boundary),
742 );
743
744 current_chunk = String::new();
745 chunk_start = current_pos;
746 is_at_boundary = true;
747 }
748
749 if boundary {
750 current_context = Some(extract_code_context(line));
751 is_at_boundary = true;
752 }
753
754 current_chunk.push_str(&line_with_newline);
756 current_pos += line_with_newline.len();
757 }
758
759 if !current_chunk.is_empty() {
761 chunks.push(
762 ChunkResult::new(current_chunk, chunk_start, content.len())
763 .with_context(current_context)
764 .with_boundary(is_at_boundary),
765 );
766 }
767
768 if chunks.is_empty() {
769 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
770 }
771
772 chunks
773}
774
775fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
781 if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
783 let chunks = chunk_json_value(&value, config, vec![]);
784 if !chunks.is_empty() {
785 return chunks;
786 }
787 }
788
789 chunk_text(content, config)
791}
792
793fn chunk_json_value(
795 value: &serde_json::Value,
796 config: &ChunkingConfig,
797 path: Vec<String>,
798) -> Vec<ChunkResult> {
799 let mut chunks = Vec::new();
800
801 match value {
802 serde_json::Value::Object(map) => {
803 let mut current_chunk = String::from("{\n");
804 let entries: Vec<_> = map.iter().collect();
805
806 for (i, (key, val)) in entries.iter().enumerate() {
807 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
808 let entry = if i < entries.len() - 1 {
809 format!(" \"{}\": {},\n", key, val_str)
810 } else {
811 format!(" \"{}\": {}\n", key, val_str)
812 };
813
814 let mut new_path = path.clone();
815 new_path.push((*key).clone());
816 let path_str = new_path.join(".");
817
818 if entry.len() > config.max_chunk_size {
820 if current_chunk.len() > 3 {
822 current_chunk.push('}');
823 let context = if path.is_empty() {
824 None
825 } else {
826 Some(path.join("."))
827 };
828 chunks.push(
829 ChunkResult::new(current_chunk, 0, 0)
830 .with_context(context)
831 .with_boundary(true),
832 );
833 current_chunk = String::from("{\n");
834 }
835
836 let sub_chunks = chunk_json_value(val, config, new_path);
838 chunks.extend(sub_chunks);
839 continue;
840 }
841
842 if current_chunk.len() + entry.len() > config.max_chunk_size
843 && current_chunk.len() > 3
844 {
845 current_chunk.push('}');
846 chunks.push(
847 ChunkResult::new(current_chunk, 0, 0)
848 .with_context(Some(path_str.clone()))
849 .with_boundary(true),
850 );
851 current_chunk = String::from("{\n");
852 }
853
854 current_chunk.push_str(&entry);
855 }
856
857 current_chunk.push('}');
858 if current_chunk.len() > 3 {
859 let context = if path.is_empty() {
860 None
861 } else {
862 Some(path.join("."))
863 };
864 chunks.push(
865 ChunkResult::new(current_chunk, 0, 0)
866 .with_context(context)
867 .with_boundary(true),
868 );
869 }
870 }
871 serde_json::Value::Array(arr) => {
872 let mut current_chunk = String::from("[\n");
873
874 for (i, val) in arr.iter().enumerate() {
875 let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
876 let entry = if i < arr.len() - 1 {
877 format!(" {},\n", val_str)
878 } else {
879 format!(" {}\n", val_str)
880 };
881
882 let mut new_path = path.clone();
883 new_path.push(format!("[{}]", i));
884 let path_str = new_path.join(".");
885
886 if current_chunk.len() + entry.len() > config.max_chunk_size
887 && current_chunk.len() > 3
888 {
889 current_chunk.push(']');
890 chunks.push(
891 ChunkResult::new(current_chunk, 0, 0)
892 .with_context(Some(path_str.clone()))
893 .with_boundary(true),
894 );
895 current_chunk = String::from("[\n");
896 }
897
898 current_chunk.push_str(&entry);
899 }
900
901 current_chunk.push(']');
902 if current_chunk.len() > 3 {
903 let context = if path.is_empty() {
904 None
905 } else {
906 Some(path.join("."))
907 };
908 chunks.push(
909 ChunkResult::new(current_chunk, 0, 0)
910 .with_context(context)
911 .with_boundary(true),
912 );
913 }
914 }
915 _ => {
916 let content = serde_json::to_string_pretty(value).unwrap_or_default();
918 let context = if path.is_empty() {
919 None
920 } else {
921 Some(path.join("."))
922 };
923 chunks.push(
924 ChunkResult::new(content, 0, 0)
925 .with_context(context)
926 .with_boundary(false),
927 );
928 }
929 }
930
931 chunks
932}
933
934fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
940 let mut chunks = Vec::new();
941 let mut current_chunk = String::new();
942 let mut chunk_start = 0;
943 let mut current_pos = 0;
944 let mut key_stack: Vec<(usize, String)> = Vec::new(); let lines: Vec<&str> = content.lines().collect();
947
948 for line in lines {
949 let line_with_newline = if current_pos > 0 {
950 format!("\n{}", line)
951 } else {
952 line.to_string()
953 };
954
955 let indent = line.len() - line.trim_start().len();
957 let trimmed = line.trim();
958
959 let is_key_line = !trimmed.starts_with('-')
961 && !trimmed.starts_with('#')
962 && trimmed.contains(':')
963 && !trimmed.starts_with('"')
964 && !trimmed.starts_with('\'');
965
966 if is_key_line {
967 if let Some(key) = trimmed.split(':').next() {
969 let key = key.trim().to_string();
970
971 while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
973 key_stack.pop();
974 }
975 key_stack.push((indent, key));
976 }
977 }
978
979 let is_top_level_key = indent == 0 && is_key_line;
981
982 if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
984 let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
985 chunks.push(
986 ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
987 .with_context(context)
988 .with_boundary(true),
989 );
990
991 current_chunk = String::new();
992 chunk_start = current_pos;
993 }
994
995 current_chunk.push_str(&line_with_newline);
996 current_pos += line_with_newline.len();
997 }
998
999 if !current_chunk.is_empty() {
1001 let context = format_yaml_path(&key_stack);
1002 chunks.push(
1003 ChunkResult::new(current_chunk, chunk_start, content.len())
1004 .with_context(context)
1005 .with_boundary(!key_stack.is_empty()),
1006 );
1007 }
1008
1009 if chunks.is_empty() {
1010 chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1011 }
1012
1013 chunks
1014}
1015
1016fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1018 if stack.is_empty() {
1019 return None;
1020 }
1021 Some(
1022 stack
1023 .iter()
1024 .map(|(_, k)| k.as_str())
1025 .collect::<Vec<_>>()
1026 .join("."),
1027 )
1028}
1029
1030#[cfg(test)]
1035mod tests {
1036 use super::*;
1037
1038 #[test]
1039 fn test_small_content_no_chunking() {
1040 let content = "Small content";
1041 let config = ChunkingConfig::default();
1042 let chunks = chunk_content(content, ContentType::Text, &config);
1043
1044 assert_eq!(chunks.len(), 1);
1045 assert_eq!(chunks[0].content, content);
1046 }
1047
1048 #[test]
1049 fn test_text_chunking() {
1050 let content = "a".repeat(2000);
1051 let config = ChunkingConfig {
1052 min_chunk_threshold: 1000,
1053 max_chunk_size: 500,
1054 min_chunk_size: 100,
1055 chunk_overlap: 100,
1056 };
1057 let chunks = chunk_content(&content, ContentType::Text, &config);
1058
1059 assert!(chunks.len() > 1);
1060 for chunk in &chunks {
1061 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1063 }
1064 }
1065
1066 #[test]
1067 fn test_markdown_splits_by_headers_first() {
1068 let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1069 let config = ChunkingConfig {
1070 min_chunk_threshold: 10, max_chunk_size: 1000,
1072 min_chunk_size: 10, chunk_overlap: 0,
1074 };
1075 let chunks = chunk_content(content, ContentType::Markdown, &config);
1076
1077 assert_eq!(chunks.len(), 2);
1079 assert!(chunks[0].content.contains("# H1"));
1080 assert!(chunks[1].content.contains("# H2"));
1081 }
1082
1083 #[test]
1084 fn test_large_section_gets_subsplit() {
1085 let long_paragraph = "This is a long sentence. ".repeat(100);
1086 let content = format!("# Header\n\n{}", long_paragraph);
1087 let config = ChunkingConfig {
1088 min_chunk_threshold: 100,
1089 max_chunk_size: 500,
1090 min_chunk_size: 100,
1091 chunk_overlap: 50,
1092 };
1093 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1094
1095 assert!(chunks.len() > 1);
1097 for chunk in &chunks {
1099 assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1100 }
1101 }
1102
1103 #[test]
1104 fn test_small_chunks_merged() {
1105 let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1106 let config = ChunkingConfig {
1107 min_chunk_threshold: 5,
1108 max_chunk_size: 1000,
1109 min_chunk_size: 50, chunk_overlap: 0,
1111 };
1112 let chunks = chunk_content(content, ContentType::Markdown, &config);
1113
1114 assert!(chunks.len() <= 3);
1117 }
1118
1119 #[test]
1120 fn test_header_path_context() {
1121 let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1122 let config = ChunkingConfig {
1123 min_chunk_threshold: 10,
1124 max_chunk_size: 1000,
1125 min_chunk_size: 10,
1126 chunk_overlap: 0,
1127 };
1128 let chunks = chunk_content(content, ContentType::Markdown, &config);
1129
1130 let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1132 assert!(detail_chunk.is_some());
1133 let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1134 assert!(ctx.contains("# Main"));
1135 assert!(ctx.contains("## Sub"));
1136 assert!(ctx.contains("### Detail"));
1137 }
1138
1139 #[test]
1140 fn test_markdown_chunking_preserves_context() {
1141 let content = format!(
1142 "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1143 "a".repeat(600),
1144 "b".repeat(600)
1145 );
1146 let config = ChunkingConfig {
1147 min_chunk_threshold: 500,
1148 max_chunk_size: 500,
1149 min_chunk_size: 100,
1150 chunk_overlap: 50,
1151 };
1152 let chunks = chunk_content(&content, ContentType::Markdown, &config);
1153
1154 assert!(chunks.len() >= 2);
1155 assert!(chunks.iter().any(|c| c.context.is_some()));
1157 }
1158
1159 #[test]
1160 fn test_code_chunking() {
1161 let content = format!(
1162 "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1163 " // code\n".repeat(50),
1164 " // more code\n".repeat(50)
1165 );
1166 let config = ChunkingConfig {
1167 min_chunk_threshold: 500,
1168 max_chunk_size: 500,
1169 min_chunk_size: 100,
1170 chunk_overlap: 50,
1171 };
1172 let chunks = chunk_content(&content, ContentType::Code, &config);
1173
1174 assert!(chunks.len() >= 2);
1175 }
1176
1177 #[test]
1178 fn test_code_boundary_patterns() {
1179 let patterns = [
1180 "fn test()",
1181 "pub fn test()",
1182 "async fn test()",
1183 "const FOO",
1184 "export default",
1185 "module.exports",
1186 "interface Foo",
1187 "type Bar",
1188 ];
1189
1190 for pattern in patterns {
1191 assert!(
1192 is_code_boundary(pattern),
1193 "Pattern '{}' should be recognized as boundary",
1194 pattern
1195 );
1196 }
1197 }
1198
1199 #[test]
1200 fn test_json_chunking() {
1201 let content = serde_json::json!({
1202 "key1": "a".repeat(300),
1203 "key2": "b".repeat(300),
1204 "key3": "c".repeat(300),
1205 })
1206 .to_string();
1207
1208 let config = ChunkingConfig {
1209 min_chunk_threshold: 500,
1210 max_chunk_size: 400,
1211 min_chunk_size: 100,
1212 chunk_overlap: 50,
1213 };
1214 let chunks = chunk_content(&content, ContentType::Json, &config);
1215
1216 assert!(!chunks.is_empty());
1217 }
1218
1219 #[test]
1220 fn test_json_nested_path_context() {
1221 let content = serde_json::json!({
1222 "users": {
1223 "profile": {
1224 "settings": "value"
1225 }
1226 }
1227 })
1228 .to_string();
1229
1230 let config = ChunkingConfig {
1231 min_chunk_threshold: 10,
1232 max_chunk_size: 1000,
1233 min_chunk_size: 10,
1234 chunk_overlap: 0,
1235 };
1236 let chunks = chunk_content(&content, ContentType::Json, &config);
1237
1238 assert!(!chunks.is_empty());
1240 }
1241
1242 #[test]
1243 fn test_sentence_splitting() {
1244 let text = "First sentence. Second sentence? Third sentence! Fourth.";
1245 let sentences = split_at_sentences(text);
1246
1247 assert!(sentences.len() >= 3);
1248 assert!(sentences[0].contains("First"));
1249 }
1250
1251 #[test]
1252 fn test_yaml_chunking_with_path() {
1253 let content = r#"
1254server:
1255 host: localhost
1256 port: 8080
1257database:
1258 host: db.example.com
1259 port: 5432
1260"#;
1261 let config = ChunkingConfig {
1262 min_chunk_threshold: 10,
1263 max_chunk_size: 1000,
1264 min_chunk_size: 10,
1265 chunk_overlap: 0,
1266 };
1267 let chunks = chunk_content(content, ContentType::Yaml, &config);
1268
1269 assert!(!chunks.is_empty());
1271 }
1272
1273 #[test]
1274 fn test_recursive_split_preserves_context() {
1275 let long_text = "This is a sentence. ".repeat(100);
1276 let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1277
1278 assert!(chunks.len() > 1);
1279 for chunk in &chunks {
1280 assert!(
1281 chunk
1282 .context
1283 .as_ref()
1284 .map(|c| c == "test context")
1285 .unwrap_or(false)
1286 );
1287 }
1288 }
1289}