agentroot_core/index/
chunker.rs1pub const CHUNK_SIZE_TOKENS: usize = 800;
5pub const CHUNK_OVERLAP_TOKENS: usize = 120;
6pub const CHUNK_SIZE_CHARS: usize = 3200;
7pub const CHUNK_OVERLAP_CHARS: usize = 480;
8
9#[derive(Debug, Clone)]
11pub struct Chunk {
12 pub text: String,
13 pub position: usize,
14 pub token_count: Option<usize>,
15}
16
17fn floor_char_boundary(s: &str, index: usize) -> usize {
19 if index >= s.len() {
20 return s.len();
21 }
22 let mut i = index;
23 while i > 0 && !s.is_char_boundary(i) {
24 i -= 1;
25 }
26 i
27}
28
29fn ceil_char_boundary(s: &str, index: usize) -> usize {
31 if index >= s.len() {
32 return s.len();
33 }
34 let mut i = index;
35 while i < s.len() && !s.is_char_boundary(i) {
36 i += 1;
37 }
38 i
39}
40
41pub fn chunk_by_chars(content: &str, chunk_size: usize, overlap: usize) -> Vec<Chunk> {
43 if content.len() <= chunk_size {
44 return vec![Chunk {
45 text: content.to_string(),
46 position: 0,
47 token_count: None,
48 }];
49 }
50
51 let mut chunks = Vec::new();
52 let mut start = 0;
53
54 while start < content.len() {
55 let raw_end = (start + chunk_size).min(content.len());
56 let end = floor_char_boundary(content, raw_end);
57 let mut chunk_end = end;
58
59 if end < content.len() {
61 let search_start_raw = start + (chunk_size * 70 / 100);
62 let search_start = ceil_char_boundary(content, search_start_raw);
63
64 if search_start < end {
65 let search_region = &content[search_start..end];
66
67 if let Some(pos) = search_region.rfind("\n\n") {
68 chunk_end = search_start + pos + 2;
69 } else if let Some(pos) = search_region.rfind(". ") {
70 chunk_end = search_start + pos + 2;
71 } else if let Some(pos) = search_region.rfind('\n') {
72 chunk_end = search_start + pos + 1;
73 } else if let Some(pos) = search_region.rfind(' ') {
74 chunk_end = search_start + pos + 1;
75 }
76 }
77 }
78
79 chunk_end = floor_char_boundary(content, chunk_end);
81
82 chunks.push(Chunk {
83 text: content[start..chunk_end].to_string(),
84 position: start,
85 token_count: None,
86 });
87
88 if chunk_end >= content.len() {
89 break;
90 }
91
92 let new_start_raw = chunk_end.saturating_sub(overlap);
93 start = ceil_char_boundary(content, new_start_raw);
94 }
95
96 chunks
97}
98
99#[cfg(test)]
100mod tests {
101 use super::*;
102
103 #[test]
104 fn test_chunk_small_content() {
105 let content = "Small content.";
106 let chunks = chunk_by_chars(content, 100, 20);
107 assert_eq!(chunks.len(), 1);
108 assert_eq!(chunks[0].text, content);
109 }
110
111 #[test]
112 fn test_chunk_preserves_paragraphs() {
113 let content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
114 let chunks = chunk_by_chars(content, 30, 5);
115 assert!(chunks.len() >= 2);
116 }
117
118 #[test]
119 fn test_chunk_handles_unicode() {
120 let content = "Hello δΈη! This is a test with emoji π and special chars β here.";
121 let chunks = chunk_by_chars(content, 20, 5);
122 assert!(!chunks.is_empty());
123 for chunk in &chunks {
124 assert!(!chunk.text.is_empty());
125 }
126 }
127
128 #[test]
129 fn test_floor_char_boundary() {
130 let s = "Hello δΈη";
131 assert_eq!(floor_char_boundary(s, 6), 6); assert_eq!(floor_char_boundary(s, 7), 6); assert_eq!(floor_char_boundary(s, 8), 6); assert_eq!(floor_char_boundary(s, 9), 9); }
136}