semantic_memory/
chunker.rs1use crate::config::ChunkingConfig;
4use crate::tokenizer::TokenCounter;
5use crate::types::TextChunk;
6
7const MAX_RECURSION_DEPTH: usize = 10;
9
10pub fn chunk_text(
15 text: &str,
16 config: &ChunkingConfig,
17 token_counter: &dyn TokenCounter,
18) -> Vec<TextChunk> {
19 if text.trim().is_empty() {
20 return Vec::new();
21 }
22
23 if text.len() <= config.max_size && text.len() >= config.min_size {
24 return vec![TextChunk {
25 index: 0,
26 content: text.to_string(),
27 token_count_estimate: token_counter.count_tokens(text),
28 }];
29 }
30
31 let raw_chunks = recursive_split(text, config, 0);
33
34 let merged = merge_small_chunks(raw_chunks, config.target_size, config.min_size);
36
37 let overlapped = apply_overlap(merged, config.overlap);
39
40 overlapped
42 .into_iter()
43 .enumerate()
44 .map(|(i, content)| {
45 let token_count_estimate = token_counter.count_tokens(&content);
46 TextChunk {
47 index: i,
48 content,
49 token_count_estimate,
50 }
51 })
52 .collect()
53}
54
55fn safe_split_at(text: &str, pos: usize) -> usize {
57 let mut split = pos.min(text.len());
58 while split > 0 && !text.is_char_boundary(split) {
59 split -= 1;
60 }
61 split
62}
63
64fn force_split(text: &str, max_size: usize) -> Vec<String> {
66 let mut chunks = Vec::new();
67 let mut start = 0;
68 while start < text.len() {
69 let end = safe_split_at(text, start + max_size);
70 if end <= start {
71 let mut next = start + 1;
73 while next < text.len() && !text.is_char_boundary(next) {
74 next += 1;
75 }
76 if next <= text.len() {
77 chunks.push(text[start..next].to_string());
78 }
79 start = next;
80 continue;
81 }
82 chunks.push(text[start..end].to_string());
83 start = end;
84 }
85 chunks
86}
87
88fn recursive_split(text: &str, config: &ChunkingConfig, depth: usize) -> Vec<String> {
90 if text.len() <= config.max_size {
91 return vec![text.to_string()];
92 }
93
94 if depth >= MAX_RECURSION_DEPTH {
95 tracing::warn!("Chunker hit max recursion depth, force-splitting at max_size");
96 return force_split(text, config.max_size);
97 }
98
99 let separators: &[&str] = &["\n\n", ". ", "? ", "! ", " "];
101
102 for sep in separators {
103 let parts: Vec<&str> = text.split(sep).collect();
104 if parts.len() <= 1 {
105 continue;
106 }
107
108 let mut chunks = Vec::new();
109 let mut current = String::new();
110
111 for (i, part) in parts.iter().enumerate() {
112 let piece = if i + 1 < parts.len() {
113 format!("{}{}", part, sep)
114 } else {
115 part.to_string()
116 };
117
118 if current.len() + piece.len() > config.target_size && !current.is_empty() {
119 chunks.push(current);
120 current = piece;
121 } else {
122 current.push_str(&piece);
123 }
124 }
125
126 if !current.is_empty() {
127 chunks.push(current);
128 }
129
130 let mut result = Vec::new();
132 for chunk in chunks {
133 if chunk.len() > config.max_size {
134 result.extend(recursive_split(&chunk, config, depth + 1));
135 } else {
136 result.push(chunk);
137 }
138 }
139
140 if result.len() > 1 {
141 return result;
142 }
143 }
144
145 force_split(text, config.max_size)
147}
148
149fn merge_small_chunks(chunks: Vec<String>, target_size: usize, min_size: usize) -> Vec<String> {
151 if chunks.is_empty() {
152 return chunks;
153 }
154
155 let mut merged = Vec::new();
156 let mut current = chunks[0].clone();
157
158 for chunk in chunks.iter().skip(1) {
159 if (current.len() < min_size || chunk.len() < min_size)
160 && current.len() + chunk.len() <= target_size
161 {
162 current.push_str(chunk);
163 } else {
164 merged.push(current);
165 current = chunk.clone();
166 }
167 }
168
169 merged.push(current);
170 if merged.len() > 1 {
171 if let Some(last) = merged.last() {
172 if last.len() < min_size {
173 let last = merged.pop().unwrap_or_default();
174 if let Some(previous) = merged.last_mut() {
175 previous.push_str(&last);
176 } else {
177 merged.push(last);
178 }
179 }
180 }
181 }
182 merged
183}
184
185fn apply_overlap(chunks: Vec<String>, overlap: usize) -> Vec<String> {
187 if chunks.len() <= 1 || overlap == 0 {
188 return chunks;
189 }
190
191 let mut result = Vec::with_capacity(chunks.len());
192 result.push(chunks[0].clone());
193
194 for i in 1..chunks.len() {
195 let prev = result.last().map(String::as_str).unwrap_or(&chunks[i - 1]);
196 let overlap_start = if prev.len() > overlap {
197 let raw_start = prev.len() - overlap;
199 let safe_start = safe_split_at(prev, raw_start);
200 prev[safe_start..]
202 .find(' ')
203 .map(|pos| safe_start + pos + 1)
204 .unwrap_or(safe_start)
205 } else {
206 0
207 };
208
209 let overlap_text = &prev[overlap_start..];
210 let mut chunk_with_overlap = overlap_text.to_string();
211 chunk_with_overlap.push_str(&chunks[i]);
212 result.push(chunk_with_overlap);
213 }
214
215 result
216}