1use super::{Chunk, DocumentSource};
4
5#[derive(Debug, Clone)]
7pub struct ChunkConfig {
8 pub chunk_size: usize,
10 pub chunk_overlap: usize,
12}
13
14impl Default for ChunkConfig {
15 fn default() -> Self {
16 Self {
17 chunk_size: 1000,
18 chunk_overlap: 200,
19 }
20 }
21}
22
23fn chunk_id(uri: &str, index: usize) -> String {
28 let hash = crate::util::fnv1a_hash(uri.as_bytes());
29 format!("{hash:016x}-{index}")
30}
31
32pub fn split_into_chunks(text: &str, source: &DocumentSource, config: &ChunkConfig) -> Vec<Chunk> {
38 let text = text.trim();
39 if text.is_empty() {
40 return vec![];
41 }
42
43 let mut chunks = Vec::new();
44 let mut current = String::new();
45 let mut chunk_index = 0;
46
47 let paragraphs: Vec<&str> = text.split("\n\n").collect();
48
49 for para in ¶graphs {
50 let para = para.trim();
51 if para.is_empty() {
52 continue;
53 }
54
55 if !current.is_empty() && current.len() + para.len() + 2 > config.chunk_size {
57 let id = chunk_id(&source.uri, chunk_index);
58 chunks.push(Chunk {
59 id,
60 content: current.clone(),
61 source: source.clone(),
62 chunk_index,
63 tenant_id: None,
64 });
65 chunk_index += 1;
66
67 if config.chunk_overlap > 0 && current.len() > config.chunk_overlap {
69 let start = current.len() - config.chunk_overlap;
70 let start = current.ceil_char_boundary(start);
72 current = current[start..].to_string();
73 } else if config.chunk_overlap == 0 {
74 current.clear();
75 }
76 }
78
79 if para.len() > config.chunk_size {
81 if !current.is_empty() {
83 let id = chunk_id(&source.uri, chunk_index);
84 chunks.push(Chunk {
85 id,
86 content: current.clone(),
87 source: source.clone(),
88 chunk_index,
89 tenant_id: None,
90 });
91 chunk_index += 1;
92 current.clear();
93 }
94
95 let mut pos = 0;
97 while pos < para.len() {
98 let end = (pos + config.chunk_size).min(para.len());
99 let end = para.ceil_char_boundary(end);
100 let end = end.min(para.len());
101
102 let id = chunk_id(&source.uri, chunk_index);
103 chunks.push(Chunk {
104 id,
105 content: para[pos..end].to_string(),
106 source: source.clone(),
107 chunk_index,
108 tenant_id: None,
109 });
110 chunk_index += 1;
111
112 if end >= para.len() {
113 break;
114 }
115
116 let advance = if config.chunk_overlap < config.chunk_size {
118 config.chunk_size - config.chunk_overlap
119 } else {
120 1 };
122 pos += advance;
123 pos = para.ceil_char_boundary(pos);
124 }
125 } else {
126 if !current.is_empty() {
128 current.push_str("\n\n");
129 }
130 current.push_str(para);
131 }
132 }
133
134 if !current.is_empty() {
136 let id = chunk_id(&source.uri, chunk_index);
137 chunks.push(Chunk {
138 id,
139 content: current,
140 source: source.clone(),
141 chunk_index,
142 tenant_id: None,
143 });
144 }
145
146 chunks
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 fn test_source() -> DocumentSource {
154 DocumentSource {
155 uri: "test.md".into(),
156 title: "Test".into(),
157 }
158 }
159
160 #[test]
161 fn empty_text_produces_no_chunks() {
162 let chunks = split_into_chunks("", &test_source(), &ChunkConfig::default());
163 assert!(chunks.is_empty());
164 }
165
166 #[test]
167 fn whitespace_only_produces_no_chunks() {
168 let chunks = split_into_chunks(" \n\n ", &test_source(), &ChunkConfig::default());
169 assert!(chunks.is_empty());
170 }
171
172 #[test]
173 fn single_small_paragraph_is_one_chunk() {
174 let text = "Hello, world!";
175 let chunks = split_into_chunks(text, &test_source(), &ChunkConfig::default());
176 assert_eq!(chunks.len(), 1);
177 assert_eq!(chunks[0].content, "Hello, world!");
178 assert_eq!(chunks[0].chunk_index, 0);
179 assert_eq!(chunks[0].source.uri, "test.md");
180 }
181
182 #[test]
183 fn multiple_paragraphs_within_limit_are_single_chunk() {
184 let text = "First paragraph.\n\nSecond paragraph.";
185 let config = ChunkConfig {
186 chunk_size: 1000,
187 chunk_overlap: 0,
188 };
189 let chunks = split_into_chunks(text, &test_source(), &config);
190 assert_eq!(chunks.len(), 1);
191 assert!(chunks[0].content.contains("First paragraph."));
192 assert!(chunks[0].content.contains("Second paragraph."));
193 }
194
195 #[test]
196 fn paragraphs_exceeding_limit_split_into_multiple_chunks() {
197 let para1 = "a".repeat(60);
198 let para2 = "b".repeat(60);
199 let text = format!("{para1}\n\n{para2}");
200 let config = ChunkConfig {
201 chunk_size: 80,
202 chunk_overlap: 0,
203 };
204 let chunks = split_into_chunks(&text, &test_source(), &config);
205 assert!(
206 chunks.len() >= 2,
207 "expected >= 2 chunks, got {}",
208 chunks.len()
209 );
210 assert!(chunks[0].content.contains('a'));
211 assert!(chunks.last().unwrap().content.contains('b'));
212 }
213
214 #[test]
215 fn overlap_preserves_context() {
216 let para1 = "a".repeat(60);
217 let para2 = "b".repeat(60);
218 let text = format!("{para1}\n\n{para2}");
219 let config = ChunkConfig {
220 chunk_size: 80,
221 chunk_overlap: 20,
222 };
223 let chunks = split_into_chunks(&text, &test_source(), &config);
224 assert!(
225 chunks.len() >= 2,
226 "expected >= 2 chunks, got {}",
227 chunks.len()
228 );
229 if chunks.len() >= 2 {
231 let c1_tail = &chunks[0].content[chunks[0].content.len().saturating_sub(20)..];
234 let c2_head = &chunks[1].content[..c1_tail.len().min(chunks[1].content.len())];
235 assert_eq!(c1_tail, c2_head, "overlap should match");
236 }
237 }
238
239 #[test]
240 fn chunk_indices_are_sequential() {
241 let text = (0..10)
242 .map(|i| format!("Paragraph {i}"))
243 .collect::<Vec<_>>()
244 .join("\n\n");
245 let config = ChunkConfig {
246 chunk_size: 30,
247 chunk_overlap: 0,
248 };
249 let chunks = split_into_chunks(&text, &test_source(), &config);
250 for (i, chunk) in chunks.iter().enumerate() {
251 assert_eq!(chunk.chunk_index, i, "chunk {i} has wrong index");
252 }
253 }
254
255 #[test]
256 fn deterministic_ids() {
257 let text = "Hello world.\n\nSecond paragraph.";
258 let config = ChunkConfig {
259 chunk_size: 20,
260 chunk_overlap: 0,
261 };
262 let chunks1 = split_into_chunks(text, &test_source(), &config);
263 let chunks2 = split_into_chunks(text, &test_source(), &config);
264 assert_eq!(chunks1.len(), chunks2.len());
265 for (a, b) in chunks1.iter().zip(chunks2.iter()) {
266 assert_eq!(a.id, b.id, "chunk IDs should be deterministic");
267 }
268 }
269
270 #[test]
271 fn different_sources_produce_different_ids() {
272 let text = "Hello world.";
273 let config = ChunkConfig::default();
274 let src1 = DocumentSource {
275 uri: "file1.md".into(),
276 title: "F1".into(),
277 };
278 let src2 = DocumentSource {
279 uri: "file2.md".into(),
280 title: "F2".into(),
281 };
282 let c1 = split_into_chunks(text, &src1, &config);
283 let c2 = split_into_chunks(text, &src2, &config);
284 assert_ne!(c1[0].id, c2[0].id);
285 }
286
287 #[test]
288 fn utf8_safe_chunking() {
289 let text = "é".repeat(600); let config = ChunkConfig {
292 chunk_size: 100,
293 chunk_overlap: 20,
294 };
295 let chunks = split_into_chunks(&text, &test_source(), &config);
296 assert!(!chunks.is_empty());
297 for chunk in &chunks {
298 assert!(chunk.content.is_char_boundary(0));
300 assert!(chunk.content.is_char_boundary(chunk.content.len()));
301 }
302 }
303
304 #[test]
305 fn chunk_config_defaults() {
306 let config = ChunkConfig::default();
307 assert_eq!(config.chunk_size, 1000);
308 assert_eq!(config.chunk_overlap, 200);
309 }
310
311 #[test]
312 fn large_single_paragraph_split() {
313 let text = "x".repeat(3000);
314 let config = ChunkConfig {
315 chunk_size: 1000,
316 chunk_overlap: 200,
317 };
318 let chunks = split_into_chunks(&text, &test_source(), &config);
319 assert!(
320 chunks.len() >= 3,
321 "expected >= 3 chunks, got {}",
322 chunks.len()
323 );
324 let total_unique: usize = chunks.iter().map(|c| c.content.len()).sum();
326 assert!(total_unique >= 3000);
328 }
329}