1use crate::document::{Chunk, Document};
10
11fn floor_char_boundary(s: &str, index: usize) -> usize {
14 if index >= s.len() {
15 return s.len();
16 }
17 let mut i = index;
18 while i > 0 && !s.is_char_boundary(i) {
19 i -= 1;
20 }
21 i
22}
23
24pub trait Chunker: Send + Sync {
29 fn chunk(&self, document: &Document) -> Vec<Chunk>;
34}
35
36#[derive(Debug, Clone)]
50pub struct FixedSizeChunker {
51 chunk_size: usize,
52 chunk_overlap: usize,
53}
54
55impl FixedSizeChunker {
56 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
63 Self { chunk_size, chunk_overlap }
64 }
65}
66
67impl Chunker for FixedSizeChunker {
68 fn chunk(&self, document: &Document) -> Vec<Chunk> {
69 if document.text.is_empty() {
70 return Vec::new();
71 }
72
73 let text = &document.text;
74 let mut chunks = Vec::new();
75 let mut start = 0;
76 let mut chunk_index = 0;
77
78 while start < text.len() {
79 let end = floor_char_boundary(text, (start + self.chunk_size).min(text.len()));
80 let chunk_text = &text[start..end];
81
82 let mut metadata = document.metadata.clone();
83 metadata.insert("chunk_index".to_string(), chunk_index.to_string());
84
85 chunks.push(Chunk {
86 id: format!("{}_{chunk_index}", document.id),
87 text: chunk_text.to_string(),
88 embedding: Vec::new(),
89 metadata,
90 document_id: document.id.clone(),
91 });
92
93 chunk_index += 1;
94 let step = self.chunk_size.saturating_sub(self.chunk_overlap);
95 if step == 0 {
96 break;
97 }
98 start = floor_char_boundary(text, start + step);
99 }
100
101 chunks
102 }
103}
104
105#[derive(Debug, Clone)]
121pub struct RecursiveChunker {
122 chunk_size: usize,
123 chunk_overlap: usize,
124}
125
126impl RecursiveChunker {
127 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
134 Self { chunk_size, chunk_overlap }
135 }
136}
137
138fn split_and_merge(
142 text: &str,
143 chunk_size: usize,
144 chunk_overlap: usize,
145 separators: &[&str],
146) -> Vec<String> {
147 if text.len() <= chunk_size || separators.is_empty() {
148 return split_by_size(text, chunk_size, chunk_overlap);
149 }
150
151 let separator = separators[0];
152 let remaining_separators = &separators[1..];
153
154 let segments: Vec<&str> = if separator == " " {
155 text.split(' ').collect()
156 } else {
157 split_keeping_separator(text, separator)
158 };
159
160 let mut chunks = Vec::new();
161 let mut current = String::new();
162
163 for segment in segments {
164 if current.is_empty() {
165 current = segment.to_string();
166 } else if current.len() + segment.len() <= chunk_size {
167 current.push_str(segment);
168 } else {
169 if current.len() > chunk_size {
171 chunks.extend(split_and_merge(
172 ¤t,
173 chunk_size,
174 chunk_overlap,
175 remaining_separators,
176 ));
177 } else {
178 chunks.push(current);
179 }
180 current = segment.to_string();
182 }
183 }
184
185 if !current.is_empty() {
186 if current.len() > chunk_size {
187 chunks.extend(split_and_merge(
188 ¤t,
189 chunk_size,
190 chunk_overlap,
191 remaining_separators,
192 ));
193 } else {
194 chunks.push(current);
195 }
196 }
197
198 chunks
199}
200
201fn split_keeping_separator<'a>(text: &'a str, separator: &str) -> Vec<&'a str> {
203 let mut result = Vec::new();
204 let mut start = 0;
205
206 while let Some(pos) = text[start..].find(separator) {
207 let end = start + pos + separator.len();
208 result.push(&text[start..end]);
209 start = end;
210 }
211
212 if start < text.len() {
213 result.push(&text[start..]);
214 }
215
216 result
217}
218
219fn split_by_size(text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
221 if text.is_empty() {
222 return Vec::new();
223 }
224
225 let mut chunks = Vec::new();
226 let mut start = 0;
227
228 while start < text.len() {
229 let end = floor_char_boundary(text, (start + chunk_size).min(text.len()));
230 chunks.push(text[start..end].to_string());
231 let step = chunk_size.saturating_sub(chunk_overlap);
232 if step == 0 {
233 break;
234 }
235 start = floor_char_boundary(text, start + step);
236 }
237
238 chunks
239}
240
241impl Chunker for RecursiveChunker {
242 fn chunk(&self, document: &Document) -> Vec<Chunk> {
243 if document.text.is_empty() {
244 return Vec::new();
245 }
246
247 let separators = ["\n\n", ". ", "! ", "? ", " "];
248 let raw_chunks =
249 split_and_merge(&document.text, self.chunk_size, self.chunk_overlap, &separators);
250
251 raw_chunks
252 .into_iter()
253 .enumerate()
254 .map(|(i, text)| {
255 let mut metadata = document.metadata.clone();
256 metadata.insert("chunk_index".to_string(), i.to_string());
257 Chunk {
258 id: format!("{}_{i}", document.id),
259 text,
260 embedding: Vec::new(),
261 metadata,
262 document_id: document.id.clone(),
263 }
264 })
265 .collect()
266 }
267}
268
269#[derive(Debug, Clone)]
284pub struct MarkdownChunker {
285 chunk_size: usize,
286 chunk_overlap: usize,
287}
288
289impl MarkdownChunker {
290 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
297 Self { chunk_size, chunk_overlap }
298 }
299}
300
301struct MarkdownSection {
303 header_path: String,
304 text: String,
305}
306
307fn parse_markdown_sections(text: &str) -> Vec<MarkdownSection> {
309 let mut sections = Vec::new();
310 let mut headers: Vec<String> = Vec::new();
311 let mut current_body = String::new();
312 let mut current_header_path = String::new();
313
314 for line in text.lines() {
315 let trimmed = line.trim_start();
316 if trimmed.starts_with('#') {
317 if !current_body.is_empty() || !current_header_path.is_empty() {
319 sections.push(MarkdownSection {
320 header_path: current_header_path.clone(),
321 text: current_body.trim().to_string(),
322 });
323 current_body = String::new();
324 }
325
326 let level = trimmed.chars().take_while(|c| *c == '#').count();
328 let header_text = trimmed[level..].trim().to_string();
329
330 headers.truncate(level.saturating_sub(1));
332 headers.push(header_text);
333 current_header_path = headers.join(" > ");
334 } else {
335 if !current_body.is_empty() {
336 current_body.push('\n');
337 }
338 current_body.push_str(line);
339 }
340 }
341
342 if !current_body.is_empty() || !current_header_path.is_empty() {
344 sections.push(MarkdownSection {
345 header_path: current_header_path,
346 text: current_body.trim().to_string(),
347 });
348 }
349
350 sections
351}
352
353impl Chunker for MarkdownChunker {
354 fn chunk(&self, document: &Document) -> Vec<Chunk> {
355 if document.text.is_empty() {
356 return Vec::new();
357 }
358
359 let sections = parse_markdown_sections(&document.text);
360 let mut chunks = Vec::new();
361 let mut chunk_index = 0;
362
363 for section in sections {
364 let section_text = if section.header_path.is_empty() {
366 section.text.clone()
367 } else if section.text.is_empty() {
368 section.header_path.clone()
369 } else {
370 format!("{}\n{}", section.header_path, section.text)
371 };
372
373 if section_text.is_empty() {
374 continue;
375 }
376
377 let sub_chunks = if section_text.len() > self.chunk_size {
378 let separators = ["\n\n", ". ", "! ", "? ", " "];
380 split_and_merge(§ion_text, self.chunk_size, self.chunk_overlap, &separators)
381 } else {
382 vec![section_text]
383 };
384
385 for text in sub_chunks {
386 let mut metadata = document.metadata.clone();
387 metadata.insert("chunk_index".to_string(), chunk_index.to_string());
388 metadata.insert("header_path".to_string(), section.header_path.clone());
389
390 chunks.push(Chunk {
391 id: format!("{}_{chunk_index}", document.id),
392 text,
393 embedding: Vec::new(),
394 metadata,
395 document_id: document.id.clone(),
396 });
397 chunk_index += 1;
398 }
399 }
400
401 chunks
402 }
403}
404
405#[cfg(test)]
406mod tests {
407 use super::*;
408 use crate::Document;
409 use std::collections::HashMap;
410
411 fn doc(text: &str) -> Document {
412 Document {
413 id: "test".to_string(),
414 text: text.to_string(),
415 metadata: HashMap::new(),
416 source_uri: None,
417 }
418 }
419
420 #[test]
421 fn fixed_chunker_utf8_multibyte() {
422 let chunker = FixedSizeChunker::new(5, 0);
426 let chunks = chunker.chunk(&doc("你好世界测试文本"));
427 for chunk in &chunks {
429 assert!(chunk.text.is_char_boundary(0));
430 let _ = chunk.text.chars().count();
432 }
433 assert!(chunks.len() > 1);
435 }
436
437 #[test]
438 fn fixed_chunker_utf8_emoji() {
439 let chunker = FixedSizeChunker::new(6, 0);
441 let chunks = chunker.chunk(&doc("🦀🚀🎉✨🌟💫"));
442 for chunk in &chunks {
443 let _ = chunk.text.chars().count();
444 }
445 assert!(chunks.len() > 1);
446 }
447
448 #[test]
449 fn fixed_chunker_utf8_mixed() {
450 let text = "Hello café 你好 🦀";
452 let chunker = FixedSizeChunker::new(7, 2);
453 let chunks = chunker.chunk(&doc(text));
454 for chunk in &chunks {
455 let _ = chunk.text.chars().count();
456 }
457 assert!(!chunks.is_empty());
458 }
459
460 #[test]
461 fn split_by_size_utf8() {
462 let text = "日本語のテスト文字列です";
463 let chunks = split_by_size(text, 10, 3);
464 for chunk in &chunks {
465 let _ = chunk.chars().count();
466 }
467 assert!(chunks.len() > 1);
468 }
469
470 #[test]
471 fn recursive_chunker_utf8() {
472 let text = "第一段落。这是中文文本。\n\n第二段落。更多中文内容在这里。";
473 let chunker = RecursiveChunker::new(15, 3);
474 let chunks = chunker.chunk(&doc(text));
475 for chunk in &chunks {
476 let _ = chunk.text.chars().count();
477 }
478 assert!(!chunks.is_empty());
479 }
480}