1use crate::document::{Chunk, Document};
10
11pub trait Chunker: Send + Sync {
16 fn chunk(&self, document: &Document) -> Vec<Chunk>;
21}
22
23#[derive(Debug, Clone)]
37pub struct FixedSizeChunker {
38 chunk_size: usize,
39 chunk_overlap: usize,
40}
41
42impl FixedSizeChunker {
43 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
50 Self { chunk_size, chunk_overlap }
51 }
52}
53
54impl Chunker for FixedSizeChunker {
55 fn chunk(&self, document: &Document) -> Vec<Chunk> {
56 if document.text.is_empty() {
57 return Vec::new();
58 }
59
60 let text = &document.text;
61 let mut chunks = Vec::new();
62 let mut start = 0;
63 let mut chunk_index = 0;
64
65 while start < text.len() {
66 let end = (start + self.chunk_size).min(text.len());
67 let chunk_text = &text[start..end];
68
69 let mut metadata = document.metadata.clone();
70 metadata.insert("chunk_index".to_string(), chunk_index.to_string());
71
72 chunks.push(Chunk {
73 id: format!("{}_{chunk_index}", document.id),
74 text: chunk_text.to_string(),
75 embedding: Vec::new(),
76 metadata,
77 document_id: document.id.clone(),
78 });
79
80 chunk_index += 1;
81 let step = self.chunk_size.saturating_sub(self.chunk_overlap);
82 if step == 0 {
83 break;
84 }
85 start += step;
86 }
87
88 chunks
89 }
90}
91
92#[derive(Debug, Clone)]
108pub struct RecursiveChunker {
109 chunk_size: usize,
110 chunk_overlap: usize,
111}
112
113impl RecursiveChunker {
114 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
121 Self { chunk_size, chunk_overlap }
122 }
123}
124
125fn split_and_merge(
129 text: &str,
130 chunk_size: usize,
131 chunk_overlap: usize,
132 separators: &[&str],
133) -> Vec<String> {
134 if text.len() <= chunk_size || separators.is_empty() {
135 return split_by_size(text, chunk_size, chunk_overlap);
136 }
137
138 let separator = separators[0];
139 let remaining_separators = &separators[1..];
140
141 let segments: Vec<&str> = if separator == " " {
142 text.split(' ').collect()
143 } else {
144 split_keeping_separator(text, separator)
145 };
146
147 let mut chunks = Vec::new();
148 let mut current = String::new();
149
150 for segment in segments {
151 if current.is_empty() {
152 current = segment.to_string();
153 } else if current.len() + segment.len() <= chunk_size {
154 current.push_str(segment);
155 } else {
156 if current.len() > chunk_size {
158 chunks.extend(split_and_merge(
159 ¤t,
160 chunk_size,
161 chunk_overlap,
162 remaining_separators,
163 ));
164 } else {
165 chunks.push(current);
166 }
167 current = segment.to_string();
169 }
170 }
171
172 if !current.is_empty() {
173 if current.len() > chunk_size {
174 chunks.extend(split_and_merge(
175 ¤t,
176 chunk_size,
177 chunk_overlap,
178 remaining_separators,
179 ));
180 } else {
181 chunks.push(current);
182 }
183 }
184
185 chunks
186}
187
188fn split_keeping_separator<'a>(text: &'a str, separator: &str) -> Vec<&'a str> {
190 let mut result = Vec::new();
191 let mut start = 0;
192
193 while let Some(pos) = text[start..].find(separator) {
194 let end = start + pos + separator.len();
195 result.push(&text[start..end]);
196 start = end;
197 }
198
199 if start < text.len() {
200 result.push(&text[start..]);
201 }
202
203 result
204}
205
206fn split_by_size(text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
208 if text.is_empty() {
209 return Vec::new();
210 }
211
212 let mut chunks = Vec::new();
213 let mut start = 0;
214
215 while start < text.len() {
216 let end = (start + chunk_size).min(text.len());
217 chunks.push(text[start..end].to_string());
218 let step = chunk_size.saturating_sub(chunk_overlap);
219 if step == 0 {
220 break;
221 }
222 start += step;
223 }
224
225 chunks
226}
227
228impl Chunker for RecursiveChunker {
229 fn chunk(&self, document: &Document) -> Vec<Chunk> {
230 if document.text.is_empty() {
231 return Vec::new();
232 }
233
234 let separators = ["\n\n", ". ", "! ", "? ", " "];
235 let raw_chunks =
236 split_and_merge(&document.text, self.chunk_size, self.chunk_overlap, &separators);
237
238 raw_chunks
239 .into_iter()
240 .enumerate()
241 .map(|(i, text)| {
242 let mut metadata = document.metadata.clone();
243 metadata.insert("chunk_index".to_string(), i.to_string());
244 Chunk {
245 id: format!("{}_{i}", document.id),
246 text,
247 embedding: Vec::new(),
248 metadata,
249 document_id: document.id.clone(),
250 }
251 })
252 .collect()
253 }
254}
255
256#[derive(Debug, Clone)]
271pub struct MarkdownChunker {
272 chunk_size: usize,
273 chunk_overlap: usize,
274}
275
276impl MarkdownChunker {
277 pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
284 Self { chunk_size, chunk_overlap }
285 }
286}
287
288struct MarkdownSection {
290 header_path: String,
291 text: String,
292}
293
294fn parse_markdown_sections(text: &str) -> Vec<MarkdownSection> {
296 let mut sections = Vec::new();
297 let mut headers: Vec<String> = Vec::new();
298 let mut current_body = String::new();
299 let mut current_header_path = String::new();
300
301 for line in text.lines() {
302 let trimmed = line.trim_start();
303 if trimmed.starts_with('#') {
304 if !current_body.is_empty() || !current_header_path.is_empty() {
306 sections.push(MarkdownSection {
307 header_path: current_header_path.clone(),
308 text: current_body.trim().to_string(),
309 });
310 current_body = String::new();
311 }
312
313 let level = trimmed.chars().take_while(|c| *c == '#').count();
315 let header_text = trimmed[level..].trim().to_string();
316
317 headers.truncate(level.saturating_sub(1));
319 headers.push(header_text);
320 current_header_path = headers.join(" > ");
321 } else {
322 if !current_body.is_empty() {
323 current_body.push('\n');
324 }
325 current_body.push_str(line);
326 }
327 }
328
329 if !current_body.is_empty() || !current_header_path.is_empty() {
331 sections.push(MarkdownSection {
332 header_path: current_header_path,
333 text: current_body.trim().to_string(),
334 });
335 }
336
337 sections
338}
339
340impl Chunker for MarkdownChunker {
341 fn chunk(&self, document: &Document) -> Vec<Chunk> {
342 if document.text.is_empty() {
343 return Vec::new();
344 }
345
346 let sections = parse_markdown_sections(&document.text);
347 let mut chunks = Vec::new();
348 let mut chunk_index = 0;
349
350 for section in sections {
351 let section_text = if section.header_path.is_empty() {
353 section.text.clone()
354 } else if section.text.is_empty() {
355 section.header_path.clone()
356 } else {
357 format!("{}\n{}", section.header_path, section.text)
358 };
359
360 if section_text.is_empty() {
361 continue;
362 }
363
364 let sub_chunks = if section_text.len() > self.chunk_size {
365 let separators = ["\n\n", ". ", "! ", "? ", " "];
367 split_and_merge(§ion_text, self.chunk_size, self.chunk_overlap, &separators)
368 } else {
369 vec![section_text]
370 };
371
372 for text in sub_chunks {
373 let mut metadata = document.metadata.clone();
374 metadata.insert("chunk_index".to_string(), chunk_index.to_string());
375 metadata.insert("header_path".to_string(), section.header_path.clone());
376
377 chunks.push(Chunk {
378 id: format!("{}_{chunk_index}", document.id),
379 text,
380 embedding: Vec::new(),
381 metadata,
382 document_id: document.id.clone(),
383 });
384 chunk_index += 1;
385 }
386 }
387
388 chunks
389 }
390}