adk_rag/
chunking.rs

1//! Document chunking strategies.
2//!
3//! This module provides the [`Chunker`] trait and three implementations:
4//!
5//! - [`FixedSizeChunker`] — splits by character count with configurable overlap
6//! - [`RecursiveChunker`] — splits hierarchically by paragraphs, sentences, then words
7//! - [`MarkdownChunker`] — splits by markdown headers, preserving header context
8
9use crate::document::{Chunk, Document};
10
11/// MSRV-compatible replacement for `str::floor_char_boundary` (stable since 1.91.0).
12/// Returns the largest byte index `<= index` that is a valid char boundary.
13fn floor_char_boundary(s: &str, index: usize) -> usize {
14    if index >= s.len() {
15        return s.len();
16    }
17    let mut i = index;
18    while i > 0 && !s.is_char_boundary(i) {
19        i -= 1;
20    }
21    i
22}
23
24/// A strategy for splitting documents into chunks.
25///
26/// Implementations produce [`Chunk`]s with text and metadata but no embeddings.
27/// Embeddings are attached later by the pipeline.
28pub trait Chunker: Send + Sync {
29    /// Split a document into chunks.
30    ///
31    /// Returns an empty `Vec` if the document has empty text.
32    /// Each returned chunk has an empty embedding vector.
33    fn chunk(&self, document: &Document) -> Vec<Chunk>;
34}
35
36/// Splits text into fixed-size chunks by character count with configurable overlap.
37///
38/// Chunk IDs are generated as `{document_id}_{chunk_index}`. Each chunk inherits
39/// the parent document's metadata plus a `chunk_index` field.
40///
41/// # Example
42///
43/// ```rust,ignore
44/// use adk_rag::FixedSizeChunker;
45///
46/// let chunker = FixedSizeChunker::new(256, 50);
47/// let chunks = chunker.chunk(&document);
48/// ```
49#[derive(Debug, Clone)]
50pub struct FixedSizeChunker {
51    chunk_size: usize,
52    chunk_overlap: usize,
53}
54
55impl FixedSizeChunker {
56    /// Create a new `FixedSizeChunker`.
57    ///
58    /// # Arguments
59    ///
60    /// * `chunk_size` — maximum number of characters per chunk
61    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
62    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
63        Self { chunk_size, chunk_overlap }
64    }
65}
66
67impl Chunker for FixedSizeChunker {
68    fn chunk(&self, document: &Document) -> Vec<Chunk> {
69        if document.text.is_empty() {
70            return Vec::new();
71        }
72
73        let text = &document.text;
74        let mut chunks = Vec::new();
75        let mut start = 0;
76        let mut chunk_index = 0;
77
78        while start < text.len() {
79            let end = floor_char_boundary(text, (start + self.chunk_size).min(text.len()));
80            let chunk_text = &text[start..end];
81
82            let mut metadata = document.metadata.clone();
83            metadata.insert("chunk_index".to_string(), chunk_index.to_string());
84
85            chunks.push(Chunk {
86                id: format!("{}_{chunk_index}", document.id),
87                text: chunk_text.to_string(),
88                embedding: Vec::new(),
89                metadata,
90                document_id: document.id.clone(),
91            });
92
93            chunk_index += 1;
94            let step = self.chunk_size.saturating_sub(self.chunk_overlap);
95            if step == 0 {
96                break;
97            }
98            start = floor_char_boundary(text, start + step);
99        }
100
101        chunks
102    }
103}
104
105/// Splits text hierarchically: paragraphs → sentences → words.
106///
107/// First splits by paragraph separators (`\n\n`). If a paragraph exceeds
108/// `chunk_size`, splits by sentence boundaries (`. `, `! `, `? `). If a
109/// sentence still exceeds `chunk_size`, splits by word boundaries. Overlap
110/// is applied between chunks at each level.
111///
112/// # Example
113///
114/// ```rust,ignore
115/// use adk_rag::RecursiveChunker;
116///
117/// let chunker = RecursiveChunker::new(512, 100);
118/// let chunks = chunker.chunk(&document);
119/// ```
120#[derive(Debug, Clone)]
121pub struct RecursiveChunker {
122    chunk_size: usize,
123    chunk_overlap: usize,
124}
125
126impl RecursiveChunker {
127    /// Create a new `RecursiveChunker`.
128    ///
129    /// # Arguments
130    ///
131    /// * `chunk_size` — maximum number of characters per chunk
132    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
133    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
134        Self { chunk_size, chunk_overlap }
135    }
136}
137
138/// Split text by a separator, then merge segments into chunks that respect
139/// `chunk_size`. If a segment exceeds `chunk_size`, it is split further
140/// using the next-level separator.
141fn split_and_merge(
142    text: &str,
143    chunk_size: usize,
144    chunk_overlap: usize,
145    separators: &[&str],
146) -> Vec<String> {
147    if text.len() <= chunk_size || separators.is_empty() {
148        return split_by_size(text, chunk_size, chunk_overlap);
149    }
150
151    let separator = separators[0];
152    let remaining_separators = &separators[1..];
153
154    let segments: Vec<&str> = if separator == " " {
155        text.split(' ').collect()
156    } else {
157        split_keeping_separator(text, separator)
158    };
159
160    let mut chunks = Vec::new();
161    let mut current = String::new();
162
163    for segment in segments {
164        if current.is_empty() {
165            current = segment.to_string();
166        } else if current.len() + segment.len() <= chunk_size {
167            current.push_str(segment);
168        } else {
169            // Current chunk is full — process it
170            if current.len() > chunk_size {
171                chunks.extend(split_and_merge(
172                    &current,
173                    chunk_size,
174                    chunk_overlap,
175                    remaining_separators,
176                ));
177            } else {
178                chunks.push(current);
179            }
180            // Start new chunk with overlap
181            current = segment.to_string();
182        }
183    }
184
185    if !current.is_empty() {
186        if current.len() > chunk_size {
187            chunks.extend(split_and_merge(
188                &current,
189                chunk_size,
190                chunk_overlap,
191                remaining_separators,
192            ));
193        } else {
194            chunks.push(current);
195        }
196    }
197
198    chunks
199}
200
201/// Split text at a separator while keeping the separator attached to the preceding segment.
202fn split_keeping_separator<'a>(text: &'a str, separator: &str) -> Vec<&'a str> {
203    let mut result = Vec::new();
204    let mut start = 0;
205
206    while let Some(pos) = text[start..].find(separator) {
207        let end = start + pos + separator.len();
208        result.push(&text[start..end]);
209        start = end;
210    }
211
212    if start < text.len() {
213        result.push(&text[start..]);
214    }
215
216    result
217}
218
219/// Simple character-based splitting with overlap.
220fn split_by_size(text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
221    if text.is_empty() {
222        return Vec::new();
223    }
224
225    let mut chunks = Vec::new();
226    let mut start = 0;
227
228    while start < text.len() {
229        let end = floor_char_boundary(text, (start + chunk_size).min(text.len()));
230        chunks.push(text[start..end].to_string());
231        let step = chunk_size.saturating_sub(chunk_overlap);
232        if step == 0 {
233            break;
234        }
235        start = floor_char_boundary(text, start + step);
236    }
237
238    chunks
239}
240
241impl Chunker for RecursiveChunker {
242    fn chunk(&self, document: &Document) -> Vec<Chunk> {
243        if document.text.is_empty() {
244            return Vec::new();
245        }
246
247        let separators = ["\n\n", ". ", "! ", "? ", " "];
248        let raw_chunks =
249            split_and_merge(&document.text, self.chunk_size, self.chunk_overlap, &separators);
250
251        raw_chunks
252            .into_iter()
253            .enumerate()
254            .map(|(i, text)| {
255                let mut metadata = document.metadata.clone();
256                metadata.insert("chunk_index".to_string(), i.to_string());
257                Chunk {
258                    id: format!("{}_{i}", document.id),
259                    text,
260                    embedding: Vec::new(),
261                    metadata,
262                    document_id: document.id.clone(),
263                }
264            })
265            .collect()
266    }
267}
268
269/// Splits text by markdown headers, keeping each section as a chunk.
270///
271/// Each section is prefixed with its header hierarchy. Sections exceeding
272/// `chunk_size` are further split using [`RecursiveChunker`] logic.
273/// The `header_path` metadata field records the header hierarchy for each chunk.
274///
275/// # Example
276///
277/// ```rust,ignore
278/// use adk_rag::MarkdownChunker;
279///
280/// let chunker = MarkdownChunker::new(512, 100);
281/// let chunks = chunker.chunk(&document);
282/// ```
283#[derive(Debug, Clone)]
284pub struct MarkdownChunker {
285    chunk_size: usize,
286    chunk_overlap: usize,
287}
288
289impl MarkdownChunker {
290    /// Create a new `MarkdownChunker`.
291    ///
292    /// # Arguments
293    ///
294    /// * `chunk_size` — maximum number of characters per chunk
295    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
296    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
297        Self { chunk_size, chunk_overlap }
298    }
299}
300
301/// A markdown section with its header hierarchy and body text.
302struct MarkdownSection {
303    header_path: String,
304    text: String,
305}
306
307/// Parse markdown text into sections split by headers.
308fn parse_markdown_sections(text: &str) -> Vec<MarkdownSection> {
309    let mut sections = Vec::new();
310    let mut headers: Vec<String> = Vec::new();
311    let mut current_body = String::new();
312    let mut current_header_path = String::new();
313
314    for line in text.lines() {
315        let trimmed = line.trim_start();
316        if trimmed.starts_with('#') {
317            // Save previous section
318            if !current_body.is_empty() || !current_header_path.is_empty() {
319                sections.push(MarkdownSection {
320                    header_path: current_header_path.clone(),
321                    text: current_body.trim().to_string(),
322                });
323                current_body = String::new();
324            }
325
326            // Determine header level
327            let level = trimmed.chars().take_while(|c| *c == '#').count();
328            let header_text = trimmed[level..].trim().to_string();
329
330            // Update header stack
331            headers.truncate(level.saturating_sub(1));
332            headers.push(header_text);
333            current_header_path = headers.join(" > ");
334        } else {
335            if !current_body.is_empty() {
336                current_body.push('\n');
337            }
338            current_body.push_str(line);
339        }
340    }
341
342    // Save final section
343    if !current_body.is_empty() || !current_header_path.is_empty() {
344        sections.push(MarkdownSection {
345            header_path: current_header_path,
346            text: current_body.trim().to_string(),
347        });
348    }
349
350    sections
351}
352
353impl Chunker for MarkdownChunker {
354    fn chunk(&self, document: &Document) -> Vec<Chunk> {
355        if document.text.is_empty() {
356            return Vec::new();
357        }
358
359        let sections = parse_markdown_sections(&document.text);
360        let mut chunks = Vec::new();
361        let mut chunk_index = 0;
362
363        for section in sections {
364            // Build section text with header prefix
365            let section_text = if section.header_path.is_empty() {
366                section.text.clone()
367            } else if section.text.is_empty() {
368                section.header_path.clone()
369            } else {
370                format!("{}\n{}", section.header_path, section.text)
371            };
372
373            if section_text.is_empty() {
374                continue;
375            }
376
377            let sub_chunks = if section_text.len() > self.chunk_size {
378                // Further split using recursive logic
379                let separators = ["\n\n", ". ", "! ", "? ", " "];
380                split_and_merge(&section_text, self.chunk_size, self.chunk_overlap, &separators)
381            } else {
382                vec![section_text]
383            };
384
385            for text in sub_chunks {
386                let mut metadata = document.metadata.clone();
387                metadata.insert("chunk_index".to_string(), chunk_index.to_string());
388                metadata.insert("header_path".to_string(), section.header_path.clone());
389
390                chunks.push(Chunk {
391                    id: format!("{}_{chunk_index}", document.id),
392                    text,
393                    embedding: Vec::new(),
394                    metadata,
395                    document_id: document.id.clone(),
396                });
397                chunk_index += 1;
398            }
399        }
400
401        chunks
402    }
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408    use crate::Document;
409    use std::collections::HashMap;
410
411    fn doc(text: &str) -> Document {
412        Document {
413            id: "test".to_string(),
414            text: text.to_string(),
415            metadata: HashMap::new(),
416            source_uri: None,
417        }
418    }
419
420    #[test]
421    fn fixed_chunker_utf8_multibyte() {
422        // Chinese characters are 3 bytes each in UTF-8.
423        // "你好世界" = 4 chars, 12 bytes.
424        // With chunk_size=5 (bytes), naive slicing would panic mid-character.
425        let chunker = FixedSizeChunker::new(5, 0);
426        let chunks = chunker.chunk(&doc("你好世界测试文本"));
427        // Should not panic and all chunks should be valid UTF-8
428        for chunk in &chunks {
429            assert!(chunk.text.is_char_boundary(0));
430            // Verify it's valid UTF-8 by iterating chars
431            let _ = chunk.text.chars().count();
432        }
433        // Should produce multiple chunks
434        assert!(chunks.len() > 1);
435    }
436
437    #[test]
438    fn fixed_chunker_utf8_emoji() {
439        // Emoji are 4 bytes each. "🦀🚀🎉" = 3 chars, 12 bytes.
440        let chunker = FixedSizeChunker::new(6, 0);
441        let chunks = chunker.chunk(&doc("🦀🚀🎉✨🌟💫"));
442        for chunk in &chunks {
443            let _ = chunk.text.chars().count();
444        }
445        assert!(chunks.len() > 1);
446    }
447
448    #[test]
449    fn fixed_chunker_utf8_mixed() {
450        // Mix of ASCII (1 byte), accented (2 bytes), CJK (3 bytes), emoji (4 bytes)
451        let text = "Hello café 你好 🦀";
452        let chunker = FixedSizeChunker::new(7, 2);
453        let chunks = chunker.chunk(&doc(text));
454        for chunk in &chunks {
455            let _ = chunk.text.chars().count();
456        }
457        assert!(!chunks.is_empty());
458    }
459
460    #[test]
461    fn split_by_size_utf8() {
462        let text = "日本語のテスト文字列です";
463        let chunks = split_by_size(text, 10, 3);
464        for chunk in &chunks {
465            let _ = chunk.chars().count();
466        }
467        assert!(chunks.len() > 1);
468    }
469
470    #[test]
471    fn recursive_chunker_utf8() {
472        let text = "第一段落。这是中文文本。\n\n第二段落。更多中文内容在这里。";
473        let chunker = RecursiveChunker::new(15, 3);
474        let chunks = chunker.chunk(&doc(text));
475        for chunk in &chunks {
476            let _ = chunk.text.chars().count();
477        }
478        assert!(!chunks.is_empty());
479    }
480}
adk_rag/chunking.rs

adk_rag/
chunking.rs