Skip to main content

adk_rag/
chunking.rs

1//! Document chunking strategies.
2//!
3//! This module provides the [`Chunker`] trait and three implementations:
4//!
5//! - [`FixedSizeChunker`] — splits by character count with configurable overlap
6//! - [`RecursiveChunker`] — splits hierarchically by paragraphs, sentences, then words
7//! - [`MarkdownChunker`] — splits by markdown headers, preserving header context
8
9use crate::document::{Chunk, Document};
10
11/// A strategy for splitting documents into chunks.
12///
13/// Implementations produce [`Chunk`]s with text and metadata but no embeddings.
14/// Embeddings are attached later by the pipeline.
15pub trait Chunker: Send + Sync {
16    /// Split a document into chunks.
17    ///
18    /// Returns an empty `Vec` if the document has empty text.
19    /// Each returned chunk has an empty embedding vector.
20    fn chunk(&self, document: &Document) -> Vec<Chunk>;
21}
22
23/// Splits text into fixed-size chunks by character count with configurable overlap.
24///
25/// Chunk IDs are generated as `{document_id}_{chunk_index}`. Each chunk inherits
26/// the parent document's metadata plus a `chunk_index` field.
27///
28/// # Example
29///
30/// ```rust,ignore
31/// use adk_rag::FixedSizeChunker;
32///
33/// let chunker = FixedSizeChunker::new(256, 50);
34/// let chunks = chunker.chunk(&document);
35/// ```
36#[derive(Debug, Clone)]
37pub struct FixedSizeChunker {
38    chunk_size: usize,
39    chunk_overlap: usize,
40}
41
42impl FixedSizeChunker {
43    /// Create a new `FixedSizeChunker`.
44    ///
45    /// # Arguments
46    ///
47    /// * `chunk_size` — maximum number of characters per chunk
48    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
49    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
50        Self { chunk_size, chunk_overlap }
51    }
52}
53
54impl Chunker for FixedSizeChunker {
55    fn chunk(&self, document: &Document) -> Vec<Chunk> {
56        if document.text.is_empty() {
57            return Vec::new();
58        }
59
60        let text = &document.text;
61        let mut chunks = Vec::new();
62        let mut start = 0;
63        let mut chunk_index = 0;
64
65        while start < text.len() {
66            let end = (start + self.chunk_size).min(text.len());
67            let chunk_text = &text[start..end];
68
69            let mut metadata = document.metadata.clone();
70            metadata.insert("chunk_index".to_string(), chunk_index.to_string());
71
72            chunks.push(Chunk {
73                id: format!("{}_{chunk_index}", document.id),
74                text: chunk_text.to_string(),
75                embedding: Vec::new(),
76                metadata,
77                document_id: document.id.clone(),
78            });
79
80            chunk_index += 1;
81            let step = self.chunk_size.saturating_sub(self.chunk_overlap);
82            if step == 0 {
83                break;
84            }
85            start += step;
86        }
87
88        chunks
89    }
90}
91
92/// Splits text hierarchically: paragraphs → sentences → words.
93///
94/// First splits by paragraph separators (`\n\n`). If a paragraph exceeds
95/// `chunk_size`, splits by sentence boundaries (`. `, `! `, `? `). If a
96/// sentence still exceeds `chunk_size`, splits by word boundaries. Overlap
97/// is applied between chunks at each level.
98///
99/// # Example
100///
101/// ```rust,ignore
102/// use adk_rag::RecursiveChunker;
103///
104/// let chunker = RecursiveChunker::new(512, 100);
105/// let chunks = chunker.chunk(&document);
106/// ```
107#[derive(Debug, Clone)]
108pub struct RecursiveChunker {
109    chunk_size: usize,
110    chunk_overlap: usize,
111}
112
113impl RecursiveChunker {
114    /// Create a new `RecursiveChunker`.
115    ///
116    /// # Arguments
117    ///
118    /// * `chunk_size` — maximum number of characters per chunk
119    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
120    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
121        Self { chunk_size, chunk_overlap }
122    }
123}
124
125/// Split text by a separator, then merge segments into chunks that respect
126/// `chunk_size`. If a segment exceeds `chunk_size`, it is split further
127/// using the next-level separator.
128fn split_and_merge(
129    text: &str,
130    chunk_size: usize,
131    chunk_overlap: usize,
132    separators: &[&str],
133) -> Vec<String> {
134    if text.len() <= chunk_size || separators.is_empty() {
135        return split_by_size(text, chunk_size, chunk_overlap);
136    }
137
138    let separator = separators[0];
139    let remaining_separators = &separators[1..];
140
141    let segments: Vec<&str> = if separator == " " {
142        text.split(' ').collect()
143    } else {
144        split_keeping_separator(text, separator)
145    };
146
147    let mut chunks = Vec::new();
148    let mut current = String::new();
149
150    for segment in segments {
151        if current.is_empty() {
152            current = segment.to_string();
153        } else if current.len() + segment.len() <= chunk_size {
154            current.push_str(segment);
155        } else {
156            // Current chunk is full — process it
157            if current.len() > chunk_size {
158                chunks.extend(split_and_merge(
159                    &current,
160                    chunk_size,
161                    chunk_overlap,
162                    remaining_separators,
163                ));
164            } else {
165                chunks.push(current);
166            }
167            // Start new chunk with overlap
168            current = segment.to_string();
169        }
170    }
171
172    if !current.is_empty() {
173        if current.len() > chunk_size {
174            chunks.extend(split_and_merge(
175                &current,
176                chunk_size,
177                chunk_overlap,
178                remaining_separators,
179            ));
180        } else {
181            chunks.push(current);
182        }
183    }
184
185    chunks
186}
187
188/// Split text at a separator while keeping the separator attached to the preceding segment.
189fn split_keeping_separator<'a>(text: &'a str, separator: &str) -> Vec<&'a str> {
190    let mut result = Vec::new();
191    let mut start = 0;
192
193    while let Some(pos) = text[start..].find(separator) {
194        let end = start + pos + separator.len();
195        result.push(&text[start..end]);
196        start = end;
197    }
198
199    if start < text.len() {
200        result.push(&text[start..]);
201    }
202
203    result
204}
205
206/// Simple character-based splitting with overlap.
207fn split_by_size(text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
208    if text.is_empty() {
209        return Vec::new();
210    }
211
212    let mut chunks = Vec::new();
213    let mut start = 0;
214
215    while start < text.len() {
216        let end = (start + chunk_size).min(text.len());
217        chunks.push(text[start..end].to_string());
218        let step = chunk_size.saturating_sub(chunk_overlap);
219        if step == 0 {
220            break;
221        }
222        start += step;
223    }
224
225    chunks
226}
227
228impl Chunker for RecursiveChunker {
229    fn chunk(&self, document: &Document) -> Vec<Chunk> {
230        if document.text.is_empty() {
231            return Vec::new();
232        }
233
234        let separators = ["\n\n", ". ", "! ", "? ", " "];
235        let raw_chunks =
236            split_and_merge(&document.text, self.chunk_size, self.chunk_overlap, &separators);
237
238        raw_chunks
239            .into_iter()
240            .enumerate()
241            .map(|(i, text)| {
242                let mut metadata = document.metadata.clone();
243                metadata.insert("chunk_index".to_string(), i.to_string());
244                Chunk {
245                    id: format!("{}_{i}", document.id),
246                    text,
247                    embedding: Vec::new(),
248                    metadata,
249                    document_id: document.id.clone(),
250                }
251            })
252            .collect()
253    }
254}
255
256/// Splits text by markdown headers, keeping each section as a chunk.
257///
258/// Each section is prefixed with its header hierarchy. Sections exceeding
259/// `chunk_size` are further split using [`RecursiveChunker`] logic.
260/// The `header_path` metadata field records the header hierarchy for each chunk.
261///
262/// # Example
263///
264/// ```rust,ignore
265/// use adk_rag::MarkdownChunker;
266///
267/// let chunker = MarkdownChunker::new(512, 100);
268/// let chunks = chunker.chunk(&document);
269/// ```
270#[derive(Debug, Clone)]
271pub struct MarkdownChunker {
272    chunk_size: usize,
273    chunk_overlap: usize,
274}
275
276impl MarkdownChunker {
277    /// Create a new `MarkdownChunker`.
278    ///
279    /// # Arguments
280    ///
281    /// * `chunk_size` — maximum number of characters per chunk
282    /// * `chunk_overlap` — number of overlapping characters between consecutive chunks
283    pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
284        Self { chunk_size, chunk_overlap }
285    }
286}
287
288/// A markdown section with its header hierarchy and body text.
289struct MarkdownSection {
290    header_path: String,
291    text: String,
292}
293
294/// Parse markdown text into sections split by headers.
295fn parse_markdown_sections(text: &str) -> Vec<MarkdownSection> {
296    let mut sections = Vec::new();
297    let mut headers: Vec<String> = Vec::new();
298    let mut current_body = String::new();
299    let mut current_header_path = String::new();
300
301    for line in text.lines() {
302        let trimmed = line.trim_start();
303        if trimmed.starts_with('#') {
304            // Save previous section
305            if !current_body.is_empty() || !current_header_path.is_empty() {
306                sections.push(MarkdownSection {
307                    header_path: current_header_path.clone(),
308                    text: current_body.trim().to_string(),
309                });
310                current_body = String::new();
311            }
312
313            // Determine header level
314            let level = trimmed.chars().take_while(|c| *c == '#').count();
315            let header_text = trimmed[level..].trim().to_string();
316
317            // Update header stack
318            headers.truncate(level.saturating_sub(1));
319            headers.push(header_text);
320            current_header_path = headers.join(" > ");
321        } else {
322            if !current_body.is_empty() {
323                current_body.push('\n');
324            }
325            current_body.push_str(line);
326        }
327    }
328
329    // Save final section
330    if !current_body.is_empty() || !current_header_path.is_empty() {
331        sections.push(MarkdownSection {
332            header_path: current_header_path,
333            text: current_body.trim().to_string(),
334        });
335    }
336
337    sections
338}
339
340impl Chunker for MarkdownChunker {
341    fn chunk(&self, document: &Document) -> Vec<Chunk> {
342        if document.text.is_empty() {
343            return Vec::new();
344        }
345
346        let sections = parse_markdown_sections(&document.text);
347        let mut chunks = Vec::new();
348        let mut chunk_index = 0;
349
350        for section in sections {
351            // Build section text with header prefix
352            let section_text = if section.header_path.is_empty() {
353                section.text.clone()
354            } else if section.text.is_empty() {
355                section.header_path.clone()
356            } else {
357                format!("{}\n{}", section.header_path, section.text)
358            };
359
360            if section_text.is_empty() {
361                continue;
362            }
363
364            let sub_chunks = if section_text.len() > self.chunk_size {
365                // Further split using recursive logic
366                let separators = ["\n\n", ". ", "! ", "? ", " "];
367                split_and_merge(&section_text, self.chunk_size, self.chunk_overlap, &separators)
368            } else {
369                vec![section_text]
370            };
371
372            for text in sub_chunks {
373                let mut metadata = document.metadata.clone();
374                metadata.insert("chunk_index".to_string(), chunk_index.to_string());
375                metadata.insert("header_path".to_string(), section.header_path.clone());
376
377                chunks.push(Chunk {
378                    id: format!("{}_{chunk_index}", document.id),
379                    text,
380                    embedding: Vec::new(),
381                    metadata,
382                    document_id: document.id.clone(),
383                });
384                chunk_index += 1;
385            }
386        }
387
388        chunks
389    }
390}