Skip to main content

semantic_memory/
chunker.rs

1//! Text chunking via recursive splitting with overlap.
2
3use crate::config::ChunkingConfig;
4use crate::tokenizer::TokenCounter;
5use crate::types::TextChunk;
6
7/// Maximum recursion depth to prevent infinite loops.
8const MAX_RECURSION_DEPTH: usize = 10;
9
10/// Chunk text using the given configuration and token counter.
11///
12/// Returns an empty vec for empty input. Short text (≤ max_size) is
13/// returned as a single chunk.
14pub fn chunk_text(
15    text: &str,
16    config: &ChunkingConfig,
17    token_counter: &dyn TokenCounter,
18) -> Vec<TextChunk> {
19    if text.trim().is_empty() {
20        return Vec::new();
21    }
22
23    if text.len() <= config.max_size && text.len() >= config.min_size {
24        return vec![TextChunk {
25            index: 0,
26            content: text.to_string(),
27            token_count_estimate: token_counter.count_tokens(text),
28        }];
29    }
30
31    // Recursive split
32    let raw_chunks = recursive_split(text, config, 0);
33
34    // Merge small adjacent chunks
35    let merged = merge_small_chunks(raw_chunks, config.target_size, config.min_size);
36
37    // Apply overlap
38    let overlapped = apply_overlap(merged, config.overlap);
39
40    // Convert to TextChunk
41    overlapped
42        .into_iter()
43        .enumerate()
44        .map(|(i, content)| {
45            let token_count_estimate = token_counter.count_tokens(&content);
46            TextChunk {
47                index: i,
48                content,
49                token_count_estimate,
50            }
51        })
52        .collect()
53}
54
55/// Snap a byte offset to the nearest valid UTF-8 char boundary (searching backward).
56fn safe_split_at(text: &str, pos: usize) -> usize {
57    let mut split = pos.min(text.len());
58    while split > 0 && !text.is_char_boundary(split) {
59        split -= 1;
60    }
61    split
62}
63
64/// Force-split text at max_size boundaries, respecting UTF-8.
65fn force_split(text: &str, max_size: usize) -> Vec<String> {
66    let mut chunks = Vec::new();
67    let mut start = 0;
68    while start < text.len() {
69        let end = safe_split_at(text, start + max_size);
70        if end <= start {
71            // Can't make progress — advance past the current character
72            let mut next = start + 1;
73            while next < text.len() && !text.is_char_boundary(next) {
74                next += 1;
75            }
76            if next <= text.len() {
77                chunks.push(text[start..next].to_string());
78            }
79            start = next;
80            continue;
81        }
82        chunks.push(text[start..end].to_string());
83        start = end;
84    }
85    chunks
86}
87
88/// Recursively split text using a hierarchy of separators.
89fn recursive_split(text: &str, config: &ChunkingConfig, depth: usize) -> Vec<String> {
90    if text.len() <= config.max_size {
91        return vec![text.to_string()];
92    }
93
94    if depth >= MAX_RECURSION_DEPTH {
95        tracing::warn!("Chunker hit max recursion depth, force-splitting at max_size");
96        return force_split(text, config.max_size);
97    }
98
99    // Separator hierarchy: paragraph > sentence > word > force
100    let separators: &[&str] = &["\n\n", ". ", "? ", "! ", " "];
101
102    for sep in separators {
103        let parts: Vec<&str> = text.split(sep).collect();
104        if parts.len() <= 1 {
105            continue;
106        }
107
108        let mut chunks = Vec::new();
109        let mut current = String::new();
110
111        for (i, part) in parts.iter().enumerate() {
112            let piece = if i + 1 < parts.len() {
113                format!("{}{}", part, sep)
114            } else {
115                part.to_string()
116            };
117
118            if current.len() + piece.len() > config.target_size && !current.is_empty() {
119                chunks.push(current);
120                current = piece;
121            } else {
122                current.push_str(&piece);
123            }
124        }
125
126        if !current.is_empty() {
127            chunks.push(current);
128        }
129
130        // Recursively split any chunks that are still too large
131        let mut result = Vec::new();
132        for chunk in chunks {
133            if chunk.len() > config.max_size {
134                result.extend(recursive_split(&chunk, config, depth + 1));
135            } else {
136                result.push(chunk);
137            }
138        }
139
140        if result.len() > 1 {
141            return result;
142        }
143    }
144
145    // All separators exhausted — force split
146    force_split(text, config.max_size)
147}
148
149/// Merge adjacent chunks that are smaller than min_size.
150fn merge_small_chunks(chunks: Vec<String>, target_size: usize, min_size: usize) -> Vec<String> {
151    if chunks.is_empty() {
152        return chunks;
153    }
154
155    let mut merged = Vec::new();
156    let mut current = chunks[0].clone();
157
158    for chunk in chunks.iter().skip(1) {
159        if (current.len() < min_size || chunk.len() < min_size)
160            && current.len() + chunk.len() <= target_size
161        {
162            current.push_str(chunk);
163        } else {
164            merged.push(current);
165            current = chunk.clone();
166        }
167    }
168
169    merged.push(current);
170    if merged.len() > 1 {
171        if let Some(last) = merged.last() {
172            if last.len() < min_size {
173                let last = merged.pop().unwrap_or_default();
174                if let Some(previous) = merged.last_mut() {
175                    previous.push_str(&last);
176                } else {
177                    merged.push(last);
178                }
179            }
180        }
181    }
182    merged
183}
184
185/// Apply overlap between adjacent chunks.
186fn apply_overlap(chunks: Vec<String>, overlap: usize) -> Vec<String> {
187    if chunks.len() <= 1 || overlap == 0 {
188        return chunks;
189    }
190
191    let mut result = Vec::with_capacity(chunks.len());
192    result.push(chunks[0].clone());
193
194    for i in 1..chunks.len() {
195        let prev = result.last().map(String::as_str).unwrap_or(&chunks[i - 1]);
196        let overlap_start = if prev.len() > overlap {
197            // Find a word boundary for the overlap start
198            let raw_start = prev.len() - overlap;
199            let safe_start = safe_split_at(prev, raw_start);
200            // Try to find a space after safe_start to avoid cutting mid-word
201            prev[safe_start..]
202                .find(' ')
203                .map(|pos| safe_start + pos + 1)
204                .unwrap_or(safe_start)
205        } else {
206            0
207        };
208
209        let overlap_text = &prev[overlap_start..];
210        let mut chunk_with_overlap = overlap_text.to_string();
211        chunk_with_overlap.push_str(&chunks[i]);
212        result.push(chunk_with_overlap);
213    }
214
215    result
216}