entelix_rag/splitter/
recursive.rs1use crate::document::{Document, Lineage};
34use crate::splitter::TextSplitter;
35use crate::splitter::common::{merge_with_overlap_metric, recurse_with_metric};
36
37pub const DEFAULT_CHUNK_SIZE_CHARS: usize = 1000;
41
42pub const DEFAULT_CHUNK_OVERLAP_CHARS: usize = 100;
46
47pub const DEFAULT_RECURSIVE_SEPARATORS: &[&str] = &["\n\n", "\n", " ", ""];
52
53const SPLITTER_NAME: &str = "recursive-character";
56
57#[derive(Clone, Debug)]
65pub struct RecursiveCharacterSplitter {
66 chunk_size: usize,
67 chunk_overlap: usize,
68 separators: std::sync::Arc<[String]>,
69}
70
71impl RecursiveCharacterSplitter {
72 #[must_use]
76 pub fn new() -> Self {
77 Self {
78 chunk_size: DEFAULT_CHUNK_SIZE_CHARS,
79 chunk_overlap: DEFAULT_CHUNK_OVERLAP_CHARS,
80 separators: DEFAULT_RECURSIVE_SEPARATORS
81 .iter()
82 .map(|s| (*s).to_owned())
83 .collect(),
84 }
85 }
86
87 #[must_use]
91 pub const fn with_chunk_size(mut self, chunk_size: usize) -> Self {
92 self.chunk_size = chunk_size;
93 self
94 }
95
96 #[must_use]
102 pub const fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
103 self.chunk_overlap = chunk_overlap;
104 self
105 }
106
107 #[must_use]
112 pub fn with_separators<I, S>(mut self, separators: I) -> Self
113 where
114 I: IntoIterator<Item = S>,
115 S: Into<String>,
116 {
117 self.separators = separators.into_iter().map(Into::into).collect();
118 self
119 }
120
121 #[must_use]
123 pub const fn chunk_size(&self) -> usize {
124 self.chunk_size
125 }
126
127 #[must_use]
129 pub const fn chunk_overlap(&self) -> usize {
130 self.chunk_overlap
131 }
132}
133
134impl Default for RecursiveCharacterSplitter {
135 fn default() -> Self {
136 Self::new()
137 }
138}
139
140impl TextSplitter for RecursiveCharacterSplitter {
141 fn name(&self) -> &'static str {
142 SPLITTER_NAME
143 }
144
145 fn split(&self, document: &Document) -> Vec<Document> {
146 let chunk_size = self.chunk_size.max(1);
147 let chunk_overlap = self.chunk_overlap.min(chunk_size.saturating_sub(1));
148
149 let measure = |text: &str| char_count(text);
150 let take_tail = |text: &str, n: usize| take_tail_chars(text, n);
151 let fallback = |text: &str, n: usize| char_chunks(text, n);
152
153 let segments = recurse_with_metric(
154 &document.content,
155 &self.separators,
156 chunk_size,
157 &measure,
158 &fallback,
159 );
160 let texts =
161 merge_with_overlap_metric(segments, chunk_size, chunk_overlap, &measure, &take_tail);
162 let total = texts.len();
163 if total == 0 {
164 return Vec::new();
165 }
166 #[allow(clippy::cast_possible_truncation)]
167 let total_u32 = total.min(u32::MAX as usize) as u32;
168 texts
169 .into_iter()
170 .enumerate()
171 .map(|(idx, text)| {
172 #[allow(clippy::cast_possible_truncation)]
173 let idx_u32 = idx.min(u32::MAX as usize) as u32;
174 let lineage =
175 Lineage::from_split(document.id.clone(), idx_u32, total_u32, SPLITTER_NAME);
176 document.child(text, lineage)
177 })
178 .collect()
179 }
180}
181
182fn char_chunks(text: &str, chunk_size: usize) -> Vec<String> {
185 if chunk_size == 0 || text.is_empty() {
186 return Vec::new();
187 }
188 let mut out = Vec::new();
189 let mut current = String::new();
190 let mut count = 0usize;
191 for ch in text.chars() {
192 current.push(ch);
193 count += 1;
194 if count == chunk_size {
195 out.push(std::mem::take(&mut current));
196 count = 0;
197 }
198 }
199 if !current.is_empty() {
200 out.push(current);
201 }
202 out
203}
204
205fn take_tail_chars(text: &str, n: usize) -> String {
209 let total = char_count(text);
210 if n >= total {
211 return text.to_owned();
212 }
213 let skip = total - n;
214 text.chars().skip(skip).collect()
215}
216
217#[inline]
218fn char_count(text: &str) -> usize {
219 text.chars().count()
220}
221
222#[cfg(test)]
223#[allow(clippy::unwrap_used, clippy::indexing_slicing)]
224mod tests {
225 use super::*;
226 use crate::document::Source;
227 use entelix_memory::Namespace;
228
229 fn ns() -> Namespace {
230 Namespace::new(entelix_core::TenantId::new("acme"))
231 }
232
233 fn doc(content: &str) -> Document {
234 Document::root("doc", content, Source::now("test://", "test"), ns())
235 }
236
237 #[test]
238 fn empty_input_produces_no_chunks() {
239 let chunks = RecursiveCharacterSplitter::new().split(&doc(""));
240 assert!(chunks.is_empty());
241 }
242
243 #[test]
244 fn small_input_produces_single_chunk_with_lineage() {
245 let chunks = RecursiveCharacterSplitter::new().split(&doc("short text"));
246 assert_eq!(chunks.len(), 1);
247 let lineage = chunks[0].lineage.as_ref().unwrap();
248 assert_eq!(lineage.chunk_index, 0);
249 assert_eq!(lineage.total_chunks, 1);
250 assert_eq!(lineage.splitter, "recursive-character");
251 assert_eq!(lineage.parent_id.as_str(), "doc");
252 }
253
254 #[test]
255 fn paragraph_split_prefers_double_newline_boundary() {
256 let text = "alpha paragraph.\n\nbeta paragraph.\n\ngamma paragraph.";
257 let splitter = RecursiveCharacterSplitter::new()
258 .with_chunk_size(20)
259 .with_chunk_overlap(0);
260 let chunks = splitter.split(&doc(text));
261 assert_eq!(chunks.len(), 3);
262 assert!(chunks[0].content.contains("alpha"));
263 assert!(chunks[1].content.contains("beta"));
264 assert!(chunks[2].content.contains("gamma"));
265 }
266
267 #[test]
268 fn oversized_paragraph_recurses_to_word_boundary() {
269 let text = "alpha bravo charlie delta echo foxtrot golf hotel";
270 let splitter = RecursiveCharacterSplitter::new()
271 .with_chunk_size(20)
272 .with_chunk_overlap(0);
273 let chunks = splitter.split(&doc(text));
274 assert!(chunks.len() > 1, "must descend past paragraph break");
275 for chunk in &chunks {
276 assert!(
277 char_count(&chunk.content) <= 20,
278 "chunk size cap honoured: {} chars",
279 char_count(&chunk.content)
280 );
281 }
282 }
283
284 #[test]
285 fn overlap_seeds_tail_into_next_chunk() {
286 let text = "0123456789 abcdefghij KLMNOPQRST uvwxyz0123";
287 let splitter = RecursiveCharacterSplitter::new()
288 .with_chunk_size(15)
289 .with_chunk_overlap(5);
290 let chunks = splitter.split(&doc(text));
291 assert!(chunks.len() >= 2);
292 for window in chunks.windows(2) {
293 let prev_tail = take_tail_chars(&window[0].content, 5);
294 assert!(
295 window[1].content.starts_with(&prev_tail),
296 "next chunk must begin with previous tail: prev_tail={prev_tail:?}, next={:?}",
297 window[1].content
298 );
299 }
300 }
301
302 #[test]
303 fn char_aware_split_respects_unicode_boundary() {
304 let text = "안녕하세요반갑습니다";
305 let splitter = RecursiveCharacterSplitter::new()
306 .with_chunk_size(4)
307 .with_chunk_overlap(0)
308 .with_separators(["", ""]);
309 let chunks = splitter.split(&doc(text));
310 for chunk in &chunks {
311 let chars: String = chunk.content.chars().collect();
312 assert_eq!(chars, chunk.content);
313 assert!(char_count(&chunk.content) <= 4);
314 }
315 let joined: String = chunks.iter().map(|c| c.content.as_str()).collect();
316 assert_eq!(joined, text);
317 }
318
319 #[test]
320 fn lineage_total_chunks_matches_emitted_count() {
321 let text = "para one.\n\npara two.\n\npara three.";
322 let chunks = RecursiveCharacterSplitter::new()
323 .with_chunk_size(15)
324 .with_chunk_overlap(0)
325 .split(&doc(text));
326 let total = chunks.len();
327 for (idx, chunk) in chunks.iter().enumerate() {
328 let lineage = chunk.lineage.as_ref().unwrap();
329 #[allow(clippy::cast_possible_truncation)]
330 let idx_u32 = idx as u32;
331 #[allow(clippy::cast_possible_truncation)]
332 let total_u32 = total as u32;
333 assert_eq!(lineage.chunk_index, idx_u32);
334 assert_eq!(lineage.total_chunks, total_u32);
335 }
336 }
337
338 #[test]
339 fn child_id_carries_chunk_index_suffix() {
340 let chunks = RecursiveCharacterSplitter::new()
341 .with_chunk_size(5)
342 .with_chunk_overlap(0)
343 .split(&doc("alpha beta gamma delta"));
344 for (idx, chunk) in chunks.iter().enumerate() {
345 assert_eq!(chunk.id.as_str(), format!("doc:{idx}"));
346 }
347 }
348
349 #[test]
350 fn overlap_clamped_below_chunk_size_terminates() {
351 let splitter = RecursiveCharacterSplitter::new()
352 .with_chunk_size(5)
353 .with_chunk_overlap(100);
354 let chunks = splitter.split(&doc("0123456789 abcdefghij"));
355 assert!(
356 !chunks.is_empty() && chunks.len() < 1000,
357 "split terminated with bounded chunk count, got {}",
358 chunks.len()
359 );
360 }
361}