entelix_rag/splitter/
recursive.rs

1//! `RecursiveCharacterSplitter` — character-budget splitter that
2//! prefers semantic boundaries (paragraph → line → word → char) and
3//! recurses into smaller boundaries only when the larger ones would
4//! produce oversized chunks.
5//!
6//! The canonical text-splitter shape — LangChain's
7//! `RecursiveCharacterTextSplitter` covers the same algorithm, and
8//! every RAG pipeline that doesn't need vendor-token-accurate
9//! splitting reaches for this default. For token-accurate budgeting,
10//! reach for [`TokenCountSplitter`](super::TokenCountSplitter)
11//! instead — both share the same recursive-merge core under
12//! [`splitter::common`](super::common) and differ only in their size
13//! metric.
14//!
15//! ## Algorithm
16//!
17//! 1. Walk separators in priority order (default: `["\n\n", "\n",
18//!    " ", ""]`). The empty-string separator is the always-fits
19//!    fallback that splits on every character.
20//! 2. At each separator: split the input. Long segments (over
21//!    `chunk_size`) recurse into the next separator; short segments
22//!    are accumulated greedily into chunks up to `chunk_size`.
23//! 3. Once a chunk is full, the next chunk is seeded with the
24//!    trailing `chunk_overlap` characters of the previous chunk so
25//!    semantically-adjacent content shares context across the
26//!    boundary.
27//!
28//! Boundary handling is *char-aware*, not byte-aware — chunk size
29//! and overlap count Unicode scalar values, not UTF-8 bytes. A
30//! Korean / Japanese / Devanagari corpus does not silently lose
31//! the last grapheme to a mid-byte split.
32
33use crate::document::{Document, Lineage};
34use crate::splitter::TextSplitter;
35use crate::splitter::common::{merge_with_overlap_metric, recurse_with_metric};
36
37/// Default chunk size in characters. ~1000 chars maps to roughly
38/// 200-300 tokens for English under `cl100k_base`, comfortably under
39/// every shipping vendor's per-message ceiling.
40pub const DEFAULT_CHUNK_SIZE_CHARS: usize = 1000;
41
42/// Default overlap between consecutive chunks. ~10% of
43/// [`DEFAULT_CHUNK_SIZE_CHARS`] preserves enough trailing context
44/// for retrieval grounding without bloating the index.
45pub const DEFAULT_CHUNK_OVERLAP_CHARS: usize = 100;
46
47/// Default separator priority list. Paragraph break → line break →
48/// word boundary → character. The empty-string fallback guarantees
49/// termination even on pathological input (one giant unbroken
50/// token).
51pub const DEFAULT_RECURSIVE_SEPARATORS: &[&str] = &["\n\n", "\n", " ", ""];
52
53/// Stable identifier surfaced on every produced chunk's
54/// [`Lineage::splitter`](crate::Lineage::splitter) field.
55const SPLITTER_NAME: &str = "recursive-character";
56
57/// Recursive character-budget splitter.
58///
59/// Construct via [`Self::new`] for the default 1000-char / 100-char
60/// shape, or [`Self::with_chunk_size`] / [`Self::with_chunk_overlap`]
61/// / [`Self::with_separators`] for tuning. Cloning is cheap — the
62/// separator list is held by `Arc` so multiple pipelines can share
63/// one configured splitter.
64#[derive(Clone, Debug)]
65pub struct RecursiveCharacterSplitter {
66    chunk_size: usize,
67    chunk_overlap: usize,
68    separators: std::sync::Arc<[String]>,
69}
70
71impl RecursiveCharacterSplitter {
72    /// Build with the default chunk size + overlap +
73    /// separator priority. The 99% case for English / Latin-script
74    /// corpora.
75    #[must_use]
76    pub fn new() -> Self {
77        Self {
78            chunk_size: DEFAULT_CHUNK_SIZE_CHARS,
79            chunk_overlap: DEFAULT_CHUNK_OVERLAP_CHARS,
80            separators: DEFAULT_RECURSIVE_SEPARATORS
81                .iter()
82                .map(|s| (*s).to_owned())
83                .collect(),
84        }
85    }
86
87    /// Override the target chunk size in characters. Chunks larger
88    /// than this are recursed; chunks smaller are accumulated
89    /// greedily.
90    #[must_use]
91    pub const fn with_chunk_size(mut self, chunk_size: usize) -> Self {
92        self.chunk_size = chunk_size;
93        self
94    }
95
96    /// Override the overlap (in characters) between consecutive
97    /// chunks. Must be strictly less than [`Self::chunk_size`] —
98    /// equal-or-greater overlap would loop indefinitely. Values at
99    /// or above the chunk size silently clamp to `chunk_size - 1`
100    /// at split time.
101    #[must_use]
102    pub const fn with_chunk_overlap(mut self, chunk_overlap: usize) -> Self {
103        self.chunk_overlap = chunk_overlap;
104        self
105    }
106
107    /// Override the separator priority list. Pipelines splitting
108    /// LaTeX, Python source, or other domain-specific corpora ship
109    /// alternative priority lists that bias toward
110    /// language-meaningful boundaries.
111    #[must_use]
112    pub fn with_separators<I, S>(mut self, separators: I) -> Self
113    where
114        I: IntoIterator<Item = S>,
115        S: Into<String>,
116    {
117        self.separators = separators.into_iter().map(Into::into).collect();
118        self
119    }
120
121    /// Effective chunk size in characters.
122    #[must_use]
123    pub const fn chunk_size(&self) -> usize {
124        self.chunk_size
125    }
126
127    /// Effective chunk overlap in characters.
128    #[must_use]
129    pub const fn chunk_overlap(&self) -> usize {
130        self.chunk_overlap
131    }
132}
133
134impl Default for RecursiveCharacterSplitter {
135    fn default() -> Self {
136        Self::new()
137    }
138}
139
140impl TextSplitter for RecursiveCharacterSplitter {
141    fn name(&self) -> &'static str {
142        SPLITTER_NAME
143    }
144
145    fn split(&self, document: &Document) -> Vec<Document> {
146        let chunk_size = self.chunk_size.max(1);
147        let chunk_overlap = self.chunk_overlap.min(chunk_size.saturating_sub(1));
148
149        let measure = |text: &str| char_count(text);
150        let take_tail = |text: &str, n: usize| take_tail_chars(text, n);
151        let fallback = |text: &str, n: usize| char_chunks(text, n);
152
153        let segments = recurse_with_metric(
154            &document.content,
155            &self.separators,
156            chunk_size,
157            &measure,
158            &fallback,
159        );
160        let texts =
161            merge_with_overlap_metric(segments, chunk_size, chunk_overlap, &measure, &take_tail);
162        let total = texts.len();
163        if total == 0 {
164            return Vec::new();
165        }
166        #[allow(clippy::cast_possible_truncation)]
167        let total_u32 = total.min(u32::MAX as usize) as u32;
168        texts
169            .into_iter()
170            .enumerate()
171            .map(|(idx, text)| {
172                #[allow(clippy::cast_possible_truncation)]
173                let idx_u32 = idx.min(u32::MAX as usize) as u32;
174                let lineage =
175                    Lineage::from_split(document.id.clone(), idx_u32, total_u32, SPLITTER_NAME);
176                document.child(text, lineage)
177            })
178            .collect()
179    }
180}
181
182/// Char-aware fixed-size chunker — used as the always-fits
183/// fallback when no separator matches.
184fn char_chunks(text: &str, chunk_size: usize) -> Vec<String> {
185    if chunk_size == 0 || text.is_empty() {
186        return Vec::new();
187    }
188    let mut out = Vec::new();
189    let mut current = String::new();
190    let mut count = 0usize;
191    for ch in text.chars() {
192        current.push(ch);
193        count += 1;
194        if count == chunk_size {
195            out.push(std::mem::take(&mut current));
196            count = 0;
197        }
198    }
199    if !current.is_empty() {
200        out.push(current);
201    }
202    out
203}
204
205/// Take the last `n` characters (Unicode scalar values, not bytes)
206/// of `text`. Returns the whole text when `n` exceeds its char
207/// count.
208fn take_tail_chars(text: &str, n: usize) -> String {
209    let total = char_count(text);
210    if n >= total {
211        return text.to_owned();
212    }
213    let skip = total - n;
214    text.chars().skip(skip).collect()
215}
216
217#[inline]
218fn char_count(text: &str) -> usize {
219    text.chars().count()
220}
221
222#[cfg(test)]
223#[allow(clippy::unwrap_used, clippy::indexing_slicing)]
224mod tests {
225    use super::*;
226    use crate::document::Source;
227    use entelix_memory::Namespace;
228
229    fn ns() -> Namespace {
230        Namespace::new(entelix_core::TenantId::new("acme"))
231    }
232
233    fn doc(content: &str) -> Document {
234        Document::root("doc", content, Source::now("test://", "test"), ns())
235    }
236
237    #[test]
238    fn empty_input_produces_no_chunks() {
239        let chunks = RecursiveCharacterSplitter::new().split(&doc(""));
240        assert!(chunks.is_empty());
241    }
242
243    #[test]
244    fn small_input_produces_single_chunk_with_lineage() {
245        let chunks = RecursiveCharacterSplitter::new().split(&doc("short text"));
246        assert_eq!(chunks.len(), 1);
247        let lineage = chunks[0].lineage.as_ref().unwrap();
248        assert_eq!(lineage.chunk_index, 0);
249        assert_eq!(lineage.total_chunks, 1);
250        assert_eq!(lineage.splitter, "recursive-character");
251        assert_eq!(lineage.parent_id.as_str(), "doc");
252    }
253
254    #[test]
255    fn paragraph_split_prefers_double_newline_boundary() {
256        let text = "alpha paragraph.\n\nbeta paragraph.\n\ngamma paragraph.";
257        let splitter = RecursiveCharacterSplitter::new()
258            .with_chunk_size(20)
259            .with_chunk_overlap(0);
260        let chunks = splitter.split(&doc(text));
261        assert_eq!(chunks.len(), 3);
262        assert!(chunks[0].content.contains("alpha"));
263        assert!(chunks[1].content.contains("beta"));
264        assert!(chunks[2].content.contains("gamma"));
265    }
266
267    #[test]
268    fn oversized_paragraph_recurses_to_word_boundary() {
269        let text = "alpha bravo charlie delta echo foxtrot golf hotel";
270        let splitter = RecursiveCharacterSplitter::new()
271            .with_chunk_size(20)
272            .with_chunk_overlap(0);
273        let chunks = splitter.split(&doc(text));
274        assert!(chunks.len() > 1, "must descend past paragraph break");
275        for chunk in &chunks {
276            assert!(
277                char_count(&chunk.content) <= 20,
278                "chunk size cap honoured: {} chars",
279                char_count(&chunk.content)
280            );
281        }
282    }
283
284    #[test]
285    fn overlap_seeds_tail_into_next_chunk() {
286        let text = "0123456789 abcdefghij KLMNOPQRST uvwxyz0123";
287        let splitter = RecursiveCharacterSplitter::new()
288            .with_chunk_size(15)
289            .with_chunk_overlap(5);
290        let chunks = splitter.split(&doc(text));
291        assert!(chunks.len() >= 2);
292        for window in chunks.windows(2) {
293            let prev_tail = take_tail_chars(&window[0].content, 5);
294            assert!(
295                window[1].content.starts_with(&prev_tail),
296                "next chunk must begin with previous tail: prev_tail={prev_tail:?}, next={:?}",
297                window[1].content
298            );
299        }
300    }
301
302    #[test]
303    fn char_aware_split_respects_unicode_boundary() {
304        let text = "안녕하세요반갑습니다";
305        let splitter = RecursiveCharacterSplitter::new()
306            .with_chunk_size(4)
307            .with_chunk_overlap(0)
308            .with_separators(["", ""]);
309        let chunks = splitter.split(&doc(text));
310        for chunk in &chunks {
311            let chars: String = chunk.content.chars().collect();
312            assert_eq!(chars, chunk.content);
313            assert!(char_count(&chunk.content) <= 4);
314        }
315        let joined: String = chunks.iter().map(|c| c.content.as_str()).collect();
316        assert_eq!(joined, text);
317    }
318
319    #[test]
320    fn lineage_total_chunks_matches_emitted_count() {
321        let text = "para one.\n\npara two.\n\npara three.";
322        let chunks = RecursiveCharacterSplitter::new()
323            .with_chunk_size(15)
324            .with_chunk_overlap(0)
325            .split(&doc(text));
326        let total = chunks.len();
327        for (idx, chunk) in chunks.iter().enumerate() {
328            let lineage = chunk.lineage.as_ref().unwrap();
329            #[allow(clippy::cast_possible_truncation)]
330            let idx_u32 = idx as u32;
331            #[allow(clippy::cast_possible_truncation)]
332            let total_u32 = total as u32;
333            assert_eq!(lineage.chunk_index, idx_u32);
334            assert_eq!(lineage.total_chunks, total_u32);
335        }
336    }
337
338    #[test]
339    fn child_id_carries_chunk_index_suffix() {
340        let chunks = RecursiveCharacterSplitter::new()
341            .with_chunk_size(5)
342            .with_chunk_overlap(0)
343            .split(&doc("alpha beta gamma delta"));
344        for (idx, chunk) in chunks.iter().enumerate() {
345            assert_eq!(chunk.id.as_str(), format!("doc:{idx}"));
346        }
347    }
348
349    #[test]
350    fn overlap_clamped_below_chunk_size_terminates() {
351        let splitter = RecursiveCharacterSplitter::new()
352            .with_chunk_size(5)
353            .with_chunk_overlap(100);
354        let chunks = splitter.split(&doc("0123456789 abcdefghij"));
355        assert!(
356            !chunks.is_empty() && chunks.len() < 1000,
357            "split terminated with bounded chunk count, got {}",
358            chunks.len()
359        );
360    }
361}
entelix_rag/splitter/recursive.rs

entelix_rag/splitter/
recursive.rs