Skip to main content

synaptic_splitters/
recursive.rs

1use crate::character::merge_splits;
2use crate::language::Language;
3use crate::TextSplitter;
4
5/// Recursively splits text using a hierarchy of separators.
6///
7/// Tries each separator in order, splitting with the first one that produces
8/// chunks small enough. If a chunk is still too large, it recurses with
9/// the next separator.
10///
11/// Default separators: `["\n\n", "\n", " ", ""]`
12pub struct RecursiveCharacterTextSplitter {
13    separators: Vec<String>,
14    chunk_size: usize,
15    chunk_overlap: usize,
16}
17
18impl RecursiveCharacterTextSplitter {
19    pub fn new(chunk_size: usize) -> Self {
20        Self {
21            separators: vec![
22                "\n\n".to_string(),
23                "\n".to_string(),
24                " ".to_string(),
25                String::new(),
26            ],
27            chunk_size,
28            chunk_overlap: 0,
29        }
30    }
31
32    pub fn with_separators(mut self, separators: Vec<String>) -> Self {
33        self.separators = separators;
34        self
35    }
36
37    /// Create a splitter with language-aware separators.
38    pub fn from_language(language: Language, chunk_size: usize, chunk_overlap: usize) -> Self {
39        Self {
40            separators: language.separators(),
41            chunk_size,
42            chunk_overlap,
43        }
44    }
45
46    pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
47        self.chunk_overlap = overlap;
48        self
49    }
50
51    fn split_recursive(&self, text: &str, separator_idx: usize) -> Vec<String> {
52        if text.len() <= self.chunk_size {
53            return vec![text.to_string()];
54        }
55
56        if separator_idx >= self.separators.len() {
57            // No more separators, force-split by chunk_size
58            return text
59                .chars()
60                .collect::<Vec<char>>()
61                .chunks(self.chunk_size)
62                .map(|c| c.iter().collect::<String>())
63                .collect();
64        }
65
66        let separator = &self.separators[separator_idx];
67
68        if separator.is_empty() {
69            // Character-level split
70            return text
71                .chars()
72                .collect::<Vec<char>>()
73                .chunks(self.chunk_size)
74                .map(|c| c.iter().collect::<String>())
75                .collect();
76        }
77
78        let splits: Vec<&str> = text.split(separator.as_str()).collect();
79        let mut final_chunks = Vec::new();
80        let mut good_splits: Vec<&str> = Vec::new();
81
82        for split in &splits {
83            if split.len() <= self.chunk_size {
84                good_splits.push(split);
85            } else {
86                // Merge any accumulated good splits first
87                if !good_splits.is_empty() {
88                    let merged =
89                        merge_splits(&good_splits, self.chunk_size, self.chunk_overlap, separator);
90                    final_chunks.extend(merged);
91                    good_splits.clear();
92                }
93                // Recurse with next separator
94                let sub_chunks = self.split_recursive(split, separator_idx + 1);
95                final_chunks.extend(sub_chunks);
96            }
97        }
98
99        if !good_splits.is_empty() {
100            let merged = merge_splits(&good_splits, self.chunk_size, self.chunk_overlap, separator);
101            final_chunks.extend(merged);
102        }
103
104        final_chunks
105    }
106}
107
108impl TextSplitter for RecursiveCharacterTextSplitter {
109    fn split_text(&self, text: &str) -> Vec<String> {
110        self.split_recursive(text, 0)
111    }
112}