synaptic_splitters/
recursive.rs1use crate::character::merge_splits;
2use crate::language::Language;
3use crate::TextSplitter;
4
5pub struct RecursiveCharacterTextSplitter {
13 separators: Vec<String>,
14 chunk_size: usize,
15 chunk_overlap: usize,
16}
17
18impl RecursiveCharacterTextSplitter {
19 pub fn new(chunk_size: usize) -> Self {
20 Self {
21 separators: vec![
22 "\n\n".to_string(),
23 "\n".to_string(),
24 " ".to_string(),
25 String::new(),
26 ],
27 chunk_size,
28 chunk_overlap: 0,
29 }
30 }
31
32 pub fn with_separators(mut self, separators: Vec<String>) -> Self {
33 self.separators = separators;
34 self
35 }
36
37 pub fn from_language(language: Language, chunk_size: usize, chunk_overlap: usize) -> Self {
39 Self {
40 separators: language.separators(),
41 chunk_size,
42 chunk_overlap,
43 }
44 }
45
46 pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
47 self.chunk_overlap = overlap;
48 self
49 }
50
51 fn split_recursive(&self, text: &str, separator_idx: usize) -> Vec<String> {
52 if text.len() <= self.chunk_size {
53 return vec![text.to_string()];
54 }
55
56 if separator_idx >= self.separators.len() {
57 return text
59 .chars()
60 .collect::<Vec<char>>()
61 .chunks(self.chunk_size)
62 .map(|c| c.iter().collect::<String>())
63 .collect();
64 }
65
66 let separator = &self.separators[separator_idx];
67
68 if separator.is_empty() {
69 return text
71 .chars()
72 .collect::<Vec<char>>()
73 .chunks(self.chunk_size)
74 .map(|c| c.iter().collect::<String>())
75 .collect();
76 }
77
78 let splits: Vec<&str> = text.split(separator.as_str()).collect();
79 let mut final_chunks = Vec::new();
80 let mut good_splits: Vec<&str> = Vec::new();
81
82 for split in &splits {
83 if split.len() <= self.chunk_size {
84 good_splits.push(split);
85 } else {
86 if !good_splits.is_empty() {
88 let merged =
89 merge_splits(&good_splits, self.chunk_size, self.chunk_overlap, separator);
90 final_chunks.extend(merged);
91 good_splits.clear();
92 }
93 let sub_chunks = self.split_recursive(split, separator_idx + 1);
95 final_chunks.extend(sub_chunks);
96 }
97 }
98
99 if !good_splits.is_empty() {
100 let merged = merge_splits(&good_splits, self.chunk_size, self.chunk_overlap, separator);
101 final_chunks.extend(merged);
102 }
103
104 final_chunks
105 }
106}
107
108impl TextSplitter for RecursiveCharacterTextSplitter {
109 fn split_text(&self, text: &str) -> Vec<String> {
110 self.split_recursive(text, 0)
111 }
112}