synaptic_splitters/
character.rs1use crate::TextSplitter;
2
3pub struct CharacterTextSplitter {
8 separator: String,
9 chunk_size: usize,
10 chunk_overlap: usize,
11}
12
13impl CharacterTextSplitter {
14 pub fn new(chunk_size: usize) -> Self {
15 Self {
16 separator: "\n\n".to_string(),
17 chunk_size,
18 chunk_overlap: 0,
19 }
20 }
21
22 pub fn with_separator(mut self, separator: impl Into<String>) -> Self {
23 self.separator = separator.into();
24 self
25 }
26
27 pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
28 self.chunk_overlap = overlap;
29 self
30 }
31}
32
33impl TextSplitter for CharacterTextSplitter {
34 fn split_text(&self, text: &str) -> Vec<String> {
35 let splits: Vec<&str> = text.split(&self.separator).collect();
36 merge_splits(
37 &splits,
38 self.chunk_size,
39 self.chunk_overlap,
40 &self.separator,
41 )
42 }
43}
44
45pub(crate) fn merge_splits(
48 splits: &[&str],
49 chunk_size: usize,
50 overlap: usize,
51 separator: &str,
52) -> Vec<String> {
53 let mut chunks = Vec::new();
54 let mut current_parts: Vec<&str> = Vec::new();
55 let mut current_len = 0;
56
57 for &split in splits {
58 let split_len = split.len();
59 let sep_len = if current_parts.is_empty() {
60 0
61 } else {
62 separator.len()
63 };
64
65 if current_len + sep_len + split_len > chunk_size && !current_parts.is_empty() {
66 let chunk = current_parts.join(separator);
67 chunks.push(chunk);
68
69 if overlap == 0 {
71 current_parts.clear();
72 current_len = 0;
73 } else {
74 while current_len > overlap && current_parts.len() > 1 {
75 let removed = current_parts.remove(0);
76 current_len -= removed.len() + separator.len();
77 }
78 }
79 }
80
81 current_parts.push(split);
82 current_len += if current_parts.len() == 1 {
83 split_len
84 } else {
85 separator.len() + split_len
86 };
87 }
88
89 if !current_parts.is_empty() {
90 chunks.push(current_parts.join(separator));
91 }
92
93 chunks
94}