Skip to main content

synaptic_splitters/
character.rs

1use crate::TextSplitter;
2
3/// Splits text by a single separator string.
4///
5/// After splitting, chunks are merged to stay under `chunk_size` with
6/// `chunk_overlap` characters of overlap between consecutive chunks.
7pub struct CharacterTextSplitter {
8    separator: String,
9    chunk_size: usize,
10    chunk_overlap: usize,
11}
12
13impl CharacterTextSplitter {
14    pub fn new(chunk_size: usize) -> Self {
15        Self {
16            separator: "\n\n".to_string(),
17            chunk_size,
18            chunk_overlap: 0,
19        }
20    }
21
22    pub fn with_separator(mut self, separator: impl Into<String>) -> Self {
23        self.separator = separator.into();
24        self
25    }
26
27    pub fn with_chunk_overlap(mut self, overlap: usize) -> Self {
28        self.chunk_overlap = overlap;
29        self
30    }
31}
32
33impl TextSplitter for CharacterTextSplitter {
34    fn split_text(&self, text: &str) -> Vec<String> {
35        let splits: Vec<&str> = text.split(&self.separator).collect();
36        merge_splits(
37            &splits,
38            self.chunk_size,
39            self.chunk_overlap,
40            &self.separator,
41        )
42    }
43}
44
45/// Merge small splits into chunks that are at most `chunk_size` long,
46/// with `overlap` characters of context from the previous chunk.
47pub(crate) fn merge_splits(
48    splits: &[&str],
49    chunk_size: usize,
50    overlap: usize,
51    separator: &str,
52) -> Vec<String> {
53    let mut chunks = Vec::new();
54    let mut current_parts: Vec<&str> = Vec::new();
55    let mut current_len = 0;
56
57    for &split in splits {
58        let split_len = split.len();
59        let sep_len = if current_parts.is_empty() {
60            0
61        } else {
62            separator.len()
63        };
64
65        if current_len + sep_len + split_len > chunk_size && !current_parts.is_empty() {
66            let chunk = current_parts.join(separator);
67            chunks.push(chunk);
68
69            // Keep parts for overlap
70            if overlap == 0 {
71                current_parts.clear();
72                current_len = 0;
73            } else {
74                while current_len > overlap && current_parts.len() > 1 {
75                    let removed = current_parts.remove(0);
76                    current_len -= removed.len() + separator.len();
77                }
78            }
79        }
80
81        current_parts.push(split);
82        current_len += if current_parts.len() == 1 {
83            split_len
84        } else {
85            separator.len() + split_len
86        };
87    }
88
89    if !current_parts.is_empty() {
90        chunks.push(current_parts.join(separator));
91    }
92
93    chunks
94}