Skip to main content

cognis_rag/splitters/
sentence.rs

1//! Sentence splitter — packs sentences into chunks up to a target size.
2
3use crate::document::Document;
4
5use super::{child_doc, TextSplitter};
6
7/// Splits text into chunks at sentence boundaries (`.`, `!`, `?` followed
8/// by whitespace or end-of-input). Adjacent sentences are packed into
9/// chunks up to `chunk_size` characters.
10pub struct SentenceSplitter {
11    chunk_size: usize,
12    chunk_overlap: usize,
13}
14
15impl Default for SentenceSplitter {
16    fn default() -> Self {
17        Self {
18            chunk_size: 1000,
19            chunk_overlap: 0,
20        }
21    }
22}
23
24impl SentenceSplitter {
25    /// Construct.
26    pub fn new() -> Self {
27        Self::default()
28    }
29    /// Cap chunk size in characters.
30    pub fn with_chunk_size(mut self, n: usize) -> Self {
31        self.chunk_size = n;
32        self
33    }
34    /// Sentence-overlap (number of trailing sentences to repeat at the
35    /// head of the next chunk).
36    pub fn with_overlap_sentences(mut self, n: usize) -> Self {
37        self.chunk_overlap = n;
38        self
39    }
40
41    fn split_sentences(text: &str) -> Vec<String> {
42        let mut out = Vec::new();
43        let mut buf = String::new();
44        let chars: Vec<char> = text.chars().collect();
45        for i in 0..chars.len() {
46            buf.push(chars[i]);
47            if matches!(chars[i], '.' | '!' | '?') {
48                let next = chars.get(i + 1);
49                let is_boundary = matches!(next, Some(c) if c.is_whitespace()) || next.is_none();
50                if is_boundary {
51                    let s = buf.trim().to_string();
52                    if !s.is_empty() {
53                        out.push(s);
54                    }
55                    buf.clear();
56                }
57            }
58        }
59        let tail = buf.trim().to_string();
60        if !tail.is_empty() {
61            out.push(tail);
62        }
63        out
64    }
65
66    fn pack(&self, sentences: Vec<String>) -> Vec<String> {
67        let mut out: Vec<String> = Vec::new();
68        let mut buf: Vec<String> = Vec::new();
69        let mut len = 0usize;
70        for s in sentences {
71            let sl = s.chars().count();
72            if !buf.is_empty() && len + sl + 1 > self.chunk_size {
73                out.push(buf.join(" "));
74                if self.chunk_overlap > 0 {
75                    let keep = buf.len().saturating_sub(self.chunk_overlap);
76                    buf = buf.split_off(keep);
77                    len = buf.iter().map(|s| s.chars().count() + 1).sum();
78                } else {
79                    buf.clear();
80                    len = 0;
81                }
82            }
83            // After the flush+overlap step, the retained tail may already
84            // be at/above chunk_size. Drop oldest sentences until the new
85            // one fits — if the sentence alone is bigger than chunk_size,
86            // we accept it as its own (oversized) chunk to avoid livelock.
87            while !buf.is_empty() && len + sl + 1 > self.chunk_size {
88                let dropped = buf.remove(0);
89                len = len.saturating_sub(dropped.chars().count() + 1);
90            }
91            len += sl + 1;
92            buf.push(s);
93        }
94        if !buf.is_empty() {
95            out.push(buf.join(" "));
96        }
97        out
98    }
99}
100
101impl TextSplitter for SentenceSplitter {
102    fn split(&self, doc: &Document) -> Vec<Document> {
103        let sentences = Self::split_sentences(&doc.content);
104        self.pack(sentences)
105            .into_iter()
106            .enumerate()
107            .map(|(i, c)| child_doc(doc, c, i))
108            .collect()
109    }
110}
111
112#[cfg(test)]
113mod tests {
114    use super::*;
115
116    #[test]
117    fn splits_on_terminal_punctuation() {
118        let s = SentenceSplitter::new().with_chunk_size(1000);
119        let chunks = s.split(&Document::new("Hi there. How are you? I'm fine!"));
120        assert_eq!(chunks.len(), 1);
121        assert!(chunks[0].content.contains("Hi there."));
122        assert!(chunks[0].content.contains("I'm fine!"));
123    }
124
125    #[test]
126    fn packs_into_size_bound() {
127        let s = SentenceSplitter::new().with_chunk_size(15);
128        let text = "One. Two. Three. Four. Five.";
129        let chunks = s.split(&Document::new(text));
130        assert!(chunks.iter().all(|c| c.content.chars().count() <= 15));
131        assert!(chunks.len() >= 2);
132    }
133}