cognis_rag/splitters/
sentence.rs1use crate::document::Document;
4
5use super::{child_doc, TextSplitter};
6
7pub struct SentenceSplitter {
11 chunk_size: usize,
12 chunk_overlap: usize,
13}
14
15impl Default for SentenceSplitter {
16 fn default() -> Self {
17 Self {
18 chunk_size: 1000,
19 chunk_overlap: 0,
20 }
21 }
22}
23
24impl SentenceSplitter {
25 pub fn new() -> Self {
27 Self::default()
28 }
29 pub fn with_chunk_size(mut self, n: usize) -> Self {
31 self.chunk_size = n;
32 self
33 }
34 pub fn with_overlap_sentences(mut self, n: usize) -> Self {
37 self.chunk_overlap = n;
38 self
39 }
40
41 fn split_sentences(text: &str) -> Vec<String> {
42 let mut out = Vec::new();
43 let mut buf = String::new();
44 let chars: Vec<char> = text.chars().collect();
45 for i in 0..chars.len() {
46 buf.push(chars[i]);
47 if matches!(chars[i], '.' | '!' | '?') {
48 let next = chars.get(i + 1);
49 let is_boundary = matches!(next, Some(c) if c.is_whitespace()) || next.is_none();
50 if is_boundary {
51 let s = buf.trim().to_string();
52 if !s.is_empty() {
53 out.push(s);
54 }
55 buf.clear();
56 }
57 }
58 }
59 let tail = buf.trim().to_string();
60 if !tail.is_empty() {
61 out.push(tail);
62 }
63 out
64 }
65
66 fn pack(&self, sentences: Vec<String>) -> Vec<String> {
67 let mut out: Vec<String> = Vec::new();
68 let mut buf: Vec<String> = Vec::new();
69 let mut len = 0usize;
70 for s in sentences {
71 let sl = s.chars().count();
72 if !buf.is_empty() && len + sl + 1 > self.chunk_size {
73 out.push(buf.join(" "));
74 if self.chunk_overlap > 0 {
75 let keep = buf.len().saturating_sub(self.chunk_overlap);
76 buf = buf.split_off(keep);
77 len = buf.iter().map(|s| s.chars().count() + 1).sum();
78 } else {
79 buf.clear();
80 len = 0;
81 }
82 }
83 while !buf.is_empty() && len + sl + 1 > self.chunk_size {
88 let dropped = buf.remove(0);
89 len = len.saturating_sub(dropped.chars().count() + 1);
90 }
91 len += sl + 1;
92 buf.push(s);
93 }
94 if !buf.is_empty() {
95 out.push(buf.join(" "));
96 }
97 out
98 }
99}
100
101impl TextSplitter for SentenceSplitter {
102 fn split(&self, doc: &Document) -> Vec<Document> {
103 let sentences = Self::split_sentences(&doc.content);
104 self.pack(sentences)
105 .into_iter()
106 .enumerate()
107 .map(|(i, c)| child_doc(doc, c, i))
108 .collect()
109 }
110}
111
112#[cfg(test)]
113mod tests {
114 use super::*;
115
116 #[test]
117 fn splits_on_terminal_punctuation() {
118 let s = SentenceSplitter::new().with_chunk_size(1000);
119 let chunks = s.split(&Document::new("Hi there. How are you? I'm fine!"));
120 assert_eq!(chunks.len(), 1);
121 assert!(chunks[0].content.contains("Hi there."));
122 assert!(chunks[0].content.contains("I'm fine!"));
123 }
124
125 #[test]
126 fn packs_into_size_bound() {
127 let s = SentenceSplitter::new().with_chunk_size(15);
128 let text = "One. Two. Three. Four. Five.";
129 let chunks = s.split(&Document::new(text));
130 assert!(chunks.iter().all(|c| c.content.chars().count() <= 15));
131 assert!(chunks.len() >= 2);
132 }
133}