semchunk_rs/splitter.rs
1// MIT License
2//
3// Copyright (c) 2024 Dominic Tarro
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23use regex::Regex;
24
25const NON_WHITESPACE_SEMANTIC_SEPARATORS: [&str; 25] = [
26 ".", "?", "!", "*", // Sentence terminators
27 ";", ",", "(", ")", "[", "]", "“", "”", "‘", "’", "'", "\"", "`", // Clause separators.
28 ":", "—", "…", // Sentence interrupters.
29 "/", "\\", "–", "&", "-", // Word joiners.
30];
31
32/// A struct for splitting texts into segments based on the most desirable separator found.
33///
34/// # Examples
35///
36/// ```
37/// use semchunk_rs::Splitter;
38/// let splitter = Splitter::default();
39/// let text = "Hello World\nGoodbye World";
40/// let (separator, is_whitespace, segments) = splitter.split_text(text);
41/// assert_eq!(separator, "\n");
42/// assert!(is_whitespace);
43/// assert_eq!(segments, vec!["Hello World", "Goodbye World"]);
44/// ```
45#[derive(Debug)]
46pub struct Splitter {
47 line_carriage: Regex,
48 tab: Regex,
49 space: Regex,
50}
51
52impl Default for Splitter {
53 fn default() -> Self {
54 Splitter {
55 line_carriage: Regex::new(r"[\n\r]+").unwrap(),
56 tab: Regex::new(r"\t").unwrap(),
57 space: Regex::new(r"\s").unwrap(),
58 }
59 }
60}
61
62impl Splitter {
63 /// Splits the given text into segments based on the most desirable separator found.
64 ///
65 /// The method prioritizes separators in the following order:
66 /// 1. The largest sequence of newlines and/or carriage returns.
67 /// 2. The largest sequence of tabs.
68 /// 3. The largest sequence of whitespace characters.
69 /// 4. A semantically meaningful non-whitespace separator.
70 ///
71 /// If no semantically meaningful separator is found, the text is split into individual characters.
72 ///
73 /// # Arguments
74 ///
75 /// * `text` - A string slice that holds the text to be split.
76 ///
77 /// # Returns
78 ///
79 /// A tuple containing:
80 /// * The separator used for splitting the text.
81 /// * A boolean indicating whether the separator is whitespace.
82 /// * A vector of string slices representing the segments of the split text.
83 ///
84 /// # Examples
85 ///
86 /// ```
87 /// use semchunk_rs::Splitter;
88 /// let splitter = Splitter::default();
89 /// let text = "Hello World\nGoodbye World";
90 /// let (separator, is_whitespace, segments) = splitter.split_text(text);
91 /// assert_eq!(separator, "\n");
92 /// assert!(is_whitespace);
93 /// assert_eq!(segments, vec!["Hello World", "Goodbye World"]);
94 /// ```
95 pub fn split_text<'a>(&self, text: &'a str) -> (&'a str, bool, Vec<&'a str>) {
96 let mut separator_is_whitespace = true;
97 let mut separator_search_pattern: Option<&Regex> = Option::None;
98 let separator: &str;
99
100 // Try splitting at, in order of most desirable to least desirable:
101 // - The largest sequence of newlines and/or carriage returns;
102 // - The largest sequence of tabs;
103 // - The largest sequence of whitespace characters; and
104 // - A semantically meaningful non-whitespace separator.
105 if text.contains("\n") || text.contains("\r") {
106 separator_search_pattern = Option::Some(&self.line_carriage);
107 // Find longest line break
108 } else if text.contains("\t") {
109 separator_search_pattern = Option::Some(&self.tab);
110 } else if self.space.is_match(text) {
111 separator_search_pattern = Option::Some(&self.space);
112 }
113
114 match separator_search_pattern {
115 Some(pattern) => {
116 separator = pattern
117 .find_iter(text)
118 .map(|m| text.get(m.start()..m.end()).unwrap())
119 .max_by_key(|&s| s.len())
120 .unwrap();
121 }
122 None => {
123 // Identify the most desirable semantically meaningful non-whitespace separator present in the text.
124 match NON_WHITESPACE_SEMANTIC_SEPARATORS
125 .iter()
126 .find(|&&c| text.contains(c))
127 .copied()
128 {
129 Some(c) => {
130 separator = c;
131 separator_is_whitespace = false;
132 }
133 None => {
134 // If no semantically meaningful separator is present in the text, return an empty string as the separator and the text as a list of characters.
135 // text.split("") does this obnoxious thing where it includes an empty string at the start and end of the list, so removing that.
136 return (
137 "",
138 true,
139 text.split("")
140 .collect::<Vec<&str>>()
141 .get(1..text.len() + 1)
142 .unwrap()
143 .to_vec(),
144 );
145 }
146 }
147 }
148 }
149 // Return the separator and the split text
150 (
151 separator,
152 separator_is_whitespace,
153 text.split(separator).collect::<Vec<&str>>().clone(),
154 )
155 }
156}
157
158#[cfg(test)]
159mod splitter_tests {
160 use super::*;
161
162 #[test]
163 fn test_whitespace_split() {
164 let splitter = Splitter::default();
165 let text = "Hello, World!";
166 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
167 assert_eq!(separator, " ");
168 assert!(separator_is_whitespace);
169 assert_eq!(split_text, ["Hello,", "World!"]);
170
171 let text = "Hello, World!\tGoodbye, World!";
172 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
173 assert_eq!(separator, "\t");
174 assert!(separator_is_whitespace);
175 assert_eq!(split_text, ["Hello, World!", "Goodbye, World!"]);
176
177 let text = "Hello, World!\nGoodbye, World!";
178 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
179 assert_eq!(separator, "\n");
180 assert!(separator_is_whitespace);
181 assert_eq!(split_text, ["Hello, World!", "Goodbye, World!"]);
182
183 // Prioritize \n\n over \n
184 let text = "Hello, World!\n\nGoodbye, World!\n<EOF>";
185 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
186 assert_eq!(separator, "\n\n");
187 assert!(separator_is_whitespace);
188 assert_eq!(split_text, ["Hello, World!", "Goodbye, World!\n<EOF>"]);
189 }
190
191 #[test]
192 fn test_simple_semantic_chars_split() {
193 // Prioritize ! over ,
194 let splitter = Splitter::default();
195 let text = "Hello,World!";
196 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
197 assert_eq!(separator, "!");
198 assert!(!separator_is_whitespace);
199 assert_eq!(split_text, ["Hello,World", ""]);
200
201 // Test with multiple separators
202 let text = "Hello,_World!_Goodbye,_World!";
203 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
204 assert_eq!(separator, "!");
205 assert!(!separator_is_whitespace);
206 assert_eq!(split_text, ["Hello,_World", "_Goodbye,_World", ""]);
207 }
208
209 #[test]
210 fn test_no_match_split() {
211 let splitter = Splitter::default();
212 let text = "Hello_World";
213 let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
214 assert_eq!(separator, "");
215 assert!(separator_is_whitespace);
216 assert_eq!(
217 split_text,
218 ["H", "e", "l", "l", "o", "_", "W", "o", "r", "l", "d"]
219 );
220 }
221}