semchunk_rs/
splitter.rs

1// MIT License
2//
3// Copyright (c) 2024 Dominic Tarro
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23use regex::Regex;
24
25const NON_WHITESPACE_SEMANTIC_SEPARATORS: [&str; 25] = [
26    ".", "?", "!", "*", // Sentence terminators
27    ";", ",", "(", ")", "[", "]", "“", "”", "‘", "’", "'", "\"", "`", // Clause separators.
28    ":", "—", "…", // Sentence interrupters.
29    "/", "\\", "–", "&", "-", // Word joiners.
30];
31
32/// A struct for splitting texts into segments based on the most desirable separator found.
33/// 
34/// # Examples
35/// 
36/// ```
37/// use semchunk_rs::Splitter;
38/// let splitter = Splitter::default();
39/// let text = "Hello World\nGoodbye World";
40/// let (separator, is_whitespace, segments) = splitter.split_text(text);
41/// assert_eq!(separator, "\n");
42/// assert!(is_whitespace);
43/// assert_eq!(segments, vec!["Hello World", "Goodbye World"]);
44/// ```
45#[derive(Debug)]
46pub struct Splitter {
47    line_carriage: Regex,
48    tab: Regex,
49    space: Regex,
50}
51
52impl Default for Splitter {
53    fn default() -> Self {
54        Splitter {
55            line_carriage: Regex::new(r"[\n\r]+").unwrap(),
56            tab: Regex::new(r"\t").unwrap(),
57            space: Regex::new(r"\s").unwrap(),
58        }
59    }
60}
61
62impl Splitter {
63    /// Splits the given text into segments based on the most desirable separator found.
64    ///
65    /// The method prioritizes separators in the following order:
66    /// 1. The largest sequence of newlines and/or carriage returns.
67    /// 2. The largest sequence of tabs.
68    /// 3. The largest sequence of whitespace characters.
69    /// 4. A semantically meaningful non-whitespace separator.
70    ///
71    /// If no semantically meaningful separator is found, the text is split into individual characters.
72    ///
73    /// # Arguments
74    ///
75    /// * `text` - A string slice that holds the text to be split.
76    ///
77    /// # Returns
78    ///
79    /// A tuple containing:
80    /// * The separator used for splitting the text.
81    /// * A boolean indicating whether the separator is whitespace.
82    /// * A vector of string slices representing the segments of the split text.
83    ///
84    /// # Examples
85    ///
86    /// ```
87    /// use semchunk_rs::Splitter;
88    /// let splitter = Splitter::default();
89    /// let text = "Hello World\nGoodbye World";
90    /// let (separator, is_whitespace, segments) = splitter.split_text(text);
91    /// assert_eq!(separator, "\n");
92    /// assert!(is_whitespace);
93    /// assert_eq!(segments, vec!["Hello World", "Goodbye World"]);
94    /// ```
95    pub fn split_text<'a>(&self, text: &'a str) -> (&'a str, bool, Vec<&'a str>) {
96        let mut separator_is_whitespace = true;
97        let mut separator_search_pattern: Option<&Regex> = Option::None;
98        let separator: &str;
99
100        // Try splitting at, in order of most desirable to least desirable:
101        // - The largest sequence of newlines and/or carriage returns;
102        // - The largest sequence of tabs;
103        // - The largest sequence of whitespace characters; and
104        // - A semantically meaningful non-whitespace separator.
105        if text.contains("\n") || text.contains("\r") {
106            separator_search_pattern = Option::Some(&self.line_carriage);
107            // Find longest line break
108        } else if text.contains("\t") {
109            separator_search_pattern = Option::Some(&self.tab);
110        } else if self.space.is_match(text) {
111            separator_search_pattern = Option::Some(&self.space);
112        }
113
114        match separator_search_pattern {
115            Some(pattern) => {
116                separator = pattern
117                    .find_iter(text)
118                    .map(|m| text.get(m.start()..m.end()).unwrap())
119                    .max_by_key(|&s| s.len())
120                    .unwrap();
121            }
122            None => {
123                // Identify the most desirable semantically meaningful non-whitespace separator present in the text.
124                match NON_WHITESPACE_SEMANTIC_SEPARATORS
125                    .iter()
126                    .find(|&&c| text.contains(c))
127                    .copied()
128                {
129                    Some(c) => {
130                        separator = c;
131                        separator_is_whitespace = false;
132                    }
133                    None => {
134                        // If no semantically meaningful separator is present in the text, return an empty string as the separator and the text as a list of characters.
135                        // text.split("") does this obnoxious thing where it includes an empty string at the start and end of the list, so removing that.
136                        return (
137                            "",
138                            true,
139                            text.split("")
140                                .collect::<Vec<&str>>()
141                                .get(1..text.len() + 1)
142                                .unwrap()
143                                .to_vec(),
144                        );
145                    }
146                }
147            }
148        }
149        // Return the separator and the split text
150        (
151            separator,
152            separator_is_whitespace,
153            text.split(separator).collect::<Vec<&str>>().clone(),
154        )
155    }
156}
157
158#[cfg(test)]
159mod splitter_tests {
160    use super::*;
161
162    #[test]
163    fn test_whitespace_split() {
164        let splitter = Splitter::default();
165        let text = "Hello, World!";
166        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
167        assert_eq!(separator, " ");
168        assert!(separator_is_whitespace);
169        assert_eq!(split_text, ["Hello,", "World!"]);
170
171        let text = "Hello, World!\tGoodbye, World!";
172        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
173        assert_eq!(separator, "\t");
174        assert!(separator_is_whitespace);
175        assert_eq!(split_text, ["Hello, World!", "Goodbye, World!"]);
176
177        let text = "Hello, World!\nGoodbye, World!";
178        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
179        assert_eq!(separator, "\n");
180        assert!(separator_is_whitespace);
181        assert_eq!(split_text, ["Hello, World!", "Goodbye, World!"]);
182
183        // Prioritize \n\n over \n
184        let text = "Hello, World!\n\nGoodbye, World!\n<EOF>";
185        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
186        assert_eq!(separator, "\n\n");
187        assert!(separator_is_whitespace);
188        assert_eq!(split_text, ["Hello, World!", "Goodbye, World!\n<EOF>"]);
189    }
190
191    #[test]
192    fn test_simple_semantic_chars_split() {
193        // Prioritize ! over ,
194        let splitter = Splitter::default();
195        let text = "Hello,World!";
196        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
197        assert_eq!(separator, "!");
198        assert!(!separator_is_whitespace);
199        assert_eq!(split_text, ["Hello,World", ""]);
200
201        // Test with multiple separators
202        let text = "Hello,_World!_Goodbye,_World!";
203        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
204        assert_eq!(separator, "!");
205        assert!(!separator_is_whitespace);
206        assert_eq!(split_text, ["Hello,_World", "_Goodbye,_World", ""]);
207    }
208
209    #[test]
210    fn test_no_match_split() {
211        let splitter = Splitter::default();
212        let text = "Hello_World";
213        let (separator, separator_is_whitespace, split_text) = splitter.split_text(text);
214        assert_eq!(separator, "");
215        assert!(separator_is_whitespace);
216        assert_eq!(
217            split_text,
218            ["H", "e", "l", "l", "o", "_", "W", "o", "r", "l", "d"]
219        );
220    }
221}
semchunk_rs/splitter.rs

semchunk_rs/
splitter.rs