alith_core/splitting/
rule_based.rs1use regex::Regex;
2use std::{ops::Range, sync::LazyLock};
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5#[inline]
6pub fn split_text_into_sentences(text: &str, keep_separator: bool) -> Vec<String> {
7 cut(text.to_owned(), keep_separator)
8}
9
10pub fn split_text_into_indices(text: &str, keep_separator: bool) -> Vec<Range<usize>> {
11 let sentences = cut(text.to_owned(), keep_separator);
12 let mut indices: Vec<Range<usize>> = Vec::new();
13 let mut start = 0;
14 for sentence in sentences.iter() {
15 let end = start + sentence.len();
16 indices.push(Range { start, end });
17 start = end;
18 }
19 if !indices.is_empty() {
20 let last = indices.last().unwrap();
21 if last.end != text.len() {
22 eprintln!("split_text_into_indices: indices do not align with input text.");
23 return vec![];
24 }
25 }
26
27 indices
28}
29
30fn cut(mut text: String, keep_separator: bool) -> Vec<String> {
32 remove_composite_abbreviations(&mut text);
33 remove_suspension_points(&mut text);
34 remove_floating_point_numbers(&mut text);
35 handle_floats_without_leading_zero(&mut text);
36 remove_abbreviations(&mut text);
37 remove_initials(&mut text);
38 unstick_sentences(&mut text);
39 remove_sentence_enders_before_parens(&mut text);
40 remove_sentence_enders_next_to_quotes(&mut text);
41 let sentences = split_sentences(&text);
42 let sentences = repair_sentences(sentences);
43 if keep_separator {
44 sentences
45 } else {
46 sentences.into_iter().map(|s| s.trim().to_owned()).collect()
47 }
48}
49
50fn remove_composite_abbreviations(text: &mut String) {
51 *text = REMOVE_COMPOSITE_ABBREVIATIONS
52 .replace_all(text, "$comp&;&")
53 .to_string();
54}
55
56fn remove_suspension_points(text: &mut String) {
57 *text = REMOVE_SUSPENSION_POINTS
58 .replace_all(text, "&&&.")
59 .to_string();
60}
61
62fn remove_floating_point_numbers(text: &mut String) {
63 *text = REMOVE_FLOATING_POINT_NUMBERS
64 .replace_all(text, "$number&@&$decimal")
65 .to_string();
66}
67
68fn handle_floats_without_leading_zero(text: &mut String) {
69 *text = HANDLE_FLOATS_WITHOUT_LEADING_ZERO
70 .replace_all(text, " &#&$nums")
71 .to_string();
72}
73
74fn remove_abbreviations(text: &mut String) {
75 *text = REMOVE_ABBREVIATIONS
76 .replace_all(text, |caps: ®ex::Captures| {
77 caps.iter()
78 .filter_map(|c| c.map(|c| c.as_str().to_string().replace('.', "&-&")))
79 .collect::<String>()
80 })
81 .to_string();
82}
83
84fn remove_initials(text: &mut String) {
85 *text = REMOVE_INITIALS.replace_all(text, "$init&_&").to_string();
86}
87
88fn unstick_sentences(text: &mut String) {
89 *text = UNSTICK_SENTENCES
90 .replace_all(text, "$left $right")
91 .to_string();
92}
93
94fn remove_sentence_enders_before_parens(text: &mut String) {
95 *text = REMOVE_SENTENCE_ENDERS_BEFORE_PARENS
96 .replace_all(text, "&==&$bef")
97 .to_string();
98}
99
100fn remove_sentence_enders_next_to_quotes(text: &mut String) {
101 *text = QUOTE_TRANSFORMATIONS
102 .iter()
103 .fold(text.to_string(), |acc, (regex, repl)| {
104 regex.replace_all(&acc, *repl).to_string()
105 });
106}
107
108fn is_word_char(c: char) -> bool {
109 let group = c.general_category_group();
110 group == GeneralCategoryGroup::Letter || group == GeneralCategoryGroup::Number
111}
112
113fn is_line_separator_char(c: char) -> bool {
114 let group = c.general_category();
115 group == GeneralCategory::LineSeparator || group == GeneralCategory::ParagraphSeparator
116}
117
118fn split_sentences(text: &str) -> Vec<String> {
119 let mut sentences = Vec::new();
120 let mut current_sentence = String::new();
121 let mut previous_sentence = String::new();
122
123 for c in text.chars() {
124 if is_word_char(c) {
125 if !previous_sentence.is_empty() {
126 sentences.push(previous_sentence);
127 previous_sentence = String::new();
128 }
129 current_sentence.push(c);
130 } else if is_line_separator_char(c) {
131 if !previous_sentence.is_empty() {
132 previous_sentence.push(c);
133 } else {
134 current_sentence.push(c);
135 }
136 } else if c == '.' || c == '?' || c == '!' {
137 current_sentence.push(c);
138 previous_sentence = current_sentence;
139 current_sentence = String::new();
140 } else if previous_sentence.is_empty() {
141 current_sentence.push(c);
142 } else {
143 previous_sentence.push(c);
144 }
145 }
146
147 if !previous_sentence.is_empty() {
148 sentences.push(previous_sentence);
149 }
150 if !current_sentence.is_empty() {
151 sentences.push(current_sentence);
152 }
153
154 sentences
155}
156
157fn repair_sentences(sentences: Vec<String>) -> Vec<String> {
158 let repaired_sentences: Vec<String> = sentences
159 .into_iter()
160 .map(|s| {
161 let replaced_sentence = s
162 .replace("&;&", ".")
163 .replace("&&&", "..")
164 .replace("&@&", ".")
165 .replace("&#&", ".")
166 .replace("&-&", ".")
167 .replace("&_&", ".")
168 .replace("&*&", ".");
169 let paren_repaired = PAREN_REPAIR
170 .replace_all(&replaced_sentence, r"$1)")
171 .to_string();
172 QUOTE_REPAIR_REGEXES
173 .iter()
174 .fold(paren_repaired, |acc, regex| {
175 regex
176 .replace_all(
177 &acc,
178 match regex as *const Regex {
179 x if x == "E_REPAIR_REGEXES[0] as *const Regex => r#"'$p""#,
180 x if x == "E_REPAIR_REGEXES[1] as *const Regex => r#"'$p""#,
181 x if x == "E_REPAIR_REGEXES[2] as *const Regex => r#"$p""#,
182 x if x == "E_REPAIR_REGEXES[3] as *const Regex => r#"$p""#,
183 x if x == "E_REPAIR_REGEXES[4] as *const Regex => r#"$p'"#,
184 _ => r#"$p""#,
185 },
186 )
187 .to_string()
188 })
189 })
190 .collect();
191 repaired_sentences
192}
193
194pub static REMOVE_COMPOSITE_ABBREVIATIONS: LazyLock<Regex> =
195 LazyLock::new(|| Regex::new(r"(?P<comp>et al\.)(?:\.)").unwrap());
196pub static REMOVE_SUSPENSION_POINTS: LazyLock<Regex> =
197 LazyLock::new(|| Regex::new(r"\.{3}").unwrap());
198pub static REMOVE_FLOATING_POINT_NUMBERS: LazyLock<Regex> =
199 LazyLock::new(|| Regex::new(r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)").unwrap());
200pub static HANDLE_FLOATS_WITHOUT_LEADING_ZERO: LazyLock<Regex> =
201 LazyLock::new(|| Regex::new(r"\s\.(?P<nums>[0-9]+)").unwrap());
202pub static REMOVE_ABBREVIATIONS: LazyLock<Regex> =
203 LazyLock::new(|| Regex::new(r"(?:[A-Za-z]\.){2,}").unwrap());
204pub static REMOVE_INITIALS: LazyLock<Regex> =
205 LazyLock::new(|| Regex::new(r"(?P<init>[A-Z])(?P<point>\.)").unwrap());
206pub static UNSTICK_SENTENCES: LazyLock<Regex> =
207 LazyLock::new(|| Regex::new(r##"(?P<left>[^.?!]\.|!|\?)(?P<right>[^\s"'])"##).unwrap());
208pub static REMOVE_SENTENCE_ENDERS_BEFORE_PARENS: LazyLock<Regex> =
209 LazyLock::new(|| Regex::new(r##"(?P<bef>[.?!])\s?\)"##).unwrap());
210pub static QUOTE_TRANSFORMATIONS: LazyLock<Vec<(Regex, &'static str)>> = LazyLock::new(|| {
211 vec![
212 (
213 Regex::new(r##"'(?P<quote>[.?!])\s?""##).unwrap(),
214 "&^&$quote",
215 ),
216 (
217 Regex::new(r##"'(?P<quote>[.?!])\s?""##).unwrap(),
218 "&**&$quote",
219 ),
220 (
221 Regex::new(r##"(?P<quote>[.?!])\s?""##).unwrap(),
222 "&=&$quote",
223 ),
224 (
225 Regex::new(r##"(?P<quote>[.?!])\s?'""##).unwrap(),
226 "&,&$quote",
227 ),
228 (
229 Regex::new(r##"(?P<quote>[.?!])\s?'"##).unwrap(),
230 "&##&$quote",
231 ),
232 (Regex::new(r##"(?P<quote>[.?!])\s?""##).unwrap(), "&$quote"),
233 ]
234});
235pub static PAREN_REPAIR: LazyLock<Regex> =
236 LazyLock::new(|| Regex::new(r"&==&(?P<p>[.!?])").unwrap());
237pub static QUOTE_REPAIR_REGEXES: LazyLock<[Regex; 6]> = LazyLock::new(|| {
238 [
239 Regex::new(r"&\^&(?P<p>[.!?])").unwrap(),
240 Regex::new(r"&\*\*&(?P<p>[.!?])").unwrap(),
241 Regex::new(r"&=&(?P<p>[.!?])").unwrap(),
242 Regex::new(r#"&,&(?P<p>[.!?])"#).unwrap(),
243 Regex::new(r"&##&(?P<p>[.!?])").unwrap(),
244 Regex::new(r"&\$&(?P<p>[.!?])").unwrap(),
245 ]
246});