Skip to main content

alith_core/splitting/
rule_based.rs

1use regex::Regex;
2use std::{ops::Range, sync::LazyLock};
3use unicode_properties::{GeneralCategory, GeneralCategoryGroup, UnicodeGeneralCategory};
4
5#[inline]
6pub fn split_text_into_sentences(text: &str, keep_separator: bool) -> Vec<String> {
7    cut(text.to_owned(), keep_separator)
8}
9
10pub fn split_text_into_indices(text: &str, keep_separator: bool) -> Vec<Range<usize>> {
11    let sentences = cut(text.to_owned(), keep_separator);
12    let mut indices: Vec<Range<usize>> = Vec::new();
13    let mut start = 0;
14    for sentence in sentences.iter() {
15        let end = start + sentence.len();
16        indices.push(Range { start, end });
17        start = end;
18    }
19    if !indices.is_empty() {
20        let last = indices.last().unwrap();
21        if last.end != text.len() {
22            eprintln!("split_text_into_indices: indices do not align with input text.");
23            return vec![];
24        }
25    }
26
27    indices
28}
29
30// Modified from https://github.com/indicium-ag/readability-text-cleanup-rs/blob/master/src/katana.rs
31fn cut(mut text: String, keep_separator: bool) -> Vec<String> {
32    remove_composite_abbreviations(&mut text);
33    remove_suspension_points(&mut text);
34    remove_floating_point_numbers(&mut text);
35    handle_floats_without_leading_zero(&mut text);
36    remove_abbreviations(&mut text);
37    remove_initials(&mut text);
38    unstick_sentences(&mut text);
39    remove_sentence_enders_before_parens(&mut text);
40    remove_sentence_enders_next_to_quotes(&mut text);
41    let sentences = split_sentences(&text);
42    let sentences = repair_sentences(sentences);
43    if keep_separator {
44        sentences
45    } else {
46        sentences.into_iter().map(|s| s.trim().to_owned()).collect()
47    }
48}
49
50fn remove_composite_abbreviations(text: &mut String) {
51    *text = REMOVE_COMPOSITE_ABBREVIATIONS
52        .replace_all(text, "$comp&;&")
53        .to_string();
54}
55
56fn remove_suspension_points(text: &mut String) {
57    *text = REMOVE_SUSPENSION_POINTS
58        .replace_all(text, "&&&.")
59        .to_string();
60}
61
62fn remove_floating_point_numbers(text: &mut String) {
63    *text = REMOVE_FLOATING_POINT_NUMBERS
64        .replace_all(text, "$number&@&$decimal")
65        .to_string();
66}
67
68fn handle_floats_without_leading_zero(text: &mut String) {
69    *text = HANDLE_FLOATS_WITHOUT_LEADING_ZERO
70        .replace_all(text, " &#&$nums")
71        .to_string();
72}
73
74fn remove_abbreviations(text: &mut String) {
75    *text = REMOVE_ABBREVIATIONS
76        .replace_all(text, |caps: &regex::Captures| {
77            caps.iter()
78                .filter_map(|c| c.map(|c| c.as_str().to_string().replace('.', "&-&")))
79                .collect::<String>()
80        })
81        .to_string();
82}
83
84fn remove_initials(text: &mut String) {
85    *text = REMOVE_INITIALS.replace_all(text, "$init&_&").to_string();
86}
87
88fn unstick_sentences(text: &mut String) {
89    *text = UNSTICK_SENTENCES
90        .replace_all(text, "$left $right")
91        .to_string();
92}
93
94fn remove_sentence_enders_before_parens(text: &mut String) {
95    *text = REMOVE_SENTENCE_ENDERS_BEFORE_PARENS
96        .replace_all(text, "&==&$bef")
97        .to_string();
98}
99
100fn remove_sentence_enders_next_to_quotes(text: &mut String) {
101    *text = QUOTE_TRANSFORMATIONS
102        .iter()
103        .fold(text.to_string(), |acc, (regex, repl)| {
104            regex.replace_all(&acc, *repl).to_string()
105        });
106}
107
108fn is_word_char(c: char) -> bool {
109    let group = c.general_category_group();
110    group == GeneralCategoryGroup::Letter || group == GeneralCategoryGroup::Number
111}
112
113fn is_line_separator_char(c: char) -> bool {
114    let group = c.general_category();
115    group == GeneralCategory::LineSeparator || group == GeneralCategory::ParagraphSeparator
116}
117
118fn split_sentences(text: &str) -> Vec<String> {
119    let mut sentences = Vec::new();
120    let mut current_sentence = String::new();
121    let mut previous_sentence = String::new();
122
123    for c in text.chars() {
124        if is_word_char(c) {
125            if !previous_sentence.is_empty() {
126                sentences.push(previous_sentence);
127                previous_sentence = String::new();
128            }
129            current_sentence.push(c);
130        } else if is_line_separator_char(c) {
131            if !previous_sentence.is_empty() {
132                previous_sentence.push(c);
133            } else {
134                current_sentence.push(c);
135            }
136        } else if c == '.' || c == '?' || c == '!' {
137            current_sentence.push(c);
138            previous_sentence = current_sentence;
139            current_sentence = String::new();
140        } else if previous_sentence.is_empty() {
141            current_sentence.push(c);
142        } else {
143            previous_sentence.push(c);
144        }
145    }
146
147    if !previous_sentence.is_empty() {
148        sentences.push(previous_sentence);
149    }
150    if !current_sentence.is_empty() {
151        sentences.push(current_sentence);
152    }
153
154    sentences
155}
156
157fn repair_sentences(sentences: Vec<String>) -> Vec<String> {
158    let repaired_sentences: Vec<String> = sentences
159        .into_iter()
160        .map(|s| {
161            let replaced_sentence = s
162                .replace("&;&", ".")
163                .replace("&&&", "..")
164                .replace("&@&", ".")
165                .replace("&#&", ".")
166                .replace("&-&", ".")
167                .replace("&_&", ".")
168                .replace("&*&", ".");
169            let paren_repaired = PAREN_REPAIR
170                .replace_all(&replaced_sentence, r"$1)")
171                .to_string();
172            QUOTE_REPAIR_REGEXES
173                .iter()
174                .fold(paren_repaired, |acc, regex| {
175                    regex
176                        .replace_all(
177                            &acc,
178                            match regex as *const Regex {
179                                x if x == &QUOTE_REPAIR_REGEXES[0] as *const Regex => r#"'$p""#,
180                                x if x == &QUOTE_REPAIR_REGEXES[1] as *const Regex => r#"'$p""#,
181                                x if x == &QUOTE_REPAIR_REGEXES[2] as *const Regex => r#"$p""#,
182                                x if x == &QUOTE_REPAIR_REGEXES[3] as *const Regex => r#"$p""#,
183                                x if x == &QUOTE_REPAIR_REGEXES[4] as *const Regex => r#"$p'"#,
184                                _ => r#"$p""#,
185                            },
186                        )
187                        .to_string()
188                })
189        })
190        .collect();
191    repaired_sentences
192}
193
194pub static REMOVE_COMPOSITE_ABBREVIATIONS: LazyLock<Regex> =
195    LazyLock::new(|| Regex::new(r"(?P<comp>et al\.)(?:\.)").unwrap());
196pub static REMOVE_SUSPENSION_POINTS: LazyLock<Regex> =
197    LazyLock::new(|| Regex::new(r"\.{3}").unwrap());
198pub static REMOVE_FLOATING_POINT_NUMBERS: LazyLock<Regex> =
199    LazyLock::new(|| Regex::new(r"(?P<number>[0-9]+)\.(?P<decimal>[0-9]+)").unwrap());
200pub static HANDLE_FLOATS_WITHOUT_LEADING_ZERO: LazyLock<Regex> =
201    LazyLock::new(|| Regex::new(r"\s\.(?P<nums>[0-9]+)").unwrap());
202pub static REMOVE_ABBREVIATIONS: LazyLock<Regex> =
203    LazyLock::new(|| Regex::new(r"(?:[A-Za-z]\.){2,}").unwrap());
204pub static REMOVE_INITIALS: LazyLock<Regex> =
205    LazyLock::new(|| Regex::new(r"(?P<init>[A-Z])(?P<point>\.)").unwrap());
206pub static UNSTICK_SENTENCES: LazyLock<Regex> =
207    LazyLock::new(|| Regex::new(r##"(?P<left>[^.?!]\.|!|\?)(?P<right>[^\s"'])"##).unwrap());
208pub static REMOVE_SENTENCE_ENDERS_BEFORE_PARENS: LazyLock<Regex> =
209    LazyLock::new(|| Regex::new(r##"(?P<bef>[.?!])\s?\)"##).unwrap());
210pub static QUOTE_TRANSFORMATIONS: LazyLock<Vec<(Regex, &'static str)>> = LazyLock::new(|| {
211    vec![
212        (
213            Regex::new(r##"'(?P<quote>[.?!])\s?""##).unwrap(),
214            "&^&$quote",
215        ),
216        (
217            Regex::new(r##"'(?P<quote>[.?!])\s?""##).unwrap(),
218            "&**&$quote",
219        ),
220        (
221            Regex::new(r##"(?P<quote>[.?!])\s?""##).unwrap(),
222            "&=&$quote",
223        ),
224        (
225            Regex::new(r##"(?P<quote>[.?!])\s?'""##).unwrap(),
226            "&,&$quote",
227        ),
228        (
229            Regex::new(r##"(?P<quote>[.?!])\s?'"##).unwrap(),
230            "&##&$quote",
231        ),
232        (Regex::new(r##"(?P<quote>[.?!])\s?""##).unwrap(), "&$quote"),
233    ]
234});
235pub static PAREN_REPAIR: LazyLock<Regex> =
236    LazyLock::new(|| Regex::new(r"&==&(?P<p>[.!?])").unwrap());
237pub static QUOTE_REPAIR_REGEXES: LazyLock<[Regex; 6]> = LazyLock::new(|| {
238    [
239        Regex::new(r"&\^&(?P<p>[.!?])").unwrap(),
240        Regex::new(r"&\*\*&(?P<p>[.!?])").unwrap(),
241        Regex::new(r"&=&(?P<p>[.!?])").unwrap(),
242        Regex::new(r#"&,&(?P<p>[.!?])"#).unwrap(),
243        Regex::new(r"&##&(?P<p>[.!?])").unwrap(),
244        Regex::new(r"&\$&(?P<p>[.!?])").unwrap(),
245    ]
246});