Skip to main content

alith_core/
cleaner.rs

1use regex::Regex;
2use std::sync::LazyLock;
3
4#[derive(Default)]
5pub enum Newlines {
6    Space,
7    Single,
8    #[default]
9    TwoPlus,
10    None,
11}
12#[derive(Default)]
13pub struct TextCleaner {
14    pub newlines: Newlines,
15    pub remove_non_basic_ascii: bool,
16    pub remve_citations: bool,
17}
18impl TextCleaner {
19    pub fn new() -> Self {
20        Self::default()
21    }
22
23    pub fn do_not_reduce_newlines(mut self) -> Self {
24        self.newlines = Newlines::None;
25        self
26    }
27
28    pub fn reduce_newlines_to_single_space(mut self) -> Self {
29        self.newlines = Newlines::Space;
30        self
31    }
32
33    pub fn reduce_newlines_to_single_newline(mut self) -> Self {
34        self.newlines = Newlines::Single;
35        self
36    }
37
38    pub fn reduce_newlines_to_double_newline(mut self) -> Self {
39        self.newlines = Newlines::TwoPlus;
40        self
41    }
42
43    pub fn remove_non_basic_ascii(mut self) -> Self {
44        self.remove_non_basic_ascii = true;
45        self
46    }
47
48    pub fn remove_citations(mut self) -> Self {
49        self.remve_citations = true;
50        self
51    }
52
53    pub fn run(&self, text: &str) -> String {
54        let text = END_OF_LINE_REGEX.replace_all(text, "\n");
55        let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
56        let text = WHITE_SPACE_REGEX.replace_all(&text, " ");
57
58        let text = match self.newlines {
59            Newlines::Space => SINGLE_NEWLINE_REGEX.replace_all(&text, " "),
60            Newlines::Single => SINGLE_NEWLINE_REGEX.replace_all(&text, "\n"),
61            Newlines::TwoPlus => TWO_PLUS_NEWLINE_REGEX.replace_all(&text, "\n\n"),
62            Newlines::None => text,
63        };
64
65        let text = if self.remove_non_basic_ascii {
66            UNWANTED_CHARS_REGEX.replace_all(&text, "")
67        } else {
68            text
69        };
70
71        let text = if self.remve_citations {
72            CITATIONS_REGEX.replace_all(&text, "")
73        } else {
74            text
75        };
76
77        SINGLE_SPACE_REGEX
78            .replace_all(&text, " ")
79            .trim()
80            .to_string()
81    }
82}
83
84pub fn normalize_whitespace(text: &str) -> String {
85    let text = END_OF_LINE_REGEX.replace_all(text, "\n");
86    let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
87    WHITE_SPACE_REGEX.replace_all(&text, " ").to_string()
88}
89
90pub fn strip_unwanted_chars(text: &str) -> String {
91    UNWANTED_CHARS_REGEX
92        .replace_all(text, "")
93        .trim()
94        .to_string()
95}
96
97pub fn reduce_to_single_whitespace(text: &str) -> String {
98    let text = SINGLE_SPACE_REGEX.replace_all(text, " ");
99    SINGLE_NEWLINE_REGEX
100        .replace_all(&text, "\n")
101        .trim()
102        .to_string()
103}
104
105//
106// Newlines
107//
108pub static END_OF_LINE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
109    vec![
110        // Ascii
111        r"(\\r\\n|\r\n)", // Windows // This must be first to avoid matching \r
112        r"(\\r|\r)",      // MacOS
113        r"(\\v|\v)",      // Vertical tab
114        r"(\\f|\f)",      // Form feed
115        r"\\n",           // Literal
116        // Unicode
117        r"\u{2028}",
118    ]
119});
120pub static END_OF_LINE_REGEX: LazyLock<Regex> =
121    LazyLock::new(|| Regex::new(&END_OF_LINE_SEQUENCES.join("|")).unwrap());
122pub static SINGLE_NEWLINE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{1,}").unwrap());
123
124//
125// Paragraphs
126//
127pub static END_OF_PARAGRAPH_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
128    vec![
129        // Unicode
130        r"\u{2029}",
131    ]
132});
133pub static END_OF_PARAGRAPH_REGEX: LazyLock<Regex> =
134    LazyLock::new(|| Regex::new(&END_OF_PARAGRAPH_SEQUENCES.join("|")).unwrap());
135pub static TWO_PLUS_NEWLINE_REGEX: LazyLock<Regex> =
136    LazyLock::new(|| Regex::new(r"\n{2,}").unwrap());
137
138//
139// White space
140//
141pub static WHITE_SPACE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
142    vec![
143        // Ascii
144        r"\\s",
145        r"(\\t|\t)",
146        // Unicode
147        r"\u{0020}",
148        r"\u{00A0}",
149        r"\u{1680}",
150        r"\u{2000}",
151        r"\u{2001}",
152        r"\u{2002}",
153        r"\u{2003}",
154        r"\u{2004}",
155        r"\u{2005}",
156        r"\u{2006}",
157        r"\u{2007}",
158        r"\u{2008}",
159        r"\u{2009}",
160        r"\u{200A}",
161        r"\u{2028}",
162        r"\u{202F}",
163        r"\u{205F}",
164        r"\u{3000}",
165        r"\u{0009}",
166    ]
167});
168
169pub static WHITE_SPACE_REGEX: LazyLock<Regex> =
170    LazyLock::new(|| Regex::new(&WHITE_SPACE_SEQUENCES.join("|")).unwrap());
171pub static SINGLE_SPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" {1,}").unwrap());
172
173//
174// Unwanted characters
175//
176pub static UNWANTED_CHARS_REGEX: LazyLock<Regex> =
177    LazyLock::new(|| Regex::new(r#"[^a-zA-Z0-9.,?!:;'\"\-\(\)\[\]\{\}$&@#%^*()\s]+"#).unwrap());
178pub static CITATIONS_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\d{1,3}\]").unwrap());