1use regex::Regex;
2use std::sync::LazyLock;
3
4#[derive(Default)]
5pub enum Newlines {
6 Space,
7 Single,
8 #[default]
9 TwoPlus,
10 None,
11}
12#[derive(Default)]
13pub struct TextCleaner {
14 pub newlines: Newlines,
15 pub remove_non_basic_ascii: bool,
16 pub remve_citations: bool,
17}
18impl TextCleaner {
19 pub fn new() -> Self {
20 Self::default()
21 }
22
23 pub fn do_not_reduce_newlines(mut self) -> Self {
24 self.newlines = Newlines::None;
25 self
26 }
27
28 pub fn reduce_newlines_to_single_space(mut self) -> Self {
29 self.newlines = Newlines::Space;
30 self
31 }
32
33 pub fn reduce_newlines_to_single_newline(mut self) -> Self {
34 self.newlines = Newlines::Single;
35 self
36 }
37
38 pub fn reduce_newlines_to_double_newline(mut self) -> Self {
39 self.newlines = Newlines::TwoPlus;
40 self
41 }
42
43 pub fn remove_non_basic_ascii(mut self) -> Self {
44 self.remove_non_basic_ascii = true;
45 self
46 }
47
48 pub fn remove_citations(mut self) -> Self {
49 self.remve_citations = true;
50 self
51 }
52
53 pub fn run(&self, text: &str) -> String {
54 let text = END_OF_LINE_REGEX.replace_all(text, "\n");
55 let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
56 let text = WHITE_SPACE_REGEX.replace_all(&text, " ");
57
58 let text = match self.newlines {
59 Newlines::Space => SINGLE_NEWLINE_REGEX.replace_all(&text, " "),
60 Newlines::Single => SINGLE_NEWLINE_REGEX.replace_all(&text, "\n"),
61 Newlines::TwoPlus => TWO_PLUS_NEWLINE_REGEX.replace_all(&text, "\n\n"),
62 Newlines::None => text,
63 };
64
65 let text = if self.remove_non_basic_ascii {
66 UNWANTED_CHARS_REGEX.replace_all(&text, "")
67 } else {
68 text
69 };
70
71 let text = if self.remve_citations {
72 CITATIONS_REGEX.replace_all(&text, "")
73 } else {
74 text
75 };
76
77 SINGLE_SPACE_REGEX
78 .replace_all(&text, " ")
79 .trim()
80 .to_string()
81 }
82}
83
84pub fn normalize_whitespace(text: &str) -> String {
85 let text = END_OF_LINE_REGEX.replace_all(text, "\n");
86 let text = END_OF_PARAGRAPH_REGEX.replace_all(&text, "\n\n");
87 WHITE_SPACE_REGEX.replace_all(&text, " ").to_string()
88}
89
90pub fn strip_unwanted_chars(text: &str) -> String {
91 UNWANTED_CHARS_REGEX
92 .replace_all(text, "")
93 .trim()
94 .to_string()
95}
96
97pub fn reduce_to_single_whitespace(text: &str) -> String {
98 let text = SINGLE_SPACE_REGEX.replace_all(text, " ");
99 SINGLE_NEWLINE_REGEX
100 .replace_all(&text, "\n")
101 .trim()
102 .to_string()
103}
104
105pub static END_OF_LINE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
109 vec![
110 r"(\\r\\n|\r\n)", r"(\\r|\r)", r"(\\v|\v)", r"(\\f|\f)", r"\\n", r"\u{2028}",
118 ]
119});
120pub static END_OF_LINE_REGEX: LazyLock<Regex> =
121 LazyLock::new(|| Regex::new(&END_OF_LINE_SEQUENCES.join("|")).unwrap());
122pub static SINGLE_NEWLINE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{1,}").unwrap());
123
124pub static END_OF_PARAGRAPH_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
128 vec![
129 r"\u{2029}",
131 ]
132});
133pub static END_OF_PARAGRAPH_REGEX: LazyLock<Regex> =
134 LazyLock::new(|| Regex::new(&END_OF_PARAGRAPH_SEQUENCES.join("|")).unwrap());
135pub static TWO_PLUS_NEWLINE_REGEX: LazyLock<Regex> =
136 LazyLock::new(|| Regex::new(r"\n{2,}").unwrap());
137
138pub static WHITE_SPACE_SEQUENCES: LazyLock<Vec<&'static str>> = LazyLock::new(|| {
142 vec![
143 r"\\s",
145 r"(\\t|\t)",
146 r"\u{0020}",
148 r"\u{00A0}",
149 r"\u{1680}",
150 r"\u{2000}",
151 r"\u{2001}",
152 r"\u{2002}",
153 r"\u{2003}",
154 r"\u{2004}",
155 r"\u{2005}",
156 r"\u{2006}",
157 r"\u{2007}",
158 r"\u{2008}",
159 r"\u{2009}",
160 r"\u{200A}",
161 r"\u{2028}",
162 r"\u{202F}",
163 r"\u{205F}",
164 r"\u{3000}",
165 r"\u{0009}",
166 ]
167});
168
169pub static WHITE_SPACE_REGEX: LazyLock<Regex> =
170 LazyLock::new(|| Regex::new(&WHITE_SPACE_SEQUENCES.join("|")).unwrap());
171pub static SINGLE_SPACE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" {1,}").unwrap());
172
173pub static UNWANTED_CHARS_REGEX: LazyLock<Regex> =
177 LazyLock::new(|| Regex::new(r#"[^a-zA-Z0-9.,?!:;'\"\-\(\)\[\]\{\}$&@#%^*()\s]+"#).unwrap());
178pub static CITATIONS_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\d{1,3}\]").unwrap());