1#[derive(Default)]
2pub enum Newlines {
3 Space,
4 Single,
5 #[default]
6 TwoPlus,
7 None,
8}
9
10enum CleanStep {
13 Emit(char),
15 Whitespace,
17 Newline(usize),
19 EscapedWhitespace,
21 EscapedNewline,
23 CitationRemoved(bool),
26 ReplayNonCitation(Vec<char>),
28}
29
30struct CleanState {
32 result: String,
33 consecutive_newlines: usize,
34 last_was_space: bool,
35}
36
37impl CleanState {
38 fn with_capacity(capacity: usize) -> Self {
39 Self {
40 result: String::with_capacity(capacity),
41 consecutive_newlines: 0,
42 last_was_space: false,
43 }
44 }
45}
46
47#[derive(Default)]
48pub struct TextCleaner {
49 pub newlines: Newlines,
50 pub remove_non_basic_ascii: bool,
51 pub remove_citations: bool,
52}
53
54impl TextCleaner {
55 pub fn new() -> Self {
56 Self::default()
57 }
58
59 pub fn do_not_reduce_newlines(mut self) -> Self {
60 self.newlines = Newlines::None;
61 self
62 }
63
64 pub fn reduce_newlines_to_single_space(mut self) -> Self {
65 self.newlines = Newlines::Space;
66 self
67 }
68
69 pub fn reduce_newlines_to_single_newline(mut self) -> Self {
70 self.newlines = Newlines::Single;
71 self
72 }
73
74 pub fn reduce_newlines_to_double_newline(mut self) -> Self {
75 self.newlines = Newlines::TwoPlus;
76 self
77 }
78
79 pub fn remove_non_basic_ascii(mut self) -> Self {
80 self.remove_non_basic_ascii = true;
81 self
82 }
83
84 pub fn remove_citations(mut self) -> Self {
85 self.remove_citations = true;
86 self
87 }
88
89 pub fn run(&self, text: &str) -> String {
94 let mut state = CleanState::with_capacity(text.len());
95 let mut chars = text.chars().peekable();
96
97 while let Some(c) = chars.next() {
98 let step = self.classify_char(c, &mut chars);
99
100 match step {
101 CleanStep::Newline(count) => {
102 self.handle_newline(&mut state, count);
103 }
104 CleanStep::Whitespace => {
105 self.handle_whitespace(&mut state);
106 }
107 CleanStep::EscapedWhitespace => {
108 self.handle_escaped_whitespace(&mut state);
109 }
110 CleanStep::EscapedNewline => {
111 state.consecutive_newlines += 1;
112 state.last_was_space = false;
113 }
114 CleanStep::CitationRemoved(remove_trailing_space) => {
115 if remove_trailing_space && state.last_was_space && state.result.ends_with(' ')
118 {
119 state.result.pop();
120 state.last_was_space = false;
121 }
122 }
123 CleanStep::ReplayNonCitation(buf) => {
124 self.emit_newlines(&mut state);
125 for ch in buf {
126 state.result.push(ch);
127 }
128 state.last_was_space = false;
129 }
130 CleanStep::Emit(ch) => {
131 self.emit_newlines(&mut state);
132 if !self.remove_non_basic_ascii || is_valid_text_char(ch) {
133 state.result.push(ch);
134 }
135 state.last_was_space = false;
136 }
137 }
138 }
139
140 if state.consecutive_newlines > 0 {
142 self.emit_newlines(&mut state);
143 }
144
145 trim_trailing_spaces(&state.result)
146 }
147
148 fn classify_char(
150 &self,
151 c: char,
152 chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
153 ) -> CleanStep {
154 match c {
155 '\r' => {
157 if chars.peek() == Some(&'\n') {
158 chars.next();
159 }
160 CleanStep::Newline(1)
161 }
162 '\n' | '\x0B' | '\x0C' | '\u{2028}' => CleanStep::Newline(1),
163 '\u{2029}' => CleanStep::Newline(2),
164
165 ' ' |
167 '\t' |
168 '\u{00A0}' |
169 '\u{1680}' |
170 '\u{2000}'..='\u{200A}' |
171 '\u{202F}' |
172 '\u{205F}' |
173 '\u{3000}' => CleanStep::Whitespace,
174
175 '\\' => self.classify_escape(chars),
177
178 '[' if self.remove_citations => self.classify_citation(chars),
180
181 _ => CleanStep::Emit(c),
183 }
184 }
185
186 fn classify_escape(&self, chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> CleanStep {
188 if let Some(&next) = chars.peek() {
189 match next {
190 's' | 't' => {
191 chars.next();
192 CleanStep::EscapedWhitespace
193 }
194 'n' | 'r' => {
195 chars.next();
196 CleanStep::EscapedNewline
197 }
198 _ => CleanStep::Emit('\\'),
199 }
200 } else {
201 CleanStep::Emit('\\')
202 }
203 }
204
205 fn classify_citation(&self, chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> CleanStep {
207 let mut buf = vec!['['];
208 let mut is_citation = false;
209
210 while let Some(&next) = chars.peek() {
211 if next.is_ascii_digit() || next == ',' || next == '-' || next == ' ' {
212 buf.push(next);
213 chars.next();
214 } else if next == ']' && buf.len() > 1 && buf[1..].iter().any(|b| b.is_ascii_digit()) {
215 is_citation = true;
216 chars.next();
217 break;
218 } else {
219 break;
220 }
221 }
222
223 if is_citation {
224 let next_is_punctuation =
227 chars.peek().is_some_and(|&c| matches!(c, '.' | ',' | '?' | '!' | ':' | ';'));
228 CleanStep::CitationRemoved(next_is_punctuation)
229 } else {
230 CleanStep::ReplayNonCitation(buf)
231 }
232 }
233
234 fn handle_newline(&self, state: &mut CleanState, count: usize) {
236 state.consecutive_newlines += count;
237 state.last_was_space = false;
238 }
239
240 fn handle_whitespace(&self, state: &mut CleanState) {
242 if state.consecutive_newlines > 0 {
243 match self.newlines {
244 Newlines::Space => {
245 state.result.push(' ');
246 state.consecutive_newlines = 0;
247 state.last_was_space = true;
248 return;
249 }
250 Newlines::Single => {
251 state.result.push('\n');
252 state.consecutive_newlines = 0;
253 }
254 Newlines::TwoPlus => {
255 let count = state.consecutive_newlines.min(2);
256 for _ in 0..count {
257 state.result.push('\n');
258 }
259 state.consecutive_newlines = 0;
260 }
261 Newlines::None => {
262 for _ in 0..state.consecutive_newlines {
263 state.result.push('\n');
264 }
265 state.consecutive_newlines = 0;
266 }
267 }
268 }
269 if !state.last_was_space {
270 state.result.push(' ');
271 state.last_was_space = true;
272 }
273 }
274
275 fn handle_escaped_whitespace(&self, state: &mut CleanState) {
277 if !state.last_was_space && state.consecutive_newlines == 0 {
278 state.result.push(' ');
279 state.last_was_space = true;
280 }
281 }
282
283 fn emit_newlines(&self, state: &mut CleanState) {
285 if state.consecutive_newlines == 0 {
286 return;
287 }
288 match self.newlines {
289 Newlines::Space => {
290 state.result.push(' ');
291 }
292 Newlines::Single => {
293 state.result.push('\n');
294 }
295 Newlines::TwoPlus => {
296 let count = state.consecutive_newlines.min(2);
297 for _ in 0..count {
298 state.result.push('\n');
299 }
300 }
301 Newlines::None => {
302 for _ in 0..state.consecutive_newlines {
303 state.result.push('\n');
304 }
305 }
306 }
307 state.consecutive_newlines = 0;
308 }
309}
310
311fn is_valid_text_char(c: char) -> bool {
315 !(c.is_control() && c != '\t' && c != '\n' && c != '\r')
316}
317
318fn trim_trailing_spaces(text: &str) -> String {
320 let trimmed = text.trim_start();
321 if trimmed.is_empty() {
322 return String::new();
323 }
324 let trimmed = trimmed.trim_end_matches([' ', '\t']);
326 let newline_count = trimmed.chars().rev().take_while(|&c| c == '\n' || c == '\r').count();
328 if newline_count == 0 {
329 return trimmed.to_string();
330 }
331 let body = &trimmed[..trimmed.len() - newline_count];
332 let clamped = newline_count.min(2);
333 let mut result = String::with_capacity(body.len() + clamped);
334 result.push_str(body);
335 for _ in 0..clamped {
336 result.push('\n');
337 }
338 result
339}
340
341pub fn normalize_whitespace(text: &str) -> String {
343 TextCleaner::new().do_not_reduce_newlines().run(text)
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349
350 #[test]
351 fn test_clean_to_single_spaces() {
352 let ascii_text =
353 "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n And\nof course, newlines.\n\n";
354 let ascii_result = "Ascii spaces here. Unicode spaces here. And of course, newlines.";
355 assert_eq!(
356 TextCleaner::new().reduce_newlines_to_single_space().run(ascii_text),
357 ascii_result
358 );
359 }
360
361 #[test]
362 fn test_clean_to_single_newlines() {
363 let ascii_text =
364 "Ascii\tspaces here. Unicode\u{00A0}spaces here.\nAnd of course, newlines.\n\nCool.";
365 let ascii_result =
366 "Ascii spaces here. Unicode spaces here.\nAnd of course, newlines.\nCool.";
367 assert_eq!(
368 TextCleaner::new().reduce_newlines_to_single_newline().run(ascii_text),
369 ascii_result
370 );
371 }
372
373 #[test]
374 fn test_clean_to_double_newlines() {
375 let ascii_text = "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n\nAscii\n\nparagraphs.\r\n\r\nUnicode\u{2029}paragraphs.\u{2029}\u{2028} Literal\\n\\nparagraphs.\\r\\n\\r\\n";
376 let ascii_result = "Ascii spaces here. Unicode spaces here.\n\nAscii\n\nparagraphs.\n\nUnicode\n\nparagraphs.\n\n Literal\n\nparagraphs.\n\n";
377 assert_eq!(
378 TextCleaner::new().reduce_newlines_to_double_newline().run(ascii_text),
379 ascii_result
380 );
381 }
382
383 #[test]
384 fn test_strip_control_chars() {
385 let text_with_controls = "Hello\x00World\x01Test\u{00A0}Normal\u{2029}End";
388 let expected = "HelloWorldTest Normal\n\nEnd";
389 assert_eq!(
390 TextCleaner::new()
391 .do_not_reduce_newlines()
392 .remove_non_basic_ascii()
393 .run(text_with_controls),
394 expected
395 );
396 }
397
398 #[test]
399 fn test_preserves_urls_and_code() {
400 let text = "Visit https://example.com/path_to/file and run x = y + 1";
401 let expected = "Visit https://example.com/path_to/file and run x = y + 1";
402 assert_eq!(
403 TextCleaner::new().do_not_reduce_newlines().remove_non_basic_ascii().run(text),
404 expected
405 );
406 }
407
408 #[test]
409 fn test_preserves_multilingual_text() {
410 let text = "Hello 世界 Bonne année Привет";
411 assert_eq!(
412 TextCleaner::new().do_not_reduce_newlines().remove_non_basic_ascii().run(text),
413 text
414 );
415 }
416
417 #[test]
418 fn test_normalize_whitespace() {
419 let ascii_text = "Ascii\tspaces here. Unicode\u{00A0}spaces here. Literal\\sspaces\\t.";
420 let ascii_result = "Ascii spaces here. Unicode spaces here. Literal spaces .";
421 assert_eq!(normalize_whitespace(ascii_text), ascii_result);
422
423 let ascii_text =
424 "Ascii\nnewlines\n. Unicode\u{2028}newlines.\u{2028}. Literal\\nnewlines.\\n";
425 let ascii_result = "Ascii\nnewlines\n. Unicode\nnewlines.\n. Literal\nnewlines.\n";
426 assert_eq!(normalize_whitespace(ascii_text), ascii_result);
427
428 let ascii_text = "Ascii\n\nparagraphs\r\n\r\n.Unicode\u{2029}paragraphs.\u{2029} Literal\\n\\nparagraphs.\\r\\n\\r\\n";
429 let result = normalize_whitespace(ascii_text);
430 let ascii_result =
431 "Ascii\n\nparagraphs\n\n.Unicode\n\nparagraphs.\n\n Literal\n\nparagraphs.\n\n";
432 assert_eq!(result, ascii_result);
433 }
434
435 #[test]
436 fn test_remove_compound_citations() {
437 let text = "Studies show this [1, 2] and also [3-5] plus [6, 7, 8].";
438 let expected = "Studies show this and also plus.";
439 assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
440 }
441
442 #[test]
443 fn test_preserves_non_citation_brackets() {
444 let text = "Array [1, 2, 3] and link [click here] are not citations.";
445 let expected = "Array and link [click here] are not citations.";
446 assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
447 }
448
449 #[test]
450 fn test_preserves_markdown_links() {
451 let text = "See [this link](https://example.com) for details.";
452 let expected = "See [this link](https://example.com) for details.";
453 assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
454 }
455}