Skip to main content

llm_text/
text.rs

1#[derive(Default)]
2pub enum Newlines {
3    Space,
4    Single,
5    #[default]
6    TwoPlus,
7    None,
8}
9
10/// Represents a single step in the text cleaning pipeline.
11/// Each step processes a character and updates the cleaning state.
12enum CleanStep {
13    /// Character should be emitted as-is
14    Emit(char),
15    /// A whitespace character was encountered
16    Whitespace,
17    /// A newline sequence was encountered
18    Newline(usize),
19    /// An escaped whitespace/newline sequence was processed
20    EscapedWhitespace,
21    /// An escaped newline sequence was processed
22    EscapedNewline,
23    /// A citation was removed. If true, the next character is punctuation
24    /// and we should remove the trailing space before it.
25    CitationRemoved(bool),
26    /// A non-citation bracket and its contents should be replayed
27    ReplayNonCitation(Vec<char>),
28}
29
30/// Internal state for the single-pass text cleaner.
31struct CleanState {
32    result: String,
33    consecutive_newlines: usize,
34    last_was_space: bool,
35}
36
37impl CleanState {
38    fn with_capacity(capacity: usize) -> Self {
39        Self {
40            result: String::with_capacity(capacity),
41            consecutive_newlines: 0,
42            last_was_space: false,
43        }
44    }
45}
46
47#[derive(Default)]
48pub struct TextCleaner {
49    pub newlines: Newlines,
50    pub remove_non_basic_ascii: bool,
51    pub remove_citations: bool,
52}
53
54impl TextCleaner {
55    pub fn new() -> Self {
56        Self::default()
57    }
58
59    pub fn do_not_reduce_newlines(mut self) -> Self {
60        self.newlines = Newlines::None;
61        self
62    }
63
64    pub fn reduce_newlines_to_single_space(mut self) -> Self {
65        self.newlines = Newlines::Space;
66        self
67    }
68
69    pub fn reduce_newlines_to_single_newline(mut self) -> Self {
70        self.newlines = Newlines::Single;
71        self
72    }
73
74    pub fn reduce_newlines_to_double_newline(mut self) -> Self {
75        self.newlines = Newlines::TwoPlus;
76        self
77    }
78
79    pub fn remove_non_basic_ascii(mut self) -> Self {
80        self.remove_non_basic_ascii = true;
81        self
82    }
83
84    pub fn remove_citations(mut self) -> Self {
85        self.remove_citations = true;
86        self
87    }
88
89    /// Single-pass text cleaning with integrated citation removal, whitespace
90    /// collapsing, and newline normalization. Uses a blacklist approach for
91    /// character filtering to preserve all visible text including URLs, code,
92    /// and multilingual content.
93    pub fn run(&self, text: &str) -> String {
94        let mut state = CleanState::with_capacity(text.len());
95        let mut chars = text.chars().peekable();
96
97        while let Some(c) = chars.next() {
98            let step = self.classify_char(c, &mut chars);
99
100            match step {
101                CleanStep::Newline(count) => {
102                    self.handle_newline(&mut state, count);
103                }
104                CleanStep::Whitespace => {
105                    self.handle_whitespace(&mut state);
106                }
107                CleanStep::EscapedWhitespace => {
108                    self.handle_escaped_whitespace(&mut state);
109                }
110                CleanStep::EscapedNewline => {
111                    state.consecutive_newlines += 1;
112                    state.last_was_space = false;
113                }
114                CleanStep::CitationRemoved(remove_trailing_space) => {
115                    // Citation was already consumed in classify_char
116                    // If the next character is punctuation, remove the trailing space
117                    if remove_trailing_space && state.last_was_space && state.result.ends_with(' ')
118                    {
119                        state.result.pop();
120                        state.last_was_space = false;
121                    }
122                }
123                CleanStep::ReplayNonCitation(buf) => {
124                    self.emit_newlines(&mut state);
125                    for ch in buf {
126                        state.result.push(ch);
127                    }
128                    state.last_was_space = false;
129                }
130                CleanStep::Emit(ch) => {
131                    self.emit_newlines(&mut state);
132                    if !self.remove_non_basic_ascii || is_valid_text_char(ch) {
133                        state.result.push(ch);
134                    }
135                    state.last_was_space = false;
136                }
137            }
138        }
139
140        // Handle any trailing newlines
141        if state.consecutive_newlines > 0 {
142            self.emit_newlines(&mut state);
143        }
144
145        trim_trailing_spaces(&state.result)
146    }
147
148    /// Classify a character and return the appropriate cleaning step.
149    fn classify_char(
150        &self,
151        c: char,
152        chars: &mut std::iter::Peekable<std::str::Chars<'_>>,
153    ) -> CleanStep {
154        match c {
155            // Handle various newline sequences
156            '\r' => {
157                if chars.peek() == Some(&'\n') {
158                    chars.next();
159                }
160                CleanStep::Newline(1)
161            }
162            '\n' | '\x0B' | '\x0C' | '\u{2028}' => CleanStep::Newline(1),
163            '\u{2029}' => CleanStep::Newline(2),
164
165            // Handle various whitespace characters
166            ' ' |
167            '\t' |
168            '\u{00A0}' |
169            '\u{1680}' |
170            '\u{2000}'..='\u{200A}' |
171            '\u{202F}' |
172            '\u{205F}' |
173            '\u{3000}' => CleanStep::Whitespace,
174
175            // Handle escaped whitespace/newline sequences
176            '\\' => self.classify_escape(chars),
177
178            // Handle citations
179            '[' if self.remove_citations => self.classify_citation(chars),
180
181            // Regular character
182            _ => CleanStep::Emit(c),
183        }
184    }
185
186    /// Classify an escape sequence after backslash.
187    fn classify_escape(&self, chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> CleanStep {
188        if let Some(&next) = chars.peek() {
189            match next {
190                's' | 't' => {
191                    chars.next();
192                    CleanStep::EscapedWhitespace
193                }
194                'n' | 'r' => {
195                    chars.next();
196                    CleanStep::EscapedNewline
197                }
198                _ => CleanStep::Emit('\\'),
199            }
200        } else {
201            CleanStep::Emit('\\')
202        }
203    }
204
205    /// Classify a potential citation starting with '['.
206    fn classify_citation(&self, chars: &mut std::iter::Peekable<std::str::Chars<'_>>) -> CleanStep {
207        let mut buf = vec!['['];
208        let mut is_citation = false;
209
210        while let Some(&next) = chars.peek() {
211            if next.is_ascii_digit() || next == ',' || next == '-' || next == ' ' {
212                buf.push(next);
213                chars.next();
214            } else if next == ']' && buf.len() > 1 && buf[1..].iter().any(|b| b.is_ascii_digit()) {
215                is_citation = true;
216                chars.next();
217                break;
218            } else {
219                break;
220            }
221        }
222
223        if is_citation {
224            // Check if the next character is punctuation - if so, we should
225            // remove the trailing space before it
226            let next_is_punctuation =
227                chars.peek().is_some_and(|&c| matches!(c, '.' | ',' | '?' | '!' | ':' | ';'));
228            CleanStep::CitationRemoved(next_is_punctuation)
229        } else {
230            CleanStep::ReplayNonCitation(buf)
231        }
232    }
233
234    /// Handle a newline character by updating the consecutive newline count.
235    fn handle_newline(&self, state: &mut CleanState, count: usize) {
236        state.consecutive_newlines += count;
237        state.last_was_space = false;
238    }
239
240    /// Handle a whitespace character, emitting pending newlines if needed.
241    fn handle_whitespace(&self, state: &mut CleanState) {
242        if state.consecutive_newlines > 0 {
243            match self.newlines {
244                Newlines::Space => {
245                    state.result.push(' ');
246                    state.consecutive_newlines = 0;
247                    state.last_was_space = true;
248                    return;
249                }
250                Newlines::Single => {
251                    state.result.push('\n');
252                    state.consecutive_newlines = 0;
253                }
254                Newlines::TwoPlus => {
255                    let count = state.consecutive_newlines.min(2);
256                    for _ in 0..count {
257                        state.result.push('\n');
258                    }
259                    state.consecutive_newlines = 0;
260                }
261                Newlines::None => {
262                    for _ in 0..state.consecutive_newlines {
263                        state.result.push('\n');
264                    }
265                    state.consecutive_newlines = 0;
266                }
267            }
268        }
269        if !state.last_was_space {
270            state.result.push(' ');
271            state.last_was_space = true;
272        }
273    }
274
275    /// Handle an escaped whitespace sequence (e.g., \s, \t).
276    fn handle_escaped_whitespace(&self, state: &mut CleanState) {
277        if !state.last_was_space && state.consecutive_newlines == 0 {
278            state.result.push(' ');
279            state.last_was_space = true;
280        }
281    }
282
283    /// Emit accumulated newlines to the result buffer.
284    fn emit_newlines(&self, state: &mut CleanState) {
285        if state.consecutive_newlines == 0 {
286            return;
287        }
288        match self.newlines {
289            Newlines::Space => {
290                state.result.push(' ');
291            }
292            Newlines::Single => {
293                state.result.push('\n');
294            }
295            Newlines::TwoPlus => {
296                let count = state.consecutive_newlines.min(2);
297                for _ in 0..count {
298                    state.result.push('\n');
299                }
300            }
301            Newlines::None => {
302                for _ in 0..state.consecutive_newlines {
303                    state.result.push('\n');
304                }
305            }
306        }
307        state.consecutive_newlines = 0;
308    }
309}
310
311/// Check if a character should be kept in cleaned text.
312/// Uses a blacklist approach: only removes ASCII control characters (except
313/// whitespace), preserving all visible text including Unicode, URLs, code, etc.
314fn is_valid_text_char(c: char) -> bool {
315    !(c.is_control() && c != '\t' && c != '\n' && c != '\r')
316}
317
318/// Trim leading whitespace, trailing spaces, and normalize trailing newlines to max 2.
319fn trim_trailing_spaces(text: &str) -> String {
320    let trimmed = text.trim_start();
321    if trimmed.is_empty() {
322        return String::new();
323    }
324    // Trim trailing spaces and tabs
325    let trimmed = trimmed.trim_end_matches([' ', '\t']);
326    // Count and normalize trailing newlines
327    let newline_count = trimmed.chars().rev().take_while(|&c| c == '\n' || c == '\r').count();
328    if newline_count == 0 {
329        return trimmed.to_string();
330    }
331    let body = &trimmed[..trimmed.len() - newline_count];
332    let clamped = newline_count.min(2);
333    let mut result = String::with_capacity(body.len() + clamped);
334    result.push_str(body);
335    for _ in 0..clamped {
336        result.push('\n');
337    }
338    result
339}
340
341/// Normalize whitespace in text using single-pass processing
342pub fn normalize_whitespace(text: &str) -> String {
343    TextCleaner::new().do_not_reduce_newlines().run(text)
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn test_clean_to_single_spaces() {
352        let ascii_text =
353            "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n And\nof course, newlines.\n\n";
354        let ascii_result = "Ascii spaces here. Unicode spaces here. And of course, newlines.";
355        assert_eq!(
356            TextCleaner::new().reduce_newlines_to_single_space().run(ascii_text),
357            ascii_result
358        );
359    }
360
361    #[test]
362    fn test_clean_to_single_newlines() {
363        let ascii_text =
364            "Ascii\tspaces here. Unicode\u{00A0}spaces here.\nAnd of course, newlines.\n\nCool.";
365        let ascii_result =
366            "Ascii spaces here. Unicode spaces here.\nAnd of course, newlines.\nCool.";
367        assert_eq!(
368            TextCleaner::new().reduce_newlines_to_single_newline().run(ascii_text),
369            ascii_result
370        );
371    }
372
373    #[test]
374    fn test_clean_to_double_newlines() {
375        let ascii_text = "Ascii\tspaces here. Unicode\u{00A0}spaces here.\n\nAscii\n\nparagraphs.\r\n\r\nUnicode\u{2029}paragraphs.\u{2029}\u{2028} Literal\\n\\nparagraphs.\\r\\n\\r\\n";
376        let ascii_result = "Ascii spaces here. Unicode spaces here.\n\nAscii\n\nparagraphs.\n\nUnicode\n\nparagraphs.\n\n Literal\n\nparagraphs.\n\n";
377        assert_eq!(
378            TextCleaner::new().reduce_newlines_to_double_newline().run(ascii_text),
379            ascii_result
380        );
381    }
382
383    #[test]
384    fn test_strip_control_chars() {
385        // Blacklist approach: only control chars are removed, all visible text
386        // including multilingual content is preserved
387        let text_with_controls = "Hello\x00World\x01Test\u{00A0}Normal\u{2029}End";
388        let expected = "HelloWorldTest Normal\n\nEnd";
389        assert_eq!(
390            TextCleaner::new()
391                .do_not_reduce_newlines()
392                .remove_non_basic_ascii()
393                .run(text_with_controls),
394            expected
395        );
396    }
397
398    #[test]
399    fn test_preserves_urls_and_code() {
400        let text = "Visit https://example.com/path_to/file and run x = y + 1";
401        let expected = "Visit https://example.com/path_to/file and run x = y + 1";
402        assert_eq!(
403            TextCleaner::new().do_not_reduce_newlines().remove_non_basic_ascii().run(text),
404            expected
405        );
406    }
407
408    #[test]
409    fn test_preserves_multilingual_text() {
410        let text = "Hello 世界 Bonne année Привет";
411        assert_eq!(
412            TextCleaner::new().do_not_reduce_newlines().remove_non_basic_ascii().run(text),
413            text
414        );
415    }
416
417    #[test]
418    fn test_normalize_whitespace() {
419        let ascii_text = "Ascii\tspaces here. Unicode\u{00A0}spaces here. Literal\\sspaces\\t.";
420        let ascii_result = "Ascii spaces here. Unicode spaces here. Literal spaces .";
421        assert_eq!(normalize_whitespace(ascii_text), ascii_result);
422
423        let ascii_text =
424            "Ascii\nnewlines\n. Unicode\u{2028}newlines.\u{2028}. Literal\\nnewlines.\\n";
425        let ascii_result = "Ascii\nnewlines\n. Unicode\nnewlines.\n. Literal\nnewlines.\n";
426        assert_eq!(normalize_whitespace(ascii_text), ascii_result);
427
428        let ascii_text = "Ascii\n\nparagraphs\r\n\r\n.Unicode\u{2029}paragraphs.\u{2029} Literal\\n\\nparagraphs.\\r\\n\\r\\n";
429        let result = normalize_whitespace(ascii_text);
430        let ascii_result =
431            "Ascii\n\nparagraphs\n\n.Unicode\n\nparagraphs.\n\n Literal\n\nparagraphs.\n\n";
432        assert_eq!(result, ascii_result);
433    }
434
435    #[test]
436    fn test_remove_compound_citations() {
437        let text = "Studies show this [1, 2] and also [3-5] plus [6, 7, 8].";
438        let expected = "Studies show this and also plus.";
439        assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
440    }
441
442    #[test]
443    fn test_preserves_non_citation_brackets() {
444        let text = "Array [1, 2, 3] and link [click here] are not citations.";
445        let expected = "Array and link [click here] are not citations.";
446        assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
447    }
448
449    #[test]
450    fn test_preserves_markdown_links() {
451        let text = "See [this link](https://example.com) for details.";
452        let expected = "See [this link](https://example.com) for details.";
453        assert_eq!(TextCleaner::new().remove_citations().run(text), expected);
454    }
455}