Skip to main content

textsanity_core/
lib.rs

1//! Pure-Rust core for `textsanity`. Configurable text cleanup before
2//! the input gets near a tokenizer or an LLM.
3//!
4//! Each operation is independent and toggleable via [`Options`]. The
5//! defaults reflect what most LLM-app builders actually want: NFKC,
6//! zero-width strip, control-char strip, whitespace collapse, trim.
7
8#![deny(unsafe_code)]
9#![warn(missing_docs)]
10#![warn(rust_2018_idioms)]
11
12use rayon::prelude::*;
13use serde::{Deserialize, Serialize};
14use thiserror::Error;
15use unicode_normalization::UnicodeNormalization;
16
17/// Crate-wide result alias.
18pub type Result<T> = std::result::Result<T, SanityError>;
19
20/// All errors surfaced by `textsanity-core`.
21#[derive(Error, Debug)]
22pub enum SanityError {
23    /// Reserved for future use; constructor is currently infallible.
24    #[error("invalid config: {0}")]
25    InvalidConfig(String),
26}
27
28/// Cleanup pipeline configuration.
29#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
30pub struct Options {
31    /// Apply NFKC unicode normalization.
32    pub nfkc: bool,
33    /// Strip zero-width code points (ZWSP, ZWJ, ZWNJ, BOM, RTL/LTR marks).
34    pub strip_zero_width: bool,
35    /// Strip control characters (C0 + C1) except `\n` and `\t`.
36    pub strip_control: bool,
37    /// Collapse runs of whitespace to a single space.
38    pub collapse_whitespace: bool,
39    /// Trim leading and trailing whitespace.
40    pub trim: bool,
41    /// Replace common smart punctuation (curly quotes, em/en dash, ellipsis)
42    /// with plain ASCII equivalents.
43    pub ascii_punctuation: bool,
44    /// Strip emoji and pictograph code points.
45    pub strip_emoji: bool,
46    /// Drop any non-ASCII character. Applied after `ascii_punctuation` so
47    /// smart punctuation gets converted, not deleted.
48    pub ascii_only: bool,
49}
50
51impl Default for Options {
52    fn default() -> Self {
53        Self {
54            nfkc: true,
55            strip_zero_width: true,
56            strip_control: true,
57            collapse_whitespace: true,
58            trim: true,
59            ascii_punctuation: false,
60            strip_emoji: false,
61            ascii_only: false,
62        }
63    }
64}
65
66impl Options {
67    /// Strict preset: every cleanup operation enabled. Use when feeding
68    /// untrusted text into a downstream that's sensitive to unicode tricks.
69    pub fn strict() -> Self {
70        Self {
71            nfkc: true,
72            strip_zero_width: true,
73            strip_control: true,
74            collapse_whitespace: true,
75            trim: true,
76            ascii_punctuation: true,
77            strip_emoji: true,
78            ascii_only: true,
79        }
80    }
81}
82
83/// Normalize line endings to `\n`. Converts both `\r\n` (CRLF) and lone
84/// `\r` (CR) to `\n`. Idempotent. Cheap to apply before or after the main
85/// `sanitize` pipeline.
86pub fn normalize_newlines(text: &str) -> String {
87    let mut out = String::with_capacity(text.len());
88    let bytes = text.as_bytes();
89    let mut i = 0;
90    while i < bytes.len() {
91        let b = bytes[i];
92        if b == b'\r' {
93            out.push('\n');
94            // Skip a following \n so CRLF collapses to one \n.
95            if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
96                i += 1;
97            }
98        } else {
99            // Reconstruct the char from the byte (ASCII-safe; fall through to
100            // re-decoding the rest as UTF-8 if a multi-byte sequence starts).
101            if b < 0x80 {
102                out.push(b as char);
103            } else {
104                // Multi-byte UTF-8: take a slice and let str handle it.
105                let end = next_char_boundary(bytes, i);
106                // Safe because text is &str and the indices are char boundaries.
107                out.push_str(&text[i..end]);
108                i = end - 1;
109            }
110        }
111        i += 1;
112    }
113    out
114}
115
116fn next_char_boundary(bytes: &[u8], start: usize) -> usize {
117    let mut end = start + 1;
118    while end < bytes.len() && (bytes[end] & 0xC0) == 0x80 {
119        end += 1;
120    }
121    end
122}
123
124/// Run the cleanup pipeline against `text` with the given options.
125pub fn sanitize(text: &str, opts: &Options) -> String {
126    // 1. NFKC normalize. This is the only step that needs a separate pass.
127    let s: String = if opts.nfkc {
128        text.nfkc().collect()
129    } else {
130        text.to_string()
131    };
132
133    // 2. Char-level filter pass (zero-width, control, emoji, ascii-only,
134    //    smart-punctuation rewrite). Single allocation; bounded by len(s).
135    let mut out = String::with_capacity(s.len());
136    for c in s.chars() {
137        if opts.strip_zero_width && is_zero_width(c) {
138            continue;
139        }
140        if opts.strip_control && is_strippable_control(c) {
141            continue;
142        }
143        if opts.strip_emoji && is_emoji_codepoint(c) {
144            continue;
145        }
146        let mapped = if opts.ascii_punctuation {
147            map_smart_punctuation(c)
148        } else {
149            CharRewrite::Single(c)
150        };
151        match mapped {
152            CharRewrite::Single(ch) => {
153                if opts.ascii_only && !ch.is_ascii() {
154                    continue;
155                }
156                out.push(ch);
157            }
158            CharRewrite::Multi(s2) => {
159                if opts.ascii_only && !s2.is_ascii() {
160                    continue;
161                }
162                out.push_str(s2);
163            }
164            CharRewrite::Drop => {}
165        }
166    }
167
168    // 3. Whitespace collapse.
169    let s = if opts.collapse_whitespace {
170        let mut collapsed = String::with_capacity(out.len());
171        let mut prev_space = false;
172        for c in out.chars() {
173            if c.is_whitespace() {
174                if !prev_space {
175                    collapsed.push(' ');
176                }
177                prev_space = true;
178            } else {
179                collapsed.push(c);
180                prev_space = false;
181            }
182        }
183        collapsed
184    } else {
185        out
186    };
187
188    // 4. Trim.
189    if opts.trim {
190        s.trim().to_string()
191    } else {
192        s
193    }
194}
195
196/// Bulk variant. With `parallel = true`, distributes across rayon's pool.
197pub fn sanitize_many(texts: &[&str], opts: &Options, parallel: bool) -> Vec<String> {
198    if parallel {
199        texts.par_iter().map(|t| sanitize(t, opts)).collect()
200    } else {
201        texts.iter().map(|t| sanitize(t, opts)).collect()
202    }
203}
204
205// --- small helpers ---
206
207fn is_zero_width(c: char) -> bool {
208    matches!(
209        c,
210        '\u{200B}'  // zero-width space
211        | '\u{200C}'  // ZWNJ
212        | '\u{200D}'  // ZWJ
213        | '\u{200E}'  // LTR mark
214        | '\u{200F}'  // RTL mark
215        | '\u{202A}' // LRE
216        | '\u{202B}' // RLE
217        | '\u{202C}' // PDF
218        | '\u{202D}' // LRO
219        | '\u{202E}' // RLO
220        | '\u{2060}'  // word joiner
221        | '\u{2061}' // function application
222        | '\u{2062}' // invisible times
223        | '\u{2063}' // invisible separator
224        | '\u{2064}' // invisible plus
225        | '\u{FEFF}' // BOM
226    )
227}
228
229fn is_strippable_control(c: char) -> bool {
230    if c == '\n' || c == '\t' {
231        return false;
232    }
233    let cp = c as u32;
234    (cp <= 0x1F) || (0x7F..=0x9F).contains(&cp)
235}
236
237fn is_emoji_codepoint(c: char) -> bool {
238    let cp = c as u32;
239    matches!(
240        cp,
241        0x1F300..=0x1F5FF   // miscellaneous symbols and pictographs
242        | 0x1F600..=0x1F64F  // emoticons
243        | 0x1F680..=0x1F6FF  // transport and map symbols
244        | 0x1F700..=0x1F77F
245        | 0x1F780..=0x1F7FF
246        | 0x1F800..=0x1F8FF
247        | 0x1F900..=0x1F9FF
248        | 0x1FA00..=0x1FA6F
249        | 0x1FA70..=0x1FAFF
250        | 0x2600..=0x26FF    // miscellaneous symbols
251        | 0x2700..=0x27BF    // dingbats
252        | 0xFE0E..=0xFE0F    // variation selectors
253    )
254}
255
256enum CharRewrite {
257    Single(char),
258    Multi(&'static str),
259    Drop,
260}
261
262fn map_smart_punctuation(c: char) -> CharRewrite {
263    match c {
264        '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => CharRewrite::Single('\''),
265        '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => CharRewrite::Single('"'),
266        '\u{2013}' | '\u{2014}' | '\u{2212}' => CharRewrite::Single('-'),
267        '\u{2026}' => CharRewrite::Multi("..."),
268        '\u{00A0}' | '\u{2007}' | '\u{202F}' => CharRewrite::Single(' '),
269        '\u{00AB}' | '\u{00BB}' => CharRewrite::Single('"'),
270        _ => CharRewrite::Single(c),
271    }
272}
273
274// `Drop` is reserved for future use (e.g. dropping mid-text marks). Keep
275// the variant referenced so a future addition doesn't fight the compiler.
276#[allow(dead_code)]
277fn _force_drop_referenced() -> CharRewrite {
278    CharRewrite::Drop
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    fn defaults() -> Options {
286        Options::default()
287    }
288
289    #[test]
290    fn strict_preset_enables_everything() {
291        let s = Options::strict();
292        assert!(s.nfkc);
293        assert!(s.strip_zero_width);
294        assert!(s.strip_control);
295        assert!(s.collapse_whitespace);
296        assert!(s.trim);
297        assert!(s.ascii_punctuation);
298        assert!(s.strip_emoji);
299        assert!(s.ascii_only);
300    }
301
302    #[test]
303    fn normalize_newlines_crlf_to_lf() {
304        assert_eq!(normalize_newlines("a\r\nb\r\nc"), "a\nb\nc");
305    }
306
307    #[test]
308    fn normalize_newlines_lone_cr_to_lf() {
309        assert_eq!(normalize_newlines("a\rb\rc"), "a\nb\nc");
310    }
311
312    #[test]
313    fn normalize_newlines_idempotent() {
314        let once = normalize_newlines("a\r\nb\r\nc");
315        let twice = normalize_newlines(&once);
316        assert_eq!(once, twice);
317    }
318
319    #[test]
320    fn normalize_newlines_preserves_unicode() {
321        assert_eq!(normalize_newlines("hi 世界\r\nbye 🌍"), "hi 世界\nbye 🌍");
322    }
323
324    #[test]
325    fn strict_preset_strips_emoji_and_smart_quotes() {
326        let r = sanitize("\u{201C}hi\u{201D} 🌍 there", &Options::strict());
327        // Smart quotes -> ascii ", emoji removed.
328        assert_eq!(r, "\"hi\" there");
329    }
330
331    #[test]
332    fn defaults_collapse_and_trim() {
333        let r = sanitize("  hello   world  ", &defaults());
334        assert_eq!(r, "hello world");
335    }
336
337    #[test]
338    fn nfkc_normalizes_ligature_and_fullwidth() {
339        let r = sanitize("file ABC123", &defaults());
340        assert_eq!(r, "file ABC123");
341    }
342
343    #[test]
344    fn zero_width_stripped() {
345        let r = sanitize("hi\u{200B}there\u{FEFF}", &defaults());
346        assert_eq!(r, "hithere");
347    }
348
349    #[test]
350    fn control_stripped_but_newline_preserved() {
351        let r = sanitize("a\x01b\nc", &defaults());
352        assert_eq!(r, "ab c"); // \n collapses with following space-equivalent? No.
353    }
354
355    #[test]
356    fn newline_preserved_when_collapse_off() {
357        let opts = Options {
358            collapse_whitespace: false,
359            ..Options::default()
360        };
361        let r = sanitize("a\nb", &opts);
362        assert_eq!(r, "a\nb");
363    }
364
365    #[test]
366    fn ascii_punctuation_replaces_smart_quotes() {
367        let opts = Options {
368            ascii_punctuation: true,
369            ..Options::default()
370        };
371        let r = sanitize("\u{201C}hello\u{201D} \u{2014} world\u{2026}", &opts);
372        assert_eq!(r, "\"hello\" - world...");
373    }
374
375    #[test]
376    fn ascii_only_drops_non_ascii() {
377        let opts = Options {
378            ascii_only: true,
379            ..Options::default()
380        };
381        let r = sanitize("hello 世界 world", &opts);
382        assert_eq!(r, "hello world");
383    }
384
385    #[test]
386    fn ascii_only_with_punctuation_keeps_converted() {
387        let opts = Options {
388            ascii_only: true,
389            ascii_punctuation: true,
390            ..Options::default()
391        };
392        // Smart quote becomes ascii ", which survives ascii_only.
393        let r = sanitize("\u{201C}hi\u{201D}", &opts);
394        assert_eq!(r, "\"hi\"");
395    }
396
397    #[test]
398    fn strip_emoji() {
399        let opts = Options {
400            strip_emoji: true,
401            ..Options::default()
402        };
403        let r = sanitize("hi 🌍 world 🚀", &opts);
404        assert_eq!(r, "hi world");
405    }
406
407    #[test]
408    fn nfkc_off_preserves_ligature() {
409        let opts = Options {
410            nfkc: false,
411            ..Options::default()
412        };
413        let r = sanitize("file", &opts);
414        assert_eq!(r, "file");
415    }
416
417    #[test]
418    fn empty_input_returns_empty() {
419        assert_eq!(sanitize("", &defaults()), "");
420    }
421
422    #[test]
423    fn collapse_off_keeps_runs() {
424        let opts = Options {
425            collapse_whitespace: false,
426            ..Options::default()
427        };
428        let r = sanitize("  hello   world  ", &opts);
429        // Trim still runs: removes leading/trailing.
430        assert_eq!(r, "hello   world");
431    }
432
433    #[test]
434    fn trim_off_keeps_edges() {
435        let opts = Options {
436            trim: false,
437            collapse_whitespace: false,
438            ..Options::default()
439        };
440        let r = sanitize("  hi  ", &opts);
441        assert_eq!(r, "  hi  ");
442    }
443
444    #[test]
445    fn nbsp_replaced_with_space_when_ascii_punct() {
446        let opts = Options {
447            ascii_punctuation: true,
448            ..Options::default()
449        };
450        let r = sanitize("a\u{00A0}b", &opts);
451        // Non-breaking space becomes regular space, then collapse leaves one.
452        assert_eq!(r, "a b");
453    }
454
455    #[test]
456    fn sanitize_many_serial_and_parallel_match() {
457        let texts: Vec<&str> = vec!["  hi  ", "world\u{FEFF}", "file"];
458        let opts = defaults();
459        let s = sanitize_many(&texts, &opts, false);
460        let p = sanitize_many(&texts, &opts, true);
461        assert_eq!(s, p);
462        assert_eq!(s, vec!["hi", "world", "file"]);
463    }
464
465    #[test]
466    fn idempotent_on_clean_input() {
467        let opts = defaults();
468        let once = sanitize("hello world", &opts);
469        let twice = sanitize(&once, &opts);
470        assert_eq!(once, twice);
471    }
472}