1#![deny(unsafe_code)]
9#![warn(missing_docs)]
10#![warn(rust_2018_idioms)]
11
12use rayon::prelude::*;
13use serde::{Deserialize, Serialize};
14use thiserror::Error;
15use unicode_normalization::UnicodeNormalization;
16
17pub type Result<T> = std::result::Result<T, SanityError>;
19
20#[derive(Error, Debug)]
22pub enum SanityError {
23 #[error("invalid config: {0}")]
25 InvalidConfig(String),
26}
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
30pub struct Options {
31 pub nfkc: bool,
33 pub strip_zero_width: bool,
35 pub strip_control: bool,
37 pub collapse_whitespace: bool,
39 pub trim: bool,
41 pub ascii_punctuation: bool,
44 pub strip_emoji: bool,
46 pub ascii_only: bool,
49}
50
51impl Default for Options {
52 fn default() -> Self {
53 Self {
54 nfkc: true,
55 strip_zero_width: true,
56 strip_control: true,
57 collapse_whitespace: true,
58 trim: true,
59 ascii_punctuation: false,
60 strip_emoji: false,
61 ascii_only: false,
62 }
63 }
64}
65
66impl Options {
67 pub fn strict() -> Self {
70 Self {
71 nfkc: true,
72 strip_zero_width: true,
73 strip_control: true,
74 collapse_whitespace: true,
75 trim: true,
76 ascii_punctuation: true,
77 strip_emoji: true,
78 ascii_only: true,
79 }
80 }
81}
82
83pub fn normalize_newlines(text: &str) -> String {
87 let mut out = String::with_capacity(text.len());
88 let bytes = text.as_bytes();
89 let mut i = 0;
90 while i < bytes.len() {
91 let b = bytes[i];
92 if b == b'\r' {
93 out.push('\n');
94 if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
96 i += 1;
97 }
98 } else {
99 if b < 0x80 {
102 out.push(b as char);
103 } else {
104 let end = next_char_boundary(bytes, i);
106 out.push_str(&text[i..end]);
108 i = end - 1;
109 }
110 }
111 i += 1;
112 }
113 out
114}
115
116fn next_char_boundary(bytes: &[u8], start: usize) -> usize {
117 let mut end = start + 1;
118 while end < bytes.len() && (bytes[end] & 0xC0) == 0x80 {
119 end += 1;
120 }
121 end
122}
123
124pub fn sanitize(text: &str, opts: &Options) -> String {
126 let s: String = if opts.nfkc {
128 text.nfkc().collect()
129 } else {
130 text.to_string()
131 };
132
133 let mut out = String::with_capacity(s.len());
136 for c in s.chars() {
137 if opts.strip_zero_width && is_zero_width(c) {
138 continue;
139 }
140 if opts.strip_control && is_strippable_control(c) {
141 continue;
142 }
143 if opts.strip_emoji && is_emoji_codepoint(c) {
144 continue;
145 }
146 let mapped = if opts.ascii_punctuation {
147 map_smart_punctuation(c)
148 } else {
149 CharRewrite::Single(c)
150 };
151 match mapped {
152 CharRewrite::Single(ch) => {
153 if opts.ascii_only && !ch.is_ascii() {
154 continue;
155 }
156 out.push(ch);
157 }
158 CharRewrite::Multi(s2) => {
159 if opts.ascii_only && !s2.is_ascii() {
160 continue;
161 }
162 out.push_str(s2);
163 }
164 CharRewrite::Drop => {}
165 }
166 }
167
168 let s = if opts.collapse_whitespace {
170 let mut collapsed = String::with_capacity(out.len());
171 let mut prev_space = false;
172 for c in out.chars() {
173 if c.is_whitespace() {
174 if !prev_space {
175 collapsed.push(' ');
176 }
177 prev_space = true;
178 } else {
179 collapsed.push(c);
180 prev_space = false;
181 }
182 }
183 collapsed
184 } else {
185 out
186 };
187
188 if opts.trim {
190 s.trim().to_string()
191 } else {
192 s
193 }
194}
195
196pub fn sanitize_many(texts: &[&str], opts: &Options, parallel: bool) -> Vec<String> {
198 if parallel {
199 texts.par_iter().map(|t| sanitize(t, opts)).collect()
200 } else {
201 texts.iter().map(|t| sanitize(t, opts)).collect()
202 }
203}
204
205fn is_zero_width(c: char) -> bool {
208 matches!(
209 c,
210 '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2060}' | '\u{2061}' | '\u{2062}' | '\u{2063}' | '\u{2064}' | '\u{FEFF}' )
227}
228
229fn is_strippable_control(c: char) -> bool {
230 if c == '\n' || c == '\t' {
231 return false;
232 }
233 let cp = c as u32;
234 (cp <= 0x1F) || (0x7F..=0x9F).contains(&cp)
235}
236
237fn is_emoji_codepoint(c: char) -> bool {
238 let cp = c as u32;
239 matches!(
240 cp,
241 0x1F300..=0x1F5FF | 0x1F600..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F700..=0x1F77F
245 | 0x1F780..=0x1F7FF
246 | 0x1F800..=0x1F8FF
247 | 0x1F900..=0x1F9FF
248 | 0x1FA00..=0x1FA6F
249 | 0x1FA70..=0x1FAFF
250 | 0x2600..=0x26FF | 0x2700..=0x27BF | 0xFE0E..=0xFE0F )
254}
255
256enum CharRewrite {
257 Single(char),
258 Multi(&'static str),
259 Drop,
260}
261
262fn map_smart_punctuation(c: char) -> CharRewrite {
263 match c {
264 '\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => CharRewrite::Single('\''),
265 '\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => CharRewrite::Single('"'),
266 '\u{2013}' | '\u{2014}' | '\u{2212}' => CharRewrite::Single('-'),
267 '\u{2026}' => CharRewrite::Multi("..."),
268 '\u{00A0}' | '\u{2007}' | '\u{202F}' => CharRewrite::Single(' '),
269 '\u{00AB}' | '\u{00BB}' => CharRewrite::Single('"'),
270 _ => CharRewrite::Single(c),
271 }
272}
273
274#[allow(dead_code)]
277fn _force_drop_referenced() -> CharRewrite {
278 CharRewrite::Drop
279}
280
281#[cfg(test)]
282mod tests {
283 use super::*;
284
285 fn defaults() -> Options {
286 Options::default()
287 }
288
289 #[test]
290 fn strict_preset_enables_everything() {
291 let s = Options::strict();
292 assert!(s.nfkc);
293 assert!(s.strip_zero_width);
294 assert!(s.strip_control);
295 assert!(s.collapse_whitespace);
296 assert!(s.trim);
297 assert!(s.ascii_punctuation);
298 assert!(s.strip_emoji);
299 assert!(s.ascii_only);
300 }
301
302 #[test]
303 fn normalize_newlines_crlf_to_lf() {
304 assert_eq!(normalize_newlines("a\r\nb\r\nc"), "a\nb\nc");
305 }
306
307 #[test]
308 fn normalize_newlines_lone_cr_to_lf() {
309 assert_eq!(normalize_newlines("a\rb\rc"), "a\nb\nc");
310 }
311
312 #[test]
313 fn normalize_newlines_idempotent() {
314 let once = normalize_newlines("a\r\nb\r\nc");
315 let twice = normalize_newlines(&once);
316 assert_eq!(once, twice);
317 }
318
319 #[test]
320 fn normalize_newlines_preserves_unicode() {
321 assert_eq!(normalize_newlines("hi 世界\r\nbye 🌍"), "hi 世界\nbye 🌍");
322 }
323
324 #[test]
325 fn strict_preset_strips_emoji_and_smart_quotes() {
326 let r = sanitize("\u{201C}hi\u{201D} 🌍 there", &Options::strict());
327 assert_eq!(r, "\"hi\" there");
329 }
330
331 #[test]
332 fn defaults_collapse_and_trim() {
333 let r = sanitize(" hello world ", &defaults());
334 assert_eq!(r, "hello world");
335 }
336
337 #[test]
338 fn nfkc_normalizes_ligature_and_fullwidth() {
339 let r = sanitize("file ABC123", &defaults());
340 assert_eq!(r, "file ABC123");
341 }
342
343 #[test]
344 fn zero_width_stripped() {
345 let r = sanitize("hi\u{200B}there\u{FEFF}", &defaults());
346 assert_eq!(r, "hithere");
347 }
348
349 #[test]
350 fn control_stripped_but_newline_preserved() {
351 let r = sanitize("a\x01b\nc", &defaults());
352 assert_eq!(r, "ab c"); }
354
355 #[test]
356 fn newline_preserved_when_collapse_off() {
357 let opts = Options {
358 collapse_whitespace: false,
359 ..Options::default()
360 };
361 let r = sanitize("a\nb", &opts);
362 assert_eq!(r, "a\nb");
363 }
364
365 #[test]
366 fn ascii_punctuation_replaces_smart_quotes() {
367 let opts = Options {
368 ascii_punctuation: true,
369 ..Options::default()
370 };
371 let r = sanitize("\u{201C}hello\u{201D} \u{2014} world\u{2026}", &opts);
372 assert_eq!(r, "\"hello\" - world...");
373 }
374
375 #[test]
376 fn ascii_only_drops_non_ascii() {
377 let opts = Options {
378 ascii_only: true,
379 ..Options::default()
380 };
381 let r = sanitize("hello 世界 world", &opts);
382 assert_eq!(r, "hello world");
383 }
384
385 #[test]
386 fn ascii_only_with_punctuation_keeps_converted() {
387 let opts = Options {
388 ascii_only: true,
389 ascii_punctuation: true,
390 ..Options::default()
391 };
392 let r = sanitize("\u{201C}hi\u{201D}", &opts);
394 assert_eq!(r, "\"hi\"");
395 }
396
397 #[test]
398 fn strip_emoji() {
399 let opts = Options {
400 strip_emoji: true,
401 ..Options::default()
402 };
403 let r = sanitize("hi 🌍 world 🚀", &opts);
404 assert_eq!(r, "hi world");
405 }
406
407 #[test]
408 fn nfkc_off_preserves_ligature() {
409 let opts = Options {
410 nfkc: false,
411 ..Options::default()
412 };
413 let r = sanitize("file", &opts);
414 assert_eq!(r, "file");
415 }
416
417 #[test]
418 fn empty_input_returns_empty() {
419 assert_eq!(sanitize("", &defaults()), "");
420 }
421
422 #[test]
423 fn collapse_off_keeps_runs() {
424 let opts = Options {
425 collapse_whitespace: false,
426 ..Options::default()
427 };
428 let r = sanitize(" hello world ", &opts);
429 assert_eq!(r, "hello world");
431 }
432
433 #[test]
434 fn trim_off_keeps_edges() {
435 let opts = Options {
436 trim: false,
437 collapse_whitespace: false,
438 ..Options::default()
439 };
440 let r = sanitize(" hi ", &opts);
441 assert_eq!(r, " hi ");
442 }
443
444 #[test]
445 fn nbsp_replaced_with_space_when_ascii_punct() {
446 let opts = Options {
447 ascii_punctuation: true,
448 ..Options::default()
449 };
450 let r = sanitize("a\u{00A0}b", &opts);
451 assert_eq!(r, "a b");
453 }
454
455 #[test]
456 fn sanitize_many_serial_and_parallel_match() {
457 let texts: Vec<&str> = vec![" hi ", "world\u{FEFF}", "file"];
458 let opts = defaults();
459 let s = sanitize_many(&texts, &opts, false);
460 let p = sanitize_many(&texts, &opts, true);
461 assert_eq!(s, p);
462 assert_eq!(s, vec!["hi", "world", "file"]);
463 }
464
465 #[test]
466 fn idempotent_on_clean_input() {
467 let opts = defaults();
468 let once = sanitize("hello world", &opts);
469 let twice = sanitize(&once, &opts);
470 assert_eq!(once, twice);
471 }
472}