#![deny(unsafe_code)]
#![warn(missing_docs)]
#![warn(rust_2018_idioms)]
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use thiserror::Error;
use unicode_normalization::UnicodeNormalization;
pub type Result<T> = std::result::Result<T, SanityError>;
#[derive(Error, Debug)]
pub enum SanityError {
#[error("invalid config: {0}")]
InvalidConfig(String),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct Options {
pub nfkc: bool,
pub strip_zero_width: bool,
pub strip_control: bool,
pub collapse_whitespace: bool,
pub trim: bool,
pub ascii_punctuation: bool,
pub strip_emoji: bool,
pub ascii_only: bool,
}
impl Default for Options {
fn default() -> Self {
Self {
nfkc: true,
strip_zero_width: true,
strip_control: true,
collapse_whitespace: true,
trim: true,
ascii_punctuation: false,
strip_emoji: false,
ascii_only: false,
}
}
}
impl Options {
pub fn strict() -> Self {
Self {
nfkc: true,
strip_zero_width: true,
strip_control: true,
collapse_whitespace: true,
trim: true,
ascii_punctuation: true,
strip_emoji: true,
ascii_only: true,
}
}
}
pub fn normalize_newlines(text: &str) -> String {
let mut out = String::with_capacity(text.len());
let bytes = text.as_bytes();
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'\r' {
out.push('\n');
if i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
i += 1;
}
} else {
if b < 0x80 {
out.push(b as char);
} else {
let end = next_char_boundary(bytes, i);
out.push_str(&text[i..end]);
i = end - 1;
}
}
i += 1;
}
out
}
fn next_char_boundary(bytes: &[u8], start: usize) -> usize {
let mut end = start + 1;
while end < bytes.len() && (bytes[end] & 0xC0) == 0x80 {
end += 1;
}
end
}
pub fn sanitize(text: &str, opts: &Options) -> String {
let s: String = if opts.nfkc {
text.nfkc().collect()
} else {
text.to_string()
};
let mut out = String::with_capacity(s.len());
for c in s.chars() {
if opts.strip_zero_width && is_zero_width(c) {
continue;
}
if opts.strip_control && is_strippable_control(c) {
continue;
}
if opts.strip_emoji && is_emoji_codepoint(c) {
continue;
}
let mapped = if opts.ascii_punctuation {
map_smart_punctuation(c)
} else {
CharRewrite::Single(c)
};
match mapped {
CharRewrite::Single(ch) => {
if opts.ascii_only && !ch.is_ascii() {
continue;
}
out.push(ch);
}
CharRewrite::Multi(s2) => {
if opts.ascii_only && !s2.is_ascii() {
continue;
}
out.push_str(s2);
}
CharRewrite::Drop => {}
}
}
let s = if opts.collapse_whitespace {
let mut collapsed = String::with_capacity(out.len());
let mut prev_space = false;
for c in out.chars() {
if c.is_whitespace() {
if !prev_space {
collapsed.push(' ');
}
prev_space = true;
} else {
collapsed.push(c);
prev_space = false;
}
}
collapsed
} else {
out
};
if opts.trim {
s.trim().to_string()
} else {
s
}
}
pub fn sanitize_many(texts: &[&str], opts: &Options, parallel: bool) -> Vec<String> {
if parallel {
texts.par_iter().map(|t| sanitize(t, opts)).collect()
} else {
texts.iter().map(|t| sanitize(t, opts)).collect()
}
}
fn is_zero_width(c: char) -> bool {
matches!(
c,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{200E}' | '\u{200F}' | '\u{202A}' | '\u{202B}' | '\u{202C}' | '\u{202D}' | '\u{202E}' | '\u{2060}' | '\u{2061}' | '\u{2062}' | '\u{2063}' | '\u{2064}' | '\u{FEFF}' )
}
fn is_strippable_control(c: char) -> bool {
if c == '\n' || c == '\t' {
return false;
}
let cp = c as u32;
(cp <= 0x1F) || (0x7F..=0x9F).contains(&cp)
}
fn is_emoji_codepoint(c: char) -> bool {
let cp = c as u32;
matches!(
cp,
0x1F300..=0x1F5FF | 0x1F600..=0x1F64F | 0x1F680..=0x1F6FF | 0x1F700..=0x1F77F
| 0x1F780..=0x1F7FF
| 0x1F800..=0x1F8FF
| 0x1F900..=0x1F9FF
| 0x1FA00..=0x1FA6F
| 0x1FA70..=0x1FAFF
| 0x2600..=0x26FF | 0x2700..=0x27BF | 0xFE0E..=0xFE0F )
}
enum CharRewrite {
Single(char),
Multi(&'static str),
Drop,
}
fn map_smart_punctuation(c: char) -> CharRewrite {
match c {
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => CharRewrite::Single('\''),
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => CharRewrite::Single('"'),
'\u{2013}' | '\u{2014}' | '\u{2212}' => CharRewrite::Single('-'),
'\u{2026}' => CharRewrite::Multi("..."),
'\u{00A0}' | '\u{2007}' | '\u{202F}' => CharRewrite::Single(' '),
'\u{00AB}' | '\u{00BB}' => CharRewrite::Single('"'),
_ => CharRewrite::Single(c),
}
}
#[allow(dead_code)]
fn _force_drop_referenced() -> CharRewrite {
CharRewrite::Drop
}
#[cfg(test)]
mod tests {
use super::*;
fn defaults() -> Options {
Options::default()
}
#[test]
fn strict_preset_enables_everything() {
let s = Options::strict();
assert!(s.nfkc);
assert!(s.strip_zero_width);
assert!(s.strip_control);
assert!(s.collapse_whitespace);
assert!(s.trim);
assert!(s.ascii_punctuation);
assert!(s.strip_emoji);
assert!(s.ascii_only);
}
#[test]
fn normalize_newlines_crlf_to_lf() {
assert_eq!(normalize_newlines("a\r\nb\r\nc"), "a\nb\nc");
}
#[test]
fn normalize_newlines_lone_cr_to_lf() {
assert_eq!(normalize_newlines("a\rb\rc"), "a\nb\nc");
}
#[test]
fn normalize_newlines_idempotent() {
let once = normalize_newlines("a\r\nb\r\nc");
let twice = normalize_newlines(&once);
assert_eq!(once, twice);
}
#[test]
fn normalize_newlines_preserves_unicode() {
assert_eq!(normalize_newlines("hi 世界\r\nbye 🌍"), "hi 世界\nbye 🌍");
}
#[test]
fn strict_preset_strips_emoji_and_smart_quotes() {
let r = sanitize("\u{201C}hi\u{201D} 🌍 there", &Options::strict());
assert_eq!(r, "\"hi\" there");
}
#[test]
fn defaults_collapse_and_trim() {
let r = sanitize(" hello world ", &defaults());
assert_eq!(r, "hello world");
}
#[test]
fn nfkc_normalizes_ligature_and_fullwidth() {
let r = sanitize("file ABC123", &defaults());
assert_eq!(r, "file ABC123");
}
#[test]
fn zero_width_stripped() {
let r = sanitize("hi\u{200B}there\u{FEFF}", &defaults());
assert_eq!(r, "hithere");
}
#[test]
fn control_stripped_but_newline_preserved() {
let r = sanitize("a\x01b\nc", &defaults());
assert_eq!(r, "ab c"); }
#[test]
fn newline_preserved_when_collapse_off() {
let opts = Options {
collapse_whitespace: false,
..Options::default()
};
let r = sanitize("a\nb", &opts);
assert_eq!(r, "a\nb");
}
#[test]
fn ascii_punctuation_replaces_smart_quotes() {
let opts = Options {
ascii_punctuation: true,
..Options::default()
};
let r = sanitize("\u{201C}hello\u{201D} \u{2014} world\u{2026}", &opts);
assert_eq!(r, "\"hello\" - world...");
}
#[test]
fn ascii_only_drops_non_ascii() {
let opts = Options {
ascii_only: true,
..Options::default()
};
let r = sanitize("hello 世界 world", &opts);
assert_eq!(r, "hello world");
}
#[test]
fn ascii_only_with_punctuation_keeps_converted() {
let opts = Options {
ascii_only: true,
ascii_punctuation: true,
..Options::default()
};
let r = sanitize("\u{201C}hi\u{201D}", &opts);
assert_eq!(r, "\"hi\"");
}
#[test]
fn strip_emoji() {
let opts = Options {
strip_emoji: true,
..Options::default()
};
let r = sanitize("hi 🌍 world 🚀", &opts);
assert_eq!(r, "hi world");
}
#[test]
fn nfkc_off_preserves_ligature() {
let opts = Options {
nfkc: false,
..Options::default()
};
let r = sanitize("file", &opts);
assert_eq!(r, "file");
}
#[test]
fn empty_input_returns_empty() {
assert_eq!(sanitize("", &defaults()), "");
}
#[test]
fn collapse_off_keeps_runs() {
let opts = Options {
collapse_whitespace: false,
..Options::default()
};
let r = sanitize(" hello world ", &opts);
assert_eq!(r, "hello world");
}
#[test]
fn trim_off_keeps_edges() {
let opts = Options {
trim: false,
collapse_whitespace: false,
..Options::default()
};
let r = sanitize(" hi ", &opts);
assert_eq!(r, " hi ");
}
#[test]
fn nbsp_replaced_with_space_when_ascii_punct() {
let opts = Options {
ascii_punctuation: true,
..Options::default()
};
let r = sanitize("a\u{00A0}b", &opts);
assert_eq!(r, "a b");
}
#[test]
fn sanitize_many_serial_and_parallel_match() {
let texts: Vec<&str> = vec![" hi ", "world\u{FEFF}", "file"];
let opts = defaults();
let s = sanitize_many(&texts, &opts, false);
let p = sanitize_many(&texts, &opts, true);
assert_eq!(s, p);
assert_eq!(s, vec!["hi", "world", "file"]);
}
#[test]
fn idempotent_on_clean_input() {
let opts = defaults();
let once = sanitize("hello world", &opts);
let twice = sanitize(&once, &opts);
assert_eq!(once, twice);
}
}