use regex::Regex;
use std::sync::LazyLock;
use unicode_normalization::UnicodeNormalization;
static WHITESPACE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\s+").expect("Invalid whitespace regex"));
static CONTROL_CHARS: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]").expect("Invalid control chars regex")
});
#[derive(Clone, Debug, Default)]
pub struct Normalizer {
nfc: bool,
remove_control_chars: bool,
collapse_whitespace: bool,
strip: bool,
}
impl Normalizer {
pub fn new() -> Self {
Self {
nfc: true,
remove_control_chars: true,
collapse_whitespace: true,
strip: true,
}
}
pub fn with_nfc(mut self, nfc: bool) -> Self {
self.nfc = nfc;
self
}
pub fn with_remove_control_chars(mut self, remove: bool) -> Self {
self.remove_control_chars = remove;
self
}
pub fn with_collapse_whitespace(mut self, collapse: bool) -> Self {
self.collapse_whitespace = collapse;
self
}
pub fn with_strip(mut self, strip: bool) -> Self {
self.strip = strip;
self
}
pub fn normalize(&self, text: &str) -> String {
let mut result = if self.nfc {
text.nfc().collect::<String>()
} else {
text.to_string()
};
if self.remove_control_chars {
result = CONTROL_CHARS.replace_all(&result, "").to_string();
}
if self.collapse_whitespace {
result = WHITESPACE.replace_all(&result, " ").to_string();
}
if self.strip {
result = result.trim().to_string();
}
result
}
pub fn normalize_in_place(&self, text: &mut String) {
*text = self.normalize(text);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_whitespace_collapse() {
let normalizer = Normalizer::new();
let text = "hello world\n\n\tfoo";
let normalized = normalizer.normalize(text);
assert_eq!(normalized, "hello world foo");
}
#[test]
fn test_control_char_removal() {
let normalizer = Normalizer::new();
let text = "hello\x00world\x1F";
let normalized = normalizer.normalize(text);
assert_eq!(normalized, "helloworld");
}
#[test]
fn test_nfc_normalization() {
let normalizer = Normalizer::new();
let text = "caf\u{0065}\u{0301}";
let normalized = normalizer.normalize(text);
assert_eq!(normalized, "café");
}
#[test]
fn test_strip() {
let normalizer = Normalizer::new();
let text = " hello world ";
let normalized = normalizer.normalize(text);
assert_eq!(normalized, "hello world");
}
}