use std::collections::HashMap;
use unicode_segmentation::UnicodeSegmentation;
pub trait Transform: Send + Sync {
fn transform(&self, input: &str) -> String;
}
fn is_unicode_punctuation(c: char) -> bool {
!c.is_alphanumeric() && !c.is_whitespace() && c.to_string().unicode_words().count() == 0
}
pub struct Compose {
transforms: Vec<Box<dyn Transform>>,
}
impl Compose {
#[must_use]
pub fn new(transforms: Vec<Box<dyn Transform>>) -> Self {
Self { transforms }
}
}
impl Transform for Compose {
fn transform(&self, input: &str) -> String {
self.transforms
.iter()
.fold(input.to_string(), |text, t| t.transform(&text))
}
}
pub struct ToLower;
impl Transform for ToLower {
fn transform(&self, input: &str) -> String {
input.to_lowercase()
}
}
pub struct ToUpper;
impl Transform for ToUpper {
fn transform(&self, input: &str) -> String {
input.to_uppercase()
}
}
pub struct Strip;
impl Transform for Strip {
fn transform(&self, input: &str) -> String {
input.trim().to_string()
}
}
pub struct RemovePunctuation;
impl Transform for RemovePunctuation {
fn transform(&self, input: &str) -> String {
input
.chars()
.filter(|c| !is_unicode_punctuation(*c))
.collect()
}
}
pub struct RemoveMultipleSpaces;
impl Transform for RemoveMultipleSpaces {
fn transform(&self, input: &str) -> String {
let mut result = String::with_capacity(input.len());
let mut prev_space = false;
for c in input.chars() {
if c == ' ' {
if !prev_space {
result.push(c);
prev_space = true;
}
} else {
result.push(c);
prev_space = false;
}
}
result
}
}
pub struct RemoveWhitespace;
impl Transform for RemoveWhitespace {
fn transform(&self, input: &str) -> String {
input.chars().filter(|c| !c.is_whitespace()).collect()
}
}
pub struct SubstituteWords {
substitutions: HashMap<String, String>,
}
impl SubstituteWords {
#[must_use]
pub fn new(pairs: Vec<(&str, &str)>) -> Self {
let substitutions = pairs
.into_iter()
.map(|(k, v)| (k.to_lowercase(), v.to_owned()))
.collect();
Self { substitutions }
}
}
impl Transform for SubstituteWords {
fn transform(&self, input: &str) -> String {
input
.split_whitespace()
.map(|word| {
self.substitutions
.get(&word.to_lowercase())
.cloned()
.unwrap_or_else(|| word.to_string())
})
.collect::<Vec<_>>()
.join(" ")
}
}
pub struct RemoveSpecificWords {
words: Vec<String>,
}
impl RemoveSpecificWords {
#[must_use]
pub fn new(words: &[&str]) -> Self {
Self {
words: words.iter().map(|w| w.to_lowercase()).collect(),
}
}
}
impl Transform for RemoveSpecificWords {
fn transform(&self, input: &str) -> String {
input
.split_whitespace()
.filter(|word| !self.words.contains(&word.to_lowercase()))
.collect::<Vec<_>>()
.join(" ")
}
}
pub struct ExpandCommonEnglishContractions;
impl Transform for ExpandCommonEnglishContractions {
fn transform(&self, input: &str) -> String {
let mut result = input.to_string();
let contractions: &[(&str, &str)] = &[
("wouldn't", "would not"),
("couldn't", "could not"),
("shouldn't", "should not"),
("mustn't", "must not"),
("needn't", "need not"),
("hasn't", "has not"),
("haven't", "have not"),
("hadn't", "had not"),
("wasn't", "was not"),
("weren't", "were not"),
("isn't", "is not"),
("aren't", "are not"),
("don't", "do not"),
("doesn't", "does not"),
("didn't", "did not"),
("won't", "will not"),
("can't", "cannot"),
("shan't", "shall not"),
("they'll", "they will"),
("they'd", "they would"),
("they've", "they have"),
("they're", "they are"),
("that's", "that is"),
("there's", "there is"),
("here's", "here is"),
("what's", "what is"),
("let's", "let us"),
("you'll", "you will"),
("you'd", "you would"),
("you've", "you have"),
("you're", "you are"),
("we'll", "we will"),
("we'd", "we would"),
("we've", "we have"),
("we're", "we are"),
("i'll", "I will"),
("i'd", "I would"),
("i've", "I have"),
("i'm", "I am"),
("he'll", "he will"),
("he'd", "he would"),
("he's", "he is"),
("she'll", "she will"),
("she'd", "she would"),
("she's", "she is"),
("it's", "it is"),
];
for (contraction, expansion) in contractions {
result = result.replace(contraction, expansion);
}
result
}
}
#[cfg(feature = "chinese-variant")]
pub struct ToSimplified;
#[cfg(feature = "chinese-variant")]
impl Transform for ToSimplified {
fn transform(&self, input: &str) -> String {
zhconv::zhconv(input, zhconv::Variant::ZhCN)
}
}
#[cfg(feature = "chinese-variant")]
pub struct ToTraditional;
#[cfg(feature = "chinese-variant")]
impl Transform for ToTraditional {
fn transform(&self, input: &str) -> String {
zhconv::zhconv(input, zhconv::Variant::ZhHant)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn to_lower_transform() {
let t = ToLower;
assert_eq!(t.transform("Hello WORLD"), "hello world");
}
#[test]
fn to_lower_empty() {
let t = ToLower;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_lower_unicode() {
let t = ToLower;
assert_eq!(t.transform("HELLO 您好"), "hello 您好");
}
#[test]
fn to_upper_transform() {
let t = ToUpper;
assert_eq!(t.transform("hello"), "HELLO");
}
#[test]
fn to_upper_empty() {
let t = ToUpper;
assert_eq!(t.transform(""), "");
}
#[test]
fn strip_transform() {
let t = Strip;
assert_eq!(t.transform(" hello "), "hello");
}
#[test]
fn strip_no_whitespace() {
let t = Strip;
assert_eq!(t.transform("hello"), "hello");
}
#[test]
fn strip_empty() {
let t = Strip;
assert_eq!(t.transform(""), "");
}
#[test]
fn strip_only_whitespace() {
let t = Strip;
assert_eq!(t.transform(" "), "");
}
#[test]
fn remove_punctuation_transform() {
let t = RemovePunctuation;
assert_eq!(t.transform("hello, world!"), "hello world");
}
#[test]
fn remove_punctuation_no_punctuation() {
let t = RemovePunctuation;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_punctuation_empty() {
let t = RemovePunctuation;
assert_eq!(t.transform(""), "");
}
#[test]
fn remove_punctuation_unicode() {
let t = RemovePunctuation;
assert_eq!(t.transform("你好,世界!"), "你好世界");
}
#[test]
fn remove_punctuation_only_punctuation() {
let t = RemovePunctuation;
assert_eq!(t.transform("!@#$%"), "");
}
#[test]
fn remove_multiple_spaces_transform() {
let t = RemoveMultipleSpaces;
assert_eq!(t.transform("hello world foo"), "hello world foo");
}
#[test]
fn remove_multiple_spaces_empty() {
let t = RemoveMultipleSpaces;
assert_eq!(t.transform(""), "");
}
#[test]
fn remove_multiple_spaces_no_extra_spaces() {
let t = RemoveMultipleSpaces;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_multiple_spaces_leading_trailing() {
let t = RemoveMultipleSpaces;
assert_eq!(t.transform(" hello "), " hello ");
}
#[test]
fn remove_multiple_spaces_only_spaces() {
let t = RemoveMultipleSpaces;
assert_eq!(t.transform(" "), " ");
}
#[test]
fn remove_whitespace_transform() {
let t = RemoveWhitespace;
assert_eq!(t.transform("hello world"), "helloworld");
}
#[test]
fn remove_whitespace_empty() {
let t = RemoveWhitespace;
assert_eq!(t.transform(""), "");
}
#[test]
fn remove_whitespace_tabs_and_newlines() {
let t = RemoveWhitespace;
assert_eq!(t.transform("hello\tworld\n"), "helloworld");
}
#[test]
fn substitute_words_transform() {
let t = SubstituteWords::new(vec![("hello", "hi")]);
assert_eq!(t.transform("hello world hello"), "hi world hi");
}
#[test]
fn substitute_words_case_insensitive() {
let t = SubstituteWords::new(vec![("hello", "hi")]);
assert_eq!(t.transform("Hello WORLD"), "hi WORLD");
}
#[test]
fn substitute_words_no_partial_match() {
let t = SubstituteWords::new(vec![("he", "she")]);
assert_eq!(t.transform("hello"), "hello");
}
#[test]
fn substitute_words_empty() {
let t = SubstituteWords::new(vec![]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn substitute_words_not_found() {
let t = SubstituteWords::new(vec![("foo", "bar")]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_specific_words_transform() {
let t = RemoveSpecificWords::new(&["the", "a", "an"]);
assert_eq!(t.transform("the cat sat on a mat"), "cat sat on mat");
}
#[test]
fn remove_specific_words_case_insensitive() {
let t = RemoveSpecificWords::new(&["the"]);
assert_eq!(t.transform("The cat"), "cat");
}
#[test]
fn remove_specific_words_empty() {
let t = RemoveSpecificWords::new(&[]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_specific_words_all_removed() {
let t = RemoveSpecificWords::new(&["hello"]);
assert_eq!(t.transform("hello"), "");
}
#[test]
fn expand_contractions_dont() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("don't"), "do not");
}
#[test]
fn expand_contractions_cant() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("can't"), "cannot");
}
#[test]
fn expand_contractions_its() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("it's"), "it is");
}
#[test]
fn expand_contractions_wont() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("won't"), "will not");
}
#[test]
fn expand_contractions_im() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'm"), "I am");
}
#[test]
fn expand_contractions_im_uppercase() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("I'm"), "I'm");
}
#[test]
fn expand_contractions_multiple() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i can't do it"), "i cannot do it");
}
#[test]
fn expand_contractions_with_im() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'm here"), "I am here");
}
#[test]
fn expand_contractions_no_contraction() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn expand_contractions_empty() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform(""), "");
}
#[test]
fn expand_contractions_preserves_case() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("DON'T"), "DON'T");
}
#[test]
fn compose_chaining() {
let pipeline = Compose::new(vec![
Box::new(Strip),
Box::new(ToLower),
Box::new(RemovePunctuation),
Box::new(RemoveMultipleSpaces),
]);
assert_eq!(pipeline.transform(" Hello, World! "), "hello world");
}
#[test]
fn compose_empty() {
let pipeline = Compose::new(vec![]);
assert_eq!(pipeline.transform("hello"), "hello");
}
#[test]
fn compose_single() {
let pipeline = Compose::new(vec![Box::new(ToLower)]);
assert_eq!(pipeline.transform("HELLO"), "hello");
}
#[test]
fn compose_empty_input() {
let pipeline = Compose::new(vec![Box::new(ToLower), Box::new(Strip)]);
assert_eq!(pipeline.transform(""), "");
}
#[cfg(feature = "chinese-variant")]
mod chinese_variant_tests {
use super::*;
#[test]
fn to_simplified_traditional_to_simplified() {
let t = ToSimplified;
assert_eq!(t.transform("繁體中文"), "繁体中文");
}
#[test]
fn to_simplified_mixed_text() {
let t = ToSimplified;
assert_eq!(t.transform("這是個測試"), "这是个测试");
}
#[test]
fn to_simplified_already_simplified() {
let t = ToSimplified;
assert_eq!(t.transform("简体中文"), "简体中文");
}
#[test]
fn to_simplified_empty() {
let t = ToSimplified;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_simplified_with_punctuation() {
let t = ToSimplified;
assert_eq!(t.transform("你好,世界!"), "你好,世界!");
}
#[test]
fn to_traditional_simplified_to_traditional() {
let t = ToTraditional;
assert_eq!(t.transform("简体中文"), "簡體中文");
}
#[test]
fn to_traditional_already_traditional() {
let t = ToTraditional;
assert_eq!(t.transform("繁體中文"), "繁體中文");
}
#[test]
fn to_traditional_empty() {
let t = ToTraditional;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_traditional_with_punctuation() {
let t = ToTraditional;
assert_eq!(t.transform("你好,世界!"), "你好,世界!");
}
#[test]
fn roundtrip_simplified_traditional_simplified() {
let original = "简体中文测试";
let t = ToTraditional;
let traditional = t.transform(original);
let s = ToSimplified;
let back = s.transform(&traditional);
assert_eq!(back, original);
}
#[test]
fn compose_with_to_simplified() {
let pipeline = Compose::new(vec![Box::new(ToSimplified), Box::new(ToLower)]);
assert_eq!(pipeline.transform("繁體中文"), "繁体中文");
}
}
}