use std::collections::HashMap;
use unicode_segmentation::UnicodeSegmentation;
pub trait Transform: Send + Sync {
fn transform(&self, input: &str) -> String;
}
fn is_unicode_punctuation(c: char) -> bool {
!c.is_alphanumeric() && !c.is_whitespace() && c.to_string().unicode_words().count() == 0
}
pub struct Compose {
transforms: Vec<Box<dyn Transform>>,
}
impl Compose {
#[must_use]
pub fn new(transforms: Vec<Box<dyn Transform>>) -> Self {
Self { transforms }
}
}
impl Transform for Compose {
fn transform(&self, input: &str) -> String {
self.transforms
.iter()
.fold(input.to_string(), |text, t| t.transform(&text))
}
}
pub struct ToLower;
impl Transform for ToLower {
fn transform(&self, input: &str) -> String {
input.to_lowercase()
}
}
pub struct ToUpper;
impl Transform for ToUpper {
fn transform(&self, input: &str) -> String {
input.to_uppercase()
}
}
pub struct Strip;
impl Transform for Strip {
fn transform(&self, input: &str) -> String {
input.trim().to_string()
}
}
pub struct RemovePunctuation;
impl Transform for RemovePunctuation {
fn transform(&self, input: &str) -> String {
input
.chars()
.filter(|c| !is_unicode_punctuation(*c))
.collect()
}
}
pub struct NormalizeSpaces;
fn is_cjk(c: char) -> bool {
matches!(
c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{2B820}'..='\u{2CEAF}' | '\u{F900}'..='\u{FAFF}' | '\u{2F800}'..='\u{2FA1F}' )
}
impl Transform for NormalizeSpaces {
fn transform(&self, input: &str) -> String {
let chars: Vec<char> = input.chars().collect();
let mut result = String::with_capacity(input.len());
let mut prev_space = false;
let mut i = 0;
while i < chars.len() {
if chars[i] == ' ' {
let prev_non_space = result.chars().rev().find(|&c| c != ' ');
let next_non_space = chars[i + 1..].iter().find(|&&c| c != ' ').copied();
let cjk_both =
prev_non_space.is_some_and(is_cjk) && next_non_space.is_some_and(is_cjk);
if cjk_both {
i += 1;
continue;
}
if !prev_space {
result.push(' ');
prev_space = true;
}
} else {
result.push(chars[i]);
prev_space = false;
}
i += 1;
}
result
}
}
pub struct RemoveWhitespace;
impl Transform for RemoveWhitespace {
fn transform(&self, input: &str) -> String {
input.chars().filter(|c| !c.is_whitespace()).collect()
}
}
pub struct SubstituteWords {
substitutions: HashMap<String, String>,
}
impl SubstituteWords {
#[must_use]
pub fn new(pairs: Vec<(&str, &str)>) -> Self {
let substitutions = pairs
.into_iter()
.map(|(k, v)| (k.to_lowercase(), v.to_owned()))
.collect();
Self { substitutions }
}
}
impl Transform for SubstituteWords {
fn transform(&self, input: &str) -> String {
input
.split_whitespace()
.map(|word| {
self.substitutions
.get(&word.to_lowercase())
.cloned()
.unwrap_or_else(|| word.to_string())
})
.collect::<Vec<_>>()
.join(" ")
}
}
pub struct RemoveSpecificWords {
words: Vec<String>,
}
impl RemoveSpecificWords {
#[must_use]
pub fn new(words: &[&str]) -> Self {
Self {
words: words.iter().map(|w| w.to_lowercase()).collect(),
}
}
}
impl Transform for RemoveSpecificWords {
fn transform(&self, input: &str) -> String {
input
.split_whitespace()
.filter(|word| !self.words.contains(&word.to_lowercase()))
.collect::<Vec<_>>()
.join(" ")
}
}
pub struct ExpandCommonEnglishContractions;
impl Transform for ExpandCommonEnglishContractions {
fn transform(&self, input: &str) -> String {
let mut result = input.to_string();
let contractions: &[(&str, &str)] = &[
("wouldn't", "would not"),
("couldn't", "could not"),
("shouldn't", "should not"),
("mustn't", "must not"),
("needn't", "need not"),
("hasn't", "has not"),
("haven't", "have not"),
("hadn't", "had not"),
("wasn't", "was not"),
("weren't", "were not"),
("isn't", "is not"),
("aren't", "are not"),
("don't", "do not"),
("doesn't", "does not"),
("didn't", "did not"),
("won't", "will not"),
("can't", "cannot"),
("shan't", "shall not"),
("they'll", "they will"),
("they'd", "they would"),
("they've", "they have"),
("they're", "they are"),
("that's", "that is"),
("there's", "there is"),
("here's", "here is"),
("what's", "what is"),
("let's", "let us"),
("you'll", "you will"),
("you'd", "you would"),
("you've", "you have"),
("you're", "you are"),
("we'll", "we will"),
("we'd", "we would"),
("we've", "we have"),
("we're", "we are"),
("i'll", "I will"),
("i'd", "I would"),
("i've", "I have"),
("i'm", "I am"),
("he'll", "he will"),
("he'd", "he would"),
("he's", "he is"),
("she'll", "she will"),
("she'd", "she would"),
("she's", "she is"),
("it's", "it is"),
];
for (contraction, expansion) in contractions {
result = result.replace(contraction, expansion);
}
result
}
}
#[cfg(feature = "chinese-variant")]
pub struct ToSimplified;
#[cfg(feature = "chinese-variant")]
impl Transform for ToSimplified {
fn transform(&self, input: &str) -> String {
zhconv::zhconv(input, zhconv::Variant::ZhCN)
}
}
#[cfg(feature = "chinese-variant")]
pub struct ToTraditional;
#[cfg(feature = "chinese-variant")]
impl Transform for ToTraditional {
fn transform(&self, input: &str) -> String {
zhconv::zhconv(input, zhconv::Variant::ZhHant)
}
}
#[cfg(feature = "chinese-word")]
pub struct ChineseWordSegment {
jieba: jieba_rs::Jieba,
}
#[cfg(feature = "chinese-word")]
impl ChineseWordSegment {
#[must_use]
pub fn new() -> Self {
Self {
jieba: jieba_rs::Jieba::new(),
}
}
}
#[cfg(feature = "chinese-word")]
impl Default for ChineseWordSegment {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "chinese-word")]
impl Transform for ChineseWordSegment {
fn transform(&self, input: &str) -> String {
if input.is_empty() {
return String::new();
}
self.jieba.cut(input, false).join(" ")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn to_lower_transform() {
let t = ToLower;
assert_eq!(t.transform("Hello WORLD"), "hello world");
}
#[test]
fn to_lower_empty() {
let t = ToLower;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_lower_unicode() {
let t = ToLower;
assert_eq!(t.transform("HELLO 您好"), "hello 您好");
}
#[test]
fn to_upper_transform() {
let t = ToUpper;
assert_eq!(t.transform("hello"), "HELLO");
}
#[test]
fn to_upper_empty() {
let t = ToUpper;
assert_eq!(t.transform(""), "");
}
#[test]
fn strip_transform() {
let t = Strip;
assert_eq!(t.transform(" hello "), "hello");
}
#[test]
fn strip_no_whitespace() {
let t = Strip;
assert_eq!(t.transform("hello"), "hello");
}
#[test]
fn strip_empty() {
let t = Strip;
assert_eq!(t.transform(""), "");
}
#[test]
fn strip_only_whitespace() {
let t = Strip;
assert_eq!(t.transform(" "), "");
}
#[test]
fn remove_punctuation_transform() {
let t = RemovePunctuation;
assert_eq!(t.transform("hello, world!"), "hello world");
}
#[test]
fn remove_punctuation_no_punctuation() {
let t = RemovePunctuation;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_punctuation_empty() {
let t = RemovePunctuation;
assert_eq!(t.transform(""), "");
}
#[test]
fn remove_punctuation_unicode() {
let t = RemovePunctuation;
assert_eq!(t.transform("你好,世界!"), "你好世界");
}
#[test]
fn remove_punctuation_only_punctuation() {
let t = RemovePunctuation;
assert_eq!(t.transform("!@#$%"), "");
}
#[test]
fn normalize_spaces_collapse_multiple() {
let t = NormalizeSpaces;
assert_eq!(t.transform("hello world foo"), "hello world foo");
}
#[test]
fn normalize_spaces_empty() {
let t = NormalizeSpaces;
assert_eq!(t.transform(""), "");
}
#[test]
fn normalize_spaces_no_extra_spaces() {
let t = NormalizeSpaces;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn normalize_spaces_leading_trailing() {
let t = NormalizeSpaces;
assert_eq!(t.transform(" hello "), " hello ");
}
#[test]
fn normalize_spaces_only_spaces() {
let t = NormalizeSpaces;
assert_eq!(t.transform(" "), " ");
}
#[test]
fn normalize_spaces_cjk_between() {
let t = NormalizeSpaces;
assert_eq!(t.transform("即 经 济 上"), "即经济上");
}
#[test]
fn normalize_spaces_cjk_consecutive() {
let t = NormalizeSpaces;
assert_eq!(t.transform("中 文"), "中文");
}
#[test]
fn normalize_spaces_mixed_cjk_latin() {
let t = NormalizeSpaces;
assert_eq!(t.transform("hello 世界"), "hello 世界");
assert_eq!(t.transform("世界 hello"), "世界 hello");
}
#[test]
fn normalize_spaces_mixed_cjk_latin_multi() {
let t = NormalizeSpaces;
assert_eq!(t.transform("hello 世界"), "hello 世界");
}
#[test]
fn normalize_spaces_cjk_extension_a() {
let t = NormalizeSpaces;
assert_eq!(t.transform("\u{3400} \u{3401}"), "\u{3400}\u{3401}");
}
#[test]
fn normalize_spaces_cjk_compatibility_ideographs() {
let t = NormalizeSpaces;
assert_eq!(t.transform("\u{F900} \u{F901}"), "\u{F900}\u{F901}");
}
#[test]
fn normalize_spaces_cjk_compatibility_supplement() {
let t = NormalizeSpaces;
assert_eq!(t.transform("\u{2F800} \u{2F801}"), "\u{2F800}\u{2F801}");
}
#[test]
fn normalize_spaces_cjk_non_cjk_kept() {
let t = NormalizeSpaces;
assert_eq!(t.transform("hello 世界"), "hello 世界");
assert_eq!(t.transform("世界 hello"), "世界 hello");
}
#[test]
fn remove_whitespace_transform() {
let t = RemoveWhitespace;
assert_eq!(t.transform("hello world"), "helloworld");
}
#[test]
fn remove_whitespace_empty() {
let t = RemoveWhitespace;
assert_eq!(t.transform(""), "");
}
#[test]
fn remove_whitespace_tabs_and_newlines() {
let t = RemoveWhitespace;
assert_eq!(t.transform("hello\tworld\n"), "helloworld");
}
#[test]
fn substitute_words_transform() {
let t = SubstituteWords::new(vec![("hello", "hi")]);
assert_eq!(t.transform("hello world hello"), "hi world hi");
}
#[test]
fn substitute_words_case_insensitive() {
let t = SubstituteWords::new(vec![("hello", "hi")]);
assert_eq!(t.transform("Hello WORLD"), "hi WORLD");
}
#[test]
fn substitute_words_no_partial_match() {
let t = SubstituteWords::new(vec![("he", "she")]);
assert_eq!(t.transform("hello"), "hello");
}
#[test]
fn substitute_words_empty() {
let t = SubstituteWords::new(vec![]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn substitute_words_not_found() {
let t = SubstituteWords::new(vec![("foo", "bar")]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_specific_words_transform() {
let t = RemoveSpecificWords::new(&["the", "a", "an"]);
assert_eq!(t.transform("the cat sat on a mat"), "cat sat on mat");
}
#[test]
fn remove_specific_words_case_insensitive() {
let t = RemoveSpecificWords::new(&["the"]);
assert_eq!(t.transform("The cat"), "cat");
}
#[test]
fn remove_specific_words_empty() {
let t = RemoveSpecificWords::new(&[]);
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn remove_specific_words_all_removed() {
let t = RemoveSpecificWords::new(&["hello"]);
assert_eq!(t.transform("hello"), "");
}
#[test]
fn expand_contractions_dont() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("don't"), "do not");
}
#[test]
fn expand_contractions_cant() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("can't"), "cannot");
}
#[test]
fn expand_contractions_its() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("it's"), "it is");
}
#[test]
fn expand_contractions_wont() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("won't"), "will not");
}
#[test]
fn expand_contractions_im() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'm"), "I am");
}
#[test]
fn expand_contractions_im_uppercase() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("I'm"), "I'm");
}
#[test]
fn expand_contractions_multiple() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i can't do it"), "i cannot do it");
}
#[test]
fn expand_contractions_with_im() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'm here"), "I am here");
}
#[test]
fn expand_contractions_no_contraction() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("hello world"), "hello world");
}
#[test]
fn expand_contractions_empty() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform(""), "");
}
#[test]
fn expand_contractions_preserves_case() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("DON'T"), "DON'T");
}
#[test]
fn expand_contractions_wouldnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("wouldn't"), "would not");
}
#[test]
fn expand_contractions_couldnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("couldn't"), "could not");
}
#[test]
fn expand_contractions_shouldnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("shouldn't"), "should not");
}
#[test]
fn expand_contractions_havent() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("haven't"), "have not");
}
#[test]
fn expand_contractions_theyll() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("they'll go"), "they will go");
}
#[test]
fn expand_contractions_theyre() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("they're here"), "they are here");
}
#[test]
fn expand_contractions_theyve() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("they've done it"), "they have done it");
}
#[test]
fn expand_contractions_thats() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("that's great"), "that is great");
}
#[test]
fn expand_contractions_shant() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("shan't"), "shall not");
}
#[test]
fn expand_contractions_mustnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("mustn't"), "must not");
}
#[test]
fn expand_contractions_neednt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("needn't"), "need not");
}
#[test]
fn expand_contractions_hes() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("he's here"), "he is here");
}
#[test]
fn expand_contractions_shes() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("she's here"), "she is here");
}
#[test]
fn expand_contractions_wed() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("we'd go"), "we would go");
}
#[test]
fn expand_contractions_youre() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("you're right"), "you are right");
}
#[test]
fn expand_contractions_ive() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i've seen it"), "I have seen it");
}
#[test]
fn expand_contractions_id() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'd go"), "I would go");
}
#[test]
fn expand_contractions_ill() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("i'll do it"), "I will do it");
}
#[test]
fn expand_contractions_doesnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("doesn't"), "does not");
}
#[test]
fn expand_contractions_didnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("didn't"), "did not");
}
#[test]
fn expand_contractions_isnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("isn't"), "is not");
}
#[test]
fn expand_contractions_arent() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("aren't"), "are not");
}
#[test]
fn expand_contractions_wasnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("wasn't"), "was not");
}
#[test]
fn expand_contractions_werent() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("weren't"), "were not");
}
#[test]
fn expand_contractions_hadnt() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("hadn't"), "had not");
}
#[test]
fn expand_contractions_herell() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("here's"), "here is");
}
#[test]
fn expand_contractions_theres() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("there's"), "there is");
}
#[test]
fn expand_contractions_whats() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("what's"), "what is");
}
#[test]
fn expand_contractions_lets() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("let's go"), "let us go");
}
#[test]
fn expand_contractions_youll() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("you'll see"), "you will see");
}
#[test]
fn expand_contractions_youd() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("you'd like"), "you would like");
}
#[test]
fn expand_contractions_youve() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("you've got"), "you have got");
}
#[test]
fn expand_contractions_shell() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("she'll go"), "she will go");
}
#[test]
fn expand_contractions_shed() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("she'd go"), "she would go");
}
#[test]
fn expand_contractions_well() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("we'll go"), "we will go");
}
#[test]
fn expand_contractions_weve() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("we've been"), "we have been");
}
#[test]
fn expand_contractions_were() {
let t = ExpandCommonEnglishContractions;
assert_eq!(t.transform("we're here"), "we are here");
}
#[test]
fn compose_chaining() {
let pipeline = Compose::new(vec![
Box::new(Strip),
Box::new(ToLower),
Box::new(RemovePunctuation),
Box::new(NormalizeSpaces),
]);
assert_eq!(pipeline.transform(" Hello, World! "), "hello world");
}
#[test]
fn compose_empty() {
let pipeline = Compose::new(vec![]);
assert_eq!(pipeline.transform("hello"), "hello");
}
#[test]
fn compose_single() {
let pipeline = Compose::new(vec![Box::new(ToLower)]);
assert_eq!(pipeline.transform("HELLO"), "hello");
}
#[test]
fn compose_empty_input() {
let pipeline = Compose::new(vec![Box::new(ToLower), Box::new(Strip)]);
assert_eq!(pipeline.transform(""), "");
}
#[cfg(feature = "chinese-variant")]
mod chinese_variant_tests {
use super::*;
#[test]
fn to_simplified_traditional_to_simplified() {
let t = ToSimplified;
assert_eq!(t.transform("繁體中文"), "繁体中文");
}
#[test]
fn to_simplified_mixed_text() {
let t = ToSimplified;
assert_eq!(t.transform("這是個測試"), "这是个测试");
}
#[test]
fn to_simplified_already_simplified() {
let t = ToSimplified;
assert_eq!(t.transform("简体中文"), "简体中文");
}
#[test]
fn to_simplified_empty() {
let t = ToSimplified;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_simplified_with_punctuation() {
let t = ToSimplified;
assert_eq!(t.transform("你好,世界!"), "你好,世界!");
}
#[test]
fn to_traditional_simplified_to_traditional() {
let t = ToTraditional;
assert_eq!(t.transform("简体中文"), "簡體中文");
}
#[test]
fn to_traditional_already_traditional() {
let t = ToTraditional;
assert_eq!(t.transform("繁體中文"), "繁體中文");
}
#[test]
fn to_traditional_empty() {
let t = ToTraditional;
assert_eq!(t.transform(""), "");
}
#[test]
fn to_traditional_with_punctuation() {
let t = ToTraditional;
assert_eq!(t.transform("你好,世界!"), "你好,世界!");
}
#[test]
fn roundtrip_simplified_traditional_simplified() {
let original = "简体中文测试";
let t = ToTraditional;
let traditional = t.transform(original);
let s = ToSimplified;
let back = s.transform(&traditional);
assert_eq!(back, original);
}
#[test]
fn compose_with_to_simplified() {
let pipeline = Compose::new(vec![Box::new(ToSimplified), Box::new(ToLower)]);
assert_eq!(pipeline.transform("繁體中文"), "繁体中文");
}
}
#[cfg(feature = "chinese-word")]
mod chinese_word_tests {
use super::*;
#[test]
fn chinese_word_segment_basic() {
let t = ChineseWordSegment::new();
let result = t.transform("今天天气真好");
assert!(!result.is_empty());
assert!(result.contains(' '));
}
#[test]
fn chinese_word_segment_empty() {
let t = ChineseWordSegment::new();
assert_eq!(t.transform(""), "");
}
#[test]
fn chinese_word_segment_preserves_words() {
let t = ChineseWordSegment::new();
let result = t.transform("我们中出了一个叛徒");
assert!(result.contains("我们"));
}
#[test]
fn chinese_word_segment_default() {
let t = ChineseWordSegment::default();
let result = t.transform("你好世界");
assert!(!result.is_empty());
}
#[test]
fn chinese_word_segment_mixed_content() {
let t = ChineseWordSegment::new();
let result = t.transform("我喜欢rust编程");
assert!(!result.is_empty());
}
}
}