use unicode_normalization::UnicodeNormalization;
pub fn is_katakana(c: char) -> bool {
('\u{30A1}'..='\u{31FF}').contains(&c)
}
pub fn is_cyrillic(c: char) -> bool {
('\u{0400}'..='\u{04FF}').contains(&c)
}
pub fn remove_acute_accent(text: &str) -> String {
text.nfd()
.filter(|c| *c as u32 != 0x301)
.nfc()
.collect::<String>()
}
#[test]
fn test_remove_accent() {
assert_eq!(remove_acute_accent("á"), "a");
assert_eq!(remove_acute_accent("á"), "a");
assert_eq!(remove_acute_accent("cápe"), "cape");
assert_eq!(remove_acute_accent("pā́"), "pā");
assert_eq!(remove_acute_accent("cá̄"), "cā");
}
pub trait SplitIntoWords {
fn split_into_words(&self) -> Vec<String>;
}
impl SplitIntoWords for String {
fn split_into_words(&self) -> Vec<String> {
if self.is_empty() {
return Vec::new();
}
let mut result = Vec::new(); let mut current = String::new(); let mut chars = self.chars();
let first_char = chars.next().unwrap();
let mut current_is_letter = first_char.is_ainu_letter();
current.push(first_char);
for c in chars {
let is_letter = c.is_ainu_letter();
if is_letter == current_is_letter {
current.push(c);
} else {
result.push(current.clone());
current = c.to_string(); current_is_letter = is_letter; }
}
result.push(current);
result
}
}
impl SplitIntoWords for &str {
fn split_into_words(&self) -> Vec<String> {
self.to_string().split_into_words()
}
}
#[test]
fn test_split_into_words() {
let text = String::from("Hello、世界! This is Rust.");
assert_eq!(
text.split_into_words(),
vec!["Hello", "、", "世界", "! ", "This", " ", "is", " ", "Rust", "."]
);
}
pub trait IsLetter {
fn is_ainu_letter(&self) -> bool;
}
impl IsLetter for char {
fn is_ainu_letter(&self) -> bool {
self.is_alphabetic()
|| "\u{3099}\u{309A}\u{309B}\u{309C}\u{FF9E}\u{FF9F}’'=".contains(*self)
}
}
#[test]
fn test_is_ainu_letter() {
assert_eq!('a'.is_ainu_letter(), true);
assert_eq!('ア'.is_ainu_letter(), true);
assert_eq!('あ'.is_ainu_letter(), true);
assert_eq!('🐱'.is_ainu_letter(), false);
}