use pinyin::ToPinyin;
use std::collections::HashMap;
pub struct VariantDetector {
pinyin_map: HashMap<String, Vec<String>>, shape_map: HashMap<char, Vec<char>>, char_to_pinyin: HashMap<char, String>, }
impl Default for VariantDetector {
fn default() -> Self {
Self::new()
}
}
impl VariantDetector {
pub fn new() -> Self {
VariantDetector {
pinyin_map: HashMap::new(),
shape_map: Self::build_shape_map(),
char_to_pinyin: HashMap::new(),
}
}
pub fn add_word(&mut self, word: &str) {
let pinyins: Vec<String> = word
.chars()
.filter_map(|c| {
if let Some(py) = c.to_pinyin() {
let pinyin = py.plain().to_string();
self.char_to_pinyin.insert(c, pinyin.clone());
Some(pinyin)
} else {
None
}
})
.collect();
if !pinyins.is_empty() {
let pinyin_key = pinyins.join("");
self.pinyin_map.entry(pinyin_key).or_default().push(word.to_string());
}
}
pub fn detect<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
let mut variants = Vec::new();
variants.extend(self.detect_pinyin_variants(text, original_words));
variants.extend(self.detect_shape_variants(text, original_words));
variants.sort_unstable();
variants.dedup();
variants
}
fn detect_pinyin_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
let text_pinyin = self.text_to_pinyin(text);
original_words
.iter()
.filter(|&&word| {
let word_pinyin: String = word
.chars()
.map(|c| {
self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| c.to_string())
})
.collect();
text_pinyin.contains(&word_pinyin)
})
.copied()
.collect()
}
fn text_to_pinyin(&self, text: &str) -> String {
text.chars()
.map(|c| {
self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| {
if let Some(py) = c.to_pinyin() {
py.plain().to_string()
} else {
c.to_string() }
})
})
.collect()
}
fn detect_shape_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
original_words.iter().filter(|&&word| self.is_shape_variant(text, word)).copied().collect()
}
fn is_shape_variant(&self, text: &str, word: &str) -> bool {
let text_chars: Vec<char> = text.chars().collect();
let word_chars: Vec<char> = word.chars().collect();
if text_chars.len() != word_chars.len() {
return false;
}
text_chars
.iter()
.zip(word_chars.iter())
.all(|(&tc, &wc)| tc == wc || self.shape_map.get(&wc).is_some_and(|variants| variants.contains(&tc)))
}
fn build_shape_map() -> HashMap<char, Vec<char>> {
let mut map = HashMap::new();
map.insert('赌', vec!['渧', '睹', '堵']);
map.insert('博', vec!['搏', '傅', '膊']);
map.insert('有', vec!['友', '右']);
map.insert('色', vec!['涩']);
map.insert('情', vec!['请', '清']);
map
}
}