sensitive_rs/variant/
mod.rs1use pinyin::ToPinyin;
2use std::collections::HashMap;
3
4pub struct VariantDetector {
6 pinyin_map: HashMap<String, Vec<String>>, shape_map: HashMap<char, Vec<char>>, char_to_pinyin: HashMap<char, String>, }
10
11impl Default for VariantDetector {
12 fn default() -> Self {
13 Self::new()
14 }
15}
16
17impl VariantDetector {
18 pub fn new() -> Self {
20 VariantDetector {
21 pinyin_map: HashMap::new(),
22 shape_map: Self::build_shape_map(),
23 char_to_pinyin: HashMap::new(),
24 }
25 }
26
27 pub fn add_word(&mut self, word: &str) {
29 let pinyins: Vec<String> = word
30 .chars()
31 .filter_map(|c| {
32 if let Some(py) = c.to_pinyin() {
33 let pinyin = py.plain().to_string();
34 self.char_to_pinyin.insert(c, pinyin.clone());
36 Some(pinyin)
37 } else {
38 None
40 }
41 })
42 .collect();
43
44 if !pinyins.is_empty() {
45 let pinyin_key = pinyins.join("");
46 self.pinyin_map.entry(pinyin_key).or_default().push(word.to_string());
47 }
48 }
49
50 pub fn detect<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
52 let mut variants = Vec::new();
53
54 variants.extend(self.detect_pinyin_variants(text, original_words));
56
57 variants.extend(self.detect_shape_variants(text, original_words));
59
60 variants.sort_unstable();
61 variants.dedup();
62 variants
63 }
64
65 fn detect_pinyin_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
67 let text_pinyin = self.text_to_pinyin(text);
68
69 original_words
70 .iter()
71 .filter(|&&word| {
72 let word_pinyin: String = word
74 .chars()
75 .map(|c| {
76 self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| c.to_string())
77 })
79 .collect();
80
81 text_pinyin.contains(&word_pinyin)
82 })
83 .copied()
84 .collect()
85 }
86
87 fn text_to_pinyin(&self, text: &str) -> String {
89 text.chars()
90 .map(|c| {
91 self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| {
92 if let Some(py) = c.to_pinyin() {
94 py.plain().to_string()
95 } else {
96 c.to_string() }
98 })
99 })
100 .collect()
101 }
102
103 fn detect_shape_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
105 original_words.iter().filter(|&&word| self.is_shape_variant(text, word)).copied().collect()
106 }
107
108 fn is_shape_variant(&self, text: &str, word: &str) -> bool {
110 let text_chars: Vec<char> = text.chars().collect();
111 let word_chars: Vec<char> = word.chars().collect();
112
113 if text_chars.len() != word_chars.len() {
114 return false;
115 }
116
117 text_chars
118 .iter()
119 .zip(word_chars.iter())
120 .all(|(&tc, &wc)| tc == wc || self.shape_map.get(&wc).is_some_and(|variants| variants.contains(&tc)))
121 }
122
123 fn build_shape_map() -> HashMap<char, Vec<char>> {
125 let mut map = HashMap::new();
126 map.insert('赌', vec!['渧', '睹', '堵']);
128 map.insert('博', vec!['搏', '傅', '膊']);
129 map.insert('有', vec!['友', '右']);
130 map.insert('色', vec!['涩']);
131 map.insert('情', vec!['请', '清']);
132 map
133 }
134}