sensitive_rs/variant/
mod.rs

1use pinyin::ToPinyin;
2use std::collections::HashMap;
3
4/// Variation detector
5pub struct VariantDetector {
6    pinyin_map: HashMap<String, Vec<String>>, // The mapping of pinyin to original word
7    shape_map: HashMap<char, Vec<char>>,      // SHAPED CLOSE CHARACTER MAPPING
8    char_to_pinyin: HashMap<char, String>,    // Character to pinyin mapping
9}
10
11impl Default for VariantDetector {
12    fn default() -> Self {
13        Self::new()
14    }
15}
16
17impl VariantDetector {
18    /// Create a new detector
19    pub fn new() -> Self {
20        VariantDetector {
21            pinyin_map: HashMap::new(),
22            shape_map: Self::build_shape_map(),
23            char_to_pinyin: HashMap::new(),
24        }
25    }
26
27    /// Construct pinyin index when adding sensitive words
28    pub fn add_word(&mut self, word: &str) {
29        let pinyins: Vec<String> = word
30            .chars()
31            .filter_map(|c| {
32                if let Some(py) = c.to_pinyin() {
33                    let pinyin = py.plain().to_string();
34                    // Create a character to pinyin mapping
35                    self.char_to_pinyin.insert(c, pinyin.clone());
36                    Some(pinyin)
37                } else {
38                    // For characters that cannot be converted, return None
39                    None
40                }
41            })
42            .collect();
43
44        if !pinyins.is_empty() {
45            let pinyin_key = pinyins.join("");
46            self.pinyin_map.entry(pinyin_key).or_default().push(word.to_string());
47        }
48    }
49
50    /// Detect variants in text
51    pub fn detect<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
52        let mut variants = Vec::new();
53
54        // 1. Detect pinyin variants
55        variants.extend(self.detect_pinyin_variants(text, original_words));
56
57        // 2. Detect shape-near-word variant
58        variants.extend(self.detect_shape_variants(text, original_words));
59
60        variants.sort_unstable();
61        variants.dedup();
62        variants
63    }
64
65    /// Detect pinyin variants
66    fn detect_pinyin_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
67        let text_pinyin = self.text_to_pinyin(text);
68
69        original_words
70            .iter()
71            .filter(|&&word| {
72                // Construct the pinyin of the word
73                let word_pinyin: String = word
74                    .chars()
75                    .map(|c| {
76                        self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| c.to_string())
77                        // Safe processing: Return original characters
78                    })
79                    .collect();
80
81                text_pinyin.contains(&word_pinyin)
82            })
83            .copied()
84            .collect()
85    }
86
87    /// Convert text to pinyin
88    fn text_to_pinyin(&self, text: &str) -> String {
89        text.chars()
90            .map(|c| {
91                self.char_to_pinyin.get(&c).cloned().unwrap_or_else(|| {
92                    // Convert uncached characters in real time
93                    if let Some(py) = c.to_pinyin() {
94                        py.plain().to_string()
95                    } else {
96                        c.to_string() // Keep original characters
97                    }
98                })
99            })
100            .collect()
101    }
102
103    /// Detect shape-near-word variant
104    fn detect_shape_variants<'a>(&'a self, text: &str, original_words: &[&'a str]) -> Vec<&'a str> {
105        original_words.iter().filter(|&&word| self.is_shape_variant(text, word)).copied().collect()
106    }
107
108    /// Determine whether it is a variant of the shape and character
109    fn is_shape_variant(&self, text: &str, word: &str) -> bool {
110        let text_chars: Vec<char> = text.chars().collect();
111        let word_chars: Vec<char> = word.chars().collect();
112
113        if text_chars.len() != word_chars.len() {
114            return false;
115        }
116
117        text_chars
118            .iter()
119            .zip(word_chars.iter())
120            .all(|(&tc, &wc)| tc == wc || self.shape_map.get(&wc).is_some_and(|variants| variants.contains(&tc)))
121    }
122
123    /// Constructing a shape-size-word mapping table
124    fn build_shape_map() -> HashMap<char, Vec<char>> {
125        let mut map = HashMap::new();
126        // Example: Add some common characters
127        map.insert('赌', vec!['渧', '睹', '堵']);
128        map.insert('博', vec!['搏', '傅', '膊']);
129        map.insert('有', vec!['友', '右']);
130        map.insert('色', vec!['涩']);
131        map.insert('情', vec!['请', '清']);
132        map
133    }
134}