Skip to main content

badwords_core/
processor.rs

1//! Text processing: normalization, transliteration, homoglyphs.
2
3use std::collections::HashMap;
4use std::path::Path;
5use unicode_normalization::UnicodeNormalization;
6
7use serde::Deserialize;
8
9#[derive(Debug, Clone)]
10pub struct TextProcessor {
11    pub normalize_text: bool,
12    pub aggressive_normalize: bool,
13    pub transliterate: bool,
14    pub replace_homoglyphs: bool,
15    unicode_mappings: HashMap<char, char>,
16    homoglyph_map: HashMap<char, char>,
17    cyrillic_to_latin: HashMap<char, String>,
18    latin_to_cyrillic: HashMap<String, char>,
19}
20
21#[derive(Debug, Deserialize)]
22struct UnicodeMappingsFile {
23    #[serde(flatten)]
24    categories: HashMap<String, HashMap<String, String>>,
25}
26
27#[derive(Debug, Deserialize)]
28struct HomoglyphsFile(HashMap<String, Vec<String>>);
29
30#[derive(Debug, Deserialize)]
31struct TransliterationFile {
32    cyrillic_to_latin: HashMap<String, String>,
33}
34
35#[derive(Debug, Deserialize)]
36#[allow(dead_code)]
37struct CharacterFrequencyFile(HashMap<String, Vec<String>>);
38
39impl TextProcessor {
40    pub fn new(
41        normalize_text: bool,
42        aggressive_normalize: bool,
43        transliterate: bool,
44        replace_homoglyphs: bool,
45    ) -> Self {
46        Self {
47            normalize_text,
48            aggressive_normalize,
49            transliterate,
50            replace_homoglyphs,
51            unicode_mappings: HashMap::new(),
52            homoglyph_map: HashMap::new(),
53            cyrillic_to_latin: HashMap::new(),
54            latin_to_cyrillic: HashMap::new(),
55        }
56    }
57
58    pub fn load_from_dir(&mut self, data_dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
59        let unicode_content = std::fs::read_to_string(data_dir.join("unicode_mappings.json"))?;
60        let homoglyph_content = std::fs::read_to_string(data_dir.join("homoglyphs.json"))?;
61        let translit_content = std::fs::read_to_string(data_dir.join("transliteration.json"))?;
62        self.load_from_str(&unicode_content, &homoglyph_content, &translit_content)
63    }
64
65    /// Load processor data from string content (no filesystem). Use for WASM/embedded.
66    pub fn load_from_str(
67        &mut self,
68        unicode_mappings_json: &str,
69        homoglyphs_json: &str,
70        transliteration_json: &str,
71    ) -> Result<(), Box<dyn std::error::Error>> {
72        self.unicode_mappings = Self::parse_unicode_mappings(unicode_mappings_json)?;
73        self.homoglyph_map = Self::parse_homoglyph_map(homoglyphs_json)?;
74
75        if self.transliterate {
76            let (cyrillic_to_latin, latin_to_cyrillic) =
77                Self::parse_transliteration(transliteration_json)?;
78            self.cyrillic_to_latin = cyrillic_to_latin;
79            self.latin_to_cyrillic = latin_to_cyrillic;
80        }
81
82        Ok(())
83    }
84
85    fn parse_unicode_mappings(content: &str) -> Result<HashMap<char, char>, Box<dyn std::error::Error>> {
86        let data: UnicodeMappingsFile = serde_json::from_str(content)?;
87
88        let mut mappings = HashMap::new();
89        for category in data.categories.values() {
90            for (k, v) in category {
91                if let (Some(from), Some(to)) = (k.chars().next(), v.chars().next()) {
92                    mappings.insert(from, to);
93                }
94            }
95        }
96        Ok(mappings)
97    }
98
99    fn parse_homoglyph_map(content: &str) -> Result<HashMap<char, char>, Box<dyn std::error::Error>> {
100        let data: HomoglyphsFile = serde_json::from_str(content)?;
101
102        let mut map = HashMap::new();
103        for (standard, variants) in data.0 {
104            let standard_char = standard.chars().next();
105            if let Some(std_c) = standard_char {
106                for variant in variants {
107                    if let Some(var_c) = variant.chars().next() {
108                        map.insert(var_c, std_c);
109                    }
110                }
111            }
112        }
113        Ok(map)
114    }
115
116    fn parse_transliteration(
117        content: &str,
118    ) -> Result<(HashMap<char, String>, HashMap<String, char>), Box<dyn std::error::Error>> {
119        let data: TransliterationFile = serde_json::from_str(content)?;
120
121        let mut cyrillic_to_latin = HashMap::new();
122        let mut latin_to_cyrillic = HashMap::new();
123
124        for (k, v) in data.cyrillic_to_latin {
125            if let Some(cyr) = k.chars().next() {
126                cyrillic_to_latin.insert(cyr, v.clone());
127                latin_to_cyrillic.insert(v, cyr);
128            }
129        }
130        Ok((cyrillic_to_latin, latin_to_cyrillic))
131    }
132
133    /// Single-pass normalization: unicode mappings + filter + collapse whitespace.
134    /// Merges normalize_text and aggressive_normalize to avoid duplicate passes.
135    fn normalize_unicode_and_filter(&self, text: &str, allow_underscore: bool) -> String {
136        let text = text.nfkc().collect::<String>();
137        let filtered: String = text
138            .to_lowercase()
139            .chars()
140            .map(|c| *self.unicode_mappings.get(&c).unwrap_or(&c))
141            .filter(|c| {
142                c.is_alphanumeric()
143                    || c.is_whitespace()
144                    || (allow_underscore && *c == '_')
145            })
146            .collect();
147        filtered.split_whitespace().collect::<Vec<_>>().join(" ")
148    }
149
150    fn transliterate_char(&self, c: char, to_latin: bool) -> String {
151        if to_latin {
152            self.cyrillic_to_latin
153                .get(&c)
154                .cloned()
155                .unwrap_or_else(|| c.to_string())
156        } else {
157            let mut s = String::new();
158            s.push(c);
159            self.latin_to_cyrillic
160                .get(&s)
161                .map(|&cyr| cyr.to_string())
162                .unwrap_or_else(|| c.to_string())
163        }
164    }
165
166    fn transliterate(&self, text: &str, to_latin: bool) -> String {
167        if to_latin {
168            text.chars()
169                .map(|c| self.transliterate_char(c, true))
170                .collect::<Vec<_>>()
171                .join("")
172        } else {
173            let mut result = String::new();
174            let mut i = 0;
175            let chars: Vec<char> = text.chars().collect();
176            while i < chars.len() {
177                let mut matched = false;
178                for len in (1..=4).rev() {
179                    if i + len <= chars.len() {
180                        let chunk: String = chars[i..i + len].iter().collect();
181                        if let Some(&cyr) = self.latin_to_cyrillic.get(&chunk) {
182                            result.push(cyr);
183                            i += len;
184                            matched = true;
185                            break;
186                        }
187                    }
188                }
189                if !matched {
190                    result.push(chars[i]);
191                    i += 1;
192                }
193            }
194            result
195        }
196    }
197
198    fn replace_homoglyphs(&self, text: &str) -> String {
199        text.chars()
200            .map(|c| *self.homoglyph_map.get(&c).unwrap_or(&c))
201            .collect()
202    }
203
204    #[inline]
205    fn contains_cyrillic(text: &str) -> bool {
206        text.chars().any(|c| ('\u{0400}'..='\u{04FF}').contains(&c))
207    }
208
209    pub fn process_text(&self, text: &str) -> String {
210        let mut txt = if self.normalize_text || self.aggressive_normalize {
211            let allow_underscore = self.normalize_text && !self.aggressive_normalize;
212            self.normalize_unicode_and_filter(text, allow_underscore)
213        } else {
214            text.to_string()
215        };
216
217        if self.transliterate {
218            if Self::contains_cyrillic(&txt) {
219                txt = self.transliterate(&txt, true);
220            }
221            txt = self.transliterate(&txt, false);
222        }
223        if self.replace_homoglyphs {
224            txt = self.replace_homoglyphs(&txt);
225        }
226        txt
227    }
228}