badwords_core/
processor.rs1use std::collections::HashMap;
4use std::path::Path;
5use unicode_normalization::UnicodeNormalization;
6
7use serde::Deserialize;
8
9#[derive(Debug, Clone)]
10pub struct TextProcessor {
11 pub normalize_text: bool,
12 pub aggressive_normalize: bool,
13 pub transliterate: bool,
14 pub replace_homoglyphs: bool,
15 unicode_mappings: HashMap<char, char>,
16 homoglyph_map: HashMap<char, char>,
17 cyrillic_to_latin: HashMap<char, String>,
18 latin_to_cyrillic: HashMap<String, char>,
19}
20
21#[derive(Debug, Deserialize)]
22struct UnicodeMappingsFile {
23 #[serde(flatten)]
24 categories: HashMap<String, HashMap<String, String>>,
25}
26
27#[derive(Debug, Deserialize)]
28struct HomoglyphsFile(HashMap<String, Vec<String>>);
29
30#[derive(Debug, Deserialize)]
31struct TransliterationFile {
32 cyrillic_to_latin: HashMap<String, String>,
33}
34
35#[derive(Debug, Deserialize)]
36#[allow(dead_code)]
37struct CharacterFrequencyFile(HashMap<String, Vec<String>>);
38
39impl TextProcessor {
40 pub fn new(
41 normalize_text: bool,
42 aggressive_normalize: bool,
43 transliterate: bool,
44 replace_homoglyphs: bool,
45 ) -> Self {
46 Self {
47 normalize_text,
48 aggressive_normalize,
49 transliterate,
50 replace_homoglyphs,
51 unicode_mappings: HashMap::new(),
52 homoglyph_map: HashMap::new(),
53 cyrillic_to_latin: HashMap::new(),
54 latin_to_cyrillic: HashMap::new(),
55 }
56 }
57
58 pub fn load_from_dir(&mut self, data_dir: &Path) -> Result<(), Box<dyn std::error::Error>> {
59 let unicode_content = std::fs::read_to_string(data_dir.join("unicode_mappings.json"))?;
60 let homoglyph_content = std::fs::read_to_string(data_dir.join("homoglyphs.json"))?;
61 let translit_content = std::fs::read_to_string(data_dir.join("transliteration.json"))?;
62 self.load_from_str(&unicode_content, &homoglyph_content, &translit_content)
63 }
64
65 pub fn load_from_str(
67 &mut self,
68 unicode_mappings_json: &str,
69 homoglyphs_json: &str,
70 transliteration_json: &str,
71 ) -> Result<(), Box<dyn std::error::Error>> {
72 self.unicode_mappings = Self::parse_unicode_mappings(unicode_mappings_json)?;
73 self.homoglyph_map = Self::parse_homoglyph_map(homoglyphs_json)?;
74
75 if self.transliterate {
76 let (cyrillic_to_latin, latin_to_cyrillic) =
77 Self::parse_transliteration(transliteration_json)?;
78 self.cyrillic_to_latin = cyrillic_to_latin;
79 self.latin_to_cyrillic = latin_to_cyrillic;
80 }
81
82 Ok(())
83 }
84
85 fn parse_unicode_mappings(content: &str) -> Result<HashMap<char, char>, Box<dyn std::error::Error>> {
86 let data: UnicodeMappingsFile = serde_json::from_str(content)?;
87
88 let mut mappings = HashMap::new();
89 for category in data.categories.values() {
90 for (k, v) in category {
91 if let (Some(from), Some(to)) = (k.chars().next(), v.chars().next()) {
92 mappings.insert(from, to);
93 }
94 }
95 }
96 Ok(mappings)
97 }
98
99 fn parse_homoglyph_map(content: &str) -> Result<HashMap<char, char>, Box<dyn std::error::Error>> {
100 let data: HomoglyphsFile = serde_json::from_str(content)?;
101
102 let mut map = HashMap::new();
103 for (standard, variants) in data.0 {
104 let standard_char = standard.chars().next();
105 if let Some(std_c) = standard_char {
106 for variant in variants {
107 if let Some(var_c) = variant.chars().next() {
108 map.insert(var_c, std_c);
109 }
110 }
111 }
112 }
113 Ok(map)
114 }
115
116 fn parse_transliteration(
117 content: &str,
118 ) -> Result<(HashMap<char, String>, HashMap<String, char>), Box<dyn std::error::Error>> {
119 let data: TransliterationFile = serde_json::from_str(content)?;
120
121 let mut cyrillic_to_latin = HashMap::new();
122 let mut latin_to_cyrillic = HashMap::new();
123
124 for (k, v) in data.cyrillic_to_latin {
125 if let Some(cyr) = k.chars().next() {
126 cyrillic_to_latin.insert(cyr, v.clone());
127 latin_to_cyrillic.insert(v, cyr);
128 }
129 }
130 Ok((cyrillic_to_latin, latin_to_cyrillic))
131 }
132
133 fn normalize_unicode_and_filter(&self, text: &str, allow_underscore: bool) -> String {
136 let text = text.nfkc().collect::<String>();
137 let filtered: String = text
138 .to_lowercase()
139 .chars()
140 .map(|c| *self.unicode_mappings.get(&c).unwrap_or(&c))
141 .filter(|c| {
142 c.is_alphanumeric()
143 || c.is_whitespace()
144 || (allow_underscore && *c == '_')
145 })
146 .collect();
147 filtered.split_whitespace().collect::<Vec<_>>().join(" ")
148 }
149
150 fn transliterate_char(&self, c: char, to_latin: bool) -> String {
151 if to_latin {
152 self.cyrillic_to_latin
153 .get(&c)
154 .cloned()
155 .unwrap_or_else(|| c.to_string())
156 } else {
157 let mut s = String::new();
158 s.push(c);
159 self.latin_to_cyrillic
160 .get(&s)
161 .map(|&cyr| cyr.to_string())
162 .unwrap_or_else(|| c.to_string())
163 }
164 }
165
166 fn transliterate(&self, text: &str, to_latin: bool) -> String {
167 if to_latin {
168 text.chars()
169 .map(|c| self.transliterate_char(c, true))
170 .collect::<Vec<_>>()
171 .join("")
172 } else {
173 let mut result = String::new();
174 let mut i = 0;
175 let chars: Vec<char> = text.chars().collect();
176 while i < chars.len() {
177 let mut matched = false;
178 for len in (1..=4).rev() {
179 if i + len <= chars.len() {
180 let chunk: String = chars[i..i + len].iter().collect();
181 if let Some(&cyr) = self.latin_to_cyrillic.get(&chunk) {
182 result.push(cyr);
183 i += len;
184 matched = true;
185 break;
186 }
187 }
188 }
189 if !matched {
190 result.push(chars[i]);
191 i += 1;
192 }
193 }
194 result
195 }
196 }
197
198 fn replace_homoglyphs(&self, text: &str) -> String {
199 text.chars()
200 .map(|c| *self.homoglyph_map.get(&c).unwrap_or(&c))
201 .collect()
202 }
203
204 #[inline]
205 fn contains_cyrillic(text: &str) -> bool {
206 text.chars().any(|c| ('\u{0400}'..='\u{04FF}').contains(&c))
207 }
208
209 pub fn process_text(&self, text: &str) -> String {
210 let mut txt = if self.normalize_text || self.aggressive_normalize {
211 let allow_underscore = self.normalize_text && !self.aggressive_normalize;
212 self.normalize_unicode_and_filter(text, allow_underscore)
213 } else {
214 text.to_string()
215 };
216
217 if self.transliterate {
218 if Self::contains_cyrillic(&txt) {
219 txt = self.transliterate(&txt, true);
220 }
221 txt = self.transliterate(&txt, false);
222 }
223 if self.replace_homoglyphs {
224 txt = self.replace_homoglyphs(&txt);
225 }
226 txt
227 }
228}