1use std::collections::HashMap;
2use unicode_normalization::UnicodeNormalization;
3
4fn is_cjk_char(ch: char) -> bool {
6 let c = ch as u32;
7 (0x4E00..=0x9FFF).contains(&c)
8 || (0x3400..=0x4DBF).contains(&c)
9 || (0xF900..=0xFAFF).contains(&c)
10 || (0x3040..=0x309F).contains(&c)
11 || (0x30A0..=0x30FF).contains(&c)
12 || (0xAC00..=0xD7AF).contains(&c)
13 || (0x1100..=0x11FF).contains(&c)
14 || (0x3000..=0x303F).contains(&c)
15 || (0xFF00..=0xFFEF).contains(&c)
16}
17
18pub fn build_vocab(sentences: &[Vec<String>]) -> (HashMap<String, usize>, Vec<String>) {
20 let mut vocab = HashMap::new();
21 let mut reverse_vocab = Vec::new();
22 let mut vocab_counter = 0;
23
24 for sentence in sentences {
25 for word in sentence {
26 if !vocab.contains_key(word) {
27 vocab.insert(word.clone(), vocab_counter);
28 reverse_vocab.push(word.clone());
29 vocab_counter += 1;
30 }
31 }
32 }
33
34 (vocab, reverse_vocab)
35}
36
37pub fn build_vocab_with_freq(sentences: &[Vec<String>]) -> (HashMap<String, usize>, Vec<String>, Vec<usize>) {
39 let mut vocab = HashMap::new();
40 let mut reverse_vocab = Vec::new();
41 let mut word_freq = Vec::new();
42 let mut vocab_counter = 0;
43
44 for sentence in sentences {
45 for word in sentence {
46 if let Some(&id) = vocab.get(word) {
47 word_freq[id] += 1;
48 } else {
49 vocab.insert(word.clone(), vocab_counter);
50 reverse_vocab.push(word.clone());
51 word_freq.push(1);
52 vocab_counter += 1;
53 }
54 }
55 }
56
57 (vocab, reverse_vocab, word_freq)
58}
59
60#[derive(Debug, Clone)]
65pub struct TextProcessor {
66 pub lowercase: bool,
67 pub remove_punctuation: bool,
68 pub remove_numbers: bool,
69 pub remove_stop_words: bool,
70 pub remove_html: bool,
71 pub remove_urls: bool,
72 pub expand_contractions: bool,
73 pub normalize_unicode: bool,
74 pub language: String,
75}
76
77impl Default for TextProcessor {
78 fn default() -> Self {
79 Self {
80 lowercase: true,
81 remove_punctuation: true,
82 remove_numbers: false,
83 remove_stop_words: false,
84 remove_html: false,
85 remove_urls: false,
86 expand_contractions: false,
87 normalize_unicode: false,
88 language: "en".to_string(),
89 }
90 }
91}
92
93impl TextProcessor {
94 pub fn process_text(&self, text: &str) -> Vec<Vec<String>> {
96 let mut text = text.nfc().collect::<String>();
97
98 if self.remove_html {
100 text = Self::strip_html(&text);
101 }
102
103 if self.remove_urls {
105 text = Self::strip_urls(&text);
106 }
107
108 let mut sentences = Vec::new();
109 let has_cjk = text.chars().any(is_cjk_char);
110
111 let delimiters: &[char] = if has_cjk {
112 &['.', '!', '?', '\n', '\u{3002}', '\u{FF01}', '\u{FF1F}', ';']
113 } else {
114 &['.', '!', '?', '\n']
115 };
116
117 for sentence in text.split(delimiters) {
118 let trimmed = sentence.trim();
119 if trimmed.is_empty() {
120 continue;
121 }
122
123 let mut processed_words = Vec::new();
124
125 if has_cjk {
126 for ch in trimmed.chars() {
128 if ch.is_whitespace() {
129 continue;
130 }
131 let s = ch.to_string();
132 let processed = self.process_word(&s);
133 if !processed.is_empty() {
134 processed_words.push(processed);
135 }
136 }
137 } else {
138 for word in trimmed.split_whitespace() {
140 let processed_word = self.process_word(word);
141 if !processed_word.is_empty() {
142 for subword in processed_word.split_whitespace() {
143 processed_words.push(subword.to_string());
144 }
145 }
146 }
147 }
148
149 if !processed_words.is_empty() {
150 sentences.push(processed_words);
151 }
152 }
153
154 sentences
155 }
156
157 fn strip_html(text: &str) -> String {
158 let mut result = String::new();
159 let mut in_tag = false;
160 for c in text.chars() {
161 if c == '<' {
162 in_tag = true;
163 } else if c == '>' {
164 in_tag = false;
165 } else if !in_tag {
166 result.push(c);
167 }
168 }
169 result
170 }
171
172 fn strip_urls(text: &str) -> String {
173 text.split_whitespace()
174 .filter(|word| !(word.starts_with("http://") || word.starts_with("https://") || word.starts_with("www.")))
175 .collect::<Vec<&str>>()
176 .join(" ")
177 }
178
179 fn process_word(&self, word: &str) -> String {
180 let mut result = word.to_string();
181
182 if self.expand_contractions {
184 result = Self::expand_contraction(&result);
185 }
186
187 if self.lowercase {
189 result = result.to_lowercase();
190 }
191
192 if self.remove_punctuation {
194 result = result
195 .chars()
196 .filter(|c| {
197 c.is_alphabetic() || c.is_numeric() || c.is_whitespace()
198 })
199 .collect::<String>()
200 .trim()
201 .to_string();
202 }
203
204 if self.remove_numbers {
206 result = result.chars()
207 .filter(|c| !c.is_ascii_digit())
208 .collect::<String>();
209 }
210
211 if result.is_empty() {
213 return String::new();
214 }
215
216 result
217 }
218
219 fn expand_contraction(word: &str) -> String {
220 match word.to_lowercase().as_str() {
221 "can't" => "cannot".to_string(),
222 "won't" => "will not".to_string(),
223 "n't" => " not".to_string(),
224 "'re" => " are".to_string(),
225 "'ve" => " have".to_string(),
226 "'ll" => " will".to_string(),
227 "'d" => " would".to_string(),
228 "'m" => " am".to_string(),
229 "i'm" => "i am".to_string(),
230 "don't" => "do not".to_string(),
231 "doesn't" => "does not".to_string(),
232 "didn't" => "did not".to_string(),
233 "isn't" => "is not".to_string(),
234 "aren't" => "are not".to_string(),
235 "wasn't" => "was not".to_string(),
236 "weren't" => "were not".to_string(),
237 "haven't" => "have not".to_string(),
238 "hasn't" => "has not".to_string(),
239 "hadn't" => "had not".to_string(),
240 "wouldn't" => "would not".to_string(),
241 "couldn't" => "could not".to_string(),
242 "shouldn't" => "should not".to_string(),
243 "let's" => "let us".to_string(),
244 "that's" => "that is".to_string(),
245 "who's" => "who is".to_string(),
246 "what's" => "what is".to_string(),
247 "here's" => "here is".to_string(),
248 "there's" => "there is".to_string(),
249 "where's" => "where is".to_string(),
250 "it's" => "it is".to_string(),
251 _ => word.to_string(),
252 }
253 }
254
255 pub fn detect_language(&self, text: &str) -> String {
257 let english_stop_words = ["the", "and", "a", "an", "in", "on", "at", "to", "for", "of", "with", "by", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "can", "could", "may", "might", "must", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"];
261
262 let words_vec: Vec<&str> = text.split_whitespace().collect();
263 let words = &words_vec;
264 let mut english_count = 0;
265
266 for word in words {
267 let lower_word = word.to_lowercase();
268 if english_stop_words.contains(&lower_word.as_str()) {
269 english_count += 1;
270 }
271 }
272
273 if english_count > words.len() / 5 {
275 "en".to_string()
276 } else {
277 "unknown".to_string()
278 }
279 }
280}
281
282pub fn load_text_data(text: &str) -> Vec<Vec<String>> {
284 let processor = TextProcessor::default();
285 processor.process_text(text)
286}
287
288pub fn load_text_data_advanced(text: &str, processor: &TextProcessor) -> Vec<Vec<String>> {
290 processor.process_text(text)
291}