Skip to main content

embedding/
text.rs

1use std::collections::HashMap;
2use unicode_normalization::UnicodeNormalization;
3
4/// Checks if a character belongs to a CJK script.
5fn is_cjk_char(ch: char) -> bool {
6    let c = ch as u32;
7    (0x4E00..=0x9FFF).contains(&c)
8        || (0x3400..=0x4DBF).contains(&c)
9        || (0xF900..=0xFAFF).contains(&c)
10        || (0x3040..=0x309F).contains(&c)
11        || (0x30A0..=0x30FF).contains(&c)
12        || (0xAC00..=0xD7AF).contains(&c)
13        || (0x1100..=0x11FF).contains(&c)
14        || (0x3000..=0x303F).contains(&c)
15        || (0xFF00..=0xFFEF).contains(&c)
16}
17
18/// Builds a vocabulary map and reverse lookup from tokenized sentences.
19pub fn build_vocab(sentences: &[Vec<String>]) -> (HashMap<String, usize>, Vec<String>) {
20    let mut vocab = HashMap::new();
21    let mut reverse_vocab = Vec::new();
22    let mut vocab_counter = 0;
23
24    for sentence in sentences {
25        for word in sentence {
26            if !vocab.contains_key(word) {
27                vocab.insert(word.clone(), vocab_counter);
28                reverse_vocab.push(word.clone());
29                vocab_counter += 1;
30            }
31        }
32    }
33
34    (vocab, reverse_vocab)
35}
36
37/// Builds a vocabulary map, reverse lookup, and per-ID word frequencies from tokenized sentences.
38pub fn build_vocab_with_freq(sentences: &[Vec<String>]) -> (HashMap<String, usize>, Vec<String>, Vec<usize>) {
39    let mut vocab = HashMap::new();
40    let mut reverse_vocab = Vec::new();
41    let mut word_freq = Vec::new();
42    let mut vocab_counter = 0;
43
44    for sentence in sentences {
45        for word in sentence {
46            if let Some(&id) = vocab.get(word) {
47                word_freq[id] += 1;
48            } else {
49                vocab.insert(word.clone(), vocab_counter);
50                reverse_vocab.push(word.clone());
51                word_freq.push(1);
52                vocab_counter += 1;
53            }
54        }
55    }
56
57    (vocab, reverse_vocab, word_freq)
58}
59
60/// Configurable text preprocessing pipeline.
61///
62/// Controls lowercasing, punctuation removal, HTML stripping,
63/// URL removal, contraction expansion, and stop-word filtering.
64#[derive(Debug, Clone)]
65pub struct TextProcessor {
66    pub lowercase: bool,
67    pub remove_punctuation: bool,
68    pub remove_numbers: bool,
69    pub remove_stop_words: bool,
70    pub remove_html: bool,
71    pub remove_urls: bool,
72    pub expand_contractions: bool,
73    pub normalize_unicode: bool,
74    pub language: String,
75}
76
77impl Default for TextProcessor {
78    fn default() -> Self {
79        Self {
80            lowercase: true,
81            remove_punctuation: true,
82            remove_numbers: false,
83            remove_stop_words: false,
84            remove_html: false,
85            remove_urls: false,
86            expand_contractions: false,
87            normalize_unicode: false,
88            language: "en".to_string(),
89        }
90    }
91}
92
93impl TextProcessor {
94    /// Processes raw text into tokenized sentences according to the configured filters.
95    pub fn process_text(&self, text: &str) -> Vec<Vec<String>> {
96        let mut text = text.nfc().collect::<String>();
97
98        // Remove HTML tags
99        if self.remove_html {
100            text = Self::strip_html(&text);
101        }
102
103        // Remove URLs
104        if self.remove_urls {
105            text = Self::strip_urls(&text);
106        }
107
108        let mut sentences = Vec::new();
109        let has_cjk = text.chars().any(is_cjk_char);
110
111        let delimiters: &[char] = if has_cjk {
112            &['.', '!', '?', '\n', '\u{3002}', '\u{FF01}', '\u{FF1F}', ';']
113        } else {
114            &['.', '!', '?', '\n']
115        };
116
117        for sentence in text.split(delimiters) {
118            let trimmed = sentence.trim();
119            if trimmed.is_empty() {
120                continue;
121            }
122
123            let mut processed_words = Vec::new();
124
125            if has_cjk {
126                // Character-level tokenization for CJK
127                for ch in trimmed.chars() {
128                    if ch.is_whitespace() {
129                        continue;
130                    }
131                    let s = ch.to_string();
132                    let processed = self.process_word(&s);
133                    if !processed.is_empty() {
134                        processed_words.push(processed);
135                    }
136                }
137            } else {
138                // Whitespace tokenization for Western languages
139                for word in trimmed.split_whitespace() {
140                    let processed_word = self.process_word(word);
141                    if !processed_word.is_empty() {
142                        for subword in processed_word.split_whitespace() {
143                            processed_words.push(subword.to_string());
144                        }
145                    }
146                }
147            }
148
149            if !processed_words.is_empty() {
150                sentences.push(processed_words);
151            }
152        }
153
154        sentences
155    }
156
157    fn strip_html(text: &str) -> String {
158        let mut result = String::new();
159        let mut in_tag = false;
160        for c in text.chars() {
161            if c == '<' {
162                in_tag = true;
163            } else if c == '>' {
164                in_tag = false;
165            } else if !in_tag {
166                result.push(c);
167            }
168        }
169        result
170    }
171
172    fn strip_urls(text: &str) -> String {
173        text.split_whitespace()
174            .filter(|word| !(word.starts_with("http://") || word.starts_with("https://") || word.starts_with("www.")))
175            .collect::<Vec<&str>>()
176            .join(" ")
177    }
178
179    fn process_word(&self, word: &str) -> String {
180        let mut result = word.to_string();
181
182        // Expand contractions
183        if self.expand_contractions {
184            result = Self::expand_contraction(&result);
185        }
186
187        // Convert to lowercase
188        if self.lowercase {
189            result = result.to_lowercase();
190        }
191
192        // Remove punctuation: keep all Unicode letters and marks
193        if self.remove_punctuation {
194            result = result
195                .chars()
196                .filter(|c| {
197                    c.is_alphabetic() || c.is_numeric() || c.is_whitespace()
198                })
199                .collect::<String>()
200                .trim()
201                .to_string();
202        }
203
204        // Remove numbers
205        if self.remove_numbers {
206            result = result.chars()
207                .filter(|c| !c.is_ascii_digit())
208                .collect::<String>();
209        }
210
211        // Remove empty strings
212        if result.is_empty() {
213            return String::new();
214        }
215
216        result
217    }
218
219    fn expand_contraction(word: &str) -> String {
220        match word.to_lowercase().as_str() {
221            "can't" => "cannot".to_string(),
222            "won't" => "will not".to_string(),
223            "n't" => " not".to_string(),
224            "'re" => " are".to_string(),
225            "'ve" => " have".to_string(),
226            "'ll" => " will".to_string(),
227            "'d" => " would".to_string(),
228            "'m" => " am".to_string(),
229            "i'm" => "i am".to_string(),
230            "don't" => "do not".to_string(),
231            "doesn't" => "does not".to_string(),
232            "didn't" => "did not".to_string(),
233            "isn't" => "is not".to_string(),
234            "aren't" => "are not".to_string(),
235            "wasn't" => "was not".to_string(),
236            "weren't" => "were not".to_string(),
237            "haven't" => "have not".to_string(),
238            "hasn't" => "has not".to_string(),
239            "hadn't" => "had not".to_string(),
240            "wouldn't" => "would not".to_string(),
241            "couldn't" => "could not".to_string(),
242            "shouldn't" => "should not".to_string(),
243            "let's" => "let us".to_string(),
244            "that's" => "that is".to_string(),
245            "who's" => "who is".to_string(),
246            "what's" => "what is".to_string(),
247            "here's" => "here is".to_string(),
248            "there's" => "there is".to_string(),
249            "where's" => "where is".to_string(),
250            "it's" => "it is".to_string(),
251            _ => word.to_string(),
252        }
253    }
254    
255    /// Simple heuristic-based language detection.
256    pub fn detect_language(&self, text: &str) -> String {
257        // Simple heuristic for language detection
258        // This is a very basic implementation - in practice, you'd use more sophisticated methods
259        
260        let english_stop_words = ["the", "and", "a", "an", "in", "on", "at", "to", "for", "of", "with", "by", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "do", "does", "did", "will", "would", "shall", "should", "can", "could", "may", "might", "must", "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them"];
261        
262        let words_vec: Vec<&str> = text.split_whitespace().collect();
263        let words = &words_vec;
264        let mut english_count = 0;
265        
266        for word in words {
267            let lower_word = word.to_lowercase();
268            if english_stop_words.contains(&lower_word.as_str()) {
269                english_count += 1;
270            }
271        }
272        
273        // If more than 20% of words are common English stop words, assume English
274        if english_count > words.len() / 5 {
275            "en".to_string()
276        } else {
277            "unknown".to_string()
278        }
279    }
280}
281
282/// Tokenizes text using the default [`TextProcessor`] settings.
283pub fn load_text_data(text: &str) -> Vec<Vec<String>> {
284    let processor = TextProcessor::default();
285    processor.process_text(text)
286}
287
288/// Tokenizes text using a custom [`TextProcessor`].
289pub fn load_text_data_advanced(text: &str, processor: &TextProcessor) -> Vec<Vec<String>> {
290    processor.process_text(text)
291}