1use serde::{Deserialize, Serialize};
23use std::collections::HashMap;
24
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
27pub enum Language {
28 English,
30 Spanish,
32 French,
34 German,
36 Chinese,
38 Japanese,
40 Korean,
42 Arabic,
44 Russian,
46 Portuguese,
48 Unknown,
50}
51
52impl Language {
53 pub fn code(&self) -> &str {
55 match self {
56 Language::English => "en",
57 Language::Spanish => "es",
58 Language::French => "fr",
59 Language::German => "de",
60 Language::Chinese => "zh",
61 Language::Japanese => "ja",
62 Language::Korean => "ko",
63 Language::Arabic => "ar",
64 Language::Russian => "ru",
65 Language::Portuguese => "pt",
66 Language::Unknown => "unknown",
67 }
68 }
69
70 pub fn from_code(code: &str) -> Self {
72 match code.to_lowercase().as_str() {
73 "en" => Language::English,
74 "es" => Language::Spanish,
75 "fr" => Language::French,
76 "de" => Language::German,
77 "zh" => Language::Chinese,
78 "ja" => Language::Japanese,
79 "ko" => Language::Korean,
80 "ar" => Language::Arabic,
81 "ru" => Language::Russian,
82 "pt" => Language::Portuguese,
83 _ => Language::Unknown,
84 }
85 }
86
87 pub fn name(&self) -> &str {
89 match self {
90 Language::English => "English",
91 Language::Spanish => "Spanish",
92 Language::French => "French",
93 Language::German => "German",
94 Language::Chinese => "Chinese",
95 Language::Japanese => "Japanese",
96 Language::Korean => "Korean",
97 Language::Arabic => "Arabic",
98 Language::Russian => "Russian",
99 Language::Portuguese => "Portuguese",
100 Language::Unknown => "Unknown",
101 }
102 }
103
104 pub fn is_cjk(&self) -> bool {
106 matches!(self, Language::Chinese | Language::Japanese | Language::Korean)
107 }
108
109 pub fn is_rtl(&self) -> bool {
111 matches!(self, Language::Arabic)
112 }
113}
114
115#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct DetectionResult {
118 pub language: Language,
120 pub confidence: f32,
122 pub alternatives: Vec<(Language, f32)>,
124}
125
126pub struct LanguageDetector {
128 models: HashMap<Language, LanguageModel>,
130}
131
132struct LanguageModel {
134 ngrams: HashMap<String, f32>,
136 total: f32,
138}
139
140impl LanguageModel {
141 fn new() -> Self {
143 Self {
144 ngrams: HashMap::new(),
145 total: 0.0,
146 }
147 }
148
149 fn train(&mut self, text: &str, n: usize) {
151 let chars: Vec<char> = text.chars().collect();
152 for window in chars.windows(n) {
153 let ngram: String = window.iter().collect();
154 *self.ngrams.entry(ngram).or_insert(0.0) += 1.0;
155 self.total += 1.0;
156 }
157 }
158
159 fn score(&self, text: &str, n: usize) -> f32 {
161 let chars: Vec<char> = text.chars().collect();
162 let mut score = 0.0;
163 let mut count = 0;
164
165 for window in chars.windows(n) {
166 let ngram: String = window.iter().collect();
167 if let Some(&freq) = self.ngrams.get(&ngram) {
168 score += (freq / self.total).ln();
169 } else {
170 score += (1.0 / (self.total + 1.0)).ln(); }
172 count += 1;
173 }
174
175 if count > 0 {
176 score / count as f32
177 } else {
178 0.0
179 }
180 }
181}
182
183impl LanguageDetector {
184 pub fn new() -> Self {
186 let mut detector = Self {
187 models: HashMap::new(),
188 };
189
190 detector.initialize_models();
192 detector
193 }
194
195 fn initialize_models(&mut self) {
197 let mut english_model = LanguageModel::new();
199 english_model.train("the quick brown fox jumps over the lazy dog", 3);
200 english_model.train("this is a test of the english language", 3);
201 self.models.insert(Language::English, english_model);
202
203 let mut spanish_model = LanguageModel::new();
205 spanish_model.train("el rápido zorro marrón salta sobre el perro perezoso", 3);
206 spanish_model.train("esta es una prueba del idioma español", 3);
207 self.models.insert(Language::Spanish, spanish_model);
208
209 let mut french_model = LanguageModel::new();
211 french_model.train("le renard brun rapide saute par-dessus le chien paresseux", 3);
212 french_model.train("ceci est un test de la langue française", 3);
213 self.models.insert(Language::French, french_model);
214
215 let mut german_model = LanguageModel::new();
217 german_model.train("der schnelle braune fuchs springt über den faulen hund", 3);
218 german_model.train("dies ist ein test der deutschen sprache", 3);
219 self.models.insert(Language::German, german_model);
220
221 let mut portuguese_model = LanguageModel::new();
223 portuguese_model.train("a rápida raposa marrom pula sobre o cão preguiçoso", 3);
224 portuguese_model.train("este é um teste da língua portuguesa", 3);
225 self.models.insert(Language::Portuguese, portuguese_model);
226
227 }
230
231 pub fn detect(&self, text: &str) -> DetectionResult {
233 if text.trim().is_empty() {
234 return DetectionResult {
235 language: Language::Unknown,
236 confidence: 0.0,
237 alternatives: Vec::new(),
238 };
239 }
240
241 if self.is_likely_chinese(text) {
243 return DetectionResult {
244 language: Language::Chinese,
245 confidence: 0.9,
246 alternatives: vec![
247 (Language::Japanese, 0.1),
248 ],
249 };
250 }
251
252 if self.is_likely_japanese(text) {
253 return DetectionResult {
254 language: Language::Japanese,
255 confidence: 0.9,
256 alternatives: vec![
257 (Language::Chinese, 0.1),
258 ],
259 };
260 }
261
262 if self.is_likely_korean(text) {
263 return DetectionResult {
264 language: Language::Korean,
265 confidence: 0.95,
266 alternatives: Vec::new(),
267 };
268 }
269
270 if self.is_likely_arabic(text) {
271 return DetectionResult {
272 language: Language::Arabic,
273 confidence: 0.95,
274 alternatives: Vec::new(),
275 };
276 }
277
278 if self.is_likely_russian(text) {
279 return DetectionResult {
280 language: Language::Russian,
281 confidence: 0.9,
282 alternatives: Vec::new(),
283 };
284 }
285
286 let mut scores: Vec<(Language, f32)> = self
288 .models
289 .iter()
290 .map(|(lang, model)| (*lang, model.score(text, 3)))
291 .collect();
292
293 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
294
295 if scores.is_empty() {
296 return DetectionResult {
297 language: Language::Unknown,
298 confidence: 0.0,
299 alternatives: Vec::new(),
300 };
301 }
302
303 let max_score = scores[0].1;
305 let min_score = scores.last().unwrap().1;
306 let range = max_score - min_score;
307
308 let confidence = if range > 0.0 {
309 ((max_score - min_score) / range).clamp(0.0, 1.0)
310 } else {
311 0.5
312 };
313
314 DetectionResult {
315 language: scores[0].0,
316 confidence,
317 alternatives: scores.into_iter().skip(1).take(3).collect(),
318 }
319 }
320
321 fn is_likely_chinese(&self, text: &str) -> bool {
323 let chinese_chars = text.chars().filter(|c| {
324 let code = *c as u32;
325 (0x4E00..=0x9FFF).contains(&code) }).count();
327
328 chinese_chars as f32 / text.chars().count() as f32 > 0.3
329 }
330
331 fn is_likely_japanese(&self, text: &str) -> bool {
333 let japanese_chars = text.chars().filter(|c| {
334 let code = *c as u32;
335 (0x3040..=0x309F).contains(&code) || (0x30A0..=0x30FF).contains(&code) }).count();
338
339 japanese_chars > 0
340 }
341
342 fn is_likely_korean(&self, text: &str) -> bool {
344 let korean_chars = text.chars().filter(|c| {
345 let code = *c as u32;
346 (0xAC00..=0xD7AF).contains(&code) }).count();
348
349 korean_chars as f32 / text.chars().count() as f32 > 0.3
350 }
351
352 fn is_likely_arabic(&self, text: &str) -> bool {
354 let arabic_chars = text.chars().filter(|c| {
355 let code = *c as u32;
356 (0x0600..=0x06FF).contains(&code) }).count();
358
359 arabic_chars as f32 / text.chars().count() as f32 > 0.3
360 }
361
362 fn is_likely_russian(&self, text: &str) -> bool {
364 let cyrillic_chars = text.chars().filter(|c| {
365 let code = *c as u32;
366 (0x0400..=0x04FF).contains(&code) }).count();
368
369 cyrillic_chars as f32 / text.chars().count() as f32 > 0.3
370 }
371}
372
373impl Default for LanguageDetector {
374 fn default() -> Self {
375 Self::new()
376 }
377}
378
379pub struct MultilingualProcessor {
381 detector: LanguageDetector,
382}
383
384impl MultilingualProcessor {
385 pub fn new() -> Self {
387 Self {
388 detector: LanguageDetector::new(),
389 }
390 }
391
392 pub fn process(&self, text: &str) -> ProcessedText {
394 let detection = self.detector.detect(text);
395 let normalized = self.normalize_text(text, detection.language);
396 let tokens = self.tokenize(&normalized, detection.language);
397
398 ProcessedText {
399 original: text.to_string(),
400 normalized,
401 tokens,
402 language: detection.language,
403 confidence: detection.confidence,
404 }
405 }
406
407 fn normalize_text(&self, text: &str, language: Language) -> String {
409 let mut normalized = text.to_string();
410
411 normalized = normalized.split_whitespace().collect::<Vec<_>>().join(" ");
413
414 match language {
416 Language::Arabic => {
417 normalized = normalized.chars()
419 .filter(|c| {
420 let code = *c as u32;
421 !(0x064B..=0x0652).contains(&code) })
423 .collect();
424 }
425 Language::Chinese | Language::Japanese => {
426 normalized = normalized.chars()
428 .map(|c| {
429 let code = c as u32;
430 if (0xFF01..=0xFF5E).contains(&code) {
431 char::from_u32(code - 0xFEE0).unwrap_or(c)
432 } else {
433 c
434 }
435 })
436 .collect();
437 }
438 _ => {}
439 }
440
441 normalized
442 }
443
444 fn tokenize(&self, text: &str, language: Language) -> Vec<String> {
446 match language {
447 Language::Chinese | Language::Japanese => {
448 text.chars()
451 .filter(|c| !c.is_whitespace())
452 .map(|c| c.to_string())
453 .collect()
454 }
455 _ => {
456 text.split_whitespace()
458 .map(|s| s.to_string())
459 .collect()
460 }
461 }
462 }
463}
464
465impl Default for MultilingualProcessor {
466 fn default() -> Self {
467 Self::new()
468 }
469}
470
471#[derive(Debug, Clone)]
473pub struct ProcessedText {
474 pub original: String,
476 pub normalized: String,
478 pub tokens: Vec<String>,
480 pub language: Language,
482 pub confidence: f32,
484}
485
486#[cfg(test)]
487mod tests {
488 use super::*;
489
490 #[test]
491 fn test_language_codes() {
492 assert_eq!(Language::English.code(), "en");
493 assert_eq!(Language::Spanish.code(), "es");
494 assert_eq!(Language::from_code("fr"), Language::French);
495 assert_eq!(Language::from_code("unknown"), Language::Unknown);
496 }
497
498 #[test]
499 fn test_cjk_detection() {
500 assert!(Language::Chinese.is_cjk());
501 assert!(Language::Japanese.is_cjk());
502 assert!(Language::Korean.is_cjk());
503 assert!(!Language::English.is_cjk());
504 }
505
506 #[test]
507 fn test_rtl_detection() {
508 assert!(Language::Arabic.is_rtl());
509 assert!(!Language::English.is_rtl());
510 }
511
512 #[test]
513 fn test_language_detection() {
514 let detector = LanguageDetector::new();
515
516 let result = detector.detect("This is English text");
517 assert_eq!(result.language, Language::English);
518 assert!(result.confidence > 0.0);
519
520 let result = detector.detect("Esto es texto en español");
521 assert_eq!(result.language, Language::Spanish);
522
523 let result = detector.detect("Ceci est du texte français");
524 assert_eq!(result.language, Language::French);
525 }
526
527 #[test]
528 fn test_chinese_detection() {
529 let detector = LanguageDetector::new();
530 let result = detector.detect("这是中文文本");
531 assert_eq!(result.language, Language::Chinese);
532 assert!(result.confidence > 0.8);
533 }
534
535 #[test]
536 fn test_japanese_detection() {
537 let detector = LanguageDetector::new();
538 let result = detector.detect("これは日本語のテキストです");
539 assert_eq!(result.language, Language::Japanese);
540 assert!(result.confidence > 0.8);
541 }
542
543 #[test]
544 fn test_korean_detection() {
545 let detector = LanguageDetector::new();
546 let result = detector.detect("이것은 한국어 텍스트입니다");
547 assert_eq!(result.language, Language::Korean);
548 assert!(result.confidence > 0.8);
549 }
550
551 #[test]
552 fn test_multilingual_processing() {
553 let processor = MultilingualProcessor::new();
554
555 let result = processor.process("This is a test");
556 assert_eq!(result.language, Language::English);
557 assert!(!result.tokens.is_empty());
558
559 let result = processor.process("Esto es una prueba");
560 assert_eq!(result.language, Language::Spanish);
561 }
562
563 #[test]
564 fn test_text_normalization() {
565 let processor = MultilingualProcessor::new();
566 let result = processor.process("This has extra spaces");
567 assert_eq!(result.normalized, "This has extra spaces");
568 }
569}