1use crate::error::{Result, TextError};
16use std::collections::HashMap;
17
18#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
24pub enum DetectedLanguage {
25 En,
27 Es,
29 Fr,
31 De,
33 It,
35 Pt,
37 Nl,
39 Ru,
41 Zh,
43 Ja,
45 Ko,
47 Ar,
49 Hi,
51 Tr,
53 Sv,
55 Pl,
57 Unknown,
59}
60
61impl DetectedLanguage {
62 pub fn iso_code(&self) -> &'static str {
64 match self {
65 Self::En => "en",
66 Self::Es => "es",
67 Self::Fr => "fr",
68 Self::De => "de",
69 Self::It => "it",
70 Self::Pt => "pt",
71 Self::Nl => "nl",
72 Self::Ru => "ru",
73 Self::Zh => "zh",
74 Self::Ja => "ja",
75 Self::Ko => "ko",
76 Self::Ar => "ar",
77 Self::Hi => "hi",
78 Self::Tr => "tr",
79 Self::Sv => "sv",
80 Self::Pl => "pl",
81 Self::Unknown => "und",
82 }
83 }
84
85 pub fn name(&self) -> &'static str {
87 match self {
88 Self::En => "English",
89 Self::Es => "Spanish",
90 Self::Fr => "French",
91 Self::De => "German",
92 Self::It => "Italian",
93 Self::Pt => "Portuguese",
94 Self::Nl => "Dutch",
95 Self::Ru => "Russian",
96 Self::Zh => "Chinese",
97 Self::Ja => "Japanese",
98 Self::Ko => "Korean",
99 Self::Ar => "Arabic",
100 Self::Hi => "Hindi",
101 Self::Tr => "Turkish",
102 Self::Sv => "Swedish",
103 Self::Pl => "Polish",
104 Self::Unknown => "Unknown",
105 }
106 }
107}
108
109#[derive(Debug, Clone)]
111pub struct LanguageDetectionOutput {
112 pub language: DetectedLanguage,
114 pub confidence: f64,
116 pub alternatives: Vec<(DetectedLanguage, f64)>,
118}
119
120#[derive(Debug, Clone, Copy, PartialEq, Eq)]
122pub enum DetectionStrategy {
123 Ngram,
125 WordFrequency,
127 UnicodeScript,
129 Combined,
131}
132
133pub fn detect_language(text: &str) -> Result<LanguageDetectionOutput> {
148 detect_language_with_strategy(text, DetectionStrategy::Combined)
149}
150
151pub fn detect_language_with_strategy(
153 text: &str,
154 strategy: DetectionStrategy,
155) -> Result<LanguageDetectionOutput> {
156 let trimmed = text.trim();
157 if trimmed.is_empty() {
158 return Err(TextError::InvalidInput(
159 "Cannot detect language of empty text".to_string(),
160 ));
161 }
162
163 match strategy {
164 DetectionStrategy::Ngram => detect_by_ngram(trimmed),
165 DetectionStrategy::WordFrequency => detect_by_word_frequency(trimmed),
166 DetectionStrategy::UnicodeScript => detect_by_unicode_script(trimmed),
167 DetectionStrategy::Combined => detect_combined(trimmed),
168 }
169}
170
171fn detect_by_ngram(text: &str) -> Result<LanguageDetectionOutput> {
176 let text_profile = build_ngram_profile(text, 3);
177 if text_profile.is_empty() {
178 return Ok(unknown_result());
179 }
180
181 let reference_profiles = reference_ngram_profiles();
182 let mut scores: Vec<(DetectedLanguage, f64)> = Vec::new();
183
184 for (lang, ref_profile) in &reference_profiles {
185 let similarity = profile_similarity(&text_profile, ref_profile);
186 scores.push((*lang, similarity));
187 }
188
189 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
190
191 if scores.is_empty() {
192 return Ok(unknown_result());
193 }
194
195 let best = scores[0];
196 let confidence = best.1.clamp(0.0, 1.0);
197
198 Ok(LanguageDetectionOutput {
199 language: best.0,
200 confidence,
201 alternatives: scores.into_iter().skip(1).collect(),
202 })
203}
204
205fn build_ngram_profile(text: &str, n: usize) -> HashMap<String, f64> {
207 let lower = text.to_lowercase();
208 let chars: Vec<char> = lower.chars().collect();
209 let mut counts: HashMap<String, f64> = HashMap::new();
210
211 if chars.len() < n {
212 return counts;
213 }
214
215 for window in chars.windows(n) {
216 let gram: String = window.iter().collect();
217 *counts.entry(gram).or_insert(0.0) += 1.0;
218 }
219
220 let total: f64 = counts.values().sum();
222 if total > 0.0 {
223 for v in counts.values_mut() {
224 *v /= total;
225 }
226 }
227
228 counts
229}
230
231fn profile_similarity(a: &HashMap<String, f64>, b: &HashMap<String, f64>) -> f64 {
233 let mut dot = 0.0_f64;
234 let mut norm_a = 0.0_f64;
235 let mut norm_b = 0.0_f64;
236
237 for (gram, &va) in a {
238 norm_a += va * va;
239 if let Some(&vb) = b.get(gram) {
240 dot += va * vb;
241 }
242 }
243 for &vb in b.values() {
244 norm_b += vb * vb;
245 }
246
247 let denom = norm_a.sqrt() * norm_b.sqrt();
248 if denom == 0.0 {
249 0.0
250 } else {
251 dot / denom
252 }
253}
254
255fn reference_ngram_profiles() -> HashMap<DetectedLanguage, HashMap<String, f64>> {
257 let mut profiles = HashMap::new();
258
259 profiles.insert(
261 DetectedLanguage::En,
262 build_ref_profile(&[
263 ("the", 50.0),
264 ("and", 30.0),
265 ("ing", 25.0),
266 ("tion", 20.0),
267 ("her", 18.0),
268 ("ent", 17.0),
269 ("ion", 16.0),
270 ("tio", 16.0),
271 ("for", 15.0),
272 ("ate", 14.0),
273 ("hat", 13.0),
274 ("tha", 13.0),
275 ("ere", 12.0),
276 ("his", 12.0),
277 ("hin", 11.0),
278 ("ter", 11.0),
279 ("was", 10.0),
280 ("all", 10.0),
281 ("ith", 9.0),
282 ("ver", 9.0),
283 ]),
284 );
285
286 profiles.insert(
288 DetectedLanguage::Es,
289 build_ref_profile(&[
290 ("que", 45.0),
291 ("ent", 30.0),
292 ("los", 28.0),
293 ("ion", 25.0),
294 ("aci", 22.0),
295 ("cion", 20.0),
296 ("del", 19.0),
297 ("las", 18.0),
298 ("con", 17.0),
299 ("est", 16.0),
300 ("por", 15.0),
301 ("nte", 14.0),
302 ("ado", 13.0),
303 ("una", 13.0),
304 ("tra", 12.0),
305 ("par", 11.0),
306 ("com", 10.0),
307 ("ero", 10.0),
308 ("ien", 9.0),
309 ("sta", 9.0),
310 ]),
311 );
312
313 profiles.insert(
315 DetectedLanguage::Fr,
316 build_ref_profile(&[
317 ("les", 45.0),
318 ("ent", 35.0),
319 ("que", 30.0),
320 ("des", 28.0),
321 ("ion", 25.0),
322 ("ait", 22.0),
323 ("ous", 20.0),
324 ("est", 18.0),
325 ("une", 17.0),
326 ("ant", 16.0),
327 ("par", 15.0),
328 ("eur", 14.0),
329 ("sur", 13.0),
330 ("tre", 12.0),
331 ("eme", 11.0),
332 ("dan", 10.0),
333 ("pas", 10.0),
334 ("tio", 9.0),
335 ("pou", 9.0),
336 ("ais", 8.0),
337 ]),
338 );
339
340 profiles.insert(
342 DetectedLanguage::De,
343 build_ref_profile(&[
344 ("ein", 45.0),
345 ("ich", 40.0),
346 ("der", 35.0),
347 ("die", 33.0),
348 ("und", 30.0),
349 ("den", 25.0),
350 ("sch", 23.0),
351 ("cht", 20.0),
352 ("ung", 18.0),
353 ("gen", 17.0),
354 ("ber", 16.0),
355 ("ver", 15.0),
356 ("auf", 14.0),
357 ("eit", 13.0),
358 ("ach", 12.0),
359 ("mit", 11.0),
360 ("aus", 10.0),
361 ("ine", 10.0),
362 ("das", 9.0),
363 ("ent", 8.0),
364 ]),
365 );
366
367 profiles.insert(
369 DetectedLanguage::It,
370 build_ref_profile(&[
371 ("che", 45.0),
372 ("ell", 30.0),
373 ("per", 28.0),
374 ("del", 25.0),
375 ("ato", 22.0),
376 ("ion", 20.0),
377 ("ent", 18.0),
378 ("con", 17.0),
379 ("lla", 16.0),
380 ("azi", 15.0),
381 ("tta", 14.0),
382 ("gli", 13.0),
383 ("sta", 12.0),
384 ("nte", 11.0),
385 ("one", 10.0),
386 ("ere", 10.0),
387 ("tto", 9.0),
388 ("ato", 9.0),
389 ("ment", 8.0),
390 ("pre", 8.0),
391 ]),
392 );
393
394 profiles.insert(
396 DetectedLanguage::Pt,
397 build_ref_profile(&[
398 ("que", 45.0),
399 ("ent", 30.0),
400 ("nte", 25.0),
401 ("ado", 22.0),
402 ("ica", 20.0),
403 ("est", 18.0),
404 ("dos", 17.0),
405 ("con", 16.0),
406 ("par", 15.0),
407 ("men", 14.0),
408 ("com", 13.0),
409 ("aco", 12.0),
410 ("tra", 11.0),
411 ("ida", 10.0),
412 ("pro", 10.0),
413 ("uma", 9.0),
414 ("mos", 9.0),
415 ("oes", 8.0),
416 ("ter", 8.0),
417 ("ais", 7.0),
418 ]),
419 );
420
421 profiles.insert(
423 DetectedLanguage::Nl,
424 build_ref_profile(&[
425 ("een", 45.0),
426 ("van", 40.0),
427 ("het", 35.0),
428 ("aar", 28.0),
429 ("ing", 25.0),
430 ("oor", 22.0),
431 ("ver", 20.0),
432 ("den", 18.0),
433 ("ijk", 16.0),
434 ("ond", 15.0),
435 ("ent", 14.0),
436 ("erd", 13.0),
437 ("sch", 12.0),
438 ("ter", 11.0),
439 ("and", 10.0),
440 ("ede", 10.0),
441 ("aat", 9.0),
442 ("met", 9.0),
443 ("nde", 8.0),
444 ("dat", 8.0),
445 ]),
446 );
447
448 profiles.insert(
450 DetectedLanguage::Tr,
451 build_ref_profile(&[
452 ("lar", 45.0),
453 ("bir", 40.0),
454 ("ler", 35.0),
455 ("eri", 30.0),
456 ("ara", 25.0),
457 ("ini", 22.0),
458 ("rin", 20.0),
459 ("yor", 18.0),
460 ("ile", 16.0),
461 ("dir", 15.0),
462 ("dan", 14.0),
463 ("rak", 13.0),
464 ("len", 12.0),
465 ("ası", 11.0),
466 ("lik", 10.0),
467 ("olu", 10.0),
468 ("ind", 9.0),
469 ("yan", 9.0),
470 ("ama", 8.0),
471 ("aki", 8.0),
472 ]),
473 );
474
475 profiles
476}
477
478fn build_ref_profile(data: &[(&str, f64)]) -> HashMap<String, f64> {
479 let total: f64 = data.iter().map(|(_, f)| f).sum();
480 let mut profile = HashMap::new();
481 for (gram, freq) in data {
482 profile.insert(gram.to_string(), freq / total);
483 }
484 profile
485}
486
487fn detect_by_word_frequency(text: &str) -> Result<LanguageDetectionOutput> {
492 let lower = text.to_lowercase();
493 let words: Vec<&str> = lower.split_whitespace().collect();
494 if words.is_empty() {
495 return Ok(unknown_result());
496 }
497
498 let word_lists = common_word_lists();
499 let mut scores: Vec<(DetectedLanguage, f64)> = Vec::new();
500
501 for (lang, common_words) in &word_lists {
502 let matches = words.iter().filter(|w| common_words.contains(*w)).count();
503 let ratio = matches as f64 / words.len() as f64;
504 scores.push((*lang, ratio));
505 }
506
507 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
508
509 if scores.is_empty() || scores[0].1 < 0.01 {
510 return Ok(unknown_result());
511 }
512
513 let best = scores[0];
514 let confidence = (best.1 * 2.5).clamp(0.0, 1.0);
515
516 Ok(LanguageDetectionOutput {
517 language: best.0,
518 confidence,
519 alternatives: scores.into_iter().skip(1).collect(),
520 })
521}
522
523fn common_word_lists() -> HashMap<DetectedLanguage, Vec<&'static str>> {
524 let mut lists = HashMap::new();
525
526 lists.insert(
527 DetectedLanguage::En,
528 vec![
529 "the", "be", "to", "of", "and", "a", "in", "that", "have", "i", "it", "for", "not",
530 "on", "with", "he", "as", "you", "do", "at", "this", "but", "his", "by", "from",
531 "they", "we", "say", "her", "she", "or", "an", "will", "my", "one", "all", "would",
532 "there", "their", "what", "so", "up", "out", "if", "about", "who", "get", "which",
533 "go", "me", "when",
534 ],
535 );
536
537 lists.insert(
538 DetectedLanguage::Es,
539 vec![
540 "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", "por", "un",
541 "para", "con", "no", "una", "su", "al", "es", "lo", "como", "pero", "sus", "le", "ya",
542 "o", "este", "ha", "si", "porque", "esta", "entre", "cuando", "muy", "sin", "sobre",
543 "ser", "tambien", "me", "hasta", "hay", "donde", "quien",
544 ],
545 );
546
547 lists.insert(
548 DetectedLanguage::Fr,
549 vec![
550 "de", "la", "le", "et", "les", "des", "en", "un", "du", "une", "que", "est", "dans",
551 "qui", "par", "pour", "au", "il", "sur", "pas", "plus", "ce", "ne", "se", "avec",
552 "mais", "on", "son", "tout", "je", "nous", "vous", "elle", "ou", "bien", "ces", "sont",
553 "sans", "comme", "peut", "fait", "aux", "entre", "deux",
554 ],
555 );
556
557 lists.insert(
558 DetectedLanguage::De,
559 vec![
560 "der", "die", "und", "in", "den", "von", "zu", "das", "mit", "sich", "des", "auf",
561 "nicht", "ein", "ist", "dem", "eine", "auch", "es", "an", "als", "nach", "wie", "aber",
562 "vor", "hat", "nur", "oder", "ich", "bei", "noch", "unter", "bis", "kann", "wird",
563 "so", "wenn", "sie", "sehr", "wir", "uber", "schon", "dann",
564 ],
565 );
566
567 lists.insert(
568 DetectedLanguage::It,
569 vec![
570 "di", "che", "il", "la", "in", "un", "per", "del", "non", "una", "con", "sono", "gli",
571 "le", "si", "da", "al", "lo", "ha", "come", "ma", "anche", "io", "suo", "dei", "nel",
572 "alla", "piu", "questo", "era", "essere", "tutto", "fra", "stato", "ancora", "dove",
573 "hanno", "ogni", "alle", "nella",
574 ],
575 );
576
577 lists.insert(
578 DetectedLanguage::Pt,
579 vec![
580 "de", "que", "o", "a", "do", "da", "em", "para", "com", "um", "uma", "os", "no", "se",
581 "na", "por", "mais", "as", "dos", "como", "mas", "ao", "ele", "das", "seu", "sua",
582 "ou", "quando", "muito", "nos", "ja", "eu", "tambem", "so", "pelo", "pela", "ate",
583 "isso", "ela", "entre", "depois", "sem", "mesmo",
584 ],
585 );
586
587 lists.insert(
588 DetectedLanguage::Nl,
589 vec![
590 "de", "het", "een", "van", "en", "in", "is", "dat", "op", "te", "zijn", "voor", "met",
591 "die", "niet", "aan", "er", "maar", "om", "ook", "als", "dan", "bij", "nog", "uit",
592 "kan", "al", "wel", "zo", "was", "worden", "tot", "naar", "heeft", "over", "meer",
593 "hun", "dit", "door", "onder", "heel", "deze", "dus",
594 ],
595 );
596
597 lists.insert(
598 DetectedLanguage::Tr,
599 vec![
600 "bir", "bu", "da", "ve", "ile", "olan", "icin", "var", "ama", "den", "daha", "gibi",
601 "sonra", "kadar", "olarak", "hem", "her", "ya", "mi", "ne", "ben", "sen", "biz", "siz",
602 "o", "onlar", "ise", "ancak", "yok", "cok",
603 ],
604 );
605
606 lists
607}
608
609fn detect_by_unicode_script(text: &str) -> Result<LanguageDetectionOutput> {
614 let chars: Vec<char> = text.chars().filter(|c| !c.is_whitespace()).collect();
615 if chars.is_empty() {
616 return Ok(unknown_result());
617 }
618
619 let total = chars.len() as f64;
620 let mut script_counts: HashMap<&str, usize> = HashMap::new();
621
622 for &ch in &chars {
623 let script = classify_char(ch);
624 *script_counts.entry(script).or_insert(0) += 1;
625 }
626
627 let mut lang_scores: HashMap<DetectedLanguage, f64> = HashMap::new();
629
630 if let Some(&count) = script_counts.get("cjk") {
631 let hiragana = *script_counts.get("hiragana").unwrap_or(&0) as f64;
633 let katakana = *script_counts.get("katakana").unwrap_or(&0) as f64;
634 let hangul = *script_counts.get("hangul").unwrap_or(&0) as f64;
635
636 if hiragana + katakana > hangul {
637 *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) +=
638 (count as f64 + hiragana + katakana) / total;
639 } else if hangul > 0.0 {
640 *lang_scores.entry(DetectedLanguage::Ko).or_insert(0.0) +=
641 (count as f64 + hangul) / total;
642 } else {
643 *lang_scores.entry(DetectedLanguage::Zh).or_insert(0.0) += count as f64 / total;
644 }
645 }
646
647 if let Some(&count) = script_counts.get("hiragana") {
648 *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) += count as f64 / total;
649 }
650 if let Some(&count) = script_counts.get("katakana") {
651 *lang_scores.entry(DetectedLanguage::Ja).or_insert(0.0) += count as f64 / total;
652 }
653 if let Some(&count) = script_counts.get("hangul") {
654 *lang_scores.entry(DetectedLanguage::Ko).or_insert(0.0) += count as f64 / total;
655 }
656 if let Some(&count) = script_counts.get("cyrillic") {
657 *lang_scores.entry(DetectedLanguage::Ru).or_insert(0.0) += count as f64 / total;
658 }
659 if let Some(&count) = script_counts.get("arabic") {
660 *lang_scores.entry(DetectedLanguage::Ar).or_insert(0.0) += count as f64 / total;
661 }
662 if let Some(&count) = script_counts.get("devanagari") {
663 *lang_scores.entry(DetectedLanguage::Hi).or_insert(0.0) += count as f64 / total;
664 }
665
666 if let Some(&count) = script_counts.get("latin") {
668 let latin_ratio = count as f64 / total;
669 if latin_ratio > 0.5 {
670 return detect_by_ngram(text);
672 }
673 }
674
675 let mut scores: Vec<(DetectedLanguage, f64)> = lang_scores.into_iter().collect();
676 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
677
678 if scores.is_empty() {
679 return Ok(unknown_result());
680 }
681
682 let best = scores[0];
683 let confidence = best.1.clamp(0.0, 1.0);
684
685 Ok(LanguageDetectionOutput {
686 language: best.0,
687 confidence,
688 alternatives: scores.into_iter().skip(1).collect(),
689 })
690}
691
692fn classify_char(ch: char) -> &'static str {
694 let code = ch as u32;
695 match code {
696 0x0041..=0x024F => "latin",
698 0x1E00..=0x1EFF => "latin",
700 0x0400..=0x052F => "cyrillic",
702 0x0600..=0x06FF | 0x0750..=0x077F | 0xFB50..=0xFDFF | 0xFE70..=0xFEFF => "arabic",
704 0x0900..=0x097F => "devanagari",
706 0x4E00..=0x9FFF | 0x3400..=0x4DBF | 0x20000..=0x2A6DF => "cjk",
708 0x3040..=0x309F => "hiragana",
710 0x30A0..=0x30FF | 0x31F0..=0x31FF => "katakana",
712 0xAC00..=0xD7AF | 0x1100..=0x11FF | 0x3130..=0x318F => "hangul",
714 0x0E00..=0x0E7F => "thai",
716 0x0370..=0x03FF => "greek",
718 0x0590..=0x05FF => "hebrew",
720 _ => "other",
721 }
722}
723
724fn detect_combined(text: &str) -> Result<LanguageDetectionOutput> {
729 let script_result = detect_by_unicode_script(text)?;
731 if script_result.language != DetectedLanguage::Unknown && script_result.confidence > 0.6 {
732 return Ok(script_result);
733 }
734
735 let ngram_result = detect_by_ngram(text)?;
737 let word_result = detect_by_word_frequency(text)?;
738
739 let mut combined: HashMap<DetectedLanguage, f64> = HashMap::new();
741 let ngram_weight = 0.55;
742 let word_weight = 0.45;
743
744 *combined.entry(ngram_result.language).or_insert(0.0) += ngram_weight * ngram_result.confidence;
746 for (lang, score) in &ngram_result.alternatives {
747 *combined.entry(*lang).or_insert(0.0) += ngram_weight * score;
748 }
749
750 *combined.entry(word_result.language).or_insert(0.0) += word_weight * word_result.confidence;
752 for (lang, score) in &word_result.alternatives {
753 *combined.entry(*lang).or_insert(0.0) += word_weight * score;
754 }
755
756 let mut scores: Vec<(DetectedLanguage, f64)> = combined.into_iter().collect();
757 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
758
759 if scores.is_empty() {
760 return Ok(unknown_result());
761 }
762
763 let best = scores[0];
764 let confidence = best.1.clamp(0.0, 1.0);
765
766 Ok(LanguageDetectionOutput {
767 language: best.0,
768 confidence,
769 alternatives: scores.into_iter().skip(1).collect(),
770 })
771}
772
773fn unknown_result() -> LanguageDetectionOutput {
774 LanguageDetectionOutput {
775 language: DetectedLanguage::Unknown,
776 confidence: 0.0,
777 alternatives: Vec::new(),
778 }
779}
780
781#[cfg(test)]
786mod tests {
787 use super::*;
788
789 #[test]
792 fn test_detect_english() {
793 let result = detect_language(
794 "The quick brown fox jumps over the lazy dog. This is a test of the language detection system.",
795 )
796 .expect("Should succeed");
797 assert_eq!(result.language, DetectedLanguage::En);
798 assert!(result.confidence > 0.0);
799 }
800
801 #[test]
802 fn test_detect_english_short() {
803 let result = detect_language("Hello world, how are you today?").expect("Should succeed");
804 assert_eq!(result.language, DetectedLanguage::En);
805 }
806
807 #[test]
808 fn test_detect_english_ngram_strategy() {
809 let result = detect_language_with_strategy(
810 "The weather is wonderful and everything looks beautiful in the morning light.",
811 DetectionStrategy::Ngram,
812 )
813 .expect("ok");
814 assert_eq!(result.language, DetectedLanguage::En);
815 }
816
817 #[test]
818 fn test_detect_english_word_frequency() {
819 let result = detect_language_with_strategy(
820 "This is a test of the word frequency detection method.",
821 DetectionStrategy::WordFrequency,
822 )
823 .expect("ok");
824 assert_eq!(result.language, DetectedLanguage::En);
825 }
826
827 #[test]
828 fn test_english_has_alternatives() {
829 let result = detect_language(
830 "The system provides comprehensive analysis and detailed reporting for all users.",
831 )
832 .expect("ok");
833 assert!(!result.alternatives.is_empty());
834 }
835
836 #[test]
839 fn test_detect_spanish() {
840 let result = detect_language(
841 "El gato se sienta en la alfombra. Esta es una prueba del sistema de deteccion de idioma.",
842 )
843 .expect("ok");
844 assert_eq!(result.language, DetectedLanguage::Es);
845 }
846
847 #[test]
848 fn test_detect_spanish_ngram() {
849 let result = detect_language_with_strategy(
850 "Los estudiantes que asistieron a la conferencia disfrutaron de las presentaciones.",
851 DetectionStrategy::Ngram,
852 )
853 .expect("ok");
854 assert_eq!(result.language, DetectedLanguage::Es);
855 }
856
857 #[test]
858 fn test_detect_spanish_word_frequency() {
859 let result = detect_language_with_strategy(
860 "Para los que no saben, el libro es una de las mejores novelas del siglo.",
861 DetectionStrategy::WordFrequency,
862 )
863 .expect("ok");
864 assert_eq!(result.language, DetectedLanguage::Es);
865 }
866
867 #[test]
868 fn test_detect_spanish_combined() {
869 let result = detect_language_with_strategy(
870 "La empresa ha contratado a nuevos empleados para el departamento de marketing.",
871 DetectionStrategy::Combined,
872 )
873 .expect("ok");
874 assert_eq!(result.language, DetectedLanguage::Es);
875 }
876
877 #[test]
878 fn test_spanish_confidence_range() {
879 let result =
880 detect_language("Buenos dias, como estas? Espero que todo vaya bien con la familia.")
881 .expect("ok");
882 assert!(result.confidence >= 0.0 && result.confidence <= 1.0);
883 }
884
885 #[test]
888 fn test_detect_french() {
889 let result = detect_language(
890 "Le chat est assis sur le tapis. Les enfants jouent dans le jardin avec leurs amis.",
891 )
892 .expect("ok");
893 assert_eq!(result.language, DetectedLanguage::Fr);
894 }
895
896 #[test]
897 fn test_detect_french_ngram() {
898 let result = detect_language_with_strategy(
899 "Les resultats des elections ont ete publies dans les journaux ce matin.",
900 DetectionStrategy::Ngram,
901 )
902 .expect("ok");
903 assert_eq!(result.language, DetectedLanguage::Fr);
904 }
905
906 #[test]
907 fn test_detect_french_word() {
908 let result = detect_language_with_strategy(
909 "Je ne suis pas sur que nous puissions terminer ce projet dans les delais prevus.",
910 DetectionStrategy::WordFrequency,
911 )
912 .expect("ok");
913 assert_eq!(result.language, DetectedLanguage::Fr);
914 }
915
916 #[test]
917 fn test_french_confidence() {
918 let result = detect_language("Bonjour, comment allez-vous? Je suis content de vous voir.")
919 .expect("ok");
920 assert!(result.confidence > 0.0);
921 }
922
923 #[test]
924 fn test_detect_french_combined() {
925 let result = detect_language(
926 "Les entreprises francaises investissent dans les nouvelles technologies pour une meilleure productivite.",
927 )
928 .expect("ok");
929 assert_eq!(result.language, DetectedLanguage::Fr);
930 }
931
932 #[test]
935 fn test_detect_german() {
936 let result = detect_language(
937 "Die Katze sitzt auf der Matte. Die Kinder spielen im Garten mit ihren Freunden.",
938 )
939 .expect("ok");
940 assert_eq!(result.language, DetectedLanguage::De);
941 }
942
943 #[test]
944 fn test_detect_german_word() {
945 let result = detect_language_with_strategy(
946 "Ich bin nicht sicher, ob wir dieses Projekt noch rechtzeitig fertigstellen werden.",
947 DetectionStrategy::WordFrequency,
948 )
949 .expect("ok");
950 assert_eq!(result.language, DetectedLanguage::De);
951 }
952
953 #[test]
954 fn test_detect_german_ngram() {
955 let result = detect_language_with_strategy(
956 "Die Ergebnisse der Untersuchung wurden gestern veroffentlicht und haben grosse Aufmerksamkeit erregt.",
957 DetectionStrategy::Ngram,
958 )
959 .expect("ok");
960 assert_eq!(result.language, DetectedLanguage::De);
961 }
962
963 #[test]
964 fn test_german_confidence() {
965 let result = detect_language("Guten Tag, wie geht es Ihnen? Ich hoffe, es geht Ihnen gut.")
966 .expect("ok");
967 assert!(result.confidence > 0.0);
968 }
969
970 #[test]
971 fn test_detect_german_combined() {
972 let result = detect_language(
973 "Die Wissenschaftler haben einen wichtigen Durchbruch in der Forschung erzielt.",
974 )
975 .expect("ok");
976 assert_eq!(result.language, DetectedLanguage::De);
977 }
978
979 #[test]
982 fn test_detect_chinese() {
983 let result =
984 detect_language("今天天气很好,我们去公园散步吧。这是一个美丽的城市。").expect("ok");
985 assert_eq!(result.language, DetectedLanguage::Zh);
986 }
987
988 #[test]
989 fn test_detect_japanese() {
990 let result =
991 detect_language("今日はとてもいい天気です。公園で散歩しましょう。").expect("ok");
992 assert_eq!(result.language, DetectedLanguage::Ja);
993 }
994
995 #[test]
996 fn test_detect_korean() {
997 let result =
998 detect_language("오늘 날씨가 정말 좋습니다. 공원에서 산책합시다.").expect("ok");
999 assert_eq!(result.language, DetectedLanguage::Ko);
1000 }
1001
1002 #[test]
1003 fn test_detect_russian() {
1004 let result = detect_language("Сегодня прекрасная погода. Давайте пойдем гулять в парк.")
1005 .expect("ok");
1006 assert_eq!(result.language, DetectedLanguage::Ru);
1007 }
1008
1009 #[test]
1010 fn test_detect_arabic() {
1011 let result = detect_language("الطقس جميل اليوم. دعونا نذهب للمشي في الحديقة.").expect("ok");
1012 assert_eq!(result.language, DetectedLanguage::Ar);
1013 }
1014
1015 #[test]
1018 fn test_empty_text_error() {
1019 let result = detect_language("");
1020 assert!(result.is_err());
1021 }
1022
1023 #[test]
1024 fn test_whitespace_only_error() {
1025 let result = detect_language(" \t\n ");
1026 assert!(result.is_err());
1027 }
1028
1029 #[test]
1030 fn test_very_short_text() {
1031 let result = detect_language("Hi").expect("ok");
1033 assert!(result.confidence >= 0.0);
1035 }
1036
1037 #[test]
1038 fn test_iso_code_round_trip() {
1039 let lang = DetectedLanguage::En;
1040 assert_eq!(lang.iso_code(), "en");
1041 assert_eq!(lang.name(), "English");
1042 }
1043
1044 #[test]
1045 fn test_unknown_iso_code() {
1046 let lang = DetectedLanguage::Unknown;
1047 assert_eq!(lang.iso_code(), "und");
1048 }
1049
1050 #[test]
1053 fn test_unicode_script_cjk() {
1054 let result =
1055 detect_language_with_strategy("这是一个测试。", DetectionStrategy::UnicodeScript)
1056 .expect("ok");
1057 assert_eq!(result.language, DetectedLanguage::Zh);
1058 }
1059
1060 #[test]
1061 fn test_unicode_script_cyrillic() {
1062 let result = detect_language_with_strategy(
1063 "Привет мир, как дела?",
1064 DetectionStrategy::UnicodeScript,
1065 )
1066 .expect("ok");
1067 assert_eq!(result.language, DetectedLanguage::Ru);
1068 }
1069
1070 #[test]
1071 fn test_unicode_script_arabic() {
1072 let result =
1073 detect_language_with_strategy("مرحبا بالعالم", DetectionStrategy::UnicodeScript)
1074 .expect("ok");
1075 assert_eq!(result.language, DetectedLanguage::Ar);
1076 }
1077
1078 #[test]
1079 fn test_unicode_script_devanagari() {
1080 let result =
1081 detect_language_with_strategy("नमस्ते दुनिया, कैसे हो?", DetectionStrategy::UnicodeScript)
1082 .expect("ok");
1083 assert_eq!(result.language, DetectedLanguage::Hi);
1084 }
1085
1086 #[test]
1087 fn test_unicode_script_latin_falls_back() {
1088 let result = detect_language_with_strategy(
1090 "The quick brown fox jumps over the lazy dog.",
1091 DetectionStrategy::UnicodeScript,
1092 )
1093 .expect("ok");
1094 assert_eq!(result.language, DetectedLanguage::En);
1096 }
1097}