1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
use std::collections::HashMap; mod lang; mod script; mod query; mod result; mod trigrams; pub use lang::*; pub use script::*; pub use trigrams::*; pub use query::Query; pub use result::Result; const MAX_DIST : u32 = 300; pub fn detect_lang(query : Query) -> Option<Result> { let text = query.text; detect_script(text).map( |script| { let lang = detect_lang_based_on_script(text, script); Result { lang: lang, script: script } }) } fn detect_lang_based_on_script(text: &String, script : Script) -> Lang { match script { Script::Latin => detect(text, LATIN_LANGS), Script::Cyrillic => detect(text, CYRILLIC_LANGS), Script::Devanagari => detect(text, DEVANAGARI_LANGS), Script::Hebrew => detect(text, HEBREW_LANGS), Script::Ethiopic => detect(text, ETHIOPIC_LANGS), Script::Arabic => Lang::Arb, Script::Mandarin => Lang::Cmn, Script::Bengali => Lang::Ben, Script::Hangul => Lang::Kor, Script::Georgian => Lang::Kat, Script::Katakana | Script::Hiragana => Lang::Jpn } } fn detect(text : &String, lang_profile_list : LangProfileList) -> Lang { let mut lang_distances : Vec<(Lang, u32)> = vec![]; let trigrams = get_trigrams_with_positions(&text); for &(ref lang, lang_trigrams) in lang_profile_list { let dist = calculate_distance(lang_trigrams, &trigrams); lang_distances.push(((*lang).clone(), dist)); } lang_distances.sort_by_key(|key| key.1 ); (lang_distances.iter().nth(0).unwrap().0).clone() } fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 { let mut total_dist = 0u32; for (i, &trigram) in lang_trigrams.iter().enumerate() { let dist = match text_trigrams.get(trigram) { Some(&n) => (n as i32 - i as i32).abs() as u32, None => MAX_DIST }; total_dist += dist; } total_dist } #[cfg(test)] mod tests { use lang::Lang; use script::Script; use super::detect_lang; use super::Query; #[test] fn test_detect_lang() { let text = &"Además de todo lo anteriormente dicho, también encontramos...".to_string(); let query = Query { text: text }; let res = detect_lang(query).unwrap(); assert_eq!(res.lang, Lang::Spa); assert_eq!(res.script, Script::Latin); let text = &"English does not suit well for the role of international language".to_string(); let query = Query { text: text }; let res = detect_lang(query).unwrap(); assert_eq!(res.lang, Lang::Eng); assert_eq!(res.script, Script::Latin); let text = &"Та нічого, все нормально. А в тебе як?".to_string(); let query = Query { text: text }; let res = detect_lang(query).unwrap(); assert_eq!(res.lang, Lang::Ukr); assert_eq!(res.script, Script::Cyrillic); let text = &"ইউনিকোডে বাংলা লিপি".to_string(); let query = Query { text: text }; let res = detect_lang(query).unwrap(); assert_eq!(res.lang, Lang::Ben); assert_eq!(res.script, Script::Bengali); } }