1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
use std::collections::HashMap;

mod lang;
mod script;
mod query;
mod result;
mod trigrams;

pub use lang::*;
pub use script::*;
pub use trigrams::*;
pub use query::Query;
pub use result::Result;

const MAX_DIST : u32 = 300;

pub fn detect_lang(query : Query) -> Option<Result> {
    let text = query.text;

    if let Some(script) = detect_script(text) {
        detect_lang_based_on_script(query, script).map( |lang| {
            Result { lang: lang, script: script }
        })
    } else {
        None
    }
}

fn detect_lang_based_on_script(query : Query, script : Script) -> Option<Lang> {
    match script {
        Script::Latin      => detect(query, LATIN_LANGS),
        Script::Cyrillic   => detect(query, CYRILLIC_LANGS),
        Script::Devanagari => detect(query, DEVANAGARI_LANGS),
        Script::Hebrew     => detect(query, HEBREW_LANGS),
        Script::Ethiopic   => detect(query, ETHIOPIC_LANGS),
        Script::Arabic     => detect(query, ARABIC_LANGS),
        Script::Mandarin => Some(Lang::Cmn),
        Script::Bengali  => Some(Lang::Ben),
        Script::Hangul   => Some(Lang::Kor),
        Script::Georgian => Some(Lang::Kat),
        Script::Greek    => Some(Lang::Ell),
        Script::Kannada  => Some(Lang::Kan),
        Script::Tamil    => Some(Lang::Tam),
        Script::Thai     => Some(Lang::Tha),
        Script::Gujarati => Some(Lang::Guj),
        Script::Gurmukhi => Some(Lang::Pan),
        Script::Katakana | Script::Hiragana  => Some(Lang::Jpn)
    }
}

fn detect(query : Query, lang_profile_list : LangProfileList) -> Option<Lang> {
    let text = query.text;

    let mut lang_distances : Vec<(Lang, u32)> = vec![];
    let trigrams = get_trigrams_with_positions(&text);

    for &(ref lang, lang_trigrams) in lang_profile_list {
        if let Some(ref whitelist) = query.whitelist {
            // Skip non-whitelisted languages
            if !whitelist.contains(lang) { continue; }
        } else if let Some(ref blacklist) = query.blacklist {
            // Skip blacklisted languages
            if blacklist.contains(lang) { continue; }
        }
        let dist = calculate_distance(lang_trigrams, &trigrams);
        lang_distances.push(((*lang).clone(), dist));
    }

    lang_distances.sort_by_key(|key| key.1 );
    lang_distances.iter().nth(0).map(|pair| pair.0)
}

fn calculate_distance(lang_trigrams: LangProfile,  text_trigrams: &HashMap<String, u32>) -> u32 {
    let mut total_dist = 0u32;

    for (i, &trigram) in lang_trigrams.iter().enumerate() {
        let dist = match text_trigrams.get(trigram) {
            Some(&n) => (n as i32 - i as i32).abs() as u32,
            None => MAX_DIST
        };
        total_dist += dist;
    }
    total_dist
}

#[cfg(test)]
mod tests {
    use lang::Lang;
    use script::Script;
    use super::detect_lang;
    use super::Query;

    #[test]
    fn test_detect_lang() {
        let text = "Además de todo lo anteriormente dicho, también encontramos...".to_string();
        let query = Query::new(&text);
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Spa);
        assert_eq!(res.script, Script::Latin);

        let text = "English does not suit well for the role of international language".to_string();
        let query = Query::new(&text);
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Eng);
        assert_eq!(res.script, Script::Latin);

        let text = "Та нічого, все нормально. А в тебе як?".to_string();
        let query = Query::new(&text);
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Ukr);
        assert_eq!(res.script, Script::Cyrillic);

        let text = "ইউনিকোডে বাংলা লিপি".to_string();
        let query = Query::new(&text);
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Ben);
        assert_eq!(res.script, Script::Bengali);
    }

    #[test]
    fn test_detect_lang_with_blacklist() {
        let text = String::from("I am begging pardon");

        // without blacklist
        let query = Query::new(&text);
        let result = detect_lang(query).unwrap();
        assert_eq!(result.lang, Lang::Jav);

        // with blacklist
        let query = Query::new(&text).
            blacklist(vec![Lang::Jav, Lang::Nld, Lang::Uzb, Lang::Swe, Lang::Nob]);
        let result = detect_lang(query).unwrap();
        assert_eq!(result.lang, Lang::Eng);
    }

    #[test]
    fn test_detect_lang_with_blacklist_none() {
        let text = String::from("האקדמיה ללשון העברית");

        // All languages with Hebrew script are in blacklist, so result must be None
        let query = Query::new(&text).blacklist(vec![Lang::Heb, Lang::Ydd]);
        let result = detect_lang(query);
        assert_eq!(result, None);
    }

    #[test]
    fn test_detect_lang_with_whitelist() {
        let whitelist = vec![Lang::Epo, Lang::Ukr];

        let text = String::from("Mi ne scias! Ne demandu min plu!");
        let query = Query::new(&text).whitelist(whitelist.clone());
        let result = detect_lang(query).unwrap();
        assert_eq!(result.lang, Lang::Epo);

        let text = String::from("Тут все.");
        let query = Query::new(&text).whitelist(whitelist.clone());
        let result = detect_lang(query).unwrap();
        assert_eq!(result.lang, Lang::Ukr);
    }
}