whatlang 0.1.2

Natural language detection library. Identifies language of a given text.
Documentation
use std::collections::HashMap;

mod lang;
mod script;
mod query;
mod result;
mod trigrams;

pub use lang::*;
pub use script::*;
pub use trigrams::*;
pub use query::Query;
pub use result::Result;

const MAX_DIST : u32 = 300;

pub fn detect_lang(query : Query) -> Option<Result> {
    let text = query.text;
    detect_script(text).map( |script| {
        let lang = detect_lang_based_on_script(text, script);
        Result { lang: lang, script: script }
    })
}

fn detect_lang_based_on_script(text: &String, script : Script) -> Lang {
    match script {
        Script::Latin      => detect(text, LATIN_LANGS),
        Script::Cyrillic   => detect(text, CYRILLIC_LANGS),
        Script::Devanagari => detect(text, DEVANAGARI_LANGS),
        Script::Hebrew     => detect(text, HEBREW_LANGS),
        Script::Ethiopic   => detect(text, ETHIOPIC_LANGS),
        Script::Arabic   => Lang::Arb,
        Script::Mandarin => Lang::Cmn,
        Script::Bengali  => Lang::Ben,
        Script::Hangul   => Lang::Kor,
        Script::Georgian => Lang::Kat,
        Script::Katakana | Script::Hiragana  => Lang::Jpn
    }
}

fn detect(text : &String, lang_profile_list : LangProfileList) -> Lang {
    let mut lang_distances : Vec<(Lang, u32)> = vec![];
    let trigrams = get_trigrams_with_positions(&text);

    for &(ref lang, lang_trigrams) in lang_profile_list {
        let dist = calculate_distance(lang_trigrams, &trigrams);
        lang_distances.push(((*lang).clone(), dist));
    }

    lang_distances.sort_by_key(|key| key.1 );
    (lang_distances.iter().nth(0).unwrap().0).clone()
}

fn calculate_distance(lang_trigrams: LangProfile,  text_trigrams: &HashMap<String, u32>) -> u32 {
    let mut total_dist = 0u32;

    for (i, &trigram) in lang_trigrams.iter().enumerate() {
        let dist = match text_trigrams.get(trigram) {
            Some(&n) => (n as i32 - i as i32).abs() as u32,
            None => MAX_DIST
        };
        total_dist += dist;
    }
    total_dist
}

#[cfg(test)]
mod tests {
    use lang::Lang;
    use script::Script;
    use super::detect_lang;
    use super::Query;

    #[test]
    fn test_detect_lang() {
        let text = &"Además de todo lo anteriormente dicho, también encontramos...".to_string();
        let query = Query { text: text };
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Spa);
        assert_eq!(res.script, Script::Latin);

        let text = &"English does not suit well for the role of international language".to_string();
        let query = Query { text: text };
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Eng);
        assert_eq!(res.script, Script::Latin);

        let text = &"Та нічого, все нормально. А в тебе як?".to_string();
        let query = Query { text: text };
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Ukr);
        assert_eq!(res.script, Script::Cyrillic);

        let text = &"ইউনিকোডে বাংলা লিপি".to_string();
        let query = Query { text: text };
        let res = detect_lang(query).unwrap();
        assert_eq!(res.lang, Lang::Ben);
        assert_eq!(res.script, Script::Bengali);
    }
}