whatlang 0.18.0

Fast and lightweight language identification library for Rust.
Documentation
//! It's a hard-core optimized implementation of a relatively simple algorithm.
//! The explanation of the algorithm can be found in the parent module [crate::alphabets].

use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};
use std::cmp::Reverse;
use std::collections::HashMap;
use std::sync::LazyLock;

/// Inverted map binding a character to a set of languages.
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) {
    let mut map = HashMap::new();

    for (lang, alphabet) in alphabets {
        for c in alphabet.chars() {
            let entry = map.entry(c).or_insert_with(Vec::new);
            entry.push(*lang);
        }
    }

    let mut char_lang: Vec<_> = map.into_iter().collect();

    char_lang.sort_unstable_by_key(|(c, _)| *c);

    let mut chars = Vec::with_capacity(char_lang.len());
    let mut langs = Vec::with_capacity(char_lang.len());
    for (ch, languages) in char_lang {
        chars.push(ch);
        langs.push(languages);
    }

    (chars, langs)
}

pub fn generic_alphabet_calculate_scores(
    script: Script,
    lang_map: &LazyLock<(Vec<char>, Vec<Vec<Lang>>)>,
    text: &LowercaseText,
    filter_list: &FilterList,
) -> RawOutcome {
    let (chars, langs) = &**lang_map;
    let script_langs = script.langs();

    // score of each character.
    let mut char_scores = vec![0; chars.len()];
    let mut max_raw_score = 0;
    // iterate over the text and scores characters.
    for ch in text.chars() {
        if is_stop_char(ch) {
            continue;
        }

        max_raw_score += 1;

        if let Ok(position) = chars.binary_search(&ch) {
            // add 2 and remove max_raw_score at the end,
            // to keep the score interval of -max_raw_score..max_raw_score
            char_scores[position] += 2;
        }
    }

    // score of each lang.
    let mut lang_scores = vec![0; Lang::all().len()];
    let mut common_score: usize = 0;
    // iterate over scored characters to compute language's scores.
    for (position, char_score) in char_scores.into_iter().enumerate() {
        if char_score > 0 {
            let languages = &langs[position];
            // if current character is common to all Languages, increment a common score
            // instead of iterating over all Languages scores.
            if languages.len() == script_langs.len() {
                common_score += char_score;
            } else {
                for &lang in languages {
                    lang_scores[lang as usize] += char_score;
                }
            }
        }
    }

    // remap languages with theirs scores.
    let mut raw_scores: Vec<(Lang, usize)> = script_langs
        .iter()
        .filter(|&&l| filter_list.is_allowed(l))
        .map(|&l| {
            let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
            (l, score)
        })
        .collect();

    raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));

    let mut normalized_scores = vec![];

    for &(lang, raw_score) in raw_scores.iter() {
        let normalized_score = raw_score as f64 / max_raw_score as f64;
        normalized_scores.push((lang, normalized_score));
    }

    RawOutcome {
        count: max_raw_score,
        raw_scores,
        scores: normalized_scores,
    }
}