use super::RawOutcome;
use crate::core::{FilterList, LowercaseText};
use crate::utils::is_stop_char;
use crate::{Lang, Script};
use std::cmp::Reverse;
use std::collections::HashMap;
use std::sync::LazyLock;
pub fn build_inverted_map(alphabets: &[(Lang, &str)]) -> (Vec<char>, Vec<Vec<Lang>>) {
let mut map = HashMap::new();
for (lang, alphabet) in alphabets {
for c in alphabet.chars() {
let entry = map.entry(c).or_insert_with(Vec::new);
entry.push(*lang);
}
}
let mut char_lang: Vec<_> = map.into_iter().collect();
char_lang.sort_unstable_by_key(|(c, _)| *c);
let mut chars = Vec::with_capacity(char_lang.len());
let mut langs = Vec::with_capacity(char_lang.len());
for (ch, languages) in char_lang {
chars.push(ch);
langs.push(languages);
}
(chars, langs)
}
pub fn generic_alphabet_calculate_scores(
script: Script,
lang_map: &LazyLock<(Vec<char>, Vec<Vec<Lang>>)>,
text: &LowercaseText,
filter_list: &FilterList,
) -> RawOutcome {
let (chars, langs) = &**lang_map;
let script_langs = script.langs();
let mut char_scores = vec![0; chars.len()];
let mut max_raw_score = 0;
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}
max_raw_score += 1;
if let Ok(position) = chars.binary_search(&ch) {
char_scores[position] += 2;
}
}
let mut lang_scores = vec![0; Lang::all().len()];
let mut common_score: usize = 0;
for (position, char_score) in char_scores.into_iter().enumerate() {
if char_score > 0 {
let languages = &langs[position];
if languages.len() == script_langs.len() {
common_score += char_score;
} else {
for &lang in languages {
lang_scores[lang as usize] += char_score;
}
}
}
}
let mut raw_scores: Vec<(Lang, usize)> = script_langs
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| {
let score = (lang_scores[l as usize] + common_score).saturating_sub(max_raw_score);
(l, score)
})
.collect();
raw_scores.sort_unstable_by_key(|(_, score)| Reverse(*score));
let mut normalized_scores = vec![];
for &(lang, raw_score) in raw_scores.iter() {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}
RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}