use std::sync::LazyLock;
use super::RawOutcome;
use super::common::{build_inverted_map, generic_alphabet_calculate_scores};
use crate::core::{FilterList, LowercaseText};
use crate::{Lang, Script};
const AFR: &str = "abcdefghijklmnopqrstuvwxyzáèéêëíîïóôúû";
const AKA: &str = "abdefghiklmnoprstuwyɔɛ";
const AZE: &str = "abcdefghijklmnopqrstuvxyzçöüğışə̇";
const CAT: &str = "abcdefghijklmnopqrstuvwxyz·àçèéíïòóúü";
const CES: &str = "abcdefghijklmnopqrstuvwxyzáéíóúýčďěňřšťůž";
const CYM: &str = "abcdefghijklmnopqrstuvwxyzàáâäèéêëìíîïòóôöùúûüýÿŵŷẁẃẅỳ";
const DAN: &str = "abcdefghijklmnopqrstuvwxyzåæø";
const DEU: &str = "abcdefghijklmnopqrstuvwxyzßäöü";
const ENG: &str = "abcdefghijklmnopqrstuvwxyz";
const EPO: &str = "abcdefghijklmnoprstuvzĉĝĥĵŝŭ";
const EST: &str = "abcdefghijklmnopqrstuvwxyzäõöü";
const FIN: &str = "abcdefghijklmnopqrstuvwxyzäöšž";
const FRA: &str = "abcdefghijklmnopqrstuvwxyzàâçèéêëîïôùûüÿœ";
const HRV: &str = "abcdefghijklmnopqrstuvwxyzćčđšž";
const HUN: &str = "abcdefghijklmnopqrstuvwxyzáéíóöúüőű";
const IND: &str = "abcdefghijklmnopqrstuvwxyz";
const ITA: &str = "abcdefghijklmnopqrstuvwxyzàèéìòù";
const JAV: &str = "abcdefghijklmnopqrstuvwxyzèé";
const LAT: &str = "abcdefghijklmnopqrstuvwxyz";
const LAV: &str = "abcdefghijklmnopqrstuvwxyzāčēģīķļņōŗšūž";
const LIT: &str = "abcdefghijklmnopqrstuvwxyząčėęįšūųž";
const NLD: &str = "abcdefghijklmnopqrstuvwxyzàèéëïij";
const NOB: &str = "abcdefghijklmnopqrstuvwxyzåæø";
const POL: &str = "abcdefghijklmnopqrstuvwxyzóąćęłńśźż";
const POR: &str = "abcdefghijklmnopqrstuvwxyzàáâãçéêíóôõú";
const RON: &str = "abcdefghijklmnopqrstuvwxyzâîăşţ";
const SLK: &str = "abcdefghijklmnopqrstuvwxyzáäéíóôúýčďĺľňŕšťž";
const SLV: &str = "abcdefghijklmnopqrstuvwxyzčšž";
const SNA: &str = "abcdefghijklmnopqrstuvwxyz";
const SPA: &str = "abcdefghijklmnopqrstuvwxyz¡¿áéíñóúü";
const SWE: &str = "abcdefghijklmnopqrstuvwxyzäåö";
const TGL: &str = "abcdefghijklmnopqrstuvwxyzáéíñóú";
const TUK: &str = "abdefghijklmnoprstuwyzäçöüýňşž";
const TUR: &str = "abcdefghijklmnopqrstuvwxyzçöüğış̇";
const UZB: &str = "abcdefghijklmnopqrstuvxyzʻ";
const VIE: &str =
"abcdefghijklmnopqrstuvwxyzàáâãèéêìíòóôõùúýăđĩũơưạảấầẩẫậắằẳẵặẹẻẽếềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ";
const ZUL: &str = "abcdefghijklmnopqrstuvwxyz";
const LATIN_ALPHABETS: &[(Lang, &str)] = &[
(Lang::Afr, AFR),
(Lang::Aka, AKA),
(Lang::Aze, AZE),
(Lang::Cat, CAT),
(Lang::Ces, CES),
(Lang::Cym, CYM),
(Lang::Dan, DAN),
(Lang::Deu, DEU),
(Lang::Eng, ENG),
(Lang::Epo, EPO),
(Lang::Est, EST),
(Lang::Fin, FIN),
(Lang::Fra, FRA),
(Lang::Hrv, HRV),
(Lang::Hun, HUN),
(Lang::Ind, IND),
(Lang::Ita, ITA),
(Lang::Jav, JAV),
(Lang::Lat, LAT),
(Lang::Lav, LAV),
(Lang::Lit, LIT),
(Lang::Nld, NLD),
(Lang::Nob, NOB),
(Lang::Pol, POL),
(Lang::Por, POR),
(Lang::Ron, RON),
(Lang::Slk, SLK),
(Lang::Slv, SLV),
(Lang::Sna, SNA),
(Lang::Spa, SPA),
(Lang::Swe, SWE),
(Lang::Tgl, TGL),
(Lang::Tuk, TUK),
(Lang::Tur, TUR),
(Lang::Uzb, UZB),
(Lang::Vie, VIE),
(Lang::Zul, ZUL),
];
pub static ALPHABET_LANG_MAP: LazyLock<(Vec<char>, Vec<Vec<Lang>>)> =
LazyLock::new(|| build_inverted_map(LATIN_ALPHABETS));
pub fn alphabet_calculate_scores(text: &LowercaseText, filter_list: &FilterList) -> RawOutcome {
generic_alphabet_calculate_scores(Script::Latin, &ALPHABET_LANG_MAP, text, filter_list)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::Script;
use crate::utils::is_stop_char;
fn naive_alphabet_calculate_scores(
text: &LowercaseText,
filter_list: &FilterList,
) -> RawOutcome {
let mut raw_scores: Vec<(Lang, i32)> = Script::Latin
.langs()
.iter()
.filter(|&&l| filter_list.is_allowed(l))
.map(|&l| (l, 0i32))
.collect();
let max_raw_score = text.chars().filter(|&ch| !is_stop_char(ch)).count();
for (lang, score) in &mut raw_scores {
let alphabet: Vec<char> = LATIN_ALPHABETS
.iter()
.find(|(l, _)| l == lang)
.unwrap()
.1
.chars()
.collect();
for ch in text.chars() {
if is_stop_char(ch) {
continue;
};
if alphabet.contains(&ch) {
*score += 1;
} else {
*score -= 1;
}
}
}
raw_scores.sort_by(|a, b| b.1.cmp(&a.1));
let raw_scores: Vec<(Lang, usize)> = raw_scores
.into_iter()
.map(|(l, s)| {
let score = if s < 0 { 0usize } else { s as usize };
(l, score)
})
.collect();
let mut normalized_scores = vec![];
for &(lang, raw_score) in &raw_scores {
let normalized_score = raw_score as f64 / max_raw_score as f64;
normalized_scores.push((lang, normalized_score));
}
RawOutcome {
count: max_raw_score,
raw_scores,
scores: normalized_scores,
}
}
#[test]
fn test_alphabet_calculate_scores_against_harmaja_hauras() {
let text =
LowercaseText::new("Ja kulkee kylmä hetki pariimme, Olet hauras kuin jää, Ja kulke");
let filter = FilterList::All;
let outcome = alphabet_calculate_scores(&text, &filter);
assert_eq!(outcome.count, 50);
assert_eq!(outcome.raw_scores.len(), 37);
assert_eq!(outcome.scores.len(), 37);
let raw_scores_for = |lang: Lang| {
outcome
.raw_scores
.iter()
.find(|(l, _)| *l == lang)
.unwrap()
.1
};
let scores_for = |lang: Lang| outcome.scores.iter().find(|(l, _)| *l == lang).unwrap().1;
assert_eq!(raw_scores_for(Lang::Fin), 50);
assert_eq!(raw_scores_for(Lang::Deu), 50);
assert_eq!(raw_scores_for(Lang::Epo), 42);
assert_eq!(scores_for(Lang::Fin), 1.0);
assert_eq!(scores_for(Lang::Deu), 1.0);
assert_eq!(scores_for(Lang::Epo), 0.84);
}
#[test]
fn test_works_as_the_old_naive_implementation() {
let filter = FilterList::All;
let texts = [
"Tyrkiske språk eller turkotatariske språk er en språkgruppe bestående av minst 35 språk",
"René Descartes est un mathématicien, physicien et philosophe français, né le 31 mars 1596 à La Haye-en-Touraine",
"Die Sonne scheint in das Büro der Grabdenkmalsfirma Heinrich Kroll & Söhne. Es ist April 923, und das Geschäf geht gut.",
];
for text in texts {
let lowercase_text = LowercaseText::new(text);
let outcome = alphabet_calculate_scores(&lowercase_text, &filter);
let naive_outcome = naive_alphabet_calculate_scores(&lowercase_text, &filter);
assert_eq!(
outcome.count, naive_outcome.count,
"count failed. Text: {}",
text
);
for (lang, raw_naive_score) in naive_outcome.raw_scores.into_iter() {
let lookup_raw_score = |lang| {
outcome
.raw_scores
.iter()
.find(|(l, _)| *l == lang)
.unwrap()
.1
};
let raw_score = lookup_raw_score(lang);
assert_eq!(
raw_score, raw_naive_score,
"raw_score VS raw_naive_score failed. Lang={}, Text: {}",
lang, text
);
}
for (lang, naive_score) in naive_outcome.scores.into_iter() {
let lookup_score =
|lang| outcome.scores.iter().find(|(l, _)| *l == lang).unwrap().1;
let score = lookup_score(lang);
assert_eq!(
score, naive_score,
"score VS naive_score failed. Lang={}, Text: {}",
lang, text
);
}
}
}
}