use crate::alphabets;
use crate::core::{calculate_confidence, Info, InternalQuery};
use crate::trigrams;
use crate::Lang;
#[derive(Debug)]
pub struct RawOutcome {
pub scores: Vec<(Lang, f64)>,
pub alphabet_raw_outcome: alphabets::RawOutcome,
pub trigram_raw_outcome: trigrams::RawOutcome,
}
pub fn detect(iquery: &InternalQuery) -> Option<Info> {
let raw_outcome = raw_detect(iquery);
let count = raw_outcome.trigram_raw_outcome.trigrams_count;
let mut normalized_scores_iter = raw_outcome.scores.into_iter();
let opt_lang_score1 = normalized_scores_iter.next();
let opt_lang_score2 = normalized_scores_iter.next();
opt_lang_score1.map(|(lang1, score1)| {
let script = iquery.multi_lang_script.to_script();
let confidence = if let Some((_, score2)) = opt_lang_score2 {
calculate_confidence(score1, score2, count)
} else {
1.0
};
Info::new(script, lang1, confidence)
})
}
pub fn raw_detect(iquery: &InternalQuery) -> RawOutcome {
let alphabet_raw_outcome = alphabets::raw_detect(iquery);
let trigram_raw_outcome = trigrams::raw_detect(iquery);
let alphabet_scores = &alphabet_raw_outcome.scores;
let trigram_scores = &trigram_raw_outcome.scores;
let mut all_langs: Vec<Lang> = alphabet_scores.iter().map(|x| x.0).collect();
trigram_scores.iter().for_each(|(lang, _)| {
if !all_langs.contains(lang) {
all_langs.push(*lang);
}
});
let count = alphabet_raw_outcome.count;
let alphabet_weight = calc_alphabet_weight(count);
let trigram_weight = 1.0 - alphabet_weight;
let mut scores = vec![];
for lang in all_langs {
let a: f64 = alphabet_scores
.iter()
.find(|(l, _)| l == &lang)
.map(|x| x.1)
.unwrap_or(0.0);
let t: f64 = trigram_scores
.iter()
.find(|(l, _)| l == &lang)
.map(|x| x.1)
.unwrap_or(0.0);
debug_assert!(a >= 0.0);
debug_assert!(a <= 1.0);
debug_assert!(t >= 0.0);
debug_assert!(t <= 1.0);
let score = a * alphabet_weight + t * trigram_weight;
debug_assert!(score >= 0.0);
debug_assert!(score <= 1.0);
scores.push((lang, score));
}
scores.sort_unstable_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Less));
RawOutcome {
scores,
alphabet_raw_outcome,
trigram_raw_outcome,
}
}
fn calc_alphabet_weight(count: usize) -> f64 {
let weight = -(count as f64 / 300.0) + 2.0 / 3.0;
weight.clamp(1.0 / 3.0, 2.0 / 3.0)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_calc_alphabet_weight() {
assert_eq!(calc_alphabet_weight(0), 2.0 / 3.0);
assert_eq!(calc_alphabet_weight(50), 0.5);
assert_eq!(calc_alphabet_weight(100), 1.0 / 3.0);
assert_eq!(calc_alphabet_weight(200), 1.0 / 3.0);
}
}