use std::collections::HashMap;
use lingua::{Language, LanguageDetector};
use unicode_segmentation::UnicodeSegmentation;
use whatlang::Lang;
pub fn detect_language_with_confidence(
detector: &LanguageDetector,
text: &str,
) -> Option<(Language, f64)> {
let language = detector.detect_language_of(text);
match language {
Some(Language::Japanese) | Some(Language::Chinese) => {
Some(detect_chinese_or_japanese(text).unwrap_or_else(|| {
(
language.unwrap(),
detector.compute_language_confidence(text, language.unwrap()),
)
}))
}
Some(language) => Some((
language,
detector.compute_language_confidence(text, language),
)),
None => None,
}
}
pub fn detect_chinese_or_japanese(text: &str) -> Option<(Language, f64)> {
let detector = whatlang::Detector::with_allowlist(vec![Lang::Cmn, Lang::Jpn]);
let info = detector.detect(text)?;
match info.lang() {
Lang::Cmn => Some((Language::Chinese, info.confidence())),
Lang::Jpn => Some((Language::Japanese, info.confidence())),
_ => None,
}
}
pub fn detect_top_n_languages(
detector: &LanguageDetector,
max_number_of_languages: usize,
text: &str,
) -> Vec<Language> {
let mut language_highscores: HashMap<Language, f64> = HashMap::new();
let mut detected_sentence_count: usize = 0;
for sentence in text.split_sentence_bounds() {
if let Some((language, confidence)) = detect_language_with_confidence(detector, sentence) {
detected_sentence_count += 1;
if confidence > 0.3 {
language_highscores
.entry(language)
.and_modify(|value| {
*value += confidence;
})
.or_insert(confidence);
}
}
}
let mut scores: Vec<(Language, f64)> = language_highscores.into_iter().collect();
scores.sort_by(|(_, score_a), (_, score_b)| score_b.total_cmp(score_a));
let fluke_border = 0.25 + (detected_sentence_count as f64) * 0.05;
let mut languages = vec![];
for (language, score) in scores {
if languages.len() < max_number_of_languages
&& (score > fluke_border || languages.is_empty())
{
languages.push(language);
} else {
break;
}
}
languages
}