use crate::settings::SETTINGS;
use crate::{config::Mode, error::LibCfgError};
pub(crate) use lingua::IsoCode639_1;
use lingua::{LanguageDetector, LanguageDetectorBuilder};
use parse_hyperlinks::iterator::MarkupLink;
use parse_hyperlinks::parser::Link;
use std::collections::HashMap;
#[cfg(feature = "lang-detection")]
pub(crate) fn get_lang(input: &str) -> Result<Vec<String>, LibCfgError> {
use std::borrow::Cow;
use itertools::Itertools;
let input = input.trim();
if input.is_empty() {
return Ok(vec![]);
}
let settings = SETTINGS.read_recursive();
match &settings.get_lang_filter.mode {
Mode::Disabled => return Ok(vec![]),
Mode::Error(e) => return Err(e.clone()),
_ => {}
}
let detector: LanguageDetector = if !&settings.get_lang_filter.language_candidates.is_empty() {
log::trace!(
"Execute template filter `get_lang` \
with languages candidates: {:?}",
&settings.get_lang_filter.language_candidates,
);
LanguageDetectorBuilder::from_iso_codes_639_1(&settings.get_lang_filter.language_candidates)
.with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
.build()
} else {
log::trace!(
"Execute template filter `get_lang` \
with all available languages",
);
LanguageDetectorBuilder::from_all_languages()
.with_minimum_relative_distance(settings.get_lang_filter.relative_distance_min)
.build()
};
let mut sniplets: Vec<Cow<str>> = Vec::new();
let mut remnant = "";
for ((skipped, _, r), link) in MarkupLink::new(input, false) {
sniplets.push(Cow::from(skipped));
remnant = r;
match link {
Link::Text2Dest(text, _, title) => {
if !text.is_empty() {
sniplets.push(text)
};
if !title.is_empty() {
sniplets.push(title)
};
}
Link::Text2Label(text, _) => {
if !text.is_empty() {
sniplets.push(text)
};
}
Link::TextLabel2Dest(text, _, _) => {
if !text.is_empty() {
sniplets.push(text)
};
}
Link::Image(alt_text, _) => {
if !alt_text.is_empty() {
sniplets.push(alt_text)
};
}
Link::Image2Dest(text1, img_alt, _, text2, _, title) => {
if !text1.is_empty() {
sniplets.push(text1)
};
if !img_alt.is_empty() {
sniplets.push(img_alt)
};
if !text2.is_empty() {
sniplets.push(text2)
};
if !title.is_empty() {
sniplets.push(title)
};
}
_ => {}
}
}
if !remnant.is_empty() {
sniplets.push(Cow::from(remnant));
}
if sniplets.is_empty() {
sniplets.push(Cow::from(input));
}
let texts = sniplets.as_slice();
use crate::FlattenWithIndexExt;
let detected_languages: Vec<String> = match &settings.get_lang_filter.mode {
Mode::Multilingual => {
let consecutive_words_min = settings.get_lang_filter.consecutive_words_min;
let words_total_percentage_min = settings.get_lang_filter.words_total_percentage_min;
let words_total: usize = texts
.iter()
.map(|slice| slice.split_whitespace().count())
.sum();
let words_min = [consecutive_words_min, words_total / 3];
let words_min = words_min.iter().min().unwrap();
log::trace!(
"Language snippets with less than {} words will be ignored.",
words_min
);
let words_distribution: HashMap<String, usize> = detector
.detect_multiple_languages_in_parallel_of(texts)
.into_iter()
.flatten_with_index()
.filter(|(i, l)| {
let allow_through = l.word_count() >= *words_min;
log::trace!(
"Language(s) detected in [{}]: {}, {}, {}: {:?}",
i,
l.language().iso_code_639_1(),
l.word_count(),
allow_through,
texts[*i][l.start_index()..l.end_index()]
.chars()
.take(60)
.collect::<String>()
);
allow_through
})
.map(|(_, l)| (l.language().iso_code_639_1().to_string(), l.word_count()))
.into_grouping_map_by(|n| n.0.clone())
.aggregate(|acc, _key, val| Some(acc.unwrap_or(0) + val.1));
let words_distribution: Vec<(String, usize)> = words_distribution
.into_iter()
.sorted_by_key(|l| usize::MAX - l.1)
.collect();
log::debug!(
"Languages distribution per word count:\n {:?}",
words_distribution
);
let words_distribution_total: usize = words_distribution.iter().map(|l| l.1).sum();
let words_total_min: usize =
words_distribution_total * words_total_percentage_min / 100;
words_distribution
.into_iter()
.filter(|(l, wc)| {
if *wc >= words_total_min {
true
} else {
let words_percentage = wc * 100 / words_distribution_total;
log::info!(
"Language `{}` rejected: not enough words in total ({}%<{}%)",
l,
words_percentage,
words_total_percentage_min
);
false
}
})
.map(|(l, _)| l)
.collect::<Vec<String>>()
}
Mode::Monolingual => detector
.detect_languages_in_parallel_of(texts)
.into_iter()
.flatten()
.map(|l| l.iso_code_639_1().to_string())
.inspect(|l| log::debug!("Language: '{}' in input detected.", l))
.collect(),
Mode::Disabled => unreachable!(),
Mode::Error(_) => unreachable!(), };
Ok(detected_languages)
}
#[cfg(test)]
mod tests {
use super::*;
use parking_lot::RwLockWriteGuard;
#[test]
fn test_get_lang() {
use crate::{
config::{GetLang, Mode},
settings::Settings,
};
use lingua::IsoCode639_1;
let get_lang_filter = GetLang {
mode: Mode::Multilingual,
language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
relative_distance_min: 0.2,
consecutive_words_min: 5,
words_total_percentage_min: 10,
};
let mut settings = SETTINGS.write();
*settings = Settings::default();
settings.get_lang_filter = get_lang_filter;
let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
let input = "Das große Haus";
let output = get_lang(input).unwrap();
assert_eq!("de", output[0]);
let input = "Il est venu trop tard";
let output = get_lang(input).unwrap();
assert_eq!("fr", output[0]);
let input = "How to set up a roof rack";
let output = get_lang(input).unwrap();
assert_eq!("en", output[0]);
let input = "1917039480 50198%-328470";
let output = get_lang(input).unwrap();
assert!(output.is_empty());
let input = " \t\n ";
let output = get_lang(input).unwrap();
assert!(output.is_empty());
let input = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing. \
Noch mehr Deutsch. \
Bien-sûr, je parle un peu. Qu'est-ce que tu veux?";
let output = get_lang(input).unwrap();
assert_eq!(output, ["fr", "de", "en"]);
let input = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing.";
let output = get_lang(input).unwrap();
assert_eq!(output, ["de", "en"]);
drop(_settings);
}
#[test]
fn test_get_lang2() {
use crate::{
config::{GetLang, Mode},
settings::Settings,
};
use lingua::IsoCode639_1;
let get_lang_filter = GetLang {
mode: Mode::Monolingual,
language_candidates: vec![IsoCode639_1::DE, IsoCode639_1::EN, IsoCode639_1::FR],
relative_distance_min: 0.2,
consecutive_words_min: 5,
words_total_percentage_min: 10,
};
let mut settings = SETTINGS.write();
*settings = Settings::default();
settings.get_lang_filter = get_lang_filter;
let _settings = RwLockWriteGuard::<'_, _>::downgrade(settings);
let input = "Das große Haus";
let output = get_lang(input).unwrap();
assert_eq!("de", output[0]);
let input = "Il est venu trop tard";
let output = get_lang(input).unwrap();
assert_eq!("fr", output[0]);
let input = "How to set up a roof rack";
let output = get_lang(input).unwrap();
assert_eq!("en", output[0]);
let input = "1917039480 50198%-328470";
let output = get_lang(input).unwrap();
assert!(output.is_empty());
let input = " \t\n ";
let output = get_lang(input).unwrap();
assert!(output.is_empty());
let input = "Parlez-vous français? \
Ich spreche Französisch nur ein bisschen. \
A little bit is better than nothing.";
let output = get_lang(input).unwrap();
assert_eq!(output.len(), 1);
assert_eq!("de", output[0]);
drop(_settings);
}
}