use std::collections::HashSet;
use crate::detector::LanguageDetector;
use crate::isocode::{IsoCode639_1, IsoCode639_3};
use crate::language::Language;
pub(crate) const MISSING_LANGUAGE_MESSAGE: &str =
"LanguageDetector needs at least 1 language to choose from";
pub(crate) const MINIMUM_RELATIVE_DISTANCE_MESSAGE: &str =
"Minimum relative distance must lie in between 0.0 and 0.99";
#[derive(Clone)]
#[cfg_attr(feature = "python", pyo3::prelude::pyclass(module = "lingua"))]
pub struct LanguageDetectorBuilder {
languages: HashSet<Language>,
minimum_relative_distance: f64,
is_every_language_model_preloaded: bool,
is_low_accuracy_mode_enabled: bool,
}
impl LanguageDetectorBuilder {
pub fn from_all_languages() -> Self {
Self::from(Language::all())
}
pub fn from_all_spoken_languages() -> Self {
Self::from(Language::all_spoken_ones())
}
pub fn from_all_languages_with_arabic_script() -> Self {
Self::from(Language::all_with_arabic_script())
}
pub fn from_all_languages_with_cyrillic_script() -> Self {
Self::from(Language::all_with_cyrillic_script())
}
pub fn from_all_languages_with_devanagari_script() -> Self {
Self::from(Language::all_with_devanagari_script())
}
pub fn from_all_languages_with_latin_script() -> Self {
Self::from(Language::all_with_latin_script())
}
pub fn from_all_languages_without(languages: &[Language]) -> Self {
let mut languages_to_load = Language::all();
languages_to_load.retain(|it| !languages.contains(it));
if languages_to_load.is_empty() {
panic!("{}", MISSING_LANGUAGE_MESSAGE);
}
Self::from(languages_to_load)
}
pub fn from_languages(languages: &[Language]) -> Self {
if languages.is_empty() {
panic!("{}", MISSING_LANGUAGE_MESSAGE);
}
Self::from(languages.iter().cloned().collect())
}
pub fn from_iso_codes_639_1(iso_codes: &[IsoCode639_1]) -> Self {
if iso_codes.is_empty() {
panic!("{}", MISSING_LANGUAGE_MESSAGE);
}
let languages = iso_codes
.iter()
.map(Language::from_iso_code_639_1)
.collect::<HashSet<_>>();
Self::from(languages)
}
pub fn from_iso_codes_639_3(iso_codes: &[IsoCode639_3]) -> Self {
if iso_codes.is_empty() {
panic!("{}", MISSING_LANGUAGE_MESSAGE);
}
let languages = iso_codes
.iter()
.map(Language::from_iso_code_639_3)
.collect::<HashSet<_>>();
Self::from(languages)
}
pub fn with_minimum_relative_distance(&mut self, distance: f64) -> &mut Self {
if !(0.0..=0.99).contains(&distance) {
panic!("{}", MINIMUM_RELATIVE_DISTANCE_MESSAGE);
}
self.minimum_relative_distance = distance;
self
}
pub fn with_preloaded_language_models(&mut self) -> &mut Self {
self.is_every_language_model_preloaded = true;
self
}
pub fn with_low_accuracy_mode(&mut self) -> &mut Self {
self.is_low_accuracy_mode_enabled = true;
self
}
pub fn build(&mut self) -> LanguageDetector {
LanguageDetector::from(
self.languages.clone(),
self.minimum_relative_distance,
self.is_every_language_model_preloaded,
self.is_low_accuracy_mode_enabled,
)
}
fn from(languages: HashSet<Language>) -> Self {
Self {
languages,
minimum_relative_distance: 0.0,
is_every_language_model_preloaded: false,
is_low_accuracy_mode_enabled: false,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn assert_detector_can_be_built_from_all_languages() {
let mut builder = LanguageDetectorBuilder::from_all_languages();
assert_eq!(builder.languages, Language::all());
assert_eq!(builder.minimum_relative_distance, 0.0);
builder.with_minimum_relative_distance(0.2);
assert_eq!(builder.minimum_relative_distance, 0.2);
}
#[test]
fn assert_detector_can_be_built_from_spoken_languages() {
let mut builder = LanguageDetectorBuilder::from_all_spoken_languages();
assert_eq!(builder.languages, Language::all_spoken_ones());
assert_eq!(builder.minimum_relative_distance, 0.0);
builder.with_minimum_relative_distance(0.2);
assert_eq!(builder.minimum_relative_distance, 0.2);
}
#[test]
fn assert_detector_can_be_built_from_languages_with_arabic_script() {
let builder = LanguageDetectorBuilder::from_all_languages_with_arabic_script();
assert_eq!(builder.languages, Language::all_with_arabic_script());
}
#[test]
fn assert_detector_can_be_built_from_languages_with_cyrillic_script() {
let builder = LanguageDetectorBuilder::from_all_languages_with_cyrillic_script();
assert_eq!(builder.languages, Language::all_with_cyrillic_script());
}
#[test]
fn assert_detector_can_be_built_from_languages_with_devanagari_script() {
let builder = LanguageDetectorBuilder::from_all_languages_with_devanagari_script();
assert_eq!(builder.languages, Language::all_with_devanagari_script());
}
#[test]
fn assert_detector_can_be_built_from_languages_with_latin_script() {
let builder = LanguageDetectorBuilder::from_all_languages_with_latin_script();
assert_eq!(builder.languages, Language::all_with_latin_script());
}
#[test]
fn assert_detector_can_be_built_from_blacklist() {
let builder = LanguageDetectorBuilder::from_all_languages_without(&[
Language::Turkish,
Language::Romanian,
]);
let expected_languages = Language::all()
.difference(&hashset!(Language::Turkish, Language::Romanian))
.cloned()
.collect::<HashSet<Language>>();
assert_eq!(builder.languages, expected_languages);
}
#[test]
#[should_panic(expected = "LanguageDetector needs at least 1 language to choose from")]
fn assert_detector_cannot_be_built_from_too_long_blacklist() {
let languages = Language::all().into_iter().collect::<Vec<_>>();
LanguageDetectorBuilder::from_all_languages_without(&languages);
}
#[test]
fn assert_detector_can_be_built_from_whitelist() {
let language_sets = vec![
vec![Language::German],
vec![Language::German, Language::English],
];
for languages in language_sets {
let builder = LanguageDetectorBuilder::from_languages(&languages);
assert_eq!(
builder.languages,
languages.into_iter().collect::<HashSet<_>>()
);
}
}
#[test]
#[should_panic(expected = "LanguageDetector needs at least 1 language to choose from")]
fn assert_detector_cannot_be_built_from_too_short_whitelist() {
LanguageDetectorBuilder::from_languages(&[]);
}
#[test]
fn assert_detector_can_be_built_from_iso_639_1_codes() {
let builder = LanguageDetectorBuilder::from_iso_codes_639_1(&[IsoCode639_1::DE]);
assert_eq!(builder.languages, hashset!(Language::German));
let builder =
LanguageDetectorBuilder::from_iso_codes_639_1(&[IsoCode639_1::DE, IsoCode639_1::ZU]);
assert_eq!(
builder.languages,
hashset!(Language::German, Language::Zulu)
);
}
#[test]
#[should_panic(expected = "LanguageDetector needs at least 1 language to choose from")]
fn assert_detector_cannot_be_built_from_too_few_iso_639_1_codes() {
LanguageDetectorBuilder::from_iso_codes_639_1(&[]);
}
#[test]
fn assert_detector_can_be_built_from_iso_639_3_codes() {
let builder = LanguageDetectorBuilder::from_iso_codes_639_3(&[IsoCode639_3::DEU]);
assert_eq!(builder.languages, hashset!(Language::German));
let builder =
LanguageDetectorBuilder::from_iso_codes_639_3(&[IsoCode639_3::DEU, IsoCode639_3::ZUL]);
assert_eq!(
builder.languages,
hashset!(Language::German, Language::Zulu)
);
}
#[test]
#[should_panic(expected = "LanguageDetector needs at least 1 language to choose from")]
fn assert_detector_cannot_be_built_from_too_few_iso_639_3_codes() {
LanguageDetectorBuilder::from_iso_codes_639_3(&[]);
}
#[test]
#[should_panic(expected = "Minimum relative distance must lie in between 0.0 and 0.99")]
fn assert_detector_cannot_be_built_from_too_small_minimum_relative_distance() {
LanguageDetectorBuilder::from_all_languages().with_minimum_relative_distance(-2.3);
}
#[test]
#[should_panic(expected = "Minimum relative distance must lie in between 0.0 and 0.99")]
fn assert_detector_cannot_be_built_from_too_large_minimum_relative_distance() {
LanguageDetectorBuilder::from_all_languages().with_minimum_relative_distance(1.7);
}
}