use thiserror::Error;
use whatlang::{detect, Lang};
use super::LanguageTag;
#[derive(Error, Debug)]
pub enum LanguageDetectionError {
#[error("Insufficient text for language detection")]
InsufficientText,
#[error("Unsupported language detected: {0:?}")]
UnsupportedLanguage(Lang),
#[error("Low confidence detection: {confidence:.2}% (minimum: {minimum:.2}%)")]
LowConfidence {
confidence: f64,
minimum: f64,
},
}
pub fn detect_language(
text: &str,
min_confidence: f64,
) -> Result<LanguageTag, LanguageDetectionError> {
let info = detect(text).ok_or(LanguageDetectionError::InsufficientText)?;
let confidence = info.confidence();
if confidence < min_confidence {
return Err(LanguageDetectionError::LowConfidence {
confidence: confidence * 100.0,
minimum: min_confidence * 100.0,
});
}
let language_code = match info.lang() {
Lang::Eng => "en",
Lang::Spa => "es",
Lang::Deu => "de",
Lang::Fra => "fr",
Lang::Por => "pt",
Lang::Ita => "it",
Lang::Nld => "nl",
Lang::Rus => "ru",
Lang::Pol => "pl",
Lang::Ukr => "uk",
Lang::Ces => "cs",
Lang::Slk => "sk",
Lang::Slv => "sl",
Lang::Hrv => "hr",
Lang::Srp => "sr",
Lang::Mkd => "mk",
Lang::Bul => "bg",
Lang::Ron => "ro",
Lang::Hun => "hu",
Lang::Ell => "el",
Lang::Tur => "tr",
Lang::Fin => "fi",
Lang::Swe => "sv",
Lang::Dan => "da",
Lang::Nob => "no", Lang::Lit => "lt",
Lang::Lav => "lv",
Lang::Est => "et",
Lang::Cat => "ca",
Lang::Afr => "af",
Lang::Lat => "la",
Lang::Epo => "eo",
Lang::Cmn => "zh", Lang::Jpn => "ja",
Lang::Kor => "ko",
Lang::Tha => "th",
Lang::Vie => "vi",
Lang::Ind => "id",
Lang::Tgl => "tl",
Lang::Jav => "jv",
Lang::Mya => "my",
Lang::Khm => "km",
Lang::Hin => "hi",
Lang::Ben => "bn",
Lang::Tam => "ta",
Lang::Tel => "te",
Lang::Mar => "mr",
Lang::Urd => "ur",
Lang::Guj => "gu",
Lang::Kan => "kn",
Lang::Mal => "ml",
Lang::Pan => "pa",
Lang::Sin => "si",
Lang::Ara => "ar",
Lang::Heb => "he",
Lang::Aze => "az",
Lang::Uzb => "uz",
Lang::Amh => "am",
other => return Err(LanguageDetectionError::UnsupportedLanguage(other)),
};
Ok(LanguageTag::new(language_code))
}
pub fn detect_from_sentences<'a, I>(
sentences: I,
max_samples: usize,
min_confidence: f64,
) -> Result<LanguageTag, LanguageDetectionError>
where
I: Iterator<Item = &'a str>,
{
let mut sample = String::new();
for sentence in sentences.take(max_samples) {
if !sample.is_empty() {
sample.push(' ');
}
sample.push_str(sentence);
}
if sample.is_empty() {
return Err(LanguageDetectionError::InsufficientText);
}
detect_language(&sample, min_confidence)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_english() {
let result = detect_language(
"The quick brown fox jumps over the lazy dog. This is a sample sentence in English.",
0.8,
);
assert!(result.is_ok());
assert_eq!(result.unwrap().language(), "en");
}
#[test]
fn test_detect_german() {
let result = detect_language(
"Der schnelle braune Fuchs springt über den faulen Hund. Dies ist ein Beispielsatz auf Deutsch.",
0.8,
);
assert!(result.is_ok());
assert_eq!(result.unwrap().language(), "de");
}
#[test]
fn test_detect_spanish() {
let result = detect_language(
"El rápido zorro marrón salta sobre el perro perezoso. Esta es una oración de ejemplo en español.",
0.8,
);
assert!(result.is_ok());
assert_eq!(result.unwrap().language(), "es");
}
#[test]
fn test_insufficient_text() {
let result = detect_language("hi", 0.8);
assert!(result.is_err() || result.unwrap().language().len() <= 3);
}
}