kreuzberg 4.6.3

High-performance document intelligence library for Rust. Extract text, metadata, and structured data from PDFs, Office documents, images, and 88+ formats with async/sync APIs.
Documentation
use super::error::OcrError;
use super::utils::MINIMAL_SUPPORTED_TESSERACT_VERSION;
use std::collections::HashSet;
use std::sync::LazyLock;

pub static TESSERACT_SUPPORTED_LANGUAGE_CODES: LazyLock<HashSet<&'static str>> = LazyLock::new(|| {
    let mut set = HashSet::new();
    set.insert("afr");
    set.insert("amh");
    set.insert("ara");
    set.insert("asm");
    set.insert("aze");
    set.insert("aze_cyrl");
    set.insert("bel");
    set.insert("ben");
    set.insert("bod");
    set.insert("bos");
    set.insert("bre");
    set.insert("bul");
    set.insert("cat");
    set.insert("ceb");
    set.insert("ces");
    set.insert("chi_sim");
    set.insert("chi_tra");
    set.insert("chr");
    set.insert("cos");
    set.insert("cym");
    set.insert("dan");
    set.insert("deu");
    set.insert("div");
    set.insert("dzo");
    set.insert("ell");
    set.insert("eng");
    set.insert("enm");
    set.insert("epo");
    set.insert("equ");
    set.insert("est");
    set.insert("eus");
    set.insert("fao");
    set.insert("fas");
    set.insert("fil");
    set.insert("fin");
    set.insert("fra");
    set.insert("frk");
    set.insert("frm");
    set.insert("fry");
    set.insert("gla");
    set.insert("gle");
    set.insert("glg");
    set.insert("grc");
    set.insert("guj");
    set.insert("hat");
    set.insert("heb");
    set.insert("hin");
    set.insert("hrv");
    set.insert("hun");
    set.insert("hye");
    set.insert("iku");
    set.insert("ind");
    set.insert("isl");
    set.insert("ita");
    set.insert("ita_old");
    set.insert("jav");
    set.insert("jpn");
    set.insert("kan");
    set.insert("kat");
    set.insert("kat_old");
    set.insert("kaz");
    set.insert("khm");
    set.insert("kir");
    set.insert("kmr");
    set.insert("kor");
    set.insert("lao");
    set.insert("lat");
    set.insert("lav");
    set.insert("lit");
    set.insert("ltz");
    set.insert("mal");
    set.insert("mar");
    set.insert("mkd");
    set.insert("mlt");
    set.insert("mon");
    set.insert("mri");
    set.insert("msa");
    set.insert("mya");
    set.insert("nep");
    set.insert("nld");
    set.insert("nor");
    set.insert("oci");
    set.insert("ori");
    set.insert("osd");
    set.insert("pan");
    set.insert("pol");
    set.insert("por");
    set.insert("pus");
    set.insert("que");
    set.insert("ron");
    set.insert("rus");
    set.insert("san");
    set.insert("sin");
    set.insert("slk");
    set.insert("slv");
    set.insert("snd");
    set.insert("spa");
    set.insert("spa_old");
    set.insert("sqi");
    set.insert("srp");
    set.insert("srp_latn");
    set.insert("sun");
    set.insert("swa");
    set.insert("swe");
    set.insert("syr");
    set.insert("tam");
    set.insert("tat");
    set.insert("tel");
    set.insert("tgk");
    set.insert("tha");
    set.insert("tir");
    set.insert("ton");
    set.insert("tur");
    set.insert("uig");
    set.insert("ukr");
    set.insert("urd");
    set.insert("uzb");
    set.insert("uzb_cyrl");
    set.insert("vie");
    set.insert("yid");
    set.insert("yor");
    set
});

pub fn validate_language_code(lang_code: &str) -> Result<(), OcrError> {
    // Accept "all" and "*" as special values to auto-detect installed languages
    let lower = lang_code.to_ascii_lowercase();
    if lower == "all" || lower == "*" {
        return Ok(());
    }

    for code in lang_code.split('+') {
        if !TESSERACT_SUPPORTED_LANGUAGE_CODES.contains(code) {
            return Err(OcrError::InvalidLanguageCode(format!(
                "Language code '{}' is not supported by Tesseract",
                code
            )));
        }
    }
    Ok(())
}

pub fn validate_tesseract_version(version: u32) -> Result<(), OcrError> {
    if version < MINIMAL_SUPPORTED_TESSERACT_VERSION {
        return Err(OcrError::UnsupportedVersion(format!(
            "Tesseract version {} is not supported. Minimum required version is {}",
            version, MINIMAL_SUPPORTED_TESSERACT_VERSION
        )));
    }
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_validate_language_code_all_keyword() {
        assert!(validate_language_code("all").is_ok());
        assert!(validate_language_code("*").is_ok());
        assert!(validate_language_code("ALL").is_ok());
        assert!(validate_language_code("All").is_ok());
    }

    #[test]
    fn test_validate_language_code_valid() {
        assert!(validate_language_code("eng").is_ok());
        assert!(validate_language_code("fra").is_ok());
        assert!(validate_language_code("deu").is_ok());
        assert!(validate_language_code("chi_sim").is_ok());
    }

    #[test]
    fn test_validate_language_code_multiple() {
        assert!(validate_language_code("eng+fra").is_ok());
        assert!(validate_language_code("eng+fra+deu").is_ok());
    }

    #[test]
    fn test_validate_language_code_invalid() {
        let result = validate_language_code("invalid_lang");
        assert!(result.is_err());
        assert!(matches!(result.unwrap_err(), OcrError::InvalidLanguageCode(_)));
    }

    #[test]
    fn test_validate_language_code_mixed_valid_invalid() {
        let result = validate_language_code("eng+invalid");
        assert!(result.is_err());
    }

    #[test]
    fn test_validate_tesseract_version_valid() {
        assert!(validate_tesseract_version(5).is_ok());
        assert!(validate_tesseract_version(6).is_ok());
    }

    #[test]
    fn test_validate_tesseract_version_invalid() {
        let result = validate_tesseract_version(4);
        assert!(result.is_err());
        assert!(matches!(result.unwrap_err(), OcrError::UnsupportedVersion(_)));
    }

    #[test]
    fn test_language_codes_exist() {
        assert!(TESSERACT_SUPPORTED_LANGUAGE_CODES.contains("eng"));
        assert!(TESSERACT_SUPPORTED_LANGUAGE_CODES.contains("fra"));
        assert!(TESSERACT_SUPPORTED_LANGUAGE_CODES.contains("chi_sim"));
        assert!(!TESSERACT_SUPPORTED_LANGUAGE_CODES.contains("fake"));
    }
}