use crate::ocr::error::OcrError;
use crate::ocr::types::TesseractConfig;
use kreuzberg_tesseract::TesseractAPI;
pub(super) fn hash_config(config: &TesseractConfig) -> String {
use ahash::AHasher;
use std::hash::{Hash, Hasher};
let mut hasher = AHasher::default();
config.language.hash(&mut hasher);
config.psm.hash(&mut hasher);
config.output_format.hash(&mut hasher);
config.enable_table_detection.hash(&mut hasher);
config.table_min_confidence.to_bits().hash(&mut hasher);
config.table_column_threshold.hash(&mut hasher);
config.table_row_threshold_ratio.to_bits().hash(&mut hasher);
config.classify_use_pre_adapted_templates.hash(&mut hasher);
config.language_model_ngram_on.hash(&mut hasher);
config.tessedit_dont_blkrej_good_wds.hash(&mut hasher);
config.tessedit_dont_rowrej_good_wds.hash(&mut hasher);
config.tessedit_enable_dict_correction.hash(&mut hasher);
config.tessedit_char_whitelist.hash(&mut hasher);
config.tessedit_use_primary_params_model.hash(&mut hasher);
config.textord_space_size_is_variable.hash(&mut hasher);
config.thresholding_method.hash(&mut hasher);
format!("{:016x}", hasher.finish())
}
pub(super) fn apply_tesseract_variables(api: &TesseractAPI, config: &TesseractConfig) -> Result<(), OcrError> {
api.set_variable(
"classify_use_pre_adapted_templates",
&config.classify_use_pre_adapted_templates.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set classify_use_pre_adapted_templates: {}", e)))?;
api.set_variable("language_model_ngram_on", &config.language_model_ngram_on.to_string())
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set language_model_ngram_on: {}", e)))?;
api.set_variable(
"tessedit_dont_blkrej_good_wds",
&config.tessedit_dont_blkrej_good_wds.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set tessedit_dont_blkrej_good_wds: {}", e)))?;
api.set_variable(
"tessedit_dont_rowrej_good_wds",
&config.tessedit_dont_rowrej_good_wds.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set tessedit_dont_rowrej_good_wds: {}", e)))?;
api.set_variable(
"tessedit_enable_dict_correction",
&config.tessedit_enable_dict_correction.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set tessedit_enable_dict_correction: {}", e)))?;
if !config.tessedit_char_whitelist.is_empty() {
api.set_variable("tessedit_char_whitelist", &config.tessedit_char_whitelist)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set tessedit_char_whitelist: {}", e)))?;
}
api.set_variable(
"tessedit_use_primary_params_model",
&config.tessedit_use_primary_params_model.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set tessedit_use_primary_params_model: {}", e)))?;
api.set_variable(
"textord_space_size_is_variable",
&config.textord_space_size_is_variable.to_string(),
)
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set textord_space_size_is_variable: {}", e)))?;
api.set_variable("thresholding_method", &config.thresholding_method.to_string())
.map_err(|e| OcrError::InvalidConfiguration(format!("Failed to set thresholding_method: {}", e)))?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
fn create_test_config() -> TesseractConfig {
TesseractConfig {
output_format: "text".to_string(),
enable_table_detection: false,
use_cache: false,
..TesseractConfig::default()
}
}
#[test]
fn test_hash_config_deterministic() {
let config = create_test_config();
let hash1 = hash_config(&config);
let hash2 = hash_config(&config);
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 16);
}
#[test]
fn test_hash_config_different_languages() {
let mut config1 = create_test_config();
config1.language = "eng".to_string();
let mut config2 = create_test_config();
config2.language = "fra".to_string();
let hash1 = hash_config(&config1);
let hash2 = hash_config(&config2);
assert_ne!(hash1, hash2);
}
#[test]
fn test_hash_config_different_psm() {
let mut config1 = create_test_config();
config1.psm = 3;
let mut config2 = create_test_config();
config2.psm = 6;
let hash1 = hash_config(&config1);
let hash2 = hash_config(&config2);
assert_ne!(hash1, hash2);
}
#[test]
fn test_hash_config_different_output_format() {
let mut config1 = create_test_config();
config1.output_format = "text".to_string();
let mut config2 = create_test_config();
config2.output_format = "markdown".to_string();
let hash1 = hash_config(&config1);
let hash2 = hash_config(&config2);
assert_ne!(hash1, hash2);
}
#[test]
fn test_hash_config_table_detection_flag() {
let mut config1 = create_test_config();
config1.enable_table_detection = false;
let mut config2 = create_test_config();
config2.enable_table_detection = true;
let hash1 = hash_config(&config1);
let hash2 = hash_config(&config2);
assert_ne!(hash1, hash2);
}
#[test]
fn test_hash_config_whitelist() {
let mut config1 = create_test_config();
config1.tessedit_char_whitelist = "".to_string();
let mut config2 = create_test_config();
config2.tessedit_char_whitelist = "0123456789".to_string();
let hash1 = hash_config(&config1);
let hash2 = hash_config(&config2);
assert_ne!(hash1, hash2);
}
}