use std::io::Write;
use std::path::Path;
use std::process::{Command, Stdio};
use super::OcrResult;
use crate::DocumentError;
const STDERR_TRUNCATE_BYTES: usize = 4096;
#[non_exhaustive]
#[derive(Debug, Clone)]
pub struct TesseractOcr {
pub lang: String,
pub binary: Option<std::path::PathBuf>,
}
impl TesseractOcr {
pub fn new() -> Self {
Self {
lang: "eng".to_string(),
binary: None,
}
}
pub fn with_lang(lang: impl Into<String>) -> Self {
Self {
lang: lang.into(),
binary: None,
}
}
pub fn extract_from_file(&self, path: &Path) -> Result<OcrResult, DocumentError> {
let binary: &std::ffi::OsStr = self
.binary
.as_deref()
.map(AsRef::as_ref)
.unwrap_or_else(|| "tesseract".as_ref());
let output = Command::new(binary)
.arg(path)
.arg("stdout")
.arg("-l")
.arg(&self.lang)
.arg("tsv")
.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.output()
.map_err(|err| match err.kind() {
std::io::ErrorKind::NotFound => DocumentError::TesseractNotFound(install_hint()),
_ => DocumentError::Io(err),
})?;
if !output.status.success() {
let stderr = truncate_stderr(&output.stderr);
return Err(DocumentError::TesseractFailed {
status: output.status.code().unwrap_or(-1),
stderr,
});
}
let tsv = String::from_utf8_lossy(&output.stdout);
Ok(parse_tsv(&tsv, &self.lang))
}
pub fn extract_from_bytes(
&self,
bytes: &[u8],
extension: &str,
) -> Result<OcrResult, DocumentError> {
let suffix = format!(".{extension}");
let mut file = tempfile::Builder::new()
.prefix("gaze-document-ocr-")
.suffix(suffix.as_str())
.tempfile()?;
file.write_all(bytes)?;
file.flush()?;
let path = file.path().to_path_buf();
self.extract_from_file(&path)
}
}
impl Default for TesseractOcr {
fn default() -> Self {
Self::new()
}
}
impl super::OcrAdapter for TesseractOcr {
fn extract_text(&self, bytes: &[u8]) -> Result<String, DocumentError> {
self.extract_from_bytes(bytes, "png").map(|res| res.text)
}
}
fn parse_tsv(tsv: &str, lang: &str) -> OcrResult {
let mut text = String::new();
let mut current_line: Option<(u64, u64, u64)> = None;
let mut current_text = String::new();
let mut conf_sum: f64 = 0.0;
let mut conf_count: usize = 0;
for (idx, line) in tsv.lines().enumerate() {
if idx == 0 || line.is_empty() {
continue;
}
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() < 12 {
continue;
}
let level: u32 = cols[0].parse().unwrap_or(0);
if level != 5 {
continue; }
let block_num: u64 = cols[2].parse().unwrap_or(0);
let par_num: u64 = cols[3].parse().unwrap_or(0);
let line_num: u64 = cols[4].parse().unwrap_or(0);
let conf: f32 = cols[10].parse().unwrap_or(-1.0);
let word = cols[11];
if word.is_empty() {
continue;
}
let line_key = (block_num, par_num, line_num);
if current_line != Some(line_key) {
if !current_text.is_empty() {
if !text.is_empty() {
text.push('\n');
}
text.push_str(¤t_text);
current_text.clear();
}
current_line = Some(line_key);
}
if !current_text.is_empty() {
current_text.push(' ');
}
current_text.push_str(word);
if conf >= 0.0 {
conf_sum += conf as f64;
conf_count += 1;
}
}
if !current_text.is_empty() {
if !text.is_empty() {
text.push('\n');
}
text.push_str(¤t_text);
}
let mean_confidence = if conf_count == 0 {
None
} else {
Some((conf_sum / conf_count as f64) as f32)
};
OcrResult {
text,
mean_confidence,
word_count: conf_count,
lang: lang.to_string(),
}
}
fn truncate_stderr(bytes: &[u8]) -> String {
if bytes.len() <= STDERR_TRUNCATE_BYTES {
return String::from_utf8_lossy(bytes).into_owned();
}
let mut out = String::from_utf8_lossy(&bytes[..STDERR_TRUNCATE_BYTES]).into_owned();
out.push_str("\n…(truncated)");
out
}
fn install_hint() -> String {
if cfg!(target_os = "macos") {
"Install via `brew install tesseract` (or `port install tesseract`).".to_string()
} else if cfg!(target_os = "linux") {
"Install via `apt-get install tesseract-ocr` (Debian/Ubuntu), \
`dnf install tesseract` (Fedora), or `pacman -S tesseract` (Arch)."
.to_string()
} else if cfg!(target_os = "windows") {
"Install via `winget install --id UB-Mannheim.TesseractOCR` or download the \
UB-Mannheim build and add it to PATH."
.to_string()
} else {
"Install Tesseract from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH.".to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_tsv_groups_words_into_lines() {
let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n\
1\t1\t0\t0\t0\t0\t0\t0\t100\t100\t-1\t\n\
5\t1\t1\t1\t1\t1\t0\t0\t40\t10\t91\tBill\n\
5\t1\t1\t1\t1\t2\t40\t0\t30\t10\t93\tto:\n\
5\t1\t1\t1\t2\t1\t0\t20\t60\t10\t87\tJane\n\
5\t1\t1\t1\t2\t2\t60\t20\t60\t10\t89\tDoe\n";
let result = parse_tsv(tsv, "eng");
assert_eq!(result.text, "Bill to:\nJane Doe");
assert_eq!(result.word_count, 4);
let conf = result.mean_confidence.expect("expected mean confidence");
assert!((conf - 90.0).abs() < 1.0);
}
#[test]
fn parse_tsv_empty_yields_empty_text() {
let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n";
let result = parse_tsv(tsv, "eng");
assert!(result.text.is_empty());
assert_eq!(result.word_count, 0);
assert!(result.mean_confidence.is_none());
}
#[test]
fn install_hint_is_non_empty() {
assert!(!install_hint().is_empty());
}
}