Skip to main content

gaze_document/ocr/
tesseract.rs

1//! Tesseract subprocess OCR adapter.
2//!
3//! Invokes the `tesseract` CLI directly via [`std::process::Command`].
4//! Subprocess (rather than FFI) is intentional — adopters never need a
5//! native build toolchain or `libtesseract` headers.
6//!
7//! ## Fail-closed contract
8//!
9//! * Missing binary → [`DocumentError::TesseractNotFound`] with per-OS install
10//!   instructions in the message payload.
11//! * Non-zero exit → [`DocumentError::TesseractFailed`] carrying captured
12//!   stderr (truncated).
13//!
14//! ## Confidence
15//!
16//! The `tsv` output mode emits per-word confidence in column 11. We compute
17//! a mean across words whose confidence is `>= 0`. Tesseract uses `-1` for
18//! structural (block / paragraph / line) rows that carry no confidence.
19
20use std::io::Write;
21use std::path::Path;
22use std::process::{Command, Stdio};
23
24use super::{BBox, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult, OcrSpan};
25use crate::DocumentError;
26
27const STDERR_TRUNCATE_BYTES: usize = 4096;
28
29/// Tesseract subprocess OCR backend.
30#[non_exhaustive]
31#[derive(Debug, Clone)]
32pub struct TesseractBackend {
33    /// Tesseract language code (default `eng`).
34    pub lang: String,
35    /// Optional explicit binary path. `None` → look up `tesseract` on `PATH`.
36    pub binary: Option<std::path::PathBuf>,
37}
38
39impl TesseractBackend {
40    /// Builds the adapter with `eng` language and `tesseract` on `PATH`.
41    pub fn new() -> Self {
42        Self {
43            lang: "eng".to_string(),
44            binary: None,
45        }
46    }
47
48    /// Builds the adapter with an explicit language code.
49    pub fn with_lang(lang: impl Into<String>) -> Self {
50        Self {
51            lang: lang.into(),
52            binary: None,
53        }
54    }
55
56    /// Runs OCR on a file already on disk.
57    ///
58    /// Calls `tesseract <path> stdout -l <lang> tsv`, parses the TSV stream
59    /// for text + per-word confidence in a single subprocess invocation.
60    pub fn extract_from_file(&self, path: &Path) -> Result<OcrResult, DocumentError> {
61        self.extract_from_file_with_lang(path, &self.lang)
62    }
63
64    fn extract_from_file_with_lang(
65        &self,
66        path: &Path,
67        lang: &str,
68    ) -> Result<OcrResult, DocumentError> {
69        let tsv = self.run_tesseract_tsv(path, lang)?;
70        Ok(parse_tsv_result(&tsv, lang))
71    }
72
73    fn run_tesseract_tsv(&self, path: &Path, lang: &str) -> Result<String, DocumentError> {
74        let binary: &std::ffi::OsStr = self
75            .binary
76            .as_deref()
77            .map(AsRef::as_ref)
78            .unwrap_or_else(|| "tesseract".as_ref());
79
80        let output = Command::new(binary)
81            .arg(path)
82            .arg("stdout")
83            .arg("-l")
84            .arg(lang)
85            .arg("tsv")
86            .stdin(Stdio::null())
87            .stdout(Stdio::piped())
88            .stderr(Stdio::piped())
89            .output()
90            .map_err(|err| match err.kind() {
91                std::io::ErrorKind::NotFound => DocumentError::TesseractNotFound(install_hint()),
92                _ => DocumentError::Io(err),
93            })?;
94
95        if !output.status.success() {
96            let stderr = truncate_stderr(&output.stderr);
97            return Err(DocumentError::TesseractFailed {
98                status: output.status.code().unwrap_or(-1),
99                stderr,
100            });
101        }
102
103        Ok(String::from_utf8_lossy(&output.stdout).into_owned())
104    }
105
106    /// Runs OCR on an in-memory image payload.
107    ///
108    /// Writes `bytes` to a tempfile (preserving any input extension via the
109    /// caller's choice of `extension`) so Tesseract auto-detects the image
110    /// format from the suffix. Cleaned up on drop.
111    pub fn extract_from_bytes(
112        &self,
113        bytes: &[u8],
114        extension: &str,
115    ) -> Result<OcrResult, DocumentError> {
116        let suffix = format!(".{extension}");
117        let mut file = tempfile::Builder::new()
118            .prefix("gaze-document-ocr-")
119            .suffix(suffix.as_str())
120            .tempfile()?;
121        file.write_all(bytes)?;
122        file.flush()?;
123        let path = file.path().to_path_buf();
124        self.extract_from_file(&path)
125    }
126}
127
128impl Default for TesseractBackend {
129    fn default() -> Self {
130        Self::new()
131    }
132}
133
134impl OcrBackend for TesseractBackend {
135    fn name(&self) -> &str {
136        "tesseract"
137    }
138
139    fn recognize(&self, image: ImageInput, hints: OcrHints) -> Result<Vec<OcrSpan>, OcrError> {
140        let suffix = format!(".{}", image.format.extension());
141        let mut file = tempfile::Builder::new()
142            .prefix("gaze-document-ocr-")
143            .suffix(suffix.as_str())
144            .tempfile()
145            .map_err(|err| OcrError::Internal(err.to_string()))?;
146        file.write_all(&image.bytes)
147            .map_err(|err| OcrError::Internal(err.to_string()))?;
148        file.flush()
149            .map_err(|err| OcrError::Internal(err.to_string()))?;
150        let tsv = self
151            .run_tesseract_tsv(file.path(), hints.primary_language())
152            .map_err(document_error_to_ocr_error)?;
153        Ok(parse_tsv_spans(&tsv))
154    }
155}
156
157fn parse_tsv_result(tsv: &str, lang: &str) -> OcrResult {
158    parse_tsv(tsv, lang)
159}
160
161fn parse_tsv_spans(tsv: &str) -> Vec<OcrSpan> {
162    let mut spans = Vec::new();
163
164    for (idx, line) in tsv.lines().enumerate() {
165        if idx == 0 || line.is_empty() {
166            continue;
167        }
168        let cols: Vec<&str> = line.split('\t').collect();
169        if cols.len() < 12 {
170            continue;
171        }
172        let level: u32 = cols[0].parse().unwrap_or(0);
173        if level != 5 {
174            continue;
175        }
176        let word = cols[11];
177        if word.is_empty() {
178            continue;
179        }
180        let confidence = cols[10]
181            .parse::<f32>()
182            .ok()
183            .filter(|conf| *conf >= 0.0)
184            .map(|conf| (conf / 100.0).clamp(0.0, 1.0));
185        spans.push(OcrSpan {
186            text: word.to_string(),
187            bbox: BBox {
188                x: cols[6].parse().unwrap_or(0),
189                y: cols[7].parse().unwrap_or(0),
190                w: cols[8].parse().unwrap_or(0),
191                h: cols[9].parse().unwrap_or(0),
192            },
193            confidence,
194        });
195    }
196
197    spans
198}
199
200fn document_error_to_ocr_error(err: DocumentError) -> OcrError {
201    match err {
202        DocumentError::TesseractNotFound(hint) => OcrError::InitFailed(hint),
203        DocumentError::TesseractFailed { status, stderr } => {
204            OcrError::RecognizeFailed(format!("status {status}: {stderr}"))
205        }
206        DocumentError::Io(err) => OcrError::Internal(err.to_string()),
207        other => OcrError::Internal(other.to_string()),
208    }
209}
210
211fn parse_tsv(tsv: &str, lang: &str) -> OcrResult {
212    let mut text = String::new();
213    let mut current_line: Option<(u64, u64, u64)> = None;
214    let mut current_text = String::new();
215    let mut conf_sum: f64 = 0.0;
216    let mut conf_count: usize = 0;
217
218    for (idx, line) in tsv.lines().enumerate() {
219        if idx == 0 || line.is_empty() {
220            // Skip header row + blank trailers.
221            continue;
222        }
223        let cols: Vec<&str> = line.split('\t').collect();
224        if cols.len() < 12 {
225            continue;
226        }
227        // Tesseract TSV columns: level, page_num, block_num, par_num,
228        // line_num, word_num, left, top, width, height, conf, text
229        let level: u32 = cols[0].parse().unwrap_or(0);
230        if level != 5 {
231            continue; // word rows only
232        }
233        let block_num: u64 = cols[2].parse().unwrap_or(0);
234        let par_num: u64 = cols[3].parse().unwrap_or(0);
235        let line_num: u64 = cols[4].parse().unwrap_or(0);
236        let conf: f32 = cols[10].parse().unwrap_or(-1.0);
237        let word = cols[11];
238        if word.is_empty() {
239            continue;
240        }
241
242        let line_key = (block_num, par_num, line_num);
243        if current_line != Some(line_key) {
244            if !current_text.is_empty() {
245                if !text.is_empty() {
246                    text.push('\n');
247                }
248                text.push_str(&current_text);
249                current_text.clear();
250            }
251            current_line = Some(line_key);
252        }
253        if !current_text.is_empty() {
254            current_text.push(' ');
255        }
256        current_text.push_str(word);
257
258        if conf >= 0.0 {
259            conf_sum += conf as f64;
260            conf_count += 1;
261        }
262    }
263    if !current_text.is_empty() {
264        if !text.is_empty() {
265            text.push('\n');
266        }
267        text.push_str(&current_text);
268    }
269
270    let mean_confidence = if conf_count == 0 {
271        None
272    } else {
273        Some((conf_sum / conf_count as f64) as f32)
274    };
275    OcrResult {
276        text,
277        mean_confidence,
278        word_count: conf_count,
279        lang: lang.to_string(),
280    }
281}
282
283fn truncate_stderr(bytes: &[u8]) -> String {
284    if bytes.len() <= STDERR_TRUNCATE_BYTES {
285        return String::from_utf8_lossy(bytes).into_owned();
286    }
287    let mut out = String::from_utf8_lossy(&bytes[..STDERR_TRUNCATE_BYTES]).into_owned();
288    out.push_str("\n…(truncated)");
289    out
290}
291
292fn install_hint() -> String {
293    if cfg!(target_os = "macos") {
294        "Install via `brew install tesseract` (or `port install tesseract`).".to_string()
295    } else if cfg!(target_os = "linux") {
296        "Install via `apt-get install tesseract-ocr` (Debian/Ubuntu), \
297         `dnf install tesseract` (Fedora), or `pacman -S tesseract` (Arch)."
298            .to_string()
299    } else if cfg!(target_os = "windows") {
300        "Install via `winget install --id UB-Mannheim.TesseractOCR` or download the \
301         UB-Mannheim build and add it to PATH."
302            .to_string()
303    } else {
304        "Install Tesseract from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH.".to_string()
305    }
306}
307
308#[cfg(test)]
309mod tests {
310    use super::*;
311
312    #[test]
313    fn parse_tsv_groups_words_into_lines() {
314        let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n\
3151\t1\t0\t0\t0\t0\t0\t0\t100\t100\t-1\t\n\
3165\t1\t1\t1\t1\t1\t0\t0\t40\t10\t91\tBill\n\
3175\t1\t1\t1\t1\t2\t40\t0\t30\t10\t93\tto:\n\
3185\t1\t1\t1\t2\t1\t0\t20\t60\t10\t87\tJane\n\
3195\t1\t1\t1\t2\t2\t60\t20\t60\t10\t89\tDoe\n";
320        let result = parse_tsv(tsv, "eng");
321        assert_eq!(result.text, "Bill to:\nJane Doe");
322        assert_eq!(result.word_count, 4);
323        let conf = result.mean_confidence.expect("expected mean confidence");
324        assert!((conf - 90.0).abs() < 1.0);
325    }
326
327    #[test]
328    fn parse_tsv_empty_yields_empty_text() {
329        let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n";
330        let result = parse_tsv(tsv, "eng");
331        assert!(result.text.is_empty());
332        assert_eq!(result.word_count, 0);
333        assert!(result.mean_confidence.is_none());
334    }
335
336    #[test]
337    fn install_hint_is_non_empty() {
338        assert!(!install_hint().is_empty());
339    }
340}