gaze_document/ocr/
tesseract.rs1use std::io::Write;
21use std::path::Path;
22use std::process::{Command, Stdio};
23
24use super::{BBox, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult, OcrSpan};
25use crate::DocumentError;
26
27const STDERR_TRUNCATE_BYTES: usize = 4096;
28
29#[non_exhaustive]
31#[derive(Debug, Clone)]
32pub struct TesseractBackend {
33 pub lang: String,
35 pub binary: Option<std::path::PathBuf>,
37}
38
39impl TesseractBackend {
40 pub fn new() -> Self {
42 Self {
43 lang: "eng".to_string(),
44 binary: None,
45 }
46 }
47
48 pub fn with_lang(lang: impl Into<String>) -> Self {
50 Self {
51 lang: lang.into(),
52 binary: None,
53 }
54 }
55
56 pub fn extract_from_file(&self, path: &Path) -> Result<OcrResult, DocumentError> {
61 self.extract_from_file_with_lang(path, &self.lang)
62 }
63
64 fn extract_from_file_with_lang(
65 &self,
66 path: &Path,
67 lang: &str,
68 ) -> Result<OcrResult, DocumentError> {
69 let tsv = self.run_tesseract_tsv(path, lang)?;
70 Ok(parse_tsv_result(&tsv, lang))
71 }
72
73 fn run_tesseract_tsv(&self, path: &Path, lang: &str) -> Result<String, DocumentError> {
74 let binary: &std::ffi::OsStr = self
75 .binary
76 .as_deref()
77 .map(AsRef::as_ref)
78 .unwrap_or_else(|| "tesseract".as_ref());
79
80 let output = Command::new(binary)
81 .arg(path)
82 .arg("stdout")
83 .arg("-l")
84 .arg(lang)
85 .arg("tsv")
86 .stdin(Stdio::null())
87 .stdout(Stdio::piped())
88 .stderr(Stdio::piped())
89 .output()
90 .map_err(|err| match err.kind() {
91 std::io::ErrorKind::NotFound => DocumentError::TesseractNotFound(install_hint()),
92 _ => DocumentError::Io(err),
93 })?;
94
95 if !output.status.success() {
96 let stderr = truncate_stderr(&output.stderr);
97 return Err(DocumentError::TesseractFailed {
98 status: output.status.code().unwrap_or(-1),
99 stderr,
100 });
101 }
102
103 Ok(String::from_utf8_lossy(&output.stdout).into_owned())
104 }
105
106 pub fn extract_from_bytes(
112 &self,
113 bytes: &[u8],
114 extension: &str,
115 ) -> Result<OcrResult, DocumentError> {
116 let suffix = format!(".{extension}");
117 let mut file = tempfile::Builder::new()
118 .prefix("gaze-document-ocr-")
119 .suffix(suffix.as_str())
120 .tempfile()?;
121 file.write_all(bytes)?;
122 file.flush()?;
123 let path = file.path().to_path_buf();
124 self.extract_from_file(&path)
125 }
126}
127
128impl Default for TesseractBackend {
129 fn default() -> Self {
130 Self::new()
131 }
132}
133
134impl OcrBackend for TesseractBackend {
135 fn name(&self) -> &str {
136 "tesseract"
137 }
138
139 fn recognize(&self, image: ImageInput, hints: OcrHints) -> Result<Vec<OcrSpan>, OcrError> {
140 let suffix = format!(".{}", image.format.extension());
141 let mut file = tempfile::Builder::new()
142 .prefix("gaze-document-ocr-")
143 .suffix(suffix.as_str())
144 .tempfile()
145 .map_err(|err| OcrError::Internal(err.to_string()))?;
146 file.write_all(&image.bytes)
147 .map_err(|err| OcrError::Internal(err.to_string()))?;
148 file.flush()
149 .map_err(|err| OcrError::Internal(err.to_string()))?;
150 let tsv = self
151 .run_tesseract_tsv(file.path(), hints.primary_language())
152 .map_err(document_error_to_ocr_error)?;
153 Ok(parse_tsv_spans(&tsv))
154 }
155}
156
157fn parse_tsv_result(tsv: &str, lang: &str) -> OcrResult {
158 parse_tsv(tsv, lang)
159}
160
161fn parse_tsv_spans(tsv: &str) -> Vec<OcrSpan> {
162 let mut spans = Vec::new();
163
164 for (idx, line) in tsv.lines().enumerate() {
165 if idx == 0 || line.is_empty() {
166 continue;
167 }
168 let cols: Vec<&str> = line.split('\t').collect();
169 if cols.len() < 12 {
170 continue;
171 }
172 let level: u32 = cols[0].parse().unwrap_or(0);
173 if level != 5 {
174 continue;
175 }
176 let word = cols[11];
177 if word.is_empty() {
178 continue;
179 }
180 let confidence = cols[10]
181 .parse::<f32>()
182 .ok()
183 .filter(|conf| *conf >= 0.0)
184 .map(|conf| (conf / 100.0).clamp(0.0, 1.0));
185 spans.push(OcrSpan {
186 text: word.to_string(),
187 bbox: BBox {
188 x: cols[6].parse().unwrap_or(0),
189 y: cols[7].parse().unwrap_or(0),
190 w: cols[8].parse().unwrap_or(0),
191 h: cols[9].parse().unwrap_or(0),
192 },
193 confidence,
194 });
195 }
196
197 spans
198}
199
200fn document_error_to_ocr_error(err: DocumentError) -> OcrError {
201 match err {
202 DocumentError::TesseractNotFound(hint) => OcrError::InitFailed(hint),
203 DocumentError::TesseractFailed { status, stderr } => {
204 OcrError::RecognizeFailed(format!("status {status}: {stderr}"))
205 }
206 DocumentError::Io(err) => OcrError::Internal(err.to_string()),
207 other => OcrError::Internal(other.to_string()),
208 }
209}
210
211fn parse_tsv(tsv: &str, lang: &str) -> OcrResult {
212 let mut text = String::new();
213 let mut current_line: Option<(u64, u64, u64)> = None;
214 let mut current_text = String::new();
215 let mut conf_sum: f64 = 0.0;
216 let mut conf_count: usize = 0;
217
218 for (idx, line) in tsv.lines().enumerate() {
219 if idx == 0 || line.is_empty() {
220 continue;
222 }
223 let cols: Vec<&str> = line.split('\t').collect();
224 if cols.len() < 12 {
225 continue;
226 }
227 let level: u32 = cols[0].parse().unwrap_or(0);
230 if level != 5 {
231 continue; }
233 let block_num: u64 = cols[2].parse().unwrap_or(0);
234 let par_num: u64 = cols[3].parse().unwrap_or(0);
235 let line_num: u64 = cols[4].parse().unwrap_or(0);
236 let conf: f32 = cols[10].parse().unwrap_or(-1.0);
237 let word = cols[11];
238 if word.is_empty() {
239 continue;
240 }
241
242 let line_key = (block_num, par_num, line_num);
243 if current_line != Some(line_key) {
244 if !current_text.is_empty() {
245 if !text.is_empty() {
246 text.push('\n');
247 }
248 text.push_str(¤t_text);
249 current_text.clear();
250 }
251 current_line = Some(line_key);
252 }
253 if !current_text.is_empty() {
254 current_text.push(' ');
255 }
256 current_text.push_str(word);
257
258 if conf >= 0.0 {
259 conf_sum += conf as f64;
260 conf_count += 1;
261 }
262 }
263 if !current_text.is_empty() {
264 if !text.is_empty() {
265 text.push('\n');
266 }
267 text.push_str(¤t_text);
268 }
269
270 let mean_confidence = if conf_count == 0 {
271 None
272 } else {
273 Some((conf_sum / conf_count as f64) as f32)
274 };
275 OcrResult {
276 text,
277 mean_confidence,
278 word_count: conf_count,
279 lang: lang.to_string(),
280 }
281}
282
283fn truncate_stderr(bytes: &[u8]) -> String {
284 if bytes.len() <= STDERR_TRUNCATE_BYTES {
285 return String::from_utf8_lossy(bytes).into_owned();
286 }
287 let mut out = String::from_utf8_lossy(&bytes[..STDERR_TRUNCATE_BYTES]).into_owned();
288 out.push_str("\n…(truncated)");
289 out
290}
291
292fn install_hint() -> String {
293 if cfg!(target_os = "macos") {
294 "Install via `brew install tesseract` (or `port install tesseract`).".to_string()
295 } else if cfg!(target_os = "linux") {
296 "Install via `apt-get install tesseract-ocr` (Debian/Ubuntu), \
297 `dnf install tesseract` (Fedora), or `pacman -S tesseract` (Arch)."
298 .to_string()
299 } else if cfg!(target_os = "windows") {
300 "Install via `winget install --id UB-Mannheim.TesseractOCR` or download the \
301 UB-Mannheim build and add it to PATH."
302 .to_string()
303 } else {
304 "Install Tesseract from https://github.com/tesseract-ocr/tesseract and ensure it is on PATH.".to_string()
305 }
306}
307
308#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn parse_tsv_groups_words_into_lines() {
314 let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n\
3151\t1\t0\t0\t0\t0\t0\t0\t100\t100\t-1\t\n\
3165\t1\t1\t1\t1\t1\t0\t0\t40\t10\t91\tBill\n\
3175\t1\t1\t1\t1\t2\t40\t0\t30\t10\t93\tto:\n\
3185\t1\t1\t1\t2\t1\t0\t20\t60\t10\t87\tJane\n\
3195\t1\t1\t1\t2\t2\t60\t20\t60\t10\t89\tDoe\n";
320 let result = parse_tsv(tsv, "eng");
321 assert_eq!(result.text, "Bill to:\nJane Doe");
322 assert_eq!(result.word_count, 4);
323 let conf = result.mean_confidence.expect("expected mean confidence");
324 assert!((conf - 90.0).abs() < 1.0);
325 }
326
327 #[test]
328 fn parse_tsv_empty_yields_empty_text() {
329 let tsv = "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n";
330 let result = parse_tsv(tsv, "eng");
331 assert!(result.text.is_empty());
332 assert_eq!(result.word_count, 0);
333 assert!(result.mean_confidence.is_none());
334 }
335
336 #[test]
337 fn install_hint_is_non_empty() {
338 assert!(!install_hint().is_empty());
339 }
340}