Skip to main content

gaze_document/ocr/
mod.rs

1//! OCR backend contract surface and concrete backends.
2//!
3//! The [`OcrBackend`] trait is intentionally narrow: finalized image bytes in,
4//! flat OCR spans out. Preprocessing, multi-page orchestration, and layout
5//! reconstruction stay above or below this module so backend plurality can
6//! arrive later without widening the trust boundary.
7
8mod normalize;
9
10use crate::DocumentError;
11
12pub(crate) use normalize::normalize_ocr_artifacts;
13
14#[cfg(feature = "ocr-tesseract")]
15#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
16pub mod tesseract;
17
18#[cfg(feature = "ocr-tesseract")]
19#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
20pub use tesseract::TesseractBackend;
21
22/// Raster image format handed to an OCR backend.
23#[derive(Debug, Clone, Copy, PartialEq, Eq)]
24pub enum ImageFormat {
25    /// PNG image bytes.
26    Png,
27    /// JPEG image bytes.
28    Jpeg,
29    /// TIFF image bytes.
30    Tiff,
31}
32
33impl ImageFormat {
34    /// File extension used for temporary subprocess handoff.
35    pub fn extension(self) -> &'static str {
36        match self {
37            Self::Png => "png",
38            Self::Jpeg => "jpg",
39            Self::Tiff => "tiff",
40        }
41    }
42}
43
44/// Detect the encoded image format from magic bytes.
45///
46/// # Errors
47///
48/// Returns [`DocumentError::UnsupportedInput`] when the byte payload is not a
49/// supported PNG, JPEG, or TIFF image.
50pub fn detect_image_format(bytes: &[u8]) -> Result<ImageFormat, DocumentError> {
51    if bytes.starts_with(b"\x89PNG") {
52        return Ok(ImageFormat::Png);
53    }
54    if bytes.starts_with(b"\xFF\xD8\xFF") {
55        return Ok(ImageFormat::Jpeg);
56    }
57    if bytes.starts_with(b"II\x2A\x00") || bytes.starts_with(b"MM\x00\x2A") {
58        return Ok(ImageFormat::Tiff);
59    }
60    Err(DocumentError::UnsupportedInput {
61        path: std::path::PathBuf::new(),
62        reason: "image bytes are not PNG, JPEG, or TIFF",
63    })
64}
65
66/// Finalized image payload for one OCR pass.
67#[derive(Debug, Clone, PartialEq, Eq)]
68pub struct ImageInput {
69    /// Encoded image bytes.
70    pub bytes: Vec<u8>,
71    /// Encoded image format.
72    pub format: ImageFormat,
73    /// Optional source DPI, when known by the orchestration layer.
74    pub dpi: Option<u32>,
75}
76
77/// Backend language tag.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct LanguageTag(String);
80
81impl LanguageTag {
82    /// Build a language tag from a backend-specific language code.
83    pub fn new(tag: impl Into<String>) -> Self {
84        Self(tag.into())
85    }
86
87    /// Borrow the language tag as a string slice.
88    pub fn as_str(&self) -> &str {
89        &self.0
90    }
91}
92
93impl Default for LanguageTag {
94    fn default() -> Self {
95        Self::new("eng")
96    }
97}
98
99/// OCR backend hints. Backends may downgrade hints they cannot support.
100#[derive(Debug, Clone, PartialEq, Eq)]
101pub struct OcrHints {
102    /// Preferred OCR languages.
103    pub languages: Vec<LanguageTag>,
104}
105
106impl OcrHints {
107    /// Build hints with the default English Tesseract-compatible tag.
108    pub fn english() -> Self {
109        Self {
110            languages: vec![LanguageTag::default()],
111        }
112    }
113
114    /// Return the first requested language, falling back to English.
115    pub fn primary_language(&self) -> &str {
116        self.languages
117            .first()
118            .map(LanguageTag::as_str)
119            .unwrap_or("eng")
120    }
121}
122
123impl Default for OcrHints {
124    fn default() -> Self {
125        Self::english()
126    }
127}
128
129/// Bounding box in image pixel coordinates.
130#[derive(Debug, Clone, Copy, PartialEq, Eq)]
131pub struct BBox {
132    /// Left coordinate.
133    pub x: u32,
134    /// Top coordinate.
135    pub y: u32,
136    /// Width in pixels.
137    pub w: u32,
138    /// Height in pixels.
139    pub h: u32,
140}
141
142/// One OCR text span emitted by a backend.
143#[derive(Debug, Clone, PartialEq)]
144pub struct OcrSpan {
145    /// Recognized text for this span.
146    pub text: String,
147    /// Span bounding box in image pixel coordinates.
148    pub bbox: BBox,
149    /// Backend confidence normalized to `0.0..=1.0`.
150    pub confidence: Option<f32>,
151}
152
153/// Closed OCR backend error surface.
154#[derive(Debug, thiserror::Error)]
155pub enum OcrError {
156    /// Backend initialization failed.
157    #[error("backend init failed: {0}")]
158    InitFailed(String),
159    /// Recognition failed after backend initialization.
160    #[error("recognize failed: {0}")]
161    RecognizeFailed(String),
162    /// Image format is unsupported by this backend.
163    #[error("unsupported image format: {0:?}")]
164    UnsupportedFormat(ImageFormat),
165    /// Backend hit an internal invariant or I/O failure.
166    #[error("backend internal error: {0}")]
167    Internal(String),
168}
169
170/// Narrow OCR backend contract.
171pub trait OcrBackend: Send + Sync {
172    /// Stable backend name used in diagnostics.
173    fn name(&self) -> &str;
174
175    /// Recognize flat spans from one finalized image.
176    fn recognize(&self, image: ImageInput, hints: OcrHints) -> Result<Vec<OcrSpan>, OcrError>;
177}
178
179/// Result of an OCR pass: full text + a structured confidence summary.
180///
181/// Backend-agnostic — concrete adapters (e.g., [`tesseract::TesseractBackend`])
182/// produce values of this shape so the rest of the pipeline does not
183/// hard-code one OCR backend's surface.
184#[non_exhaustive]
185#[derive(Debug, Clone)]
186pub struct OcrResult {
187    /// Extracted plain text, with original line breaks preserved.
188    pub text: String,
189    /// Mean per-word OCR confidence in `[0.0, 100.0]`. `None` when the page
190    /// contained no recognizable words.
191    pub mean_confidence: Option<f32>,
192    /// Number of words emitted with confidence `>= 0`.
193    pub word_count: usize,
194    /// OCR language tag used (e.g., `eng`).
195    pub lang: String,
196}
197
198impl OcrResult {
199    /// Build an OCR result from raw fields.
200    pub(crate) fn new(
201        text: String,
202        mean_confidence: Option<f32>,
203        word_count: usize,
204        lang: String,
205    ) -> Self {
206        Self {
207            text,
208            mean_confidence,
209            word_count,
210            lang,
211        }
212    }
213
214    /// Build an OCR result from flat spans using pixel y-position to recover
215    /// a conservative reading order.
216    pub fn from_spans(spans: &[OcrSpan], lang: String) -> Self {
217        Self::from_spans_with_column_detection(spans, lang, false).0
218    }
219
220    /// Build an OCR result from flat spans using the crate layout
221    /// post-processor. Returns the result plus the detected column count.
222    pub(crate) fn from_spans_with_column_detection(
223        spans: &[OcrSpan],
224        lang: String,
225        column_detection: bool,
226    ) -> (Self, u32) {
227        let ordered = crate::postprocess::order_spans(spans, column_detection);
228        let mut conf_sum = 0.0f64;
229        let mut conf_count = 0usize;
230        for span in spans {
231            if let Some(confidence) = span.confidence {
232                conf_sum += (confidence * 100.0) as f64;
233                conf_count += 1;
234            }
235        }
236        let mean_confidence = if conf_count == 0 {
237            None
238        } else {
239            Some((conf_sum / conf_count as f64) as f32)
240        };
241        (
242            Self {
243                text: ordered.text,
244                mean_confidence,
245                word_count: conf_count,
246                lang,
247            },
248            ordered.column_count,
249        )
250    }
251
252    /// Mean confidence normalized to `0.0..=1.0`.
253    pub(crate) fn mean_confidence_unit(&self) -> Option<f32> {
254        self.mean_confidence.map(|confidence| {
255            if confidence > 1.0 {
256                (confidence / 100.0).clamp(0.0, 1.0)
257            } else {
258                confidence.clamp(0.0, 1.0)
259            }
260        })
261    }
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn mean_confidence_unit_normalizes_legacy_percent_value() {
270        let result = OcrResult::new("body".to_string(), Some(91.0), 1, "eng".to_string());
271        assert_eq!(result.mean_confidence_unit(), Some(0.91));
272    }
273
274    #[test]
275    fn from_spans_reports_detected_columns() {
276        let spans = vec![
277            OcrSpan {
278                text: "A1".to_string(),
279                bbox: BBox {
280                    x: 10,
281                    y: 10,
282                    w: 30,
283                    h: 10,
284                },
285                confidence: Some(0.8),
286            },
287            OcrSpan {
288                text: "B1".to_string(),
289                bbox: BBox {
290                    x: 280,
291                    y: 10,
292                    w: 30,
293                    h: 10,
294                },
295                confidence: Some(0.8),
296            },
297            OcrSpan {
298                text: "A2".to_string(),
299                bbox: BBox {
300                    x: 10,
301                    y: 30,
302                    w: 30,
303                    h: 10,
304                },
305                confidence: Some(0.8),
306            },
307            OcrSpan {
308                text: "B2".to_string(),
309                bbox: BBox {
310                    x: 280,
311                    y: 30,
312                    w: 30,
313                    h: 10,
314                },
315                confidence: Some(0.8),
316            },
317        ];
318
319        let (result, columns) =
320            OcrResult::from_spans_with_column_detection(&spans, "eng".to_string(), true);
321
322        assert_eq!(columns, 2);
323        assert_eq!(result.text, "A1\nA2\n\nB1\nB2");
324        assert_eq!(result.mean_confidence_unit(), Some(0.8));
325    }
326
327    #[test]
328    fn detect_image_format_accepts_supported_magic_bytes() {
329        assert_eq!(
330            detect_image_format(b"\x89PNG\r\n\x1A\nrest").expect("png magic"),
331            ImageFormat::Png
332        );
333        assert_eq!(
334            detect_image_format(b"\xFF\xD8\xFF\xE0rest").expect("jpeg magic"),
335            ImageFormat::Jpeg
336        );
337        assert_eq!(
338            detect_image_format(b"II\x2A\x00rest").expect("little-endian tiff magic"),
339            ImageFormat::Tiff
340        );
341        assert_eq!(
342            detect_image_format(b"MM\x00\x2Arest").expect("big-endian tiff magic"),
343            ImageFormat::Tiff
344        );
345    }
346
347    #[test]
348    fn detect_image_format_rejects_unknown_bytes() {
349        let err = detect_image_format(b"not an image").expect_err("unknown format fails");
350        assert!(matches!(err, DocumentError::UnsupportedInput { .. }));
351    }
352}