Skip to main content

gaze_document/ocr/
mod.rs

1//! OCR adapter contract surface and concrete backends.
2//!
3//! The [`OcrAdapter`] trait stays open for future backends (cloud OCR, Apple
4//! Vision, etc.). The shipping backend in v0.0.x is the Tesseract subprocess
5//! adapter under [`tesseract`].
6//!
7//! The trait itself remains fail-loud-default via [`PendingOcrAdapter`] so an
8//! adopter who builds a trait object without picking a backend gets a typed
9//! `NotImplemented` error instead of silent zero-detection output (Axis 1
10//! fail-closed).
11
12use crate::DocumentError;
13
14mod normalize;
15
16pub(crate) use normalize::normalize_ocr_artifacts;
17
18#[cfg(feature = "ocr-tesseract")]
19#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
20pub mod tesseract;
21
22#[cfg(feature = "ocr-tesseract")]
23#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
24pub use tesseract::TesseractOcr;
25
26/// Result of an OCR pass: full text + a structured confidence summary.
27///
28/// Backend-agnostic — concrete adapters (e.g., [`tesseract::TesseractOcr`])
29/// produce values of this shape so the rest of the pipeline does not
30/// hard-code one OCR backend's surface.
31#[non_exhaustive]
32#[derive(Debug, Clone)]
33pub struct OcrResult {
34    /// Extracted plain text, with original line breaks preserved.
35    pub text: String,
36    /// Mean per-word OCR confidence in `[0.0, 100.0]`. `None` when the page
37    /// contained no recognizable words.
38    pub mean_confidence: Option<f32>,
39    /// Number of words emitted with confidence `>= 0`.
40    pub word_count: usize,
41    /// OCR language tag used (e.g., `eng`).
42    pub lang: String,
43}
44
45impl OcrResult {
46    /// Build an OCR result from raw fields.
47    pub fn new(
48        text: String,
49        mean_confidence: Option<f32>,
50        word_count: usize,
51        lang: String,
52    ) -> Self {
53        Self {
54            text,
55            mean_confidence,
56            word_count,
57            lang,
58        }
59    }
60}
61
62/// Adapter contract for OCR backends.
63///
64/// Implementations live behind feature flags (e.g., `ocr-tesseract`) and
65/// must round-trip page coordinates so the eventual `layout::ReadingOrder`
66/// pass can stitch the result without losing source spans.
67pub trait OcrAdapter {
68    /// Extract textual content from `_bytes` (raw image / page payload).
69    ///
70    /// # Errors
71    /// Implementations return [`DocumentError`] on backend failure.
72    fn extract_text(&self, bytes: &[u8]) -> Result<String, DocumentError>;
73}
74
75/// Reserved fail-loud adapter.
76///
77/// Used as an explicit "no backend wired" sentinel. Every call returns
78/// [`DocumentError::NotImplemented`] so accidental wiring is caught at the
79/// call site (Axis 1 fail-closed).
80#[non_exhaustive]
81pub struct PendingOcrAdapter {
82    _private: (),
83}
84
85impl PendingOcrAdapter {
86    /// Build the fail-loud adapter.
87    ///
88    /// # Errors
89    /// Always returns [`DocumentError::NotImplemented`]. The placeholder
90    /// exists so adopters wiring a trait object without a concrete backend
91    /// fail at construction time rather than receiving silent zero-output
92    /// (Axis 1 fail-closed).
93    pub fn new() -> Result<Self, DocumentError> {
94        Err(DocumentError::NotImplemented(
95            "PendingOcrAdapter::new (wire a concrete OCR backend)",
96        ))
97    }
98}
99
100impl OcrAdapter for PendingOcrAdapter {
101    fn extract_text(&self, _bytes: &[u8]) -> Result<String, DocumentError> {
102        Err(DocumentError::NotImplemented(
103            "PendingOcrAdapter::extract_text (wire a concrete OCR backend)",
104        ))
105    }
106}