Skip to main content

gaze_document/
lib.rs

1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. PII detection flows through the
6//! standard [`gaze::Pipeline`] so the manifest stays canonical and reversible
7//! (Axis 2 reversibility).
8//!
9//! # Quickstart
10//!
11//! ```no_run
12//! use std::path::Path;
13//!
14//! let bundle = gaze_document::clean(
15//!     Path::new("invoice.pdf"),
16//!     Path::new("./safe-out"),
17//! )?;
18//! assert!(!bundle.clean_markdown.is_empty());
19//! # Ok::<(), gaze_document::DocumentError>(())
20//! ```
21//!
22//! # Runtime requirements
23//!
24//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
25//! * For PDF input: a pdfium dynamic library available to the process. See
26//!   the crate README for per-OS install instructions.
27//!
28//! # Feature flags
29//!
30//! | Flag             | Default | What it enables                                            |
31//! |------------------|---------|------------------------------------------------------------|
32//! | `ocr-tesseract`  | yes     | Tesseract subprocess OCR backend.                          |
33//! | `pdf-input`      | yes     | PDF text extraction + raster OCR fallback via `pdfium-render`. |
34//! | `serde`          | yes     | `Serialize` / `Deserialize` for [`BundleReport`].          |
35//! | `extract-docling`| no      | Reserved — future Docling layout adapter (no impl yet).    |
36//! | `render-image`   | no      | Reserved — future redacted-preview renderer (no impl yet). |
37
38#![forbid(unsafe_code)]
39#![deny(missing_docs)]
40#![cfg_attr(docsrs, feature(doc_cfg))]
41
42pub mod bundle;
43pub mod extract;
44pub mod layout;
45#[cfg(feature = "mcp")]
46#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
47pub mod mcp;
48pub mod ocr;
49mod postprocess;
50mod preprocess;
51pub mod render;
52
53#[cfg(feature = "ocr-tesseract")]
54#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
55pub use bundle::{clean, clean_with_ocr_backend};
56pub use bundle::{
57    BundleReport, ClassCount, LayoutSummary, OcrSource, PageReport, Pipeline, SafeBundle,
58    BUNDLE_VERSION,
59};
60pub use layout::ReadingOrder;
61#[cfg(feature = "ocr-tesseract")]
62#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
63pub use ocr::TesseractBackend;
64pub use ocr::{
65    detect_image_format, BBox, ImageFormat, ImageInput, LanguageTag, OcrBackend, OcrError,
66    OcrHints, OcrSpan,
67};
68pub use render::Renderer;
69
70/// Crate-level error type for `gaze-document`.
71///
72/// Fail-closed by construction (Axis 1 reliability): every variant describes
73/// a specific, recoverable surface so adopters can branch on cause without
74/// matching opaque strings.
75#[non_exhaustive]
76#[derive(Debug)]
77pub enum DocumentError {
78    /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
79    TesseractNotFound(String),
80    /// Tesseract returned a non-zero exit status. The payload carries the
81    /// captured stderr (truncated) so adopters can surface it.
82    TesseractFailed {
83        /// Exit status reported by the OS.
84        status: i32,
85        /// Captured stderr (truncated to keep error payloads bounded).
86        stderr: String,
87    },
88    /// pdfium dynamic library could not be loaded. The payload is a per-OS
89    /// install hint.
90    PdfiumNotFound(String),
91    /// pdfium reported an error while parsing or rasterizing a PDF.
92    PdfRasterFailed(String),
93    /// Input file format is not supported by the current build (e.g. PDF
94    /// input without the `pdf-input` feature).
95    UnsupportedInput {
96        /// Path that was rejected.
97        path: std::path::PathBuf,
98        /// Reason the input was rejected.
99        reason: &'static str,
100    },
101    /// An I/O error while reading the input or writing the bundle.
102    Io(std::io::Error),
103    /// The bundle output directory could not be prepared.
104    OutputDir(std::path::PathBuf, std::io::Error),
105    /// `gaze::Pipeline` construction or invocation failed.
106    Pipeline(String),
107    /// `serde_json` serialization of the bundle report or manifest failed.
108    Serde(serde_json::Error),
109    /// The requested operation is part of the public contract but has no
110    /// implementation yet. Returned by reserved stubs (Renderer trait,
111    /// ReadingOrder) until follow-up PRs land.
112    NotImplemented(&'static str),
113}
114
115impl core::fmt::Display for DocumentError {
116    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
117        match self {
118            Self::TesseractNotFound(hint) => write!(
119                f,
120                "gaze-document: tesseract binary not found on PATH. {hint}"
121            ),
122            Self::TesseractFailed { status, stderr } => write!(
123                f,
124                "gaze-document: tesseract exited with status {status}: {stderr}"
125            ),
126            Self::PdfiumNotFound(hint) => {
127                write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
128            }
129            Self::PdfRasterFailed(detail) => {
130                write!(f, "gaze-document: pdf rasterization failed: {detail}")
131            }
132            Self::UnsupportedInput { path, reason } => write!(
133                f,
134                "gaze-document: unsupported input `{}`: {reason}",
135                path.display()
136            ),
137            Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
138            Self::OutputDir(path, err) => write!(
139                f,
140                "gaze-document: cannot prepare output dir `{}`: {err}",
141                path.display()
142            ),
143            Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
144            Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
145            Self::NotImplemented(what) => {
146                write!(f, "gaze-document: {what} is not yet implemented")
147            }
148        }
149    }
150}
151
152impl std::error::Error for DocumentError {
153    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
154        match self {
155            Self::Io(err) | Self::OutputDir(_, err) => Some(err),
156            Self::Serde(err) => Some(err),
157            _ => None,
158        }
159    }
160}
161
162impl From<std::io::Error> for DocumentError {
163    fn from(err: std::io::Error) -> Self {
164        Self::Io(err)
165    }
166}
167
168impl From<serde_json::Error> for DocumentError {
169    fn from(err: serde_json::Error) -> Self {
170        Self::Serde(err)
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn crate_compiles_and_error_renders() {
180        let err = DocumentError::NotImplemented("smoke");
181        assert!(err.to_string().contains("not yet implemented"));
182    }
183
184    #[test]
185    fn tesseract_not_found_error_includes_hint() {
186        let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
187        let msg = err.to_string();
188        assert!(msg.contains("tesseract"));
189        assert!(msg.contains("brew install"));
190    }
191}