Skip to main content

gaze_document/
lib.rs

1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. Agent-safe files and owner-only
6//! restore material are written to separate output directories. PII detection
7//! flows through the standard [`gaze::Pipeline`] so the manifest stays
8//! canonical and reversible (Axis 2 reversibility).
9//!
10//! # Quickstart
11//!
12//! ```no_run
13//! use std::path::Path;
14//!
15//! let bundle = gaze_document::clean(
16//!     Path::new("invoice.pdf"),
17//!     gaze_document::AgentBundleDir::new("./agent-out")?,
18//!     gaze_document::OwnerBundleDir::new("./owner-out")?,
19//! )?;
20//! assert!(!bundle.clean_markdown.is_empty());
21//! # Ok::<(), gaze_document::DocumentError>(())
22//! ```
23//!
24//! # Runtime requirements
25//!
26//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
27//! * For PDF input: a pdfium dynamic library available to the process. See
28//!   the crate README for per-OS install instructions.
29//!
30//! # Feature flags
31//!
32//! | Flag             | Default | What it enables                                            |
33//! |------------------|---------|------------------------------------------------------------|
34//! | `ocr-tesseract`  | yes     | Tesseract subprocess OCR backend.                          |
35//! | `pdf-input`      | yes     | PDF text extraction + raster OCR fallback via `pdfium-render`. |
36//! | `serde`          | yes     | `Serialize` / `Deserialize` for [`BundleReport`].          |
37//! | `extract-docling`| no      | Reserved — future Docling layout adapter (no impl yet).    |
38//! | `render-image`   | no      | Reserved — future redacted-preview renderer (no impl yet). |
39
40#![forbid(unsafe_code)]
41#![deny(missing_docs)]
42#![cfg_attr(docsrs, feature(doc_cfg))]
43
44pub mod bundle;
45pub mod extract;
46pub mod layout;
47#[cfg(feature = "mcp")]
48#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
49pub mod mcp;
50pub mod ocr;
51mod postprocess;
52mod preprocess;
53pub mod render;
54
55#[cfg(feature = "ocr-tesseract")]
56#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
57pub use bundle::{clean, clean_with_ocr_backend};
58pub use bundle::{
59    AgentBundleDir, BundleReport, ClassCount, LayoutSummary, OcrSource, OwnerBundleDir, PageReport,
60    Pipeline, SafeBundle, BUNDLE_VERSION,
61};
62pub use layout::ReadingOrder;
63#[cfg(feature = "ocr-tesseract")]
64#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
65pub use ocr::TesseractBackend;
66pub use ocr::{
67    detect_image_format, BBox, ImageFormat, ImageInput, LanguageTag, OcrBackend, OcrError,
68    OcrHints, OcrSpan,
69};
70pub use render::Renderer;
71
72/// Crate-level error type for `gaze-document`.
73///
74/// Fail-closed by construction (Axis 1 reliability): every variant describes
75/// a specific, recoverable surface so adopters can branch on cause without
76/// matching opaque strings.
77#[non_exhaustive]
78#[derive(Debug)]
79pub enum DocumentError {
80    /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
81    TesseractNotFound(String),
82    /// Tesseract returned a non-zero exit status. The payload carries the
83    /// captured stderr (truncated) so adopters can surface it.
84    TesseractFailed {
85        /// Exit status reported by the OS.
86        status: i32,
87        /// Captured stderr (truncated to keep error payloads bounded).
88        stderr: String,
89    },
90    /// pdfium dynamic library could not be loaded. The payload is a per-OS
91    /// install hint.
92    PdfiumNotFound(String),
93    /// pdfium reported an error while parsing or rasterizing a PDF.
94    PdfRasterFailed(String),
95    /// Input file format is not supported by the current build (e.g. PDF
96    /// input without the `pdf-input` feature).
97    UnsupportedInput {
98        /// Path that was rejected.
99        path: std::path::PathBuf,
100        /// Reason the input was rejected.
101        reason: &'static str,
102    },
103    /// An I/O error while reading the input or writing the bundle.
104    Io(std::io::Error),
105    /// The bundle output directory could not be prepared.
106    OutputDir(std::path::PathBuf, std::io::Error),
107    /// The requested agent/owner bundle directory pair violates the runtime
108    /// partition contract.
109    BundleLayoutInvalid {
110        /// Machine-readable reason for the layout rejection.
111        reason: BundleLayoutInvalidReason,
112    },
113    /// `gaze::Pipeline` construction or invocation failed.
114    Pipeline(String),
115    /// `serde_json` serialization of the bundle report or manifest failed.
116    Serde(serde_json::Error),
117    /// The requested operation is part of the public contract but has no
118    /// implementation yet. Returned by reserved stubs (Renderer trait,
119    /// ReadingOrder) until follow-up PRs land.
120    NotImplemented(&'static str),
121}
122
123/// Closed reason set for invalid SafeBundle agent/owner output layouts.
124#[non_exhaustive]
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub enum BundleLayoutInvalidReason {
127    /// Agent and owner outputs resolve to the same directory.
128    AgentEqualsOwner,
129    /// The agent output directory is nested inside the owner output directory.
130    AgentNestedInOwner,
131    /// The owner output directory is nested inside the agent output directory.
132    OwnerNestedInAgent,
133    /// One output path was empty and cannot name a directory.
134    EmptyPath,
135}
136
137impl core::fmt::Display for DocumentError {
138    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
139        match self {
140            Self::TesseractNotFound(hint) => write!(
141                f,
142                "gaze-document: tesseract binary not found on PATH. {hint}"
143            ),
144            Self::TesseractFailed { status, stderr } => write!(
145                f,
146                "gaze-document: tesseract exited with status {status}: {stderr}"
147            ),
148            Self::PdfiumNotFound(hint) => {
149                write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
150            }
151            Self::PdfRasterFailed(detail) => {
152                write!(f, "gaze-document: pdf rasterization failed: {detail}")
153            }
154            Self::UnsupportedInput { path, reason } => write!(
155                f,
156                "gaze-document: unsupported input `{}`: {reason}",
157                path.display()
158            ),
159            Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
160            Self::OutputDir(path, err) => write!(
161                f,
162                "gaze-document: cannot prepare output dir `{}`: {err}",
163                path.display()
164            ),
165            Self::BundleLayoutInvalid { reason } => {
166                write!(f, "gaze-document: invalid bundle layout: {reason}")
167            }
168            Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
169            Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
170            Self::NotImplemented(what) => {
171                write!(f, "gaze-document: {what} is not yet implemented")
172            }
173        }
174    }
175}
176
177impl core::fmt::Display for BundleLayoutInvalidReason {
178    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
179        match self {
180            Self::AgentEqualsOwner => write!(f, "agent and owner output directories are equal"),
181            Self::AgentNestedInOwner => {
182                write!(
183                    f,
184                    "agent output directory is nested inside owner output directory"
185                )
186            }
187            Self::OwnerNestedInAgent => {
188                write!(
189                    f,
190                    "owner output directory is nested inside agent output directory"
191                )
192            }
193            Self::EmptyPath => write!(f, "bundle output directory path is empty"),
194        }
195    }
196}
197
198impl std::error::Error for DocumentError {
199    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
200        match self {
201            Self::Io(err) | Self::OutputDir(_, err) => Some(err),
202            Self::Serde(err) => Some(err),
203            _ => None,
204        }
205    }
206}
207
208impl From<std::io::Error> for DocumentError {
209    fn from(err: std::io::Error) -> Self {
210        Self::Io(err)
211    }
212}
213
214impl From<serde_json::Error> for DocumentError {
215    fn from(err: serde_json::Error) -> Self {
216        Self::Serde(err)
217    }
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn crate_compiles_and_error_renders() {
226        let err = DocumentError::NotImplemented("smoke");
227        assert!(err.to_string().contains("not yet implemented"));
228    }
229
230    #[test]
231    fn tesseract_not_found_error_includes_hint() {
232        let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
233        let msg = err.to_string();
234        assert!(msg.contains("tesseract"));
235        assert!(msg.contains("brew install"));
236    }
237}