Skip to main content

gaze_document/
lib.rs

1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or single-page PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. PII detection flows through the
6//! standard [`gaze::Pipeline`] so the manifest stays canonical and reversible
7//! (Axis 2 reversibility).
8//!
9//! # Quickstart
10//!
11//! ```no_run
12//! use std::path::Path;
13//!
14//! let bundle = gaze_document::clean(
15//!     Path::new("invoice.pdf"),
16//!     Path::new("./safe-out"),
17//! )?;
18//! assert!(!bundle.clean_markdown.is_empty());
19//! # Ok::<(), gaze_document::DocumentError>(())
20//! ```
21//!
22//! # Runtime requirements
23//!
24//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
25//! * For PDF input: a pdfium dynamic library available to the process. See
26//!   the crate README for per-OS install instructions.
27//!
28//! # Feature flags
29//!
30//! | Flag             | Default | What it enables                                            |
31//! |------------------|---------|------------------------------------------------------------|
32//! | `ocr-tesseract`  | yes     | Tesseract subprocess OCR backend.                          |
33//! | `pdf-input`      | yes     | Single-page PDF rasterization via `pdfium-render`.         |
34//! | `serde`          | yes     | `Serialize` / `Deserialize` for [`BundleReport`].          |
35//! | `extract-docling`| no      | Reserved — future Docling layout adapter (no impl yet).    |
36//! | `render-image`   | no      | Reserved — future redacted-preview renderer (no impl yet). |
37
38#![forbid(unsafe_code)]
39#![deny(missing_docs)]
40#![cfg_attr(docsrs, feature(doc_cfg))]
41
42pub mod bundle;
43pub mod extract;
44pub mod layout;
45#[cfg(feature = "mcp")]
46#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
47pub mod mcp;
48pub mod ocr;
49pub mod render;
50
51#[cfg(feature = "ocr-tesseract")]
52#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
53pub use bundle::clean;
54pub use bundle::{BundleReport, ClassCount, LayoutSummary, SafeBundle, BUNDLE_VERSION};
55pub use layout::ReadingOrder;
56pub use ocr::OcrAdapter;
57pub use render::Renderer;
58
59/// Crate-level error type for `gaze-document`.
60///
61/// Fail-closed by construction (Axis 1 reliability): every variant describes
62/// a specific, recoverable surface so adopters can branch on cause without
63/// matching opaque strings.
64#[non_exhaustive]
65#[derive(Debug)]
66pub enum DocumentError {
67    /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
68    TesseractNotFound(String),
69    /// Tesseract returned a non-zero exit status. The payload carries the
70    /// captured stderr (truncated) so adopters can surface it.
71    TesseractFailed {
72        /// Exit status reported by the OS.
73        status: i32,
74        /// Captured stderr (truncated to keep error payloads bounded).
75        stderr: String,
76    },
77    /// pdfium dynamic library could not be loaded. The payload is a per-OS
78    /// install hint.
79    PdfiumNotFound(String),
80    /// pdfium reported an error while parsing or rasterizing a PDF.
81    PdfRasterFailed(String),
82    /// Input file format is not supported by the current build (e.g. PDF
83    /// input without the `pdf-input` feature).
84    UnsupportedInput {
85        /// Path that was rejected.
86        path: std::path::PathBuf,
87        /// Reason the input was rejected.
88        reason: &'static str,
89    },
90    /// An I/O error while reading the input or writing the bundle.
91    Io(std::io::Error),
92    /// The bundle output directory could not be prepared.
93    OutputDir(std::path::PathBuf, std::io::Error),
94    /// `gaze::Pipeline` construction or invocation failed.
95    Pipeline(String),
96    /// `serde_json` serialization of the bundle report or manifest failed.
97    Serde(serde_json::Error),
98    /// The requested operation is part of the public contract but has no
99    /// implementation yet. Returned by reserved stubs (Renderer trait,
100    /// ReadingOrder) until follow-up PRs land.
101    NotImplemented(&'static str),
102}
103
104impl core::fmt::Display for DocumentError {
105    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106        match self {
107            Self::TesseractNotFound(hint) => write!(
108                f,
109                "gaze-document: tesseract binary not found on PATH. {hint}"
110            ),
111            Self::TesseractFailed { status, stderr } => write!(
112                f,
113                "gaze-document: tesseract exited with status {status}: {stderr}"
114            ),
115            Self::PdfiumNotFound(hint) => {
116                write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
117            }
118            Self::PdfRasterFailed(detail) => {
119                write!(f, "gaze-document: pdf rasterization failed: {detail}")
120            }
121            Self::UnsupportedInput { path, reason } => write!(
122                f,
123                "gaze-document: unsupported input `{}`: {reason}",
124                path.display()
125            ),
126            Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
127            Self::OutputDir(path, err) => write!(
128                f,
129                "gaze-document: cannot prepare output dir `{}`: {err}",
130                path.display()
131            ),
132            Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
133            Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
134            Self::NotImplemented(what) => {
135                write!(f, "gaze-document: {what} is not yet implemented")
136            }
137        }
138    }
139}
140
141impl std::error::Error for DocumentError {
142    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
143        match self {
144            Self::Io(err) | Self::OutputDir(_, err) => Some(err),
145            Self::Serde(err) => Some(err),
146            _ => None,
147        }
148    }
149}
150
151impl From<std::io::Error> for DocumentError {
152    fn from(err: std::io::Error) -> Self {
153        Self::Io(err)
154    }
155}
156
157impl From<serde_json::Error> for DocumentError {
158    fn from(err: serde_json::Error) -> Self {
159        Self::Serde(err)
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn crate_compiles_and_error_renders() {
169        let err = DocumentError::NotImplemented("smoke");
170        assert!(err.to_string().contains("not yet implemented"));
171    }
172
173    #[test]
174    fn tesseract_not_found_error_includes_hint() {
175        let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
176        let msg = err.to_string();
177        assert!(msg.contains("tesseract"));
178        assert!(msg.contains("brew install"));
179    }
180}