gaze_document/lib.rs
1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. Agent-safe files and owner-only
6//! restore material are written to separate output directories. PII detection
7//! flows through the standard [`gaze::Pipeline`] so the manifest stays
8//! canonical and reversible (Axis 2 reversibility).
9//!
10//! # Quickstart
11//!
12//! ```no_run
13//! use std::path::Path;
14//!
15//! let bundle = gaze_document::clean(
16//! Path::new("invoice.pdf"),
17//! gaze_document::AgentBundleDir::new("./agent-out")?,
18//! gaze_document::OwnerBundleDir::new("./owner-out")?,
19//! )?;
20//! assert!(!bundle.clean_markdown.is_empty());
21//! # Ok::<(), gaze_document::DocumentError>(())
22//! ```
23//!
24//! # Runtime requirements
25//!
26//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
27//! * For PDF input: a pdfium dynamic library available to the process. See
28//! the crate README for per-OS install instructions.
29//!
30//! # Feature flags
31//!
32//! | Flag | Default | What it enables |
33//! |------------------|---------|------------------------------------------------------------|
34//! | `ocr-tesseract` | yes | Tesseract subprocess OCR backend. |
35//! | `pdf-input` | yes | PDF text extraction + raster OCR fallback via `pdfium-render`. |
36//! | `serde` | yes | `Serialize` / `Deserialize` for [`BundleReport`]. |
37//! | `extract-docling`| no | Reserved — future Docling layout adapter (no impl yet). |
38//! | `render-image` | no | Reserved — future redacted-preview renderer (no impl yet). |
39
40#![forbid(unsafe_code)]
41#![deny(missing_docs)]
42#![cfg_attr(docsrs, feature(doc_cfg))]
43
44pub mod bundle;
45pub mod extract;
46pub mod layout;
47#[cfg(feature = "mcp")]
48#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
49pub mod mcp;
50pub mod ocr;
51mod postprocess;
52mod preprocess;
53pub mod render;
54
55#[cfg(feature = "ocr-tesseract")]
56#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
57pub use bundle::{clean, clean_with_ocr_backend};
58pub use bundle::{
59 AgentBundleDir, BundleReport, ClassCount, LayoutSummary, OcrSource, OwnerBundleDir, PageReport,
60 Pipeline, SafeBundle, BUNDLE_VERSION,
61};
62pub use layout::ReadingOrder;
63#[cfg(feature = "ocr-tesseract")]
64#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
65pub use ocr::TesseractBackend;
66pub use ocr::{
67 detect_image_format, BBox, ImageFormat, ImageInput, LanguageTag, OcrBackend, OcrError,
68 OcrHints, OcrSpan,
69};
70pub use render::Renderer;
71
72/// Crate-level error type for `gaze-document`.
73///
74/// Fail-closed by construction (Axis 1 reliability): every variant describes
75/// a specific, recoverable surface so adopters can branch on cause without
76/// matching opaque strings.
77#[non_exhaustive]
78#[derive(Debug)]
79pub enum DocumentError {
80 /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
81 TesseractNotFound(String),
82 /// Tesseract returned a non-zero exit status. The payload carries the
83 /// captured stderr (truncated) so adopters can surface it.
84 TesseractFailed {
85 /// Exit status reported by the OS.
86 status: i32,
87 /// Captured stderr (truncated to keep error payloads bounded).
88 stderr: String,
89 },
90 /// pdfium dynamic library could not be loaded. The payload is a per-OS
91 /// install hint.
92 PdfiumNotFound(String),
93 /// pdfium reported an error while parsing or rasterizing a PDF.
94 PdfRasterFailed(String),
95 /// Input file format is not supported by the current build (e.g. PDF
96 /// input without the `pdf-input` feature).
97 UnsupportedInput {
98 /// Path that was rejected.
99 path: std::path::PathBuf,
100 /// Reason the input was rejected.
101 reason: &'static str,
102 },
103 /// An I/O error while reading the input or writing the bundle.
104 Io(std::io::Error),
105 /// The bundle output directory could not be prepared.
106 OutputDir(std::path::PathBuf, std::io::Error),
107 /// The requested agent/owner bundle directory pair violates the runtime
108 /// partition contract.
109 BundleLayoutInvalid {
110 /// Machine-readable reason for the layout rejection.
111 reason: BundleLayoutInvalidReason,
112 },
113 /// `gaze::Pipeline` construction or invocation failed.
114 Pipeline(String),
115 /// `serde_json` serialization of the bundle report or manifest failed.
116 Serde(serde_json::Error),
117 /// The requested operation is part of the public contract but has no
118 /// implementation yet. Returned by reserved stubs (Renderer trait,
119 /// ReadingOrder) until follow-up PRs land.
120 NotImplemented(&'static str),
121}
122
123/// Closed reason set for invalid SafeBundle agent/owner output layouts.
124#[non_exhaustive]
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
126pub enum BundleLayoutInvalidReason {
127 /// Agent and owner outputs resolve to the same directory.
128 AgentEqualsOwner,
129 /// The agent output directory is nested inside the owner output directory.
130 AgentNestedInOwner,
131 /// The owner output directory is nested inside the agent output directory.
132 OwnerNestedInAgent,
133 /// One output path was empty and cannot name a directory.
134 EmptyPath,
135}
136
137impl core::fmt::Display for DocumentError {
138 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
139 match self {
140 Self::TesseractNotFound(hint) => write!(
141 f,
142 "gaze-document: tesseract binary not found on PATH. {hint}"
143 ),
144 Self::TesseractFailed { status, stderr } => write!(
145 f,
146 "gaze-document: tesseract exited with status {status}: {stderr}"
147 ),
148 Self::PdfiumNotFound(hint) => {
149 write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
150 }
151 Self::PdfRasterFailed(detail) => {
152 write!(f, "gaze-document: pdf rasterization failed: {detail}")
153 }
154 Self::UnsupportedInput { path, reason } => write!(
155 f,
156 "gaze-document: unsupported input `{}`: {reason}",
157 path.display()
158 ),
159 Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
160 Self::OutputDir(path, err) => write!(
161 f,
162 "gaze-document: cannot prepare output dir `{}`: {err}",
163 path.display()
164 ),
165 Self::BundleLayoutInvalid { reason } => {
166 write!(f, "gaze-document: invalid bundle layout: {reason}")
167 }
168 Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
169 Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
170 Self::NotImplemented(what) => {
171 write!(f, "gaze-document: {what} is not yet implemented")
172 }
173 }
174 }
175}
176
177impl core::fmt::Display for BundleLayoutInvalidReason {
178 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
179 match self {
180 Self::AgentEqualsOwner => write!(f, "agent and owner output directories are equal"),
181 Self::AgentNestedInOwner => {
182 write!(
183 f,
184 "agent output directory is nested inside owner output directory"
185 )
186 }
187 Self::OwnerNestedInAgent => {
188 write!(
189 f,
190 "owner output directory is nested inside agent output directory"
191 )
192 }
193 Self::EmptyPath => write!(f, "bundle output directory path is empty"),
194 }
195 }
196}
197
198impl std::error::Error for DocumentError {
199 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
200 match self {
201 Self::Io(err) | Self::OutputDir(_, err) => Some(err),
202 Self::Serde(err) => Some(err),
203 _ => None,
204 }
205 }
206}
207
208impl From<std::io::Error> for DocumentError {
209 fn from(err: std::io::Error) -> Self {
210 Self::Io(err)
211 }
212}
213
214impl From<serde_json::Error> for DocumentError {
215 fn from(err: serde_json::Error) -> Self {
216 Self::Serde(err)
217 }
218}
219
220#[cfg(test)]
221mod tests {
222 use super::*;
223
224 #[test]
225 fn crate_compiles_and_error_renders() {
226 let err = DocumentError::NotImplemented("smoke");
227 assert!(err.to_string().contains("not yet implemented"));
228 }
229
230 #[test]
231 fn tesseract_not_found_error_includes_hint() {
232 let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
233 let msg = err.to_string();
234 assert!(msg.contains("tesseract"));
235 assert!(msg.contains("brew install"));
236 }
237}