gaze_document/lib.rs
1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or single-page PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. PII detection flows through the
6//! standard [`gaze::Pipeline`] so the manifest stays canonical and reversible
7//! (Axis 2 reversibility).
8//!
9//! # Quickstart
10//!
11//! ```no_run
12//! use std::path::Path;
13//!
14//! let bundle = gaze_document::clean(
15//! Path::new("invoice.pdf"),
16//! Path::new("./safe-out"),
17//! )?;
18//! assert!(!bundle.clean_markdown.is_empty());
19//! # Ok::<(), gaze_document::DocumentError>(())
20//! ```
21//!
22//! # Runtime requirements
23//!
24//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
25//! * For PDF input: a pdfium dynamic library available to the process. See
26//! the crate README for per-OS install instructions.
27//!
28//! # Feature flags
29//!
30//! | Flag | Default | What it enables |
31//! |------------------|---------|------------------------------------------------------------|
32//! | `ocr-tesseract` | yes | Tesseract subprocess OCR backend. |
33//! | `pdf-input` | yes | Single-page PDF rasterization via `pdfium-render`. |
34//! | `serde` | yes | `Serialize` / `Deserialize` for [`BundleReport`]. |
35//! | `extract-docling`| no | Reserved — future Docling layout adapter (no impl yet). |
36//! | `render-image` | no | Reserved — future redacted-preview renderer (no impl yet). |
37
38#![forbid(unsafe_code)]
39#![deny(missing_docs)]
40#![cfg_attr(docsrs, feature(doc_cfg))]
41
42pub mod bundle;
43pub mod extract;
44pub mod layout;
45#[cfg(feature = "mcp")]
46#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
47pub mod mcp;
48pub mod ocr;
49pub mod render;
50
51#[cfg(feature = "ocr-tesseract")]
52#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
53pub use bundle::clean;
54pub use bundle::{BundleReport, ClassCount, LayoutSummary, SafeBundle, BUNDLE_VERSION};
55pub use layout::ReadingOrder;
56pub use ocr::OcrAdapter;
57pub use render::Renderer;
58
59/// Crate-level error type for `gaze-document`.
60///
61/// Fail-closed by construction (Axis 1 reliability): every variant describes
62/// a specific, recoverable surface so adopters can branch on cause without
63/// matching opaque strings.
64#[non_exhaustive]
65#[derive(Debug)]
66pub enum DocumentError {
67 /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
68 TesseractNotFound(String),
69 /// Tesseract returned a non-zero exit status. The payload carries the
70 /// captured stderr (truncated) so adopters can surface it.
71 TesseractFailed {
72 /// Exit status reported by the OS.
73 status: i32,
74 /// Captured stderr (truncated to keep error payloads bounded).
75 stderr: String,
76 },
77 /// pdfium dynamic library could not be loaded. The payload is a per-OS
78 /// install hint.
79 PdfiumNotFound(String),
80 /// pdfium reported an error while parsing or rasterizing a PDF.
81 PdfRasterFailed(String),
82 /// Input file format is not supported by the current build (e.g. PDF
83 /// input without the `pdf-input` feature).
84 UnsupportedInput {
85 /// Path that was rejected.
86 path: std::path::PathBuf,
87 /// Reason the input was rejected.
88 reason: &'static str,
89 },
90 /// An I/O error while reading the input or writing the bundle.
91 Io(std::io::Error),
92 /// The bundle output directory could not be prepared.
93 OutputDir(std::path::PathBuf, std::io::Error),
94 /// `gaze::Pipeline` construction or invocation failed.
95 Pipeline(String),
96 /// `serde_json` serialization of the bundle report or manifest failed.
97 Serde(serde_json::Error),
98 /// The requested operation is part of the public contract but has no
99 /// implementation yet. Returned by reserved stubs (Renderer trait,
100 /// ReadingOrder) until follow-up PRs land.
101 NotImplemented(&'static str),
102}
103
104impl core::fmt::Display for DocumentError {
105 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106 match self {
107 Self::TesseractNotFound(hint) => write!(
108 f,
109 "gaze-document: tesseract binary not found on PATH. {hint}"
110 ),
111 Self::TesseractFailed { status, stderr } => write!(
112 f,
113 "gaze-document: tesseract exited with status {status}: {stderr}"
114 ),
115 Self::PdfiumNotFound(hint) => {
116 write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
117 }
118 Self::PdfRasterFailed(detail) => {
119 write!(f, "gaze-document: pdf rasterization failed: {detail}")
120 }
121 Self::UnsupportedInput { path, reason } => write!(
122 f,
123 "gaze-document: unsupported input `{}`: {reason}",
124 path.display()
125 ),
126 Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
127 Self::OutputDir(path, err) => write!(
128 f,
129 "gaze-document: cannot prepare output dir `{}`: {err}",
130 path.display()
131 ),
132 Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
133 Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
134 Self::NotImplemented(what) => {
135 write!(f, "gaze-document: {what} is not yet implemented")
136 }
137 }
138 }
139}
140
141impl std::error::Error for DocumentError {
142 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
143 match self {
144 Self::Io(err) | Self::OutputDir(_, err) => Some(err),
145 Self::Serde(err) => Some(err),
146 _ => None,
147 }
148 }
149}
150
151impl From<std::io::Error> for DocumentError {
152 fn from(err: std::io::Error) -> Self {
153 Self::Io(err)
154 }
155}
156
157impl From<serde_json::Error> for DocumentError {
158 fn from(err: serde_json::Error) -> Self {
159 Self::Serde(err)
160 }
161}
162
163#[cfg(test)]
164mod tests {
165 use super::*;
166
167 #[test]
168 fn crate_compiles_and_error_renders() {
169 let err = DocumentError::NotImplemented("smoke");
170 assert!(err.to_string().contains("not yet implemented"));
171 }
172
173 #[test]
174 fn tesseract_not_found_error_includes_hint() {
175 let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
176 let msg = err.to_string();
177 assert!(msg.contains("tesseract"));
178 assert!(msg.contains("brew install"));
179 }
180}