gaze_document/lib.rs
1//! Document ingestion + safe-bundle generation for the Gaze runtime.
2//!
3//! `gaze-document` turns a single image (PNG / JPG) or PDF into a
4//! [`SafeBundle`]: tokenized Markdown, a restorable [`gaze::Manifest`], and a
5//! structured OCR + PII [`BundleReport`]. PII detection flows through the
6//! standard [`gaze::Pipeline`] so the manifest stays canonical and reversible
7//! (Axis 2 reversibility).
8//!
9//! # Quickstart
10//!
11//! ```no_run
12//! use std::path::Path;
13//!
14//! let bundle = gaze_document::clean(
15//! Path::new("invoice.pdf"),
16//! Path::new("./safe-out"),
17//! )?;
18//! assert!(!bundle.clean_markdown.is_empty());
19//! # Ok::<(), gaze_document::DocumentError>(())
20//! ```
21//!
22//! # Runtime requirements
23//!
24//! * `tesseract` binary on `PATH` (Tesseract 4.x or 5.x).
25//! * For PDF input: a pdfium dynamic library available to the process. See
26//! the crate README for per-OS install instructions.
27//!
28//! # Feature flags
29//!
30//! | Flag | Default | What it enables |
31//! |------------------|---------|------------------------------------------------------------|
32//! | `ocr-tesseract` | yes | Tesseract subprocess OCR backend. |
33//! | `pdf-input` | yes | PDF text extraction + raster OCR fallback via `pdfium-render`. |
34//! | `serde` | yes | `Serialize` / `Deserialize` for [`BundleReport`]. |
35//! | `extract-docling`| no | Reserved — future Docling layout adapter (no impl yet). |
36//! | `render-image` | no | Reserved — future redacted-preview renderer (no impl yet). |
37
38#![forbid(unsafe_code)]
39#![deny(missing_docs)]
40#![cfg_attr(docsrs, feature(doc_cfg))]
41
42pub mod bundle;
43pub mod extract;
44pub mod layout;
45#[cfg(feature = "mcp")]
46#[cfg_attr(docsrs, doc(cfg(feature = "mcp")))]
47pub mod mcp;
48pub mod ocr;
49mod postprocess;
50mod preprocess;
51pub mod render;
52
53#[cfg(feature = "ocr-tesseract")]
54#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
55pub use bundle::{clean, clean_with_ocr_backend};
56pub use bundle::{
57 BundleReport, ClassCount, LayoutSummary, OcrSource, PageReport, Pipeline, SafeBundle,
58 BUNDLE_VERSION,
59};
60pub use layout::ReadingOrder;
61#[cfg(feature = "ocr-tesseract")]
62#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
63pub use ocr::TesseractBackend;
64pub use ocr::{
65 detect_image_format, BBox, ImageFormat, ImageInput, LanguageTag, OcrBackend, OcrError,
66 OcrHints, OcrSpan,
67};
68pub use render::Renderer;
69
70/// Crate-level error type for `gaze-document`.
71///
72/// Fail-closed by construction (Axis 1 reliability): every variant describes
73/// a specific, recoverable surface so adopters can branch on cause without
74/// matching opaque strings.
75#[non_exhaustive]
76#[derive(Debug)]
77pub enum DocumentError {
78 /// Tesseract CLI is not on `PATH`. The payload is a per-OS install hint.
79 TesseractNotFound(String),
80 /// Tesseract returned a non-zero exit status. The payload carries the
81 /// captured stderr (truncated) so adopters can surface it.
82 TesseractFailed {
83 /// Exit status reported by the OS.
84 status: i32,
85 /// Captured stderr (truncated to keep error payloads bounded).
86 stderr: String,
87 },
88 /// pdfium dynamic library could not be loaded. The payload is a per-OS
89 /// install hint.
90 PdfiumNotFound(String),
91 /// pdfium reported an error while parsing or rasterizing a PDF.
92 PdfRasterFailed(String),
93 /// Input file format is not supported by the current build (e.g. PDF
94 /// input without the `pdf-input` feature).
95 UnsupportedInput {
96 /// Path that was rejected.
97 path: std::path::PathBuf,
98 /// Reason the input was rejected.
99 reason: &'static str,
100 },
101 /// An I/O error while reading the input or writing the bundle.
102 Io(std::io::Error),
103 /// The bundle output directory could not be prepared.
104 OutputDir(std::path::PathBuf, std::io::Error),
105 /// `gaze::Pipeline` construction or invocation failed.
106 Pipeline(String),
107 /// `serde_json` serialization of the bundle report or manifest failed.
108 Serde(serde_json::Error),
109 /// The requested operation is part of the public contract but has no
110 /// implementation yet. Returned by reserved stubs (Renderer trait,
111 /// ReadingOrder) until follow-up PRs land.
112 NotImplemented(&'static str),
113}
114
115impl core::fmt::Display for DocumentError {
116 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
117 match self {
118 Self::TesseractNotFound(hint) => write!(
119 f,
120 "gaze-document: tesseract binary not found on PATH. {hint}"
121 ),
122 Self::TesseractFailed { status, stderr } => write!(
123 f,
124 "gaze-document: tesseract exited with status {status}: {stderr}"
125 ),
126 Self::PdfiumNotFound(hint) => {
127 write!(f, "gaze-document: pdfium dynamic library not found. {hint}")
128 }
129 Self::PdfRasterFailed(detail) => {
130 write!(f, "gaze-document: pdf rasterization failed: {detail}")
131 }
132 Self::UnsupportedInput { path, reason } => write!(
133 f,
134 "gaze-document: unsupported input `{}`: {reason}",
135 path.display()
136 ),
137 Self::Io(err) => write!(f, "gaze-document: io error: {err}"),
138 Self::OutputDir(path, err) => write!(
139 f,
140 "gaze-document: cannot prepare output dir `{}`: {err}",
141 path.display()
142 ),
143 Self::Pipeline(detail) => write!(f, "gaze-document: pipeline error: {detail}"),
144 Self::Serde(err) => write!(f, "gaze-document: serialize error: {err}"),
145 Self::NotImplemented(what) => {
146 write!(f, "gaze-document: {what} is not yet implemented")
147 }
148 }
149 }
150}
151
152impl std::error::Error for DocumentError {
153 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
154 match self {
155 Self::Io(err) | Self::OutputDir(_, err) => Some(err),
156 Self::Serde(err) => Some(err),
157 _ => None,
158 }
159 }
160}
161
162impl From<std::io::Error> for DocumentError {
163 fn from(err: std::io::Error) -> Self {
164 Self::Io(err)
165 }
166}
167
168impl From<serde_json::Error> for DocumentError {
169 fn from(err: serde_json::Error) -> Self {
170 Self::Serde(err)
171 }
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 #[test]
179 fn crate_compiles_and_error_renders() {
180 let err = DocumentError::NotImplemented("smoke");
181 assert!(err.to_string().contains("not yet implemented"));
182 }
183
184 #[test]
185 fn tesseract_not_found_error_includes_hint() {
186 let err = DocumentError::TesseractNotFound("Install via `brew install tesseract`.".into());
187 let msg = err.to_string();
188 assert!(msg.contains("tesseract"));
189 assert!(msg.contains("brew install"));
190 }
191}