Skip to main content

gaze_document/bundle/
mod.rs

1//! SafeBundle generation: OCR + Gaze redact → on-disk artifacts.
2//!
3//! The top-level [`clean`] function is the public adopter entry point. It
4//! routes any supported input (PNG / JPG / single-page PDF) through OCR,
5//! pipes the extracted text through a [`gaze::Pipeline`], and persists the
6//! result as three files in a target directory:
7//!
8//! ```text
9//! out/
10//!   clean.md        # OCR text with PII replaced by reversible tokens
11//!   manifest.json   # gaze::Manifest — restorable, canonical
12//!   report.json     # BundleReport — OCR + PII counts + provenance
13//! ```
14//!
15//! The manifest contract is the same one the rest of the gaze runtime
16//! uses (`gaze::Manifest`). Adopters can pair `clean.md` with `manifest.json`
17//! and restore via the standard gaze session APIs.
18
19use std::path::PathBuf;
20
21use gaze::Manifest;
22use serde::{Deserialize, Serialize};
23
24use crate::ocr::OcrResult;
25
26#[cfg(feature = "ocr-tesseract")]
27use std::collections::BTreeMap;
28#[cfg(feature = "ocr-tesseract")]
29use std::fs;
30#[cfg(feature = "ocr-tesseract")]
31use std::path::Path;
32
33#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
34use gaze::{
35    Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline, RawDocument, Scope, Session,
36};
37#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
38use gaze_recognizers::{
39    AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
40};
41#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
42use gaze_types::{EmittedTokenSpan, PiiClass};
43
44#[cfg(feature = "ocr-tesseract")]
45use crate::extract::InputKind;
46#[cfg(feature = "ocr-tesseract")]
47use crate::DocumentError;
48
49/// Versioned `report.json` schema tag (bump on breaking shape changes).
50pub const BUNDLE_VERSION: u32 = 1;
51
52/// Bundle filename written into `--out` for tokenized Markdown.
53pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
54/// Bundle filename written into `--out` for the restorable manifest.
55pub const MANIFEST_FILE: &str = "manifest.json";
56/// Bundle filename written into `--out` for the OCR + PII provenance report.
57pub const REPORT_FILE: &str = "report.json";
58
59/// Post-ingestion artifact paired with a Gaze [`Manifest`].
60#[non_exhaustive]
61#[derive(Debug, Clone)]
62pub struct SafeBundle {
63    /// Tokenized Markdown safe to hand to an LLM.
64    pub clean_markdown: String,
65    /// Reversible manifest produced by the gaze pipeline.
66    pub manifest: Manifest,
67    /// Opaque layout summary (reserved — single-page in v0.0.x).
68    pub layout: LayoutSummary,
69    /// Optional rasterized preview of the source document (reserved).
70    pub preview_png: Option<Vec<u8>>,
71    /// Per-bundle audit + provenance report.
72    pub report: BundleReport,
73    /// Absolute path of the input that produced this bundle.
74    pub source_path: PathBuf,
75    /// Absolute path of the output directory that received this bundle.
76    pub out_dir: PathBuf,
77}
78
79impl SafeBundle {
80    /// Build a [`SafeBundle`] from its component parts.
81    pub fn new(
82        clean_markdown: String,
83        manifest: Manifest,
84        layout: LayoutSummary,
85        preview_png: Option<Vec<u8>>,
86        report: BundleReport,
87        source_path: PathBuf,
88        out_dir: PathBuf,
89    ) -> Self {
90        Self {
91            clean_markdown,
92            manifest,
93            layout,
94            preview_png,
95            report,
96            source_path,
97            out_dir,
98        }
99    }
100}
101
102/// Per-class PII detection count for [`BundleReport`].
103#[non_exhaustive]
104#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
105pub struct ClassCount {
106    /// Audit-canonical class name (e.g., `"email"`, `"custom:phone"`).
107    pub class: String,
108    /// Number of token spans emitted for that class.
109    pub count: u32,
110}
111
112impl ClassCount {
113    /// Build a class-count entry.
114    pub fn new(class: impl Into<String>, count: u32) -> Self {
115        Self {
116            class: class.into(),
117            count,
118        }
119    }
120}
121
122/// Bundle audit + provenance report serialized to `report.json`.
123///
124/// Schema versioned via [`BUNDLE_VERSION`]; older readers can branch on the
125/// `bundle_version` field. Field set is `#[non_exhaustive]` so additive
126/// extensions are SemVer-safe.
127#[non_exhaustive]
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct BundleReport {
130    /// Schema version (currently [`BUNDLE_VERSION`]).
131    pub bundle_version: u32,
132    /// Input kind detected from the source path.
133    pub input_kind: String,
134    /// Mean per-word Tesseract confidence (0..100). `None` when zero words.
135    pub ocr_mean_confidence: Option<f32>,
136    /// Number of words Tesseract emitted with non-negative confidence.
137    pub ocr_word_count: usize,
138    /// Tesseract language code used for OCR (e.g., `"eng"`).
139    pub ocr_lang: String,
140    /// Character count of the tokenized Markdown output.
141    pub clean_char_count: usize,
142    /// Total PII token spans across all classes.
143    pub pii_token_count: u32,
144    /// Per-class breakdown of PII token counts.
145    pub pii_tokens_by_class: Vec<ClassCount>,
146    /// PDF page count when the input was a PDF. `None` for image inputs.
147    pub pdf_page_count: Option<i32>,
148    /// PDF page index that was rasterized. `None` for image inputs.
149    pub pdf_page_index: Option<i32>,
150}
151
152impl BundleReport {
153    /// Build a [`BundleReport`] from its component parts.
154    #[allow(clippy::too_many_arguments)]
155    pub fn new(
156        input_kind: impl Into<String>,
157        ocr: &OcrResult,
158        clean_char_count: usize,
159        pii_token_count: u32,
160        pii_tokens_by_class: Vec<ClassCount>,
161        pdf_page_count: Option<i32>,
162        pdf_page_index: Option<i32>,
163    ) -> Self {
164        Self {
165            bundle_version: BUNDLE_VERSION,
166            input_kind: input_kind.into(),
167            ocr_mean_confidence: ocr.mean_confidence,
168            ocr_word_count: ocr.word_count,
169            ocr_lang: ocr.lang.clone(),
170            clean_char_count,
171            pii_token_count,
172            pii_tokens_by_class,
173            pdf_page_count,
174            pdf_page_index,
175        }
176    }
177}
178
179/// Opaque layout summary placeholder.
180///
181/// Reserved until the multi-page + reading-order PR lands. Construction
182/// records only the page count surfaced by the input layer.
183#[non_exhaustive]
184#[derive(Debug, Clone)]
185pub struct LayoutSummary {
186    /// Number of pages handed to the OCR pass (always `1` in v0.0.x).
187    pub page_count: u32,
188}
189
190impl LayoutSummary {
191    /// Build a single-page layout summary.
192    pub fn single_page() -> Self {
193        Self { page_count: 1 }
194    }
195
196    /// Build a layout summary with an explicit page count.
197    pub fn new(page_count: u32) -> Self {
198        Self { page_count }
199    }
200}
201
202/// Top-level entry point: ingest one document, write a [`SafeBundle`] to disk.
203///
204/// `input` must be a regular file with extension `.png`, `.jpg`, `.jpeg`, or
205/// `.pdf`. `out_dir` is created if missing and populated with three files
206/// (see module docs).
207///
208/// # Errors
209///
210/// Returns [`DocumentError`] for any failure in the OCR → redact → write
211/// chain. Fail-closed: every error variant carries enough context to
212/// diagnose without inspecting partial bundle state.
213#[cfg(feature = "ocr-tesseract")]
214#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
215pub fn clean(input: &Path, out_dir: &Path) -> Result<SafeBundle, DocumentError> {
216    let kind = InputKind::detect(input)?;
217    let absolute_input = absolutize(input);
218    let absolute_out = absolutize(out_dir);
219
220    fs::create_dir_all(out_dir)
221        .map_err(|err| DocumentError::OutputDir(absolute_out.clone(), err))?;
222
223    let (ocr_result, pdf_page_count, pdf_page_index) = run_ocr(input, kind)?;
224    // Repair known narrow OCR artifacts (e.g. spurious whitespace around
225    // `@` in emails) before the redact pipeline sees the text. See
226    // `crate::ocr::normalize` for the documented rule set. Axis 1
227    // (never leak) requires this — the OCR pass occasionally inserts a
228    // single space inside an email that would otherwise slip past strict
229    // recognizers and survive into clean.md.
230    let normalized_text = crate::ocr::normalize_ocr_artifacts(&ocr_result.text);
231    let pipeline = build_document_pipeline()?;
232    let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
233    let locale_chain = [LocaleTag::Global];
234    let (clean_doc, spans, _leak_report) = pipeline
235        .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
236        .map_err(|err| pipeline_err("redact", err))?;
237
238    let clean_text = match clean_doc {
239        CleanDocument::Text(text) => text,
240        _ => {
241            return Err(DocumentError::Pipeline(
242                "pipeline returned non-text variant for text input".to_string(),
243            ));
244        }
245    };
246
247    let manifest = Manifest::from_spans(spans.clone());
248    let counts = count_pii_by_class(&spans);
249    let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
250
251    let report = BundleReport::new(
252        kind_label(kind),
253        &ocr_result,
254        clean_text.chars().count(),
255        pii_token_count,
256        counts,
257        pdf_page_count,
258        pdf_page_index,
259    );
260
261    let clean_markdown = format_clean_markdown(&clean_text, kind);
262    write_bundle(out_dir, &clean_markdown, &manifest, &report)?;
263
264    Ok(SafeBundle::new(
265        clean_markdown,
266        manifest,
267        LayoutSummary::single_page(),
268        None,
269        report,
270        absolute_input,
271        absolute_out,
272    ))
273}
274
275#[cfg(feature = "ocr-tesseract")]
276pub(crate) fn run_ocr(
277    input: &Path,
278    kind: InputKind,
279) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
280    use crate::ocr::TesseractOcr;
281    let ocr = TesseractOcr::new();
282    match kind {
283        InputKind::Png | InputKind::Jpeg => {
284            let result = ocr.extract_from_file(input)?;
285            Ok((result, None, None))
286        }
287        InputKind::Pdf => {
288            #[cfg(feature = "pdf-input")]
289            {
290                use crate::extract::pdf::{rasterize_first_page, PdfRasterConfig};
291                let raster = rasterize_first_page(input, PdfRasterConfig::new())?;
292                let result = ocr.extract_from_bytes(&raster.png_bytes, "png")?;
293                Ok((result, Some(raster.page_count), Some(raster.page_index)))
294            }
295            #[cfg(not(feature = "pdf-input"))]
296            {
297                Err(DocumentError::UnsupportedInput {
298                    path: input.to_path_buf(),
299                    reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
300                })
301            }
302        }
303    }
304}
305
306#[cfg(feature = "ocr-tesseract")]
307#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
308pub(crate) fn build_document_pipeline() -> Result<Pipeline, DocumentError> {
309    let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
310    // Conservative phone pattern: optional `+CC`, area, exchange, line, with
311    // common separators. Synthetic fixture uses `+1-555-0142`-style numbers.
312    let phone = RegexDetector::new(
313        r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
314        PiiClass::custom("phone"),
315    )
316    .map_err(|err| pipeline_err("phone-regex", err))?;
317    // Invoice / shipping recipient block names. Scope is intentionally
318    // local to gaze-document (rather than extending the locale-en
319    // `forward_markers` bucket): forwarded-email cues and document
320    // recipient blocks are semantically distinct anchors and should not
321    // share a bucket. `LineEnd` boundary stops the name span at the
322    // newline that ends the recipient line so a follow-up `Email:` row
323    // cannot be absorbed into the Name match.
324    let recipient_name = AnchoredMatchRecognizer::new(
325        "gaze_document.name.recipient".to_string(),
326        vec![
327            "Bill to".to_string(),
328            "Invoice to".to_string(),
329            "Ship to".to_string(),
330            "Attention".to_string(),
331            "Attn".to_string(),
332        ],
333        AnchoredBoundary::LineEnd,
334        48,
335        NameShape::PersonName,
336        CuePosition::Before,
337        "invoice_recipient".to_string(),
338        2,
339        0.88,
340        110,
341    );
342    Pipeline::builder()
343        .detector(email)
344        .detector(phone)
345        .recognizer(recipient_name)
346        .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
347        .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
348        .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
349        .rule(DefaultRule::new(Action::Preserve))
350        .build()
351        .map_err(|err| pipeline_err("build", err))
352}
353
354#[cfg(feature = "ocr-tesseract")]
355fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
356    let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
357    for span in spans {
358        *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
359    }
360    by_class
361        .into_iter()
362        .map(|(class, count)| ClassCount::new(class, count))
363        .collect()
364}
365
366#[cfg(feature = "ocr-tesseract")]
367fn write_bundle(
368    out_dir: &Path,
369    clean_markdown: &str,
370    manifest: &Manifest,
371    report: &BundleReport,
372) -> Result<(), DocumentError> {
373    fs::write(out_dir.join(CLEAN_MARKDOWN_FILE), clean_markdown)?;
374    let manifest_json = serde_json::to_vec_pretty(manifest)?;
375    fs::write(out_dir.join(MANIFEST_FILE), manifest_json)?;
376    let report_json = serde_json::to_vec_pretty(report)?;
377    fs::write(out_dir.join(REPORT_FILE), report_json)?;
378    Ok(())
379}
380
381#[cfg(feature = "ocr-tesseract")]
382pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
383    let mut out = String::new();
384    out.push_str("# gaze-document safe bundle\n\n");
385    out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
386    out.push_str("---\n\n");
387    out.push_str(text);
388    if !text.ends_with('\n') {
389        out.push('\n');
390    }
391    out
392}
393
394#[cfg(feature = "ocr-tesseract")]
395pub(crate) fn kind_label(kind: InputKind) -> &'static str {
396    match kind {
397        InputKind::Png => "png",
398        InputKind::Jpeg => "jpeg",
399        InputKind::Pdf => "pdf",
400    }
401}
402
403#[cfg(feature = "ocr-tesseract")]
404fn absolutize(path: &Path) -> PathBuf {
405    if path.is_absolute() {
406        path.to_path_buf()
407    } else {
408        std::env::current_dir()
409            .map(|cwd| cwd.join(path))
410            .unwrap_or_else(|_| path.to_path_buf())
411    }
412}
413
414#[cfg(feature = "ocr-tesseract")]
415fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
416    DocumentError::Pipeline(format!("{stage}: {err}"))
417}
418
419#[cfg(all(test, feature = "ocr-tesseract"))]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn count_pii_by_class_groups_email_and_phone() {
425        let spans = vec![
426            EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
427            EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
428            EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
429        ];
430        let counts = count_pii_by_class(&spans);
431        assert_eq!(counts.len(), 2);
432        let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
433        assert_eq!(by_class.get("email"), Some(&2));
434        assert_eq!(by_class.get("custom:phone"), Some(&1));
435    }
436
437    #[test]
438    fn report_serializes_with_bundle_version() {
439        let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
440        let report = BundleReport::new(
441            "png",
442            &ocr,
443            42,
444            3,
445            vec![
446                ClassCount::new("email", 2),
447                ClassCount::new("custom:phone", 1),
448            ],
449            None,
450            None,
451        );
452        let json = serde_json::to_value(&report).expect("serialize");
453        assert_eq!(json["bundle_version"], BUNDLE_VERSION);
454        assert_eq!(json["input_kind"], "png");
455        assert_eq!(json["pii_token_count"], 3);
456    }
457
458    #[test]
459    fn format_clean_markdown_appends_trailing_newline() {
460        let md = format_clean_markdown("hello", InputKind::Png);
461        assert!(md.ends_with('\n'));
462        assert!(md.contains("Source kind: `png`"));
463        assert!(md.contains("hello"));
464    }
465}