gaze_document/bundle/
mod.rs

1//! SafeBundle generation: OCR + Gaze redact → on-disk artifacts.
2//!
3//! The top-level [`clean`] function is the public adopter entry point. It
4//! routes any supported input (PNG / JPG / single-page PDF) through OCR,
5//! pipes the extracted text through a [`gaze::Pipeline`], and persists the
6//! result as three files split across agent and owner target directories:
7//!
8//! ```text
9//! agent_out/
10//!   clean.md        # OCR text with PII replaced by reversible tokens
11//!   report.json     # BundleReport — OCR + PII counts + provenance
12//!
13//! owner_out/
14//!   manifest.json   # gaze::Manifest — restorable, canonical
15//! ```
16//!
17//! The manifest contract is the same one the rest of the gaze runtime
18//! uses (`gaze::Manifest`). Because it carries restore material, it is written
19//! only to the owner output directory.
20
21use std::path::{Component, Path, PathBuf};
22
23use gaze::Manifest;
24use serde::{Deserialize, Serialize};
25
26use crate::ocr::{
27    detect_image_format, ImageFormat, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult,
28};
29
30#[cfg(feature = "ocr-tesseract")]
31use std::collections::BTreeMap;
32#[cfg(feature = "ocr-tesseract")]
33use std::fs;
34
35#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
36use gaze::{
37    Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline as GazePipeline,
38    RawDocument, Scope, Session,
39};
40#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
41use gaze_recognizers::{
42    AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
43};
44#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
45use gaze_types::{EmittedTokenSpan, PiiClass};
46
47#[cfg(feature = "ocr-tesseract")]
48use crate::extract::InputKind;
49#[cfg(feature = "ocr-tesseract")]
50use crate::{BundleLayoutInvalidReason, DocumentError};
51
52/// Versioned `report.json` schema tag (bump on breaking shape changes).
53pub const BUNDLE_VERSION: u32 = 2;
54const DEFAULT_LOW_CONFIDENCE_THRESHOLD: f32 = 0.65;
55
56/// Bundle filename written into the agent output directory for tokenized Markdown.
57pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
58/// Bundle filename written into the owner output directory for the restorable manifest.
59pub const MANIFEST_FILE: &str = "manifest.json";
60/// Bundle filename written into the agent output directory for the OCR + PII provenance report.
61pub const REPORT_FILE: &str = "report.json";
62
63/// Agent-visible SafeBundle output directory.
64#[non_exhaustive]
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct AgentBundleDir(PathBuf);
67
68impl AgentBundleDir {
69    /// Build an agent output directory wrapper.
70    ///
71    /// The directory is created later, after the paired owner directory has
72    /// been validated so equal and nested layouts fail before any bundle write.
73    pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentError> {
74        let path = path.into();
75        validate_non_empty_path(&path)?;
76        Ok(Self(path))
77    }
78
79    /// Return the wrapped filesystem path.
80    pub fn as_path(&self) -> &Path {
81        &self.0
82    }
83}
84
85/// Owner-only SafeBundle output directory.
86#[non_exhaustive]
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct OwnerBundleDir(PathBuf);
89
90impl OwnerBundleDir {
91    /// Build an owner output directory wrapper.
92    ///
93    /// The directory is created later, after the paired agent directory has
94    /// been validated so equal and nested layouts fail before any bundle write.
95    pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentError> {
96        let path = path.into();
97        validate_non_empty_path(&path)?;
98        Ok(Self(path))
99    }
100
101    /// Return the wrapped filesystem path.
102    pub fn as_path(&self) -> &Path {
103        &self.0
104    }
105}
106
107/// Post-ingestion artifact paired with a Gaze [`Manifest`].
108#[non_exhaustive]
109#[derive(Debug, Clone)]
110pub struct SafeBundle {
111    /// Tokenized Markdown safe to hand to an LLM.
112    pub clean_markdown: String,
113    /// Reversible manifest produced by the gaze pipeline.
114    pub manifest: Manifest,
115    /// Opaque layout summary (reserved — single-page in v0.0.x).
116    pub layout: LayoutSummary,
117    /// Optional rasterized preview of the source document (reserved).
118    pub preview_png: Option<Vec<u8>>,
119    /// Per-bundle audit + provenance report.
120    pub report: BundleReport,
121    /// Absolute path of the input that produced this bundle.
122    pub source_path: PathBuf,
123    /// Absolute path of the agent-visible output directory.
124    pub agent_out_dir: PathBuf,
125    /// Absolute path of the owner-only output directory.
126    pub owner_out_dir: PathBuf,
127}
128
129impl SafeBundle {
130    /// Build a [`SafeBundle`] from its component parts.
131    #[allow(clippy::too_many_arguments)]
132    pub fn new(
133        clean_markdown: String,
134        manifest: Manifest,
135        layout: LayoutSummary,
136        preview_png: Option<Vec<u8>>,
137        report: BundleReport,
138        source_path: PathBuf,
139        agent_out_dir: PathBuf,
140        owner_out_dir: PathBuf,
141    ) -> Self {
142        Self {
143            clean_markdown,
144            manifest,
145            layout,
146            preview_png,
147            report,
148            source_path,
149            agent_out_dir,
150            owner_out_dir,
151        }
152    }
153}
154
155/// Per-class PII detection count for [`BundleReport`].
156#[non_exhaustive]
157#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
158pub struct ClassCount {
159    /// Audit-canonical class name (e.g., `"email"`, `"custom:phone"`).
160    pub class: String,
161    /// Number of token spans emitted for that class.
162    pub count: u32,
163}
164
165impl ClassCount {
166    /// Build a class-count entry.
167    pub fn new(class: impl Into<String>, count: u32) -> Self {
168        Self {
169            class: class.into(),
170            count,
171        }
172    }
173}
174
175/// Per-page extraction source.
176#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
177#[serde(rename_all = "snake_case")]
178pub enum OcrSource {
179    /// Selectable text extracted directly from a PDF page.
180    VectorPdf,
181    /// Raster OCR from an image page.
182    Ocr,
183}
184
185/// Per-page OCR/layout provenance.
186#[non_exhaustive]
187#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
188pub struct PageReport {
189    /// Zero-based page index.
190    pub page_index: i32,
191    /// Extraction path used for this page.
192    pub ocr_source: OcrSource,
193    /// OCR backend name when [`OcrSource::Ocr`] produced the page.
194    pub ocr_backend: Option<String>,
195    /// Aggregated page confidence in `0.0..=1.0`. `None` for vector PDF text.
196    pub confidence: Option<f32>,
197    /// True when confidence is present and below the configured threshold.
198    pub low_confidence: bool,
199    /// Detected text column count. `1` means single-column.
200    pub column_count: u32,
201    /// Number of OCR words with confidence for this page.
202    pub ocr_word_count: usize,
203    /// Legacy percent-scale mean confidence for this page.
204    pub ocr_mean_confidence: Option<f32>,
205}
206
207impl PageReport {
208    fn new(
209        page_index: i32,
210        ocr_source: OcrSource,
211        ocr_backend: Option<String>,
212        ocr: &OcrResult,
213        column_count: u32,
214        low_confidence_threshold: f32,
215    ) -> Self {
216        let confidence = ocr.mean_confidence_unit();
217        Self {
218            page_index,
219            ocr_source,
220            ocr_backend,
221            confidence,
222            low_confidence: confidence
223                .map(|confidence| confidence < low_confidence_threshold)
224                .unwrap_or(false),
225            column_count,
226            ocr_word_count: ocr.word_count,
227            ocr_mean_confidence: ocr.mean_confidence,
228        }
229    }
230}
231
232/// Bundle audit + provenance report serialized to `report.json`.
233///
234/// Schema versioned via [`BUNDLE_VERSION`]; older readers can branch on the
235/// `bundle_version` field. Field set is `#[non_exhaustive]` so additive
236/// extensions are SemVer-safe.
237#[non_exhaustive]
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct BundleReport {
240    /// Schema version (currently [`BUNDLE_VERSION`]).
241    pub bundle_version: u32,
242    /// Input kind detected from the source path.
243    pub input_kind: String,
244    /// Mean per-word Tesseract confidence (0..100). `None` when zero words.
245    pub ocr_mean_confidence: Option<f32>,
246    /// Number of words Tesseract emitted with non-negative confidence.
247    pub ocr_word_count: usize,
248    /// Tesseract language code used for OCR (e.g., `"eng"`).
249    pub ocr_lang: String,
250    /// Character count of the tokenized Markdown output.
251    pub clean_char_count: usize,
252    /// Total PII token spans across all classes.
253    pub pii_token_count: u32,
254    /// Per-class breakdown of PII token counts.
255    pub pii_tokens_by_class: Vec<ClassCount>,
256    /// PDF page count when the input was a PDF. `None` for image inputs.
257    pub pdf_page_count: Option<i32>,
258    /// PDF page index that was rasterized. `None` for image inputs.
259    pub pdf_page_index: Option<i32>,
260    /// Per-page extraction, confidence, and layout provenance.
261    #[serde(default)]
262    pub pages: Vec<PageReport>,
263    /// Confidence threshold used to set [`PageReport::low_confidence`].
264    #[serde(default = "default_low_confidence_threshold")]
265    pub low_confidence_threshold: f32,
266}
267
268impl BundleReport {
269    /// Build a [`BundleReport`] from its component parts.
270    #[allow(clippy::too_many_arguments)]
271    pub fn new(
272        input_kind: impl Into<String>,
273        ocr: &OcrResult,
274        clean_char_count: usize,
275        pii_token_count: u32,
276        pii_tokens_by_class: Vec<ClassCount>,
277        pdf_page_count: Option<i32>,
278        pdf_page_index: Option<i32>,
279        pages: Vec<PageReport>,
280        low_confidence_threshold: f32,
281    ) -> Self {
282        Self {
283            bundle_version: BUNDLE_VERSION,
284            input_kind: input_kind.into(),
285            ocr_mean_confidence: ocr.mean_confidence,
286            ocr_word_count: ocr.word_count,
287            ocr_lang: ocr.lang.clone(),
288            clean_char_count,
289            pii_token_count,
290            pii_tokens_by_class,
291            pdf_page_count,
292            pdf_page_index,
293            pages,
294            low_confidence_threshold,
295        }
296    }
297}
298
299fn default_low_confidence_threshold() -> f32 {
300    DEFAULT_LOW_CONFIDENCE_THRESHOLD
301}
302
303/// Configurable document-cleaning pipeline.
304#[non_exhaustive]
305#[derive(Debug, Clone, Copy)]
306pub struct Pipeline {
307    low_confidence_threshold: f32,
308    column_detection: bool,
309}
310
311impl Pipeline {
312    /// Build a pipeline with conservative defaults.
313    pub fn new() -> Self {
314        Self {
315            low_confidence_threshold: DEFAULT_LOW_CONFIDENCE_THRESHOLD,
316            column_detection: true,
317        }
318    }
319
320    /// Override the per-page low-confidence threshold.
321    pub fn with_low_confidence_threshold(mut self, threshold: f32) -> Self {
322        self.low_confidence_threshold = threshold.clamp(0.0, 1.0);
323        self
324    }
325
326    /// Enable or disable multi-column OCR span ordering.
327    pub fn with_column_detection(mut self, enabled: bool) -> Self {
328        self.column_detection = enabled;
329        self
330    }
331
332    /// Clean one document with an adopter-supplied OCR backend.
333    ///
334    /// # Errors
335    /// Returns [`DocumentError`] for any extraction, OCR, redaction, or write failure.
336    #[cfg(feature = "ocr-tesseract")]
337    #[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
338    pub fn clean_with_ocr_backend(
339        &self,
340        input: &Path,
341        agent_out: AgentBundleDir,
342        owner_out: OwnerBundleDir,
343        ocr_backend: &dyn OcrBackend,
344    ) -> Result<SafeBundle, DocumentError> {
345        clean_with_options(input, agent_out, owner_out, ocr_backend, *self)
346    }
347}
348
349impl Default for Pipeline {
350    fn default() -> Self {
351        Self::new()
352    }
353}
354
355/// Opaque layout summary placeholder.
356///
357/// Reserved until the multi-page + reading-order PR lands. Construction
358/// records only the page count surfaced by the input layer.
359#[non_exhaustive]
360#[derive(Debug, Clone)]
361pub struct LayoutSummary {
362    /// Number of pages handed to the OCR pass (always `1` in v0.0.x).
363    pub page_count: u32,
364}
365
366impl LayoutSummary {
367    /// Build a single-page layout summary.
368    pub fn single_page() -> Self {
369        Self { page_count: 1 }
370    }
371
372    /// Build a layout summary with an explicit page count.
373    pub fn new(page_count: u32) -> Self {
374        Self { page_count }
375    }
376}
377
378/// Top-level entry point: ingest one document, write a [`SafeBundle`] to disk.
379///
380/// `input` must be a regular file with extension `.png`, `.jpg`, `.jpeg`, or
381/// `.pdf`. `agent_out` and `owner_out` are created if missing and populated
382/// with the split artifact layout described in the module docs.
383///
384/// # Errors
385///
386/// Returns [`DocumentError`] for any failure in the OCR → redact → write
387/// chain. Fail-closed: every error variant carries enough context to
388/// diagnose without inspecting partial bundle state.
389#[cfg(feature = "ocr-tesseract")]
390#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
391pub fn clean(
392    input: &Path,
393    agent_out: AgentBundleDir,
394    owner_out: OwnerBundleDir,
395) -> Result<SafeBundle, DocumentError> {
396    let backend = crate::ocr::TesseractBackend::new();
397    Pipeline::new().clean_with_ocr_backend(input, agent_out, owner_out, &backend)
398}
399
400/// Top-level entry point with an adopter-supplied OCR backend.
401///
402/// The backend receives finalized single-image bytes. PDF rasterization and
403/// downstream layout/report generation remain owned by `gaze-document`.
404///
405/// # Errors
406///
407/// Returns [`DocumentError`] for any failure in the OCR → redact → write
408/// chain. OCR backend errors are mapped into the existing document error
409/// surface so current callers keep the same high-level behavior.
410#[cfg(feature = "ocr-tesseract")]
411#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
412pub fn clean_with_ocr_backend(
413    input: &Path,
414    agent_out: AgentBundleDir,
415    owner_out: OwnerBundleDir,
416    ocr_backend: &dyn OcrBackend,
417) -> Result<SafeBundle, DocumentError> {
418    Pipeline::new().clean_with_ocr_backend(input, agent_out, owner_out, ocr_backend)
419}
420
421#[cfg(feature = "ocr-tesseract")]
422fn clean_with_options(
423    input: &Path,
424    agent_out: AgentBundleDir,
425    owner_out: OwnerBundleDir,
426    ocr_backend: &dyn OcrBackend,
427    options: Pipeline,
428) -> Result<SafeBundle, DocumentError> {
429    let kind = InputKind::detect(input)?;
430    let absolute_input = absolutize(input);
431    let (absolute_agent_out, absolute_owner_out) = prepare_bundle_dirs(&agent_out, &owner_out)?;
432
433    let extraction = run_document_extraction(input, kind, ocr_backend, options)?;
434    // Repair known narrow OCR artifacts (e.g. spurious whitespace around
435    // `@` in emails) before the redact pipeline sees the text. See
436    // `crate::ocr::normalize` for the documented rule set. Axis 1
437    // (never leak) requires this — the OCR pass occasionally inserts a
438    // single space inside an email that would otherwise slip past strict
439    // recognizers and survive into clean.md.
440    let normalized_text = crate::ocr::normalize_ocr_artifacts(&extraction.ocr_result.text);
441    let pipeline = build_document_pipeline()?;
442    let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
443    let locale_chain = [LocaleTag::Global];
444    let (clean_doc, spans, _leak_report) = pipeline
445        .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
446        .map_err(|err| pipeline_err("redact", err))?;
447
448    let clean_text = match clean_doc {
449        CleanDocument::Text(text) => text,
450        _ => {
451            return Err(DocumentError::Pipeline(
452                "pipeline returned non-text variant for text input".to_string(),
453            ));
454        }
455    };
456
457    let manifest = Manifest::from_spans(spans.clone());
458    let counts = count_pii_by_class(&spans);
459    let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
460
461    let report = BundleReport::new(
462        kind_label(kind),
463        &extraction.ocr_result,
464        clean_text.chars().count(),
465        pii_token_count,
466        counts,
467        extraction.pdf_page_count,
468        extraction.pdf_page_index,
469        extraction.pages,
470        options.low_confidence_threshold,
471    );
472
473    let clean_markdown = format_clean_markdown(&clean_text, kind);
474    write_bundle(&agent_out, &owner_out, &clean_markdown, &manifest, &report)?;
475
476    Ok(SafeBundle::new(
477        clean_markdown,
478        manifest,
479        LayoutSummary::new(extraction.page_count),
480        None,
481        report,
482        absolute_input,
483        absolute_agent_out,
484        absolute_owner_out,
485    ))
486}
487
488#[cfg(feature = "ocr-tesseract")]
489struct DocumentExtraction {
490    ocr_result: OcrResult,
491    pdf_page_count: Option<i32>,
492    pdf_page_index: Option<i32>,
493    pages: Vec<PageReport>,
494    page_count: u32,
495}
496
497#[cfg(feature = "ocr-tesseract")]
498#[cfg_attr(not(feature = "mcp"), allow(dead_code))]
499pub(crate) fn run_ocr(
500    input: &Path,
501    kind: InputKind,
502    ocr_backend: &dyn OcrBackend,
503) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
504    let extraction = run_document_extraction(input, kind, ocr_backend, Pipeline::new())?;
505    Ok((
506        extraction.ocr_result,
507        extraction.pdf_page_count,
508        extraction.pdf_page_index,
509    ))
510}
511
512#[cfg(feature = "ocr-tesseract")]
513fn run_document_extraction(
514    input: &Path,
515    kind: InputKind,
516    ocr_backend: &dyn OcrBackend,
517    options: Pipeline,
518) -> Result<DocumentExtraction, DocumentError> {
519    match kind {
520        InputKind::Png | InputKind::Jpeg => {
521            let bytes = fs::read(input)?;
522            let format = detect_image_format(&bytes)?;
523            let (result, column_count) = recognize_image(
524                ocr_backend,
525                ImageInput {
526                    bytes,
527                    format,
528                    dpi: None,
529                },
530                options.column_detection,
531            )?;
532            let page_report = PageReport::new(
533                0,
534                OcrSource::Ocr,
535                Some(ocr_backend.name().to_string()),
536                &result,
537                column_count,
538                options.low_confidence_threshold,
539            );
540            Ok(DocumentExtraction {
541                ocr_result: result,
542                pdf_page_count: None,
543                pdf_page_index: None,
544                pages: vec![page_report],
545                page_count: 1,
546            })
547        }
548        InputKind::Pdf => {
549            #[cfg(feature = "pdf-input")]
550            {
551                use crate::extract::pdf::{extract_pages, PdfPagePayload, PdfRasterConfig};
552                let payloads = extract_pages(input, PdfRasterConfig::new())?;
553                let mut page_results = Vec::with_capacity(payloads.len());
554                let mut pages = Vec::with_capacity(payloads.len());
555                let mut pdf_page_count = None;
556                let mut first_page_index = None;
557
558                for payload in payloads {
559                    pdf_page_count = Some(payload.page_count());
560                    if first_page_index.is_none() {
561                        first_page_index = Some(payload.page_index());
562                    }
563                    match payload {
564                        PdfPagePayload::VectorText {
565                            text, page_index, ..
566                        } => {
567                            let result = OcrResult::new(text, None, 0, "vector-pdf".to_string());
568                            pages.push(PageReport::new(
569                                page_index,
570                                OcrSource::VectorPdf,
571                                None,
572                                &result,
573                                1,
574                                options.low_confidence_threshold,
575                            ));
576                            page_results.push(result);
577                        }
578                        PdfPagePayload::Raster(raster) => {
579                            let (result, column_count) = recognize_image(
580                                ocr_backend,
581                                ImageInput {
582                                    bytes: raster.png_bytes,
583                                    format: ImageFormat::Png,
584                                    dpi: None,
585                                },
586                                options.column_detection,
587                            )?;
588                            pages.push(PageReport::new(
589                                raster.page_index,
590                                OcrSource::Ocr,
591                                Some(ocr_backend.name().to_string()),
592                                &result,
593                                column_count,
594                                options.low_confidence_threshold,
595                            ));
596                            page_results.push(result);
597                        }
598                    }
599                }
600
601                Ok(DocumentExtraction {
602                    ocr_result: merge_page_results(&page_results),
603                    pdf_page_count,
604                    pdf_page_index: first_page_index,
605                    page_count: pages.len() as u32,
606                    pages,
607                })
608            }
609            #[cfg(not(feature = "pdf-input"))]
610            {
611                Err(DocumentError::UnsupportedInput {
612                    path: input.to_path_buf(),
613                    reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
614                })
615            }
616        }
617    }
618}
619
620#[cfg(feature = "ocr-tesseract")]
621fn recognize_image(
622    ocr_backend: &dyn OcrBackend,
623    image: ImageInput,
624    column_detection: bool,
625) -> Result<(OcrResult, u32), DocumentError> {
626    let hints = OcrHints::default();
627    let lang = hints.primary_language().to_string();
628    let image = crate::preprocess::preprocess_image(image);
629    let spans = ocr_backend
630        .recognize(image, hints)
631        .map_err(map_ocr_error_to_document_error)?;
632    Ok(OcrResult::from_spans_with_column_detection(
633        &spans,
634        lang,
635        column_detection,
636    ))
637}
638
639#[cfg(feature = "ocr-tesseract")]
640fn merge_page_results(results: &[OcrResult]) -> OcrResult {
641    let text = results
642        .iter()
643        .map(|result| result.text.as_str())
644        .collect::<Vec<_>>()
645        .join("\n\n");
646    let mut conf_sum = 0.0f64;
647    let mut conf_count = 0usize;
648    for result in results {
649        if let Some(confidence) = result.mean_confidence {
650            conf_sum += confidence as f64 * result.word_count as f64;
651            conf_count += result.word_count;
652        }
653    }
654    let mean_confidence = if conf_count == 0 {
655        None
656    } else {
657        Some((conf_sum / conf_count as f64) as f32)
658    };
659    OcrResult::new(text, mean_confidence, conf_count, "mixed".to_string())
660}
661
662#[cfg(feature = "ocr-tesseract")]
663fn map_ocr_error_to_document_error(err: OcrError) -> DocumentError {
664    match err {
665        OcrError::InitFailed(hint) => DocumentError::TesseractNotFound(hint),
666        OcrError::RecognizeFailed(detail) => DocumentError::TesseractFailed {
667            status: -1,
668            stderr: detail,
669        },
670        OcrError::UnsupportedFormat(format) => DocumentError::UnsupportedInput {
671            path: PathBuf::new(),
672            reason: match format {
673                ImageFormat::Png => "png image format is not supported by the OCR backend",
674                ImageFormat::Jpeg => "jpeg image format is not supported by the OCR backend",
675                ImageFormat::Tiff => "tiff image format is not supported by the OCR backend",
676            },
677        },
678        OcrError::Internal(detail) => DocumentError::Pipeline(format!("ocr: {detail}")),
679    }
680}
681
682#[cfg(feature = "ocr-tesseract")]
683#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
684pub(crate) fn build_document_pipeline() -> Result<GazePipeline, DocumentError> {
685    let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
686    // Conservative phone pattern: optional `+CC`, area, exchange, line, with
687    // common separators. Synthetic fixture uses `+1-555-0142`-style numbers.
688    let phone = RegexDetector::new(
689        r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
690        PiiClass::custom("phone"),
691    )
692    .map_err(|err| pipeline_err("phone-regex", err))?;
693    // Invoice / shipping recipient block names. Scope is intentionally
694    // local to gaze-document (rather than extending the locale-en
695    // `forward_markers` bucket): forwarded-email cues and document
696    // recipient blocks are semantically distinct anchors and should not
697    // share a bucket. `LineEnd` boundary stops the name span at the
698    // newline that ends the recipient line so a follow-up `Email:` row
699    // cannot be absorbed into the Name match.
700    let recipient_name = AnchoredMatchRecognizer::new(
701        "gaze_document.name.recipient".to_string(),
702        vec![
703            "Bill to".to_string(),
704            "Invoice to".to_string(),
705            "Ship to".to_string(),
706            "Attention".to_string(),
707            "Attn".to_string(),
708        ],
709        AnchoredBoundary::LineEnd,
710        48,
711        NameShape::PersonName,
712        CuePosition::Before,
713        "invoice_recipient".to_string(),
714        2,
715        0.88,
716        110,
717    );
718    GazePipeline::builder()
719        .detector(email)
720        .detector(phone)
721        .recognizer(recipient_name)
722        .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
723        .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
724        .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
725        .rule(DefaultRule::new(Action::Preserve))
726        .build()
727        .map_err(|err| pipeline_err("build", err))
728}
729
730#[cfg(feature = "ocr-tesseract")]
731fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
732    let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
733    for span in spans {
734        *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
735    }
736    by_class
737        .into_iter()
738        .map(|(class, count)| ClassCount::new(class, count))
739        .collect()
740}
741
742#[cfg(feature = "ocr-tesseract")]
743fn write_bundle(
744    agent_out: &AgentBundleDir,
745    owner_out: &OwnerBundleDir,
746    clean_markdown: &str,
747    manifest: &Manifest,
748    report: &BundleReport,
749) -> Result<(), DocumentError> {
750    fs::write(
751        agent_out.as_path().join(CLEAN_MARKDOWN_FILE),
752        clean_markdown,
753    )?;
754    let manifest_json = serde_json::to_vec_pretty(manifest)?;
755    fs::write(owner_out.as_path().join(MANIFEST_FILE), manifest_json)?;
756    let report_json = serde_json::to_vec_pretty(report)?;
757    fs::write(agent_out.as_path().join(REPORT_FILE), report_json)?;
758    Ok(())
759}
760
761#[cfg(feature = "ocr-tesseract")]
762fn prepare_bundle_dirs(
763    agent_out: &AgentBundleDir,
764    owner_out: &OwnerBundleDir,
765) -> Result<(PathBuf, PathBuf), DocumentError> {
766    let agent = normalize_for_layout(agent_out.as_path());
767    let owner = normalize_for_layout(owner_out.as_path());
768    validate_bundle_layout(&agent, &owner)?;
769
770    fs::create_dir_all(agent_out.as_path())
771        .map_err(|err| DocumentError::OutputDir(agent.clone(), err))?;
772    fs::create_dir_all(owner_out.as_path())
773        .map_err(|err| DocumentError::OutputDir(owner.clone(), err))?;
774
775    let agent = fs::canonicalize(agent_out.as_path()).unwrap_or(agent);
776    let owner = fs::canonicalize(owner_out.as_path()).unwrap_or(owner);
777    validate_bundle_layout(&agent, &owner)?;
778    Ok((agent, owner))
779}
780
781fn validate_non_empty_path(path: &Path) -> Result<(), DocumentError> {
782    if path.as_os_str().is_empty() {
783        return Err(DocumentError::BundleLayoutInvalid {
784            reason: BundleLayoutInvalidReason::EmptyPath,
785        });
786    }
787    Ok(())
788}
789
790fn validate_bundle_layout(agent: &Path, owner: &Path) -> Result<(), DocumentError> {
791    if agent == owner {
792        return Err(DocumentError::BundleLayoutInvalid {
793            reason: BundleLayoutInvalidReason::AgentEqualsOwner,
794        });
795    }
796    if agent.starts_with(owner) {
797        return Err(DocumentError::BundleLayoutInvalid {
798            reason: BundleLayoutInvalidReason::AgentNestedInOwner,
799        });
800    }
801    if owner.starts_with(agent) {
802        return Err(DocumentError::BundleLayoutInvalid {
803            reason: BundleLayoutInvalidReason::OwnerNestedInAgent,
804        });
805    }
806    Ok(())
807}
808
809fn normalize_for_layout(path: &Path) -> PathBuf {
810    let absolute = absolutize(path);
811    let mut normalized = PathBuf::new();
812    for component in absolute.components() {
813        match component {
814            Component::CurDir => {}
815            Component::ParentDir => {
816                normalized.pop();
817            }
818            Component::Prefix(prefix) => normalized.push(prefix.as_os_str()),
819            Component::RootDir => normalized.push(component.as_os_str()),
820            Component::Normal(part) => normalized.push(part),
821        }
822    }
823    normalized
824}
825
826#[cfg(feature = "ocr-tesseract")]
827pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
828    let mut out = String::new();
829    out.push_str("# gaze-document safe bundle\n\n");
830    out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
831    out.push_str("---\n\n");
832    out.push_str(text);
833    if !text.ends_with('\n') {
834        out.push('\n');
835    }
836    out
837}
838
839#[cfg(feature = "ocr-tesseract")]
840pub(crate) fn kind_label(kind: InputKind) -> &'static str {
841    match kind {
842        InputKind::Png => "png",
843        InputKind::Jpeg => "jpeg",
844        InputKind::Pdf => "pdf",
845    }
846}
847
848fn absolutize(path: &Path) -> PathBuf {
849    if path.is_absolute() {
850        path.to_path_buf()
851    } else {
852        std::env::current_dir()
853            .map(|cwd| cwd.join(path))
854            .unwrap_or_else(|_| path.to_path_buf())
855    }
856}
857
858#[cfg(feature = "ocr-tesseract")]
859fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
860    DocumentError::Pipeline(format!("{stage}: {err}"))
861}
862
863#[cfg(all(test, feature = "ocr-tesseract"))]
864mod tests {
865    use super::*;
866    use crate::ocr::{BBox, OcrSpan};
867
868    #[derive(Debug)]
869    struct MockBackend {
870        spans: Vec<OcrSpan>,
871    }
872
873    impl OcrBackend for MockBackend {
874        fn name(&self) -> &str {
875            "mock-ocr"
876        }
877
878        fn recognize(
879            &self,
880            _image: ImageInput,
881            _hints: OcrHints,
882        ) -> Result<Vec<OcrSpan>, OcrError> {
883            Ok(self.spans.clone())
884        }
885    }
886
887    fn span(text: &str, x: u32, y: u32, confidence: f32) -> OcrSpan {
888        OcrSpan {
889            text: text.to_string(),
890            bbox: BBox { x, y, w: 90, h: 16 },
891            confidence: Some(confidence),
892        }
893    }
894
895    fn bundle_dirs(tmp: &tempfile::TempDir) -> (AgentBundleDir, OwnerBundleDir) {
896        (
897            AgentBundleDir::new(tmp.path().join("agent")).expect("agent dir"),
898            OwnerBundleDir::new(tmp.path().join("owner")).expect("owner dir"),
899        )
900    }
901
902    #[test]
903    fn count_pii_by_class_groups_email_and_phone() {
904        let spans = vec![
905            EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
906            EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
907            EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
908        ];
909        let counts = count_pii_by_class(&spans);
910        assert_eq!(counts.len(), 2);
911        let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
912        assert_eq!(by_class.get("email"), Some(&2));
913        assert_eq!(by_class.get("custom:phone"), Some(&1));
914    }
915
916    #[test]
917    fn report_serializes_with_bundle_version() {
918        let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
919        let report = BundleReport::new(
920            "png",
921            &ocr,
922            42,
923            3,
924            vec![
925                ClassCount::new("email", 2),
926                ClassCount::new("custom:phone", 1),
927            ],
928            None,
929            None,
930            vec![PageReport::new(
931                0,
932                OcrSource::Ocr,
933                Some("tesseract".to_string()),
934                &ocr,
935                1,
936                DEFAULT_LOW_CONFIDENCE_THRESHOLD,
937            )],
938            DEFAULT_LOW_CONFIDENCE_THRESHOLD,
939        );
940        let json = serde_json::to_value(&report).expect("serialize");
941        assert_eq!(json["bundle_version"], BUNDLE_VERSION);
942        assert_eq!(json["input_kind"], "png");
943        assert_eq!(json["pii_token_count"], 3);
944        assert_eq!(json["pages"][0]["ocr_source"], "ocr");
945        assert_eq!(
946            json["low_confidence_threshold"],
947            DEFAULT_LOW_CONFIDENCE_THRESHOLD
948        );
949    }
950
951    #[test]
952    fn v1_report_without_page_fields_still_deserializes() {
953        let json = serde_json::json!({
954            "bundle_version": 1,
955            "input_kind": "png",
956            "ocr_mean_confidence": 90.0,
957            "ocr_word_count": 2,
958            "ocr_lang": "eng",
959            "clean_char_count": 12,
960            "pii_token_count": 1,
961            "pii_tokens_by_class": [{ "class": "email", "count": 1 }],
962            "pdf_page_count": null,
963            "pdf_page_index": null
964        });
965
966        let report: BundleReport = serde_json::from_value(json).expect("v1 parses");
967
968        assert_eq!(report.bundle_version, 1);
969        assert!(report.pages.is_empty());
970        assert_eq!(
971            report.low_confidence_threshold,
972            DEFAULT_LOW_CONFIDENCE_THRESHOLD
973        );
974    }
975
976    #[test]
977    fn clean_with_mock_backend_flags_low_confidence_and_columns() {
978        let backend = MockBackend {
979            spans: vec![
980                span("Bill", 20, 10, 0.50),
981                span("to:", 116, 10, 0.50),
982                span("Jane", 20, 36, 0.50),
983                span("Doe", 116, 36, 0.50),
984                span("Email:", 360, 10, 0.50),
985                span("alice@example.invalid", 360, 36, 0.50),
986            ],
987        };
988        let tmp = tempfile::tempdir().expect("tempdir");
989        let input = tmp.path().join("input.png");
990        fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
991        let (agent_out, owner_out) = bundle_dirs(&tmp);
992
993        let bundle = Pipeline::new()
994            .with_low_confidence_threshold(0.65)
995            .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
996            .expect("clean succeeds");
997
998        assert_eq!(bundle.report.bundle_version, 2);
999        assert_eq!(bundle.report.pages.len(), 1);
1000        let page = &bundle.report.pages[0];
1001        assert_eq!(page.ocr_backend.as_deref(), Some("mock-ocr"));
1002        assert_eq!(page.column_count, 2);
1003        assert_eq!(page.confidence, Some(0.5));
1004        assert!(page.low_confidence);
1005        assert!(
1006            bundle.clean_markdown.contains(":Email_"),
1007            "{}",
1008            bundle.clean_markdown
1009        );
1010        assert!(
1011            !bundle.clean_markdown.contains("alice@example.invalid"),
1012            "{}",
1013            bundle.clean_markdown
1014        );
1015    }
1016
1017    #[test]
1018    fn clean_with_mock_backend_preserves_table_cell_context() {
1019        let backend = MockBackend {
1020            spans: vec![
1021                span("Field", 20, 10, 0.92),
1022                span("Value", 160, 10, 0.92),
1023                span("Bill", 20, 40, 0.92),
1024                span("Jane", 160, 40, 0.92),
1025                span("Email", 20, 70, 0.92),
1026                span("alice@example.invalid", 160, 70, 0.92),
1027            ],
1028        };
1029        let tmp = tempfile::tempdir().expect("tempdir");
1030        let input = tmp.path().join("input.png");
1031        fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
1032        let (agent_out, owner_out) = bundle_dirs(&tmp);
1033
1034        let bundle = Pipeline::new()
1035            .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
1036            .expect("clean succeeds");
1037
1038        assert_eq!(bundle.report.pages[0].column_count, 1);
1039        assert!(
1040            bundle.clean_markdown.contains("Field\nValue\n\nBill\nJane"),
1041            "{}",
1042            bundle.clean_markdown
1043        );
1044        assert!(
1045            bundle.clean_markdown.contains(":Email_"),
1046            "{}",
1047            bundle.clean_markdown
1048        );
1049        assert!(
1050            !bundle.clean_markdown.contains("alice@example.invalid"),
1051            "{}",
1052            bundle.clean_markdown
1053        );
1054    }
1055
1056    #[cfg(feature = "pdf-input")]
1057    #[test]
1058    fn clean_preprocesses_rotated_image_before_backend_ocr() {
1059        use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
1060
1061        #[derive(Debug)]
1062        struct OrientationSensitiveBackend;
1063
1064        impl OcrBackend for OrientationSensitiveBackend {
1065            fn name(&self) -> &str {
1066                "orientation-sensitive"
1067            }
1068
1069            fn recognize(
1070                &self,
1071                image: ImageInput,
1072                _hints: OcrHints,
1073            ) -> Result<Vec<OcrSpan>, OcrError> {
1074                let decoded = image::load_from_memory(&image.bytes)
1075                    .map_err(|err| OcrError::Internal(err.to_string()))?;
1076                if decoded.width() <= decoded.height() {
1077                    return Ok(Vec::new());
1078                }
1079                Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1080            }
1081        }
1082
1083        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1084        for y in 38..42 {
1085            for x in 16..104 {
1086                image.put_pixel(x, y, Luma([0]));
1087            }
1088        }
1089        let sideways = image::imageops::rotate90(&image);
1090        let mut bytes = Vec::new();
1091        sideways
1092            .write_to(
1093                &mut std::io::Cursor::new(&mut bytes),
1094                EncodedImageFormat::Png,
1095            )
1096            .expect("encode png");
1097        let tmp = tempfile::tempdir().expect("tempdir");
1098        let input = tmp.path().join("input.png");
1099        fs::write(&input, bytes).expect("write input");
1100        let (agent_out, owner_out) = bundle_dirs(&tmp);
1101
1102        let bundle = Pipeline::new()
1103            .clean_with_ocr_backend(&input, agent_out, owner_out, &OrientationSensitiveBackend)
1104            .expect("clean succeeds");
1105
1106        assert!(
1107            bundle.clean_markdown.contains(":Email_"),
1108            "{}",
1109            bundle.clean_markdown
1110        );
1111        assert!(
1112            !bundle.clean_markdown.contains("alice@example.invalid"),
1113            "{}",
1114            bundle.clean_markdown
1115        );
1116    }
1117
1118    #[cfg(feature = "pdf-input")]
1119    #[test]
1120    fn clean_deskews_image_before_backend_ocr() {
1121        use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
1122        use imageproc::geometric_transformations::{rotate_about_center, Interpolation};
1123
1124        fn horizontal_score(bytes: &[u8]) -> Result<u64, OcrError> {
1125            let decoded = image::load_from_memory(bytes)
1126                .map_err(|err| OcrError::Internal(err.to_string()))?
1127                .to_luma8();
1128            let mut score = 0u64;
1129            for y in 0..decoded.height() {
1130                let mut dark = 0u64;
1131                for x in 0..decoded.width() {
1132                    if decoded.get_pixel(x, y).0[0] < 200 {
1133                        dark += 1;
1134                    }
1135                }
1136                score = score.saturating_add(dark.saturating_mul(dark));
1137            }
1138            Ok(score)
1139        }
1140
1141        #[derive(Debug)]
1142        struct DeskewSensitiveBackend {
1143            minimum_score: u64,
1144        }
1145
1146        impl OcrBackend for DeskewSensitiveBackend {
1147            fn name(&self) -> &str {
1148                "deskew-sensitive"
1149            }
1150
1151            fn recognize(
1152                &self,
1153                image: ImageInput,
1154                _hints: OcrHints,
1155            ) -> Result<Vec<OcrSpan>, OcrError> {
1156                if horizontal_score(&image.bytes)? < self.minimum_score {
1157                    return Ok(Vec::new());
1158                }
1159                Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1160            }
1161        }
1162
1163        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1164        for y in 38..42 {
1165            for x in 16..104 {
1166                image.put_pixel(x, y, Luma([0]));
1167            }
1168        }
1169        let skewed = rotate_about_center(
1170            &image,
1171            4.0_f32.to_radians(),
1172            Interpolation::Nearest,
1173            Luma([255]),
1174        );
1175        let mut bytes = Vec::new();
1176        skewed
1177            .write_to(
1178                &mut std::io::Cursor::new(&mut bytes),
1179                EncodedImageFormat::Png,
1180            )
1181            .expect("encode png");
1182        let raw_score = horizontal_score(&bytes).expect("raw score");
1183        let backend = DeskewSensitiveBackend {
1184            minimum_score: raw_score + 1_000,
1185        };
1186        assert!(
1187            backend
1188                .recognize(
1189                    ImageInput {
1190                        bytes: bytes.clone(),
1191                        format: ImageFormat::Png,
1192                        dpi: None
1193                    },
1194                    OcrHints::default()
1195                )
1196                .expect("raw recognize")
1197                .is_empty(),
1198            "raw skewed payload should miss before preprocessing"
1199        );
1200        let tmp = tempfile::tempdir().expect("tempdir");
1201        let input = tmp.path().join("input.png");
1202        fs::write(&input, bytes).expect("write input");
1203        let (agent_out, owner_out) = bundle_dirs(&tmp);
1204
1205        let bundle = Pipeline::new()
1206            .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
1207            .expect("clean succeeds");
1208
1209        assert!(
1210            bundle.clean_markdown.contains(":Email_"),
1211            "{}",
1212            bundle.clean_markdown
1213        );
1214        assert!(
1215            !bundle.clean_markdown.contains("alice@example.invalid"),
1216            "{}",
1217            bundle.clean_markdown
1218        );
1219    }
1220
1221    #[test]
1222    fn format_clean_markdown_appends_trailing_newline() {
1223        let md = format_clean_markdown("hello", InputKind::Png);
1224        assert!(md.ends_with('\n'));
1225        assert!(md.contains("Source kind: `png`"));
1226        assert!(md.contains("hello"));
1227    }
1228}
gaze_document/bundle/mod.rs

gaze_document/bundle/
mod.rs