gaze_document/bundle/
mod.rs

1//! SafeBundle generation: OCR + Gaze redact → on-disk artifacts.
2//!
3//! The top-level [`clean`] function is the public adopter entry point. It
4//! routes any supported input (PNG / JPG / single-page PDF) through OCR,
5//! pipes the extracted text through a [`gaze::Pipeline`], and persists the
6//! result as three files in a target directory:
7//!
8//! ```text
9//! out/
10//!   clean.md        # OCR text with PII replaced by reversible tokens
11//!   manifest.json   # gaze::Manifest — restorable, canonical
12//!   report.json     # BundleReport — OCR + PII counts + provenance
13//! ```
14//!
15//! The manifest contract is the same one the rest of the gaze runtime
16//! uses (`gaze::Manifest`). Adopters can pair `clean.md` with `manifest.json`
17//! and restore via the standard gaze session APIs.
18
19use std::path::PathBuf;
20
21use gaze::Manifest;
22use serde::{Deserialize, Serialize};
23
24use crate::ocr::{
25    detect_image_format, ImageFormat, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult,
26};
27
28#[cfg(feature = "ocr-tesseract")]
29use std::collections::BTreeMap;
30#[cfg(feature = "ocr-tesseract")]
31use std::fs;
32#[cfg(feature = "ocr-tesseract")]
33use std::path::Path;
34
35#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
36use gaze::{
37    Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline as GazePipeline,
38    RawDocument, Scope, Session,
39};
40#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
41use gaze_recognizers::{
42    AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
43};
44#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
45use gaze_types::{EmittedTokenSpan, PiiClass};
46
47#[cfg(feature = "ocr-tesseract")]
48use crate::extract::InputKind;
49#[cfg(feature = "ocr-tesseract")]
50use crate::DocumentError;
51
52/// Versioned `report.json` schema tag (bump on breaking shape changes).
53pub const BUNDLE_VERSION: u32 = 2;
54const DEFAULT_LOW_CONFIDENCE_THRESHOLD: f32 = 0.65;
55
56/// Bundle filename written into `--out` for tokenized Markdown.
57pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
58/// Bundle filename written into `--out` for the restorable manifest.
59pub const MANIFEST_FILE: &str = "manifest.json";
60/// Bundle filename written into `--out` for the OCR + PII provenance report.
61pub const REPORT_FILE: &str = "report.json";
62
63/// Post-ingestion artifact paired with a Gaze [`Manifest`].
64#[non_exhaustive]
65#[derive(Debug, Clone)]
66pub struct SafeBundle {
67    /// Tokenized Markdown safe to hand to an LLM.
68    pub clean_markdown: String,
69    /// Reversible manifest produced by the gaze pipeline.
70    pub manifest: Manifest,
71    /// Opaque layout summary (reserved — single-page in v0.0.x).
72    pub layout: LayoutSummary,
73    /// Optional rasterized preview of the source document (reserved).
74    pub preview_png: Option<Vec<u8>>,
75    /// Per-bundle audit + provenance report.
76    pub report: BundleReport,
77    /// Absolute path of the input that produced this bundle.
78    pub source_path: PathBuf,
79    /// Absolute path of the output directory that received this bundle.
80    pub out_dir: PathBuf,
81}
82
83impl SafeBundle {
84    /// Build a [`SafeBundle`] from its component parts.
85    pub fn new(
86        clean_markdown: String,
87        manifest: Manifest,
88        layout: LayoutSummary,
89        preview_png: Option<Vec<u8>>,
90        report: BundleReport,
91        source_path: PathBuf,
92        out_dir: PathBuf,
93    ) -> Self {
94        Self {
95            clean_markdown,
96            manifest,
97            layout,
98            preview_png,
99            report,
100            source_path,
101            out_dir,
102        }
103    }
104}
105
106/// Per-class PII detection count for [`BundleReport`].
107#[non_exhaustive]
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
109pub struct ClassCount {
110    /// Audit-canonical class name (e.g., `"email"`, `"custom:phone"`).
111    pub class: String,
112    /// Number of token spans emitted for that class.
113    pub count: u32,
114}
115
116impl ClassCount {
117    /// Build a class-count entry.
118    pub fn new(class: impl Into<String>, count: u32) -> Self {
119        Self {
120            class: class.into(),
121            count,
122        }
123    }
124}
125
126/// Per-page extraction source.
127#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
128#[serde(rename_all = "snake_case")]
129pub enum OcrSource {
130    /// Selectable text extracted directly from a PDF page.
131    VectorPdf,
132    /// Raster OCR from an image page.
133    Ocr,
134}
135
136/// Per-page OCR/layout provenance.
137#[non_exhaustive]
138#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
139pub struct PageReport {
140    /// Zero-based page index.
141    pub page_index: i32,
142    /// Extraction path used for this page.
143    pub ocr_source: OcrSource,
144    /// OCR backend name when [`OcrSource::Ocr`] produced the page.
145    pub ocr_backend: Option<String>,
146    /// Aggregated page confidence in `0.0..=1.0`. `None` for vector PDF text.
147    pub confidence: Option<f32>,
148    /// True when confidence is present and below the configured threshold.
149    pub low_confidence: bool,
150    /// Detected text column count. `1` means single-column.
151    pub column_count: u32,
152    /// Number of OCR words with confidence for this page.
153    pub ocr_word_count: usize,
154    /// Legacy percent-scale mean confidence for this page.
155    pub ocr_mean_confidence: Option<f32>,
156}
157
158impl PageReport {
159    fn new(
160        page_index: i32,
161        ocr_source: OcrSource,
162        ocr_backend: Option<String>,
163        ocr: &OcrResult,
164        column_count: u32,
165        low_confidence_threshold: f32,
166    ) -> Self {
167        let confidence = ocr.mean_confidence_unit();
168        Self {
169            page_index,
170            ocr_source,
171            ocr_backend,
172            confidence,
173            low_confidence: confidence
174                .map(|confidence| confidence < low_confidence_threshold)
175                .unwrap_or(false),
176            column_count,
177            ocr_word_count: ocr.word_count,
178            ocr_mean_confidence: ocr.mean_confidence,
179        }
180    }
181}
182
183/// Bundle audit + provenance report serialized to `report.json`.
184///
185/// Schema versioned via [`BUNDLE_VERSION`]; older readers can branch on the
186/// `bundle_version` field. Field set is `#[non_exhaustive]` so additive
187/// extensions are SemVer-safe.
188#[non_exhaustive]
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct BundleReport {
191    /// Schema version (currently [`BUNDLE_VERSION`]).
192    pub bundle_version: u32,
193    /// Input kind detected from the source path.
194    pub input_kind: String,
195    /// Mean per-word Tesseract confidence (0..100). `None` when zero words.
196    pub ocr_mean_confidence: Option<f32>,
197    /// Number of words Tesseract emitted with non-negative confidence.
198    pub ocr_word_count: usize,
199    /// Tesseract language code used for OCR (e.g., `"eng"`).
200    pub ocr_lang: String,
201    /// Character count of the tokenized Markdown output.
202    pub clean_char_count: usize,
203    /// Total PII token spans across all classes.
204    pub pii_token_count: u32,
205    /// Per-class breakdown of PII token counts.
206    pub pii_tokens_by_class: Vec<ClassCount>,
207    /// PDF page count when the input was a PDF. `None` for image inputs.
208    pub pdf_page_count: Option<i32>,
209    /// PDF page index that was rasterized. `None` for image inputs.
210    pub pdf_page_index: Option<i32>,
211    /// Per-page extraction, confidence, and layout provenance.
212    #[serde(default)]
213    pub pages: Vec<PageReport>,
214    /// Confidence threshold used to set [`PageReport::low_confidence`].
215    #[serde(default = "default_low_confidence_threshold")]
216    pub low_confidence_threshold: f32,
217}
218
219impl BundleReport {
220    /// Build a [`BundleReport`] from its component parts.
221    #[allow(clippy::too_many_arguments)]
222    pub fn new(
223        input_kind: impl Into<String>,
224        ocr: &OcrResult,
225        clean_char_count: usize,
226        pii_token_count: u32,
227        pii_tokens_by_class: Vec<ClassCount>,
228        pdf_page_count: Option<i32>,
229        pdf_page_index: Option<i32>,
230        pages: Vec<PageReport>,
231        low_confidence_threshold: f32,
232    ) -> Self {
233        Self {
234            bundle_version: BUNDLE_VERSION,
235            input_kind: input_kind.into(),
236            ocr_mean_confidence: ocr.mean_confidence,
237            ocr_word_count: ocr.word_count,
238            ocr_lang: ocr.lang.clone(),
239            clean_char_count,
240            pii_token_count,
241            pii_tokens_by_class,
242            pdf_page_count,
243            pdf_page_index,
244            pages,
245            low_confidence_threshold,
246        }
247    }
248}
249
250fn default_low_confidence_threshold() -> f32 {
251    DEFAULT_LOW_CONFIDENCE_THRESHOLD
252}
253
254/// Configurable document-cleaning pipeline.
255#[non_exhaustive]
256#[derive(Debug, Clone, Copy)]
257pub struct Pipeline {
258    low_confidence_threshold: f32,
259    column_detection: bool,
260}
261
262impl Pipeline {
263    /// Build a pipeline with conservative defaults.
264    pub fn new() -> Self {
265        Self {
266            low_confidence_threshold: DEFAULT_LOW_CONFIDENCE_THRESHOLD,
267            column_detection: true,
268        }
269    }
270
271    /// Override the per-page low-confidence threshold.
272    pub fn with_low_confidence_threshold(mut self, threshold: f32) -> Self {
273        self.low_confidence_threshold = threshold.clamp(0.0, 1.0);
274        self
275    }
276
277    /// Enable or disable multi-column OCR span ordering.
278    pub fn with_column_detection(mut self, enabled: bool) -> Self {
279        self.column_detection = enabled;
280        self
281    }
282
283    /// Clean one document with an adopter-supplied OCR backend.
284    ///
285    /// # Errors
286    /// Returns [`DocumentError`] for any extraction, OCR, redaction, or write failure.
287    #[cfg(feature = "ocr-tesseract")]
288    #[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
289    pub fn clean_with_ocr_backend(
290        &self,
291        input: &Path,
292        out_dir: &Path,
293        ocr_backend: &dyn OcrBackend,
294    ) -> Result<SafeBundle, DocumentError> {
295        clean_with_options(input, out_dir, ocr_backend, *self)
296    }
297}
298
299impl Default for Pipeline {
300    fn default() -> Self {
301        Self::new()
302    }
303}
304
305/// Opaque layout summary placeholder.
306///
307/// Reserved until the multi-page + reading-order PR lands. Construction
308/// records only the page count surfaced by the input layer.
309#[non_exhaustive]
310#[derive(Debug, Clone)]
311pub struct LayoutSummary {
312    /// Number of pages handed to the OCR pass (always `1` in v0.0.x).
313    pub page_count: u32,
314}
315
316impl LayoutSummary {
317    /// Build a single-page layout summary.
318    pub fn single_page() -> Self {
319        Self { page_count: 1 }
320    }
321
322    /// Build a layout summary with an explicit page count.
323    pub fn new(page_count: u32) -> Self {
324        Self { page_count }
325    }
326}
327
328/// Top-level entry point: ingest one document, write a [`SafeBundle`] to disk.
329///
330/// `input` must be a regular file with extension `.png`, `.jpg`, `.jpeg`, or
331/// `.pdf`. `out_dir` is created if missing and populated with three files
332/// (see module docs).
333///
334/// # Errors
335///
336/// Returns [`DocumentError`] for any failure in the OCR → redact → write
337/// chain. Fail-closed: every error variant carries enough context to
338/// diagnose without inspecting partial bundle state.
339#[cfg(feature = "ocr-tesseract")]
340#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
341pub fn clean(input: &Path, out_dir: &Path) -> Result<SafeBundle, DocumentError> {
342    let backend = crate::ocr::TesseractBackend::new();
343    Pipeline::new().clean_with_ocr_backend(input, out_dir, &backend)
344}
345
346/// Top-level entry point with an adopter-supplied OCR backend.
347///
348/// The backend receives finalized single-image bytes. PDF rasterization and
349/// downstream layout/report generation remain owned by `gaze-document`.
350///
351/// # Errors
352///
353/// Returns [`DocumentError`] for any failure in the OCR → redact → write
354/// chain. OCR backend errors are mapped into the existing document error
355/// surface so current callers keep the same high-level behavior.
356#[cfg(feature = "ocr-tesseract")]
357#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
358pub fn clean_with_ocr_backend(
359    input: &Path,
360    out_dir: &Path,
361    ocr_backend: &dyn OcrBackend,
362) -> Result<SafeBundle, DocumentError> {
363    Pipeline::new().clean_with_ocr_backend(input, out_dir, ocr_backend)
364}
365
366#[cfg(feature = "ocr-tesseract")]
367fn clean_with_options(
368    input: &Path,
369    out_dir: &Path,
370    ocr_backend: &dyn OcrBackend,
371    options: Pipeline,
372) -> Result<SafeBundle, DocumentError> {
373    let kind = InputKind::detect(input)?;
374    let absolute_input = absolutize(input);
375    let absolute_out = absolutize(out_dir);
376
377    fs::create_dir_all(out_dir)
378        .map_err(|err| DocumentError::OutputDir(absolute_out.clone(), err))?;
379
380    let extraction = run_document_extraction(input, kind, ocr_backend, options)?;
381    // Repair known narrow OCR artifacts (e.g. spurious whitespace around
382    // `@` in emails) before the redact pipeline sees the text. See
383    // `crate::ocr::normalize` for the documented rule set. Axis 1
384    // (never leak) requires this — the OCR pass occasionally inserts a
385    // single space inside an email that would otherwise slip past strict
386    // recognizers and survive into clean.md.
387    let normalized_text = crate::ocr::normalize_ocr_artifacts(&extraction.ocr_result.text);
388    let pipeline = build_document_pipeline()?;
389    let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
390    let locale_chain = [LocaleTag::Global];
391    let (clean_doc, spans, _leak_report) = pipeline
392        .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
393        .map_err(|err| pipeline_err("redact", err))?;
394
395    let clean_text = match clean_doc {
396        CleanDocument::Text(text) => text,
397        _ => {
398            return Err(DocumentError::Pipeline(
399                "pipeline returned non-text variant for text input".to_string(),
400            ));
401        }
402    };
403
404    let manifest = Manifest::from_spans(spans.clone());
405    let counts = count_pii_by_class(&spans);
406    let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
407
408    let report = BundleReport::new(
409        kind_label(kind),
410        &extraction.ocr_result,
411        clean_text.chars().count(),
412        pii_token_count,
413        counts,
414        extraction.pdf_page_count,
415        extraction.pdf_page_index,
416        extraction.pages,
417        options.low_confidence_threshold,
418    );
419
420    let clean_markdown = format_clean_markdown(&clean_text, kind);
421    write_bundle(out_dir, &clean_markdown, &manifest, &report)?;
422
423    Ok(SafeBundle::new(
424        clean_markdown,
425        manifest,
426        LayoutSummary::new(extraction.page_count),
427        None,
428        report,
429        absolute_input,
430        absolute_out,
431    ))
432}
433
434#[cfg(feature = "ocr-tesseract")]
435struct DocumentExtraction {
436    ocr_result: OcrResult,
437    pdf_page_count: Option<i32>,
438    pdf_page_index: Option<i32>,
439    pages: Vec<PageReport>,
440    page_count: u32,
441}
442
443#[cfg(feature = "ocr-tesseract")]
444#[cfg_attr(not(feature = "mcp"), allow(dead_code))]
445pub(crate) fn run_ocr(
446    input: &Path,
447    kind: InputKind,
448    ocr_backend: &dyn OcrBackend,
449) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
450    let extraction = run_document_extraction(input, kind, ocr_backend, Pipeline::new())?;
451    Ok((
452        extraction.ocr_result,
453        extraction.pdf_page_count,
454        extraction.pdf_page_index,
455    ))
456}
457
458#[cfg(feature = "ocr-tesseract")]
459fn run_document_extraction(
460    input: &Path,
461    kind: InputKind,
462    ocr_backend: &dyn OcrBackend,
463    options: Pipeline,
464) -> Result<DocumentExtraction, DocumentError> {
465    match kind {
466        InputKind::Png | InputKind::Jpeg => {
467            let bytes = fs::read(input)?;
468            let format = detect_image_format(&bytes)?;
469            let (result, column_count) = recognize_image(
470                ocr_backend,
471                ImageInput {
472                    bytes,
473                    format,
474                    dpi: None,
475                },
476                options.column_detection,
477            )?;
478            let page_report = PageReport::new(
479                0,
480                OcrSource::Ocr,
481                Some(ocr_backend.name().to_string()),
482                &result,
483                column_count,
484                options.low_confidence_threshold,
485            );
486            Ok(DocumentExtraction {
487                ocr_result: result,
488                pdf_page_count: None,
489                pdf_page_index: None,
490                pages: vec![page_report],
491                page_count: 1,
492            })
493        }
494        InputKind::Pdf => {
495            #[cfg(feature = "pdf-input")]
496            {
497                use crate::extract::pdf::{extract_pages, PdfPagePayload, PdfRasterConfig};
498                let payloads = extract_pages(input, PdfRasterConfig::new())?;
499                let mut page_results = Vec::with_capacity(payloads.len());
500                let mut pages = Vec::with_capacity(payloads.len());
501                let mut pdf_page_count = None;
502                let mut first_page_index = None;
503
504                for payload in payloads {
505                    pdf_page_count = Some(payload.page_count());
506                    if first_page_index.is_none() {
507                        first_page_index = Some(payload.page_index());
508                    }
509                    match payload {
510                        PdfPagePayload::VectorText {
511                            text, page_index, ..
512                        } => {
513                            let result = OcrResult::new(text, None, 0, "vector-pdf".to_string());
514                            pages.push(PageReport::new(
515                                page_index,
516                                OcrSource::VectorPdf,
517                                None,
518                                &result,
519                                1,
520                                options.low_confidence_threshold,
521                            ));
522                            page_results.push(result);
523                        }
524                        PdfPagePayload::Raster(raster) => {
525                            let (result, column_count) = recognize_image(
526                                ocr_backend,
527                                ImageInput {
528                                    bytes: raster.png_bytes,
529                                    format: ImageFormat::Png,
530                                    dpi: None,
531                                },
532                                options.column_detection,
533                            )?;
534                            pages.push(PageReport::new(
535                                raster.page_index,
536                                OcrSource::Ocr,
537                                Some(ocr_backend.name().to_string()),
538                                &result,
539                                column_count,
540                                options.low_confidence_threshold,
541                            ));
542                            page_results.push(result);
543                        }
544                    }
545                }
546
547                Ok(DocumentExtraction {
548                    ocr_result: merge_page_results(&page_results),
549                    pdf_page_count,
550                    pdf_page_index: first_page_index,
551                    page_count: pages.len() as u32,
552                    pages,
553                })
554            }
555            #[cfg(not(feature = "pdf-input"))]
556            {
557                Err(DocumentError::UnsupportedInput {
558                    path: input.to_path_buf(),
559                    reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
560                })
561            }
562        }
563    }
564}
565
566#[cfg(feature = "ocr-tesseract")]
567fn recognize_image(
568    ocr_backend: &dyn OcrBackend,
569    image: ImageInput,
570    column_detection: bool,
571) -> Result<(OcrResult, u32), DocumentError> {
572    let hints = OcrHints::default();
573    let lang = hints.primary_language().to_string();
574    let image = crate::preprocess::preprocess_image(image);
575    let spans = ocr_backend
576        .recognize(image, hints)
577        .map_err(map_ocr_error_to_document_error)?;
578    Ok(OcrResult::from_spans_with_column_detection(
579        &spans,
580        lang,
581        column_detection,
582    ))
583}
584
585#[cfg(feature = "ocr-tesseract")]
586fn merge_page_results(results: &[OcrResult]) -> OcrResult {
587    let text = results
588        .iter()
589        .map(|result| result.text.as_str())
590        .collect::<Vec<_>>()
591        .join("\n\n");
592    let mut conf_sum = 0.0f64;
593    let mut conf_count = 0usize;
594    for result in results {
595        if let Some(confidence) = result.mean_confidence {
596            conf_sum += confidence as f64 * result.word_count as f64;
597            conf_count += result.word_count;
598        }
599    }
600    let mean_confidence = if conf_count == 0 {
601        None
602    } else {
603        Some((conf_sum / conf_count as f64) as f32)
604    };
605    OcrResult::new(text, mean_confidence, conf_count, "mixed".to_string())
606}
607
608#[cfg(feature = "ocr-tesseract")]
609fn map_ocr_error_to_document_error(err: OcrError) -> DocumentError {
610    match err {
611        OcrError::InitFailed(hint) => DocumentError::TesseractNotFound(hint),
612        OcrError::RecognizeFailed(detail) => DocumentError::TesseractFailed {
613            status: -1,
614            stderr: detail,
615        },
616        OcrError::UnsupportedFormat(format) => DocumentError::UnsupportedInput {
617            path: PathBuf::new(),
618            reason: match format {
619                ImageFormat::Png => "png image format is not supported by the OCR backend",
620                ImageFormat::Jpeg => "jpeg image format is not supported by the OCR backend",
621                ImageFormat::Tiff => "tiff image format is not supported by the OCR backend",
622            },
623        },
624        OcrError::Internal(detail) => DocumentError::Pipeline(format!("ocr: {detail}")),
625    }
626}
627
628#[cfg(feature = "ocr-tesseract")]
629#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
630pub(crate) fn build_document_pipeline() -> Result<GazePipeline, DocumentError> {
631    let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
632    // Conservative phone pattern: optional `+CC`, area, exchange, line, with
633    // common separators. Synthetic fixture uses `+1-555-0142`-style numbers.
634    let phone = RegexDetector::new(
635        r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
636        PiiClass::custom("phone"),
637    )
638    .map_err(|err| pipeline_err("phone-regex", err))?;
639    // Invoice / shipping recipient block names. Scope is intentionally
640    // local to gaze-document (rather than extending the locale-en
641    // `forward_markers` bucket): forwarded-email cues and document
642    // recipient blocks are semantically distinct anchors and should not
643    // share a bucket. `LineEnd` boundary stops the name span at the
644    // newline that ends the recipient line so a follow-up `Email:` row
645    // cannot be absorbed into the Name match.
646    let recipient_name = AnchoredMatchRecognizer::new(
647        "gaze_document.name.recipient".to_string(),
648        vec![
649            "Bill to".to_string(),
650            "Invoice to".to_string(),
651            "Ship to".to_string(),
652            "Attention".to_string(),
653            "Attn".to_string(),
654        ],
655        AnchoredBoundary::LineEnd,
656        48,
657        NameShape::PersonName,
658        CuePosition::Before,
659        "invoice_recipient".to_string(),
660        2,
661        0.88,
662        110,
663    );
664    GazePipeline::builder()
665        .detector(email)
666        .detector(phone)
667        .recognizer(recipient_name)
668        .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
669        .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
670        .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
671        .rule(DefaultRule::new(Action::Preserve))
672        .build()
673        .map_err(|err| pipeline_err("build", err))
674}
675
676#[cfg(feature = "ocr-tesseract")]
677fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
678    let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
679    for span in spans {
680        *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
681    }
682    by_class
683        .into_iter()
684        .map(|(class, count)| ClassCount::new(class, count))
685        .collect()
686}
687
688#[cfg(feature = "ocr-tesseract")]
689fn write_bundle(
690    out_dir: &Path,
691    clean_markdown: &str,
692    manifest: &Manifest,
693    report: &BundleReport,
694) -> Result<(), DocumentError> {
695    fs::write(out_dir.join(CLEAN_MARKDOWN_FILE), clean_markdown)?;
696    let manifest_json = serde_json::to_vec_pretty(manifest)?;
697    fs::write(out_dir.join(MANIFEST_FILE), manifest_json)?;
698    let report_json = serde_json::to_vec_pretty(report)?;
699    fs::write(out_dir.join(REPORT_FILE), report_json)?;
700    Ok(())
701}
702
703#[cfg(feature = "ocr-tesseract")]
704pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
705    let mut out = String::new();
706    out.push_str("# gaze-document safe bundle\n\n");
707    out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
708    out.push_str("---\n\n");
709    out.push_str(text);
710    if !text.ends_with('\n') {
711        out.push('\n');
712    }
713    out
714}
715
716#[cfg(feature = "ocr-tesseract")]
717pub(crate) fn kind_label(kind: InputKind) -> &'static str {
718    match kind {
719        InputKind::Png => "png",
720        InputKind::Jpeg => "jpeg",
721        InputKind::Pdf => "pdf",
722    }
723}
724
725#[cfg(feature = "ocr-tesseract")]
726fn absolutize(path: &Path) -> PathBuf {
727    if path.is_absolute() {
728        path.to_path_buf()
729    } else {
730        std::env::current_dir()
731            .map(|cwd| cwd.join(path))
732            .unwrap_or_else(|_| path.to_path_buf())
733    }
734}
735
736#[cfg(feature = "ocr-tesseract")]
737fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
738    DocumentError::Pipeline(format!("{stage}: {err}"))
739}
740
741#[cfg(all(test, feature = "ocr-tesseract"))]
742mod tests {
743    use super::*;
744    use crate::ocr::{BBox, OcrSpan};
745
746    #[derive(Debug)]
747    struct MockBackend {
748        spans: Vec<OcrSpan>,
749    }
750
751    impl OcrBackend for MockBackend {
752        fn name(&self) -> &str {
753            "mock-ocr"
754        }
755
756        fn recognize(
757            &self,
758            _image: ImageInput,
759            _hints: OcrHints,
760        ) -> Result<Vec<OcrSpan>, OcrError> {
761            Ok(self.spans.clone())
762        }
763    }
764
765    fn span(text: &str, x: u32, y: u32, confidence: f32) -> OcrSpan {
766        OcrSpan {
767            text: text.to_string(),
768            bbox: BBox { x, y, w: 90, h: 16 },
769            confidence: Some(confidence),
770        }
771    }
772
773    #[test]
774    fn count_pii_by_class_groups_email_and_phone() {
775        let spans = vec![
776            EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
777            EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
778            EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
779        ];
780        let counts = count_pii_by_class(&spans);
781        assert_eq!(counts.len(), 2);
782        let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
783        assert_eq!(by_class.get("email"), Some(&2));
784        assert_eq!(by_class.get("custom:phone"), Some(&1));
785    }
786
787    #[test]
788    fn report_serializes_with_bundle_version() {
789        let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
790        let report = BundleReport::new(
791            "png",
792            &ocr,
793            42,
794            3,
795            vec![
796                ClassCount::new("email", 2),
797                ClassCount::new("custom:phone", 1),
798            ],
799            None,
800            None,
801            vec![PageReport::new(
802                0,
803                OcrSource::Ocr,
804                Some("tesseract".to_string()),
805                &ocr,
806                1,
807                DEFAULT_LOW_CONFIDENCE_THRESHOLD,
808            )],
809            DEFAULT_LOW_CONFIDENCE_THRESHOLD,
810        );
811        let json = serde_json::to_value(&report).expect("serialize");
812        assert_eq!(json["bundle_version"], BUNDLE_VERSION);
813        assert_eq!(json["input_kind"], "png");
814        assert_eq!(json["pii_token_count"], 3);
815        assert_eq!(json["pages"][0]["ocr_source"], "ocr");
816        assert_eq!(
817            json["low_confidence_threshold"],
818            DEFAULT_LOW_CONFIDENCE_THRESHOLD
819        );
820    }
821
822    #[test]
823    fn v1_report_without_page_fields_still_deserializes() {
824        let json = serde_json::json!({
825            "bundle_version": 1,
826            "input_kind": "png",
827            "ocr_mean_confidence": 90.0,
828            "ocr_word_count": 2,
829            "ocr_lang": "eng",
830            "clean_char_count": 12,
831            "pii_token_count": 1,
832            "pii_tokens_by_class": [{ "class": "email", "count": 1 }],
833            "pdf_page_count": null,
834            "pdf_page_index": null
835        });
836
837        let report: BundleReport = serde_json::from_value(json).expect("v1 parses");
838
839        assert_eq!(report.bundle_version, 1);
840        assert!(report.pages.is_empty());
841        assert_eq!(
842            report.low_confidence_threshold,
843            DEFAULT_LOW_CONFIDENCE_THRESHOLD
844        );
845    }
846
847    #[test]
848    fn clean_with_mock_backend_flags_low_confidence_and_columns() {
849        let backend = MockBackend {
850            spans: vec![
851                span("Bill", 20, 10, 0.50),
852                span("to:", 116, 10, 0.50),
853                span("Jane", 20, 36, 0.50),
854                span("Doe", 116, 36, 0.50),
855                span("Email:", 360, 10, 0.50),
856                span("alice@example.invalid", 360, 36, 0.50),
857            ],
858        };
859        let tmp = tempfile::tempdir().expect("tempdir");
860        let input = tmp.path().join("input.png");
861        fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
862
863        let bundle = Pipeline::new()
864            .with_low_confidence_threshold(0.65)
865            .clean_with_ocr_backend(&input, tmp.path(), &backend)
866            .expect("clean succeeds");
867
868        assert_eq!(bundle.report.bundle_version, 2);
869        assert_eq!(bundle.report.pages.len(), 1);
870        let page = &bundle.report.pages[0];
871        assert_eq!(page.ocr_backend.as_deref(), Some("mock-ocr"));
872        assert_eq!(page.column_count, 2);
873        assert_eq!(page.confidence, Some(0.5));
874        assert!(page.low_confidence);
875        assert!(
876            bundle.clean_markdown.contains(":Email_"),
877            "{}",
878            bundle.clean_markdown
879        );
880        assert!(
881            !bundle.clean_markdown.contains("alice@example.invalid"),
882            "{}",
883            bundle.clean_markdown
884        );
885    }
886
887    #[test]
888    fn clean_with_mock_backend_preserves_table_cell_context() {
889        let backend = MockBackend {
890            spans: vec![
891                span("Field", 20, 10, 0.92),
892                span("Value", 160, 10, 0.92),
893                span("Bill", 20, 40, 0.92),
894                span("Jane", 160, 40, 0.92),
895                span("Email", 20, 70, 0.92),
896                span("alice@example.invalid", 160, 70, 0.92),
897            ],
898        };
899        let tmp = tempfile::tempdir().expect("tempdir");
900        let input = tmp.path().join("input.png");
901        fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
902
903        let bundle = Pipeline::new()
904            .clean_with_ocr_backend(&input, tmp.path(), &backend)
905            .expect("clean succeeds");
906
907        assert_eq!(bundle.report.pages[0].column_count, 1);
908        assert!(
909            bundle.clean_markdown.contains("Field\nValue\n\nBill\nJane"),
910            "{}",
911            bundle.clean_markdown
912        );
913        assert!(
914            bundle.clean_markdown.contains(":Email_"),
915            "{}",
916            bundle.clean_markdown
917        );
918        assert!(
919            !bundle.clean_markdown.contains("alice@example.invalid"),
920            "{}",
921            bundle.clean_markdown
922        );
923    }
924
925    #[cfg(feature = "pdf-input")]
926    #[test]
927    fn clean_preprocesses_rotated_image_before_backend_ocr() {
928        use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
929
930        #[derive(Debug)]
931        struct OrientationSensitiveBackend;
932
933        impl OcrBackend for OrientationSensitiveBackend {
934            fn name(&self) -> &str {
935                "orientation-sensitive"
936            }
937
938            fn recognize(
939                &self,
940                image: ImageInput,
941                _hints: OcrHints,
942            ) -> Result<Vec<OcrSpan>, OcrError> {
943                let decoded = image::load_from_memory(&image.bytes)
944                    .map_err(|err| OcrError::Internal(err.to_string()))?;
945                if decoded.width() <= decoded.height() {
946                    return Ok(Vec::new());
947                }
948                Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
949            }
950        }
951
952        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
953        for y in 38..42 {
954            for x in 16..104 {
955                image.put_pixel(x, y, Luma([0]));
956            }
957        }
958        let sideways = image::imageops::rotate90(&image);
959        let mut bytes = Vec::new();
960        sideways
961            .write_to(
962                &mut std::io::Cursor::new(&mut bytes),
963                EncodedImageFormat::Png,
964            )
965            .expect("encode png");
966        let tmp = tempfile::tempdir().expect("tempdir");
967        let input = tmp.path().join("input.png");
968        fs::write(&input, bytes).expect("write input");
969
970        let bundle = Pipeline::new()
971            .clean_with_ocr_backend(&input, tmp.path(), &OrientationSensitiveBackend)
972            .expect("clean succeeds");
973
974        assert!(
975            bundle.clean_markdown.contains(":Email_"),
976            "{}",
977            bundle.clean_markdown
978        );
979        assert!(
980            !bundle.clean_markdown.contains("alice@example.invalid"),
981            "{}",
982            bundle.clean_markdown
983        );
984    }
985
986    #[cfg(feature = "pdf-input")]
987    #[test]
988    fn clean_deskews_image_before_backend_ocr() {
989        use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
990        use imageproc::geometric_transformations::{rotate_about_center, Interpolation};
991
992        fn horizontal_score(bytes: &[u8]) -> Result<u64, OcrError> {
993            let decoded = image::load_from_memory(bytes)
994                .map_err(|err| OcrError::Internal(err.to_string()))?
995                .to_luma8();
996            let mut score = 0u64;
997            for y in 0..decoded.height() {
998                let mut dark = 0u64;
999                for x in 0..decoded.width() {
1000                    if decoded.get_pixel(x, y).0[0] < 200 {
1001                        dark += 1;
1002                    }
1003                }
1004                score = score.saturating_add(dark.saturating_mul(dark));
1005            }
1006            Ok(score)
1007        }
1008
1009        #[derive(Debug)]
1010        struct DeskewSensitiveBackend {
1011            minimum_score: u64,
1012        }
1013
1014        impl OcrBackend for DeskewSensitiveBackend {
1015            fn name(&self) -> &str {
1016                "deskew-sensitive"
1017            }
1018
1019            fn recognize(
1020                &self,
1021                image: ImageInput,
1022                _hints: OcrHints,
1023            ) -> Result<Vec<OcrSpan>, OcrError> {
1024                if horizontal_score(&image.bytes)? < self.minimum_score {
1025                    return Ok(Vec::new());
1026                }
1027                Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1028            }
1029        }
1030
1031        let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1032        for y in 38..42 {
1033            for x in 16..104 {
1034                image.put_pixel(x, y, Luma([0]));
1035            }
1036        }
1037        let skewed = rotate_about_center(
1038            &image,
1039            4.0_f32.to_radians(),
1040            Interpolation::Nearest,
1041            Luma([255]),
1042        );
1043        let mut bytes = Vec::new();
1044        skewed
1045            .write_to(
1046                &mut std::io::Cursor::new(&mut bytes),
1047                EncodedImageFormat::Png,
1048            )
1049            .expect("encode png");
1050        let raw_score = horizontal_score(&bytes).expect("raw score");
1051        let backend = DeskewSensitiveBackend {
1052            minimum_score: raw_score + 1_000,
1053        };
1054        assert!(
1055            backend
1056                .recognize(
1057                    ImageInput {
1058                        bytes: bytes.clone(),
1059                        format: ImageFormat::Png,
1060                        dpi: None
1061                    },
1062                    OcrHints::default()
1063                )
1064                .expect("raw recognize")
1065                .is_empty(),
1066            "raw skewed payload should miss before preprocessing"
1067        );
1068        let tmp = tempfile::tempdir().expect("tempdir");
1069        let input = tmp.path().join("input.png");
1070        fs::write(&input, bytes).expect("write input");
1071
1072        let bundle = Pipeline::new()
1073            .clean_with_ocr_backend(&input, tmp.path(), &backend)
1074            .expect("clean succeeds");
1075
1076        assert!(
1077            bundle.clean_markdown.contains(":Email_"),
1078            "{}",
1079            bundle.clean_markdown
1080        );
1081        assert!(
1082            !bundle.clean_markdown.contains("alice@example.invalid"),
1083            "{}",
1084            bundle.clean_markdown
1085        );
1086    }
1087
1088    #[test]
1089    fn format_clean_markdown_appends_trailing_newline() {
1090        let md = format_clean_markdown("hello", InputKind::Png);
1091        assert!(md.ends_with('\n'));
1092        assert!(md.contains("Source kind: `png`"));
1093        assert!(md.contains("hello"));
1094    }
1095}
gaze_document/bundle/mod.rs

gaze_document/bundle/
mod.rs