1use std::path::{Component, Path, PathBuf};
22
23use gaze::Manifest;
24use serde::{Deserialize, Serialize};
25
26use crate::ocr::{
27 detect_image_format, ImageFormat, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult,
28};
29
30#[cfg(feature = "ocr-tesseract")]
31use std::collections::BTreeMap;
32#[cfg(feature = "ocr-tesseract")]
33use std::fs;
34
35#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
36use gaze::{
37 Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline as GazePipeline,
38 RawDocument, Scope, Session,
39};
40#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
41use gaze_recognizers::{
42 AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
43};
44#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
45use gaze_types::{EmittedTokenSpan, PiiClass};
46
47#[cfg(feature = "ocr-tesseract")]
48use crate::extract::InputKind;
49#[cfg(feature = "ocr-tesseract")]
50use crate::{BundleLayoutInvalidReason, DocumentError};
51
52pub const BUNDLE_VERSION: u32 = 2;
54const DEFAULT_LOW_CONFIDENCE_THRESHOLD: f32 = 0.65;
55
56pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
58pub const MANIFEST_FILE: &str = "manifest.json";
60pub const REPORT_FILE: &str = "report.json";
62
63#[non_exhaustive]
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct AgentBundleDir(PathBuf);
67
68impl AgentBundleDir {
69 pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentError> {
74 let path = path.into();
75 validate_non_empty_path(&path)?;
76 Ok(Self(path))
77 }
78
79 pub fn as_path(&self) -> &Path {
81 &self.0
82 }
83}
84
85#[non_exhaustive]
87#[derive(Debug, Clone, PartialEq, Eq)]
88pub struct OwnerBundleDir(PathBuf);
89
90impl OwnerBundleDir {
91 pub fn new(path: impl Into<PathBuf>) -> Result<Self, DocumentError> {
96 let path = path.into();
97 validate_non_empty_path(&path)?;
98 Ok(Self(path))
99 }
100
101 pub fn as_path(&self) -> &Path {
103 &self.0
104 }
105}
106
107#[non_exhaustive]
109#[derive(Debug, Clone)]
110pub struct SafeBundle {
111 pub clean_markdown: String,
113 pub manifest: Manifest,
115 pub layout: LayoutSummary,
117 pub preview_png: Option<Vec<u8>>,
119 pub report: BundleReport,
121 pub source_path: PathBuf,
123 pub agent_out_dir: PathBuf,
125 pub owner_out_dir: PathBuf,
127}
128
129impl SafeBundle {
130 #[allow(clippy::too_many_arguments)]
132 pub fn new(
133 clean_markdown: String,
134 manifest: Manifest,
135 layout: LayoutSummary,
136 preview_png: Option<Vec<u8>>,
137 report: BundleReport,
138 source_path: PathBuf,
139 agent_out_dir: PathBuf,
140 owner_out_dir: PathBuf,
141 ) -> Self {
142 Self {
143 clean_markdown,
144 manifest,
145 layout,
146 preview_png,
147 report,
148 source_path,
149 agent_out_dir,
150 owner_out_dir,
151 }
152 }
153}
154
155#[non_exhaustive]
157#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
158pub struct ClassCount {
159 pub class: String,
161 pub count: u32,
163}
164
165impl ClassCount {
166 pub fn new(class: impl Into<String>, count: u32) -> Self {
168 Self {
169 class: class.into(),
170 count,
171 }
172 }
173}
174
175#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
177#[serde(rename_all = "snake_case")]
178pub enum OcrSource {
179 VectorPdf,
181 Ocr,
183}
184
185#[non_exhaustive]
187#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
188pub struct PageReport {
189 pub page_index: i32,
191 pub ocr_source: OcrSource,
193 pub ocr_backend: Option<String>,
195 pub confidence: Option<f32>,
197 pub low_confidence: bool,
199 pub column_count: u32,
201 pub ocr_word_count: usize,
203 pub ocr_mean_confidence: Option<f32>,
205}
206
207impl PageReport {
208 fn new(
209 page_index: i32,
210 ocr_source: OcrSource,
211 ocr_backend: Option<String>,
212 ocr: &OcrResult,
213 column_count: u32,
214 low_confidence_threshold: f32,
215 ) -> Self {
216 let confidence = ocr.mean_confidence_unit();
217 Self {
218 page_index,
219 ocr_source,
220 ocr_backend,
221 confidence,
222 low_confidence: confidence
223 .map(|confidence| confidence < low_confidence_threshold)
224 .unwrap_or(false),
225 column_count,
226 ocr_word_count: ocr.word_count,
227 ocr_mean_confidence: ocr.mean_confidence,
228 }
229 }
230}
231
232#[non_exhaustive]
238#[derive(Debug, Clone, Serialize, Deserialize)]
239pub struct BundleReport {
240 pub bundle_version: u32,
242 pub input_kind: String,
244 pub ocr_mean_confidence: Option<f32>,
246 pub ocr_word_count: usize,
248 pub ocr_lang: String,
250 pub clean_char_count: usize,
252 pub pii_token_count: u32,
254 pub pii_tokens_by_class: Vec<ClassCount>,
256 pub pdf_page_count: Option<i32>,
258 pub pdf_page_index: Option<i32>,
260 #[serde(default)]
262 pub pages: Vec<PageReport>,
263 #[serde(default = "default_low_confidence_threshold")]
265 pub low_confidence_threshold: f32,
266}
267
268impl BundleReport {
269 #[allow(clippy::too_many_arguments)]
271 pub fn new(
272 input_kind: impl Into<String>,
273 ocr: &OcrResult,
274 clean_char_count: usize,
275 pii_token_count: u32,
276 pii_tokens_by_class: Vec<ClassCount>,
277 pdf_page_count: Option<i32>,
278 pdf_page_index: Option<i32>,
279 pages: Vec<PageReport>,
280 low_confidence_threshold: f32,
281 ) -> Self {
282 Self {
283 bundle_version: BUNDLE_VERSION,
284 input_kind: input_kind.into(),
285 ocr_mean_confidence: ocr.mean_confidence,
286 ocr_word_count: ocr.word_count,
287 ocr_lang: ocr.lang.clone(),
288 clean_char_count,
289 pii_token_count,
290 pii_tokens_by_class,
291 pdf_page_count,
292 pdf_page_index,
293 pages,
294 low_confidence_threshold,
295 }
296 }
297}
298
299fn default_low_confidence_threshold() -> f32 {
300 DEFAULT_LOW_CONFIDENCE_THRESHOLD
301}
302
303#[non_exhaustive]
305#[derive(Debug, Clone, Copy)]
306pub struct Pipeline {
307 low_confidence_threshold: f32,
308 column_detection: bool,
309}
310
311impl Pipeline {
312 pub fn new() -> Self {
314 Self {
315 low_confidence_threshold: DEFAULT_LOW_CONFIDENCE_THRESHOLD,
316 column_detection: true,
317 }
318 }
319
320 pub fn with_low_confidence_threshold(mut self, threshold: f32) -> Self {
322 self.low_confidence_threshold = threshold.clamp(0.0, 1.0);
323 self
324 }
325
326 pub fn with_column_detection(mut self, enabled: bool) -> Self {
328 self.column_detection = enabled;
329 self
330 }
331
332 #[cfg(feature = "ocr-tesseract")]
337 #[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
338 pub fn clean_with_ocr_backend(
339 &self,
340 input: &Path,
341 agent_out: AgentBundleDir,
342 owner_out: OwnerBundleDir,
343 ocr_backend: &dyn OcrBackend,
344 ) -> Result<SafeBundle, DocumentError> {
345 clean_with_options(input, agent_out, owner_out, ocr_backend, *self)
346 }
347}
348
349impl Default for Pipeline {
350 fn default() -> Self {
351 Self::new()
352 }
353}
354
355#[non_exhaustive]
360#[derive(Debug, Clone)]
361pub struct LayoutSummary {
362 pub page_count: u32,
364}
365
366impl LayoutSummary {
367 pub fn single_page() -> Self {
369 Self { page_count: 1 }
370 }
371
372 pub fn new(page_count: u32) -> Self {
374 Self { page_count }
375 }
376}
377
378#[cfg(feature = "ocr-tesseract")]
390#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
391pub fn clean(
392 input: &Path,
393 agent_out: AgentBundleDir,
394 owner_out: OwnerBundleDir,
395) -> Result<SafeBundle, DocumentError> {
396 let backend = crate::ocr::TesseractBackend::new();
397 Pipeline::new().clean_with_ocr_backend(input, agent_out, owner_out, &backend)
398}
399
400#[cfg(feature = "ocr-tesseract")]
411#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
412pub fn clean_with_ocr_backend(
413 input: &Path,
414 agent_out: AgentBundleDir,
415 owner_out: OwnerBundleDir,
416 ocr_backend: &dyn OcrBackend,
417) -> Result<SafeBundle, DocumentError> {
418 Pipeline::new().clean_with_ocr_backend(input, agent_out, owner_out, ocr_backend)
419}
420
421#[cfg(feature = "ocr-tesseract")]
422fn clean_with_options(
423 input: &Path,
424 agent_out: AgentBundleDir,
425 owner_out: OwnerBundleDir,
426 ocr_backend: &dyn OcrBackend,
427 options: Pipeline,
428) -> Result<SafeBundle, DocumentError> {
429 let kind = InputKind::detect(input)?;
430 let absolute_input = absolutize(input);
431 let (absolute_agent_out, absolute_owner_out) = prepare_bundle_dirs(&agent_out, &owner_out)?;
432
433 let extraction = run_document_extraction(input, kind, ocr_backend, options)?;
434 let normalized_text = crate::ocr::normalize_ocr_artifacts(&extraction.ocr_result.text);
441 let pipeline = build_document_pipeline()?;
442 let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
443 let locale_chain = [LocaleTag::Global];
444 let (clean_doc, spans, _leak_report) = pipeline
445 .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
446 .map_err(|err| pipeline_err("redact", err))?;
447
448 let clean_text = match clean_doc {
449 CleanDocument::Text(text) => text,
450 _ => {
451 return Err(DocumentError::Pipeline(
452 "pipeline returned non-text variant for text input".to_string(),
453 ));
454 }
455 };
456
457 let manifest = Manifest::from_spans(spans.clone());
458 let counts = count_pii_by_class(&spans);
459 let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
460
461 let report = BundleReport::new(
462 kind_label(kind),
463 &extraction.ocr_result,
464 clean_text.chars().count(),
465 pii_token_count,
466 counts,
467 extraction.pdf_page_count,
468 extraction.pdf_page_index,
469 extraction.pages,
470 options.low_confidence_threshold,
471 );
472
473 let clean_markdown = format_clean_markdown(&clean_text, kind);
474 write_bundle(&agent_out, &owner_out, &clean_markdown, &manifest, &report)?;
475
476 Ok(SafeBundle::new(
477 clean_markdown,
478 manifest,
479 LayoutSummary::new(extraction.page_count),
480 None,
481 report,
482 absolute_input,
483 absolute_agent_out,
484 absolute_owner_out,
485 ))
486}
487
488#[cfg(feature = "ocr-tesseract")]
489struct DocumentExtraction {
490 ocr_result: OcrResult,
491 pdf_page_count: Option<i32>,
492 pdf_page_index: Option<i32>,
493 pages: Vec<PageReport>,
494 page_count: u32,
495}
496
497#[cfg(feature = "ocr-tesseract")]
498#[cfg_attr(not(feature = "mcp"), allow(dead_code))]
499pub(crate) fn run_ocr(
500 input: &Path,
501 kind: InputKind,
502 ocr_backend: &dyn OcrBackend,
503) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
504 let extraction = run_document_extraction(input, kind, ocr_backend, Pipeline::new())?;
505 Ok((
506 extraction.ocr_result,
507 extraction.pdf_page_count,
508 extraction.pdf_page_index,
509 ))
510}
511
512#[cfg(feature = "ocr-tesseract")]
513fn run_document_extraction(
514 input: &Path,
515 kind: InputKind,
516 ocr_backend: &dyn OcrBackend,
517 options: Pipeline,
518) -> Result<DocumentExtraction, DocumentError> {
519 match kind {
520 InputKind::Png | InputKind::Jpeg => {
521 let bytes = fs::read(input)?;
522 let format = detect_image_format(&bytes)?;
523 let (result, column_count) = recognize_image(
524 ocr_backend,
525 ImageInput {
526 bytes,
527 format,
528 dpi: None,
529 },
530 options.column_detection,
531 )?;
532 let page_report = PageReport::new(
533 0,
534 OcrSource::Ocr,
535 Some(ocr_backend.name().to_string()),
536 &result,
537 column_count,
538 options.low_confidence_threshold,
539 );
540 Ok(DocumentExtraction {
541 ocr_result: result,
542 pdf_page_count: None,
543 pdf_page_index: None,
544 pages: vec![page_report],
545 page_count: 1,
546 })
547 }
548 InputKind::Pdf => {
549 #[cfg(feature = "pdf-input")]
550 {
551 use crate::extract::pdf::{extract_pages, PdfPagePayload, PdfRasterConfig};
552 let payloads = extract_pages(input, PdfRasterConfig::new())?;
553 let mut page_results = Vec::with_capacity(payloads.len());
554 let mut pages = Vec::with_capacity(payloads.len());
555 let mut pdf_page_count = None;
556 let mut first_page_index = None;
557
558 for payload in payloads {
559 pdf_page_count = Some(payload.page_count());
560 if first_page_index.is_none() {
561 first_page_index = Some(payload.page_index());
562 }
563 match payload {
564 PdfPagePayload::VectorText {
565 text, page_index, ..
566 } => {
567 let result = OcrResult::new(text, None, 0, "vector-pdf".to_string());
568 pages.push(PageReport::new(
569 page_index,
570 OcrSource::VectorPdf,
571 None,
572 &result,
573 1,
574 options.low_confidence_threshold,
575 ));
576 page_results.push(result);
577 }
578 PdfPagePayload::Raster(raster) => {
579 let (result, column_count) = recognize_image(
580 ocr_backend,
581 ImageInput {
582 bytes: raster.png_bytes,
583 format: ImageFormat::Png,
584 dpi: None,
585 },
586 options.column_detection,
587 )?;
588 pages.push(PageReport::new(
589 raster.page_index,
590 OcrSource::Ocr,
591 Some(ocr_backend.name().to_string()),
592 &result,
593 column_count,
594 options.low_confidence_threshold,
595 ));
596 page_results.push(result);
597 }
598 }
599 }
600
601 Ok(DocumentExtraction {
602 ocr_result: merge_page_results(&page_results),
603 pdf_page_count,
604 pdf_page_index: first_page_index,
605 page_count: pages.len() as u32,
606 pages,
607 })
608 }
609 #[cfg(not(feature = "pdf-input"))]
610 {
611 Err(DocumentError::UnsupportedInput {
612 path: input.to_path_buf(),
613 reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
614 })
615 }
616 }
617 }
618}
619
620#[cfg(feature = "ocr-tesseract")]
621fn recognize_image(
622 ocr_backend: &dyn OcrBackend,
623 image: ImageInput,
624 column_detection: bool,
625) -> Result<(OcrResult, u32), DocumentError> {
626 let hints = OcrHints::default();
627 let lang = hints.primary_language().to_string();
628 let image = crate::preprocess::preprocess_image(image);
629 let spans = ocr_backend
630 .recognize(image, hints)
631 .map_err(map_ocr_error_to_document_error)?;
632 Ok(OcrResult::from_spans_with_column_detection(
633 &spans,
634 lang,
635 column_detection,
636 ))
637}
638
639#[cfg(feature = "ocr-tesseract")]
640fn merge_page_results(results: &[OcrResult]) -> OcrResult {
641 let text = results
642 .iter()
643 .map(|result| result.text.as_str())
644 .collect::<Vec<_>>()
645 .join("\n\n");
646 let mut conf_sum = 0.0f64;
647 let mut conf_count = 0usize;
648 for result in results {
649 if let Some(confidence) = result.mean_confidence {
650 conf_sum += confidence as f64 * result.word_count as f64;
651 conf_count += result.word_count;
652 }
653 }
654 let mean_confidence = if conf_count == 0 {
655 None
656 } else {
657 Some((conf_sum / conf_count as f64) as f32)
658 };
659 OcrResult::new(text, mean_confidence, conf_count, "mixed".to_string())
660}
661
662#[cfg(feature = "ocr-tesseract")]
663fn map_ocr_error_to_document_error(err: OcrError) -> DocumentError {
664 match err {
665 OcrError::InitFailed(hint) => DocumentError::TesseractNotFound(hint),
666 OcrError::RecognizeFailed(detail) => DocumentError::TesseractFailed {
667 status: -1,
668 stderr: detail,
669 },
670 OcrError::UnsupportedFormat(format) => DocumentError::UnsupportedInput {
671 path: PathBuf::new(),
672 reason: match format {
673 ImageFormat::Png => "png image format is not supported by the OCR backend",
674 ImageFormat::Jpeg => "jpeg image format is not supported by the OCR backend",
675 ImageFormat::Tiff => "tiff image format is not supported by the OCR backend",
676 },
677 },
678 OcrError::Internal(detail) => DocumentError::Pipeline(format!("ocr: {detail}")),
679 }
680}
681
682#[cfg(feature = "ocr-tesseract")]
683#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
684pub(crate) fn build_document_pipeline() -> Result<GazePipeline, DocumentError> {
685 let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
686 let phone = RegexDetector::new(
689 r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
690 PiiClass::custom("phone"),
691 )
692 .map_err(|err| pipeline_err("phone-regex", err))?;
693 let recipient_name = AnchoredMatchRecognizer::new(
701 "gaze_document.name.recipient".to_string(),
702 vec![
703 "Bill to".to_string(),
704 "Invoice to".to_string(),
705 "Ship to".to_string(),
706 "Attention".to_string(),
707 "Attn".to_string(),
708 ],
709 AnchoredBoundary::LineEnd,
710 48,
711 NameShape::PersonName,
712 CuePosition::Before,
713 "invoice_recipient".to_string(),
714 2,
715 0.88,
716 110,
717 );
718 GazePipeline::builder()
719 .detector(email)
720 .detector(phone)
721 .recognizer(recipient_name)
722 .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
723 .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
724 .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
725 .rule(DefaultRule::new(Action::Preserve))
726 .build()
727 .map_err(|err| pipeline_err("build", err))
728}
729
730#[cfg(feature = "ocr-tesseract")]
731fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
732 let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
733 for span in spans {
734 *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
735 }
736 by_class
737 .into_iter()
738 .map(|(class, count)| ClassCount::new(class, count))
739 .collect()
740}
741
742#[cfg(feature = "ocr-tesseract")]
743fn write_bundle(
744 agent_out: &AgentBundleDir,
745 owner_out: &OwnerBundleDir,
746 clean_markdown: &str,
747 manifest: &Manifest,
748 report: &BundleReport,
749) -> Result<(), DocumentError> {
750 fs::write(
751 agent_out.as_path().join(CLEAN_MARKDOWN_FILE),
752 clean_markdown,
753 )?;
754 let manifest_json = serde_json::to_vec_pretty(manifest)?;
755 fs::write(owner_out.as_path().join(MANIFEST_FILE), manifest_json)?;
756 let report_json = serde_json::to_vec_pretty(report)?;
757 fs::write(agent_out.as_path().join(REPORT_FILE), report_json)?;
758 Ok(())
759}
760
761#[cfg(feature = "ocr-tesseract")]
762fn prepare_bundle_dirs(
763 agent_out: &AgentBundleDir,
764 owner_out: &OwnerBundleDir,
765) -> Result<(PathBuf, PathBuf), DocumentError> {
766 let agent = normalize_for_layout(agent_out.as_path());
767 let owner = normalize_for_layout(owner_out.as_path());
768 validate_bundle_layout(&agent, &owner)?;
769
770 fs::create_dir_all(agent_out.as_path())
771 .map_err(|err| DocumentError::OutputDir(agent.clone(), err))?;
772 fs::create_dir_all(owner_out.as_path())
773 .map_err(|err| DocumentError::OutputDir(owner.clone(), err))?;
774
775 let agent = fs::canonicalize(agent_out.as_path()).unwrap_or(agent);
776 let owner = fs::canonicalize(owner_out.as_path()).unwrap_or(owner);
777 validate_bundle_layout(&agent, &owner)?;
778 Ok((agent, owner))
779}
780
781fn validate_non_empty_path(path: &Path) -> Result<(), DocumentError> {
782 if path.as_os_str().is_empty() {
783 return Err(DocumentError::BundleLayoutInvalid {
784 reason: BundleLayoutInvalidReason::EmptyPath,
785 });
786 }
787 Ok(())
788}
789
790fn validate_bundle_layout(agent: &Path, owner: &Path) -> Result<(), DocumentError> {
791 if agent == owner {
792 return Err(DocumentError::BundleLayoutInvalid {
793 reason: BundleLayoutInvalidReason::AgentEqualsOwner,
794 });
795 }
796 if agent.starts_with(owner) {
797 return Err(DocumentError::BundleLayoutInvalid {
798 reason: BundleLayoutInvalidReason::AgentNestedInOwner,
799 });
800 }
801 if owner.starts_with(agent) {
802 return Err(DocumentError::BundleLayoutInvalid {
803 reason: BundleLayoutInvalidReason::OwnerNestedInAgent,
804 });
805 }
806 Ok(())
807}
808
809fn normalize_for_layout(path: &Path) -> PathBuf {
810 let absolute = absolutize(path);
811 let mut normalized = PathBuf::new();
812 for component in absolute.components() {
813 match component {
814 Component::CurDir => {}
815 Component::ParentDir => {
816 normalized.pop();
817 }
818 Component::Prefix(prefix) => normalized.push(prefix.as_os_str()),
819 Component::RootDir => normalized.push(component.as_os_str()),
820 Component::Normal(part) => normalized.push(part),
821 }
822 }
823 normalized
824}
825
826#[cfg(feature = "ocr-tesseract")]
827pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
828 let mut out = String::new();
829 out.push_str("# gaze-document safe bundle\n\n");
830 out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
831 out.push_str("---\n\n");
832 out.push_str(text);
833 if !text.ends_with('\n') {
834 out.push('\n');
835 }
836 out
837}
838
839#[cfg(feature = "ocr-tesseract")]
840pub(crate) fn kind_label(kind: InputKind) -> &'static str {
841 match kind {
842 InputKind::Png => "png",
843 InputKind::Jpeg => "jpeg",
844 InputKind::Pdf => "pdf",
845 }
846}
847
848fn absolutize(path: &Path) -> PathBuf {
849 if path.is_absolute() {
850 path.to_path_buf()
851 } else {
852 std::env::current_dir()
853 .map(|cwd| cwd.join(path))
854 .unwrap_or_else(|_| path.to_path_buf())
855 }
856}
857
858#[cfg(feature = "ocr-tesseract")]
859fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
860 DocumentError::Pipeline(format!("{stage}: {err}"))
861}
862
863#[cfg(all(test, feature = "ocr-tesseract"))]
864mod tests {
865 use super::*;
866 use crate::ocr::{BBox, OcrSpan};
867
868 #[derive(Debug)]
869 struct MockBackend {
870 spans: Vec<OcrSpan>,
871 }
872
873 impl OcrBackend for MockBackend {
874 fn name(&self) -> &str {
875 "mock-ocr"
876 }
877
878 fn recognize(
879 &self,
880 _image: ImageInput,
881 _hints: OcrHints,
882 ) -> Result<Vec<OcrSpan>, OcrError> {
883 Ok(self.spans.clone())
884 }
885 }
886
887 fn span(text: &str, x: u32, y: u32, confidence: f32) -> OcrSpan {
888 OcrSpan {
889 text: text.to_string(),
890 bbox: BBox { x, y, w: 90, h: 16 },
891 confidence: Some(confidence),
892 }
893 }
894
895 fn bundle_dirs(tmp: &tempfile::TempDir) -> (AgentBundleDir, OwnerBundleDir) {
896 (
897 AgentBundleDir::new(tmp.path().join("agent")).expect("agent dir"),
898 OwnerBundleDir::new(tmp.path().join("owner")).expect("owner dir"),
899 )
900 }
901
902 #[test]
903 fn count_pii_by_class_groups_email_and_phone() {
904 let spans = vec![
905 EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
906 EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
907 EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
908 ];
909 let counts = count_pii_by_class(&spans);
910 assert_eq!(counts.len(), 2);
911 let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
912 assert_eq!(by_class.get("email"), Some(&2));
913 assert_eq!(by_class.get("custom:phone"), Some(&1));
914 }
915
916 #[test]
917 fn report_serializes_with_bundle_version() {
918 let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
919 let report = BundleReport::new(
920 "png",
921 &ocr,
922 42,
923 3,
924 vec![
925 ClassCount::new("email", 2),
926 ClassCount::new("custom:phone", 1),
927 ],
928 None,
929 None,
930 vec![PageReport::new(
931 0,
932 OcrSource::Ocr,
933 Some("tesseract".to_string()),
934 &ocr,
935 1,
936 DEFAULT_LOW_CONFIDENCE_THRESHOLD,
937 )],
938 DEFAULT_LOW_CONFIDENCE_THRESHOLD,
939 );
940 let json = serde_json::to_value(&report).expect("serialize");
941 assert_eq!(json["bundle_version"], BUNDLE_VERSION);
942 assert_eq!(json["input_kind"], "png");
943 assert_eq!(json["pii_token_count"], 3);
944 assert_eq!(json["pages"][0]["ocr_source"], "ocr");
945 assert_eq!(
946 json["low_confidence_threshold"],
947 DEFAULT_LOW_CONFIDENCE_THRESHOLD
948 );
949 }
950
951 #[test]
952 fn v1_report_without_page_fields_still_deserializes() {
953 let json = serde_json::json!({
954 "bundle_version": 1,
955 "input_kind": "png",
956 "ocr_mean_confidence": 90.0,
957 "ocr_word_count": 2,
958 "ocr_lang": "eng",
959 "clean_char_count": 12,
960 "pii_token_count": 1,
961 "pii_tokens_by_class": [{ "class": "email", "count": 1 }],
962 "pdf_page_count": null,
963 "pdf_page_index": null
964 });
965
966 let report: BundleReport = serde_json::from_value(json).expect("v1 parses");
967
968 assert_eq!(report.bundle_version, 1);
969 assert!(report.pages.is_empty());
970 assert_eq!(
971 report.low_confidence_threshold,
972 DEFAULT_LOW_CONFIDENCE_THRESHOLD
973 );
974 }
975
976 #[test]
977 fn clean_with_mock_backend_flags_low_confidence_and_columns() {
978 let backend = MockBackend {
979 spans: vec![
980 span("Bill", 20, 10, 0.50),
981 span("to:", 116, 10, 0.50),
982 span("Jane", 20, 36, 0.50),
983 span("Doe", 116, 36, 0.50),
984 span("Email:", 360, 10, 0.50),
985 span("alice@example.invalid", 360, 36, 0.50),
986 ],
987 };
988 let tmp = tempfile::tempdir().expect("tempdir");
989 let input = tmp.path().join("input.png");
990 fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
991 let (agent_out, owner_out) = bundle_dirs(&tmp);
992
993 let bundle = Pipeline::new()
994 .with_low_confidence_threshold(0.65)
995 .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
996 .expect("clean succeeds");
997
998 assert_eq!(bundle.report.bundle_version, 2);
999 assert_eq!(bundle.report.pages.len(), 1);
1000 let page = &bundle.report.pages[0];
1001 assert_eq!(page.ocr_backend.as_deref(), Some("mock-ocr"));
1002 assert_eq!(page.column_count, 2);
1003 assert_eq!(page.confidence, Some(0.5));
1004 assert!(page.low_confidence);
1005 assert!(
1006 bundle.clean_markdown.contains(":Email_"),
1007 "{}",
1008 bundle.clean_markdown
1009 );
1010 assert!(
1011 !bundle.clean_markdown.contains("alice@example.invalid"),
1012 "{}",
1013 bundle.clean_markdown
1014 );
1015 }
1016
1017 #[test]
1018 fn clean_with_mock_backend_preserves_table_cell_context() {
1019 let backend = MockBackend {
1020 spans: vec![
1021 span("Field", 20, 10, 0.92),
1022 span("Value", 160, 10, 0.92),
1023 span("Bill", 20, 40, 0.92),
1024 span("Jane", 160, 40, 0.92),
1025 span("Email", 20, 70, 0.92),
1026 span("alice@example.invalid", 160, 70, 0.92),
1027 ],
1028 };
1029 let tmp = tempfile::tempdir().expect("tempdir");
1030 let input = tmp.path().join("input.png");
1031 fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
1032 let (agent_out, owner_out) = bundle_dirs(&tmp);
1033
1034 let bundle = Pipeline::new()
1035 .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
1036 .expect("clean succeeds");
1037
1038 assert_eq!(bundle.report.pages[0].column_count, 1);
1039 assert!(
1040 bundle.clean_markdown.contains("Field\nValue\n\nBill\nJane"),
1041 "{}",
1042 bundle.clean_markdown
1043 );
1044 assert!(
1045 bundle.clean_markdown.contains(":Email_"),
1046 "{}",
1047 bundle.clean_markdown
1048 );
1049 assert!(
1050 !bundle.clean_markdown.contains("alice@example.invalid"),
1051 "{}",
1052 bundle.clean_markdown
1053 );
1054 }
1055
1056 #[cfg(feature = "pdf-input")]
1057 #[test]
1058 fn clean_preprocesses_rotated_image_before_backend_ocr() {
1059 use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
1060
1061 #[derive(Debug)]
1062 struct OrientationSensitiveBackend;
1063
1064 impl OcrBackend for OrientationSensitiveBackend {
1065 fn name(&self) -> &str {
1066 "orientation-sensitive"
1067 }
1068
1069 fn recognize(
1070 &self,
1071 image: ImageInput,
1072 _hints: OcrHints,
1073 ) -> Result<Vec<OcrSpan>, OcrError> {
1074 let decoded = image::load_from_memory(&image.bytes)
1075 .map_err(|err| OcrError::Internal(err.to_string()))?;
1076 if decoded.width() <= decoded.height() {
1077 return Ok(Vec::new());
1078 }
1079 Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1080 }
1081 }
1082
1083 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1084 for y in 38..42 {
1085 for x in 16..104 {
1086 image.put_pixel(x, y, Luma([0]));
1087 }
1088 }
1089 let sideways = image::imageops::rotate90(&image);
1090 let mut bytes = Vec::new();
1091 sideways
1092 .write_to(
1093 &mut std::io::Cursor::new(&mut bytes),
1094 EncodedImageFormat::Png,
1095 )
1096 .expect("encode png");
1097 let tmp = tempfile::tempdir().expect("tempdir");
1098 let input = tmp.path().join("input.png");
1099 fs::write(&input, bytes).expect("write input");
1100 let (agent_out, owner_out) = bundle_dirs(&tmp);
1101
1102 let bundle = Pipeline::new()
1103 .clean_with_ocr_backend(&input, agent_out, owner_out, &OrientationSensitiveBackend)
1104 .expect("clean succeeds");
1105
1106 assert!(
1107 bundle.clean_markdown.contains(":Email_"),
1108 "{}",
1109 bundle.clean_markdown
1110 );
1111 assert!(
1112 !bundle.clean_markdown.contains("alice@example.invalid"),
1113 "{}",
1114 bundle.clean_markdown
1115 );
1116 }
1117
1118 #[cfg(feature = "pdf-input")]
1119 #[test]
1120 fn clean_deskews_image_before_backend_ocr() {
1121 use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
1122 use imageproc::geometric_transformations::{rotate_about_center, Interpolation};
1123
1124 fn horizontal_score(bytes: &[u8]) -> Result<u64, OcrError> {
1125 let decoded = image::load_from_memory(bytes)
1126 .map_err(|err| OcrError::Internal(err.to_string()))?
1127 .to_luma8();
1128 let mut score = 0u64;
1129 for y in 0..decoded.height() {
1130 let mut dark = 0u64;
1131 for x in 0..decoded.width() {
1132 if decoded.get_pixel(x, y).0[0] < 200 {
1133 dark += 1;
1134 }
1135 }
1136 score = score.saturating_add(dark.saturating_mul(dark));
1137 }
1138 Ok(score)
1139 }
1140
1141 #[derive(Debug)]
1142 struct DeskewSensitiveBackend {
1143 minimum_score: u64,
1144 }
1145
1146 impl OcrBackend for DeskewSensitiveBackend {
1147 fn name(&self) -> &str {
1148 "deskew-sensitive"
1149 }
1150
1151 fn recognize(
1152 &self,
1153 image: ImageInput,
1154 _hints: OcrHints,
1155 ) -> Result<Vec<OcrSpan>, OcrError> {
1156 if horizontal_score(&image.bytes)? < self.minimum_score {
1157 return Ok(Vec::new());
1158 }
1159 Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1160 }
1161 }
1162
1163 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1164 for y in 38..42 {
1165 for x in 16..104 {
1166 image.put_pixel(x, y, Luma([0]));
1167 }
1168 }
1169 let skewed = rotate_about_center(
1170 &image,
1171 4.0_f32.to_radians(),
1172 Interpolation::Nearest,
1173 Luma([255]),
1174 );
1175 let mut bytes = Vec::new();
1176 skewed
1177 .write_to(
1178 &mut std::io::Cursor::new(&mut bytes),
1179 EncodedImageFormat::Png,
1180 )
1181 .expect("encode png");
1182 let raw_score = horizontal_score(&bytes).expect("raw score");
1183 let backend = DeskewSensitiveBackend {
1184 minimum_score: raw_score + 1_000,
1185 };
1186 assert!(
1187 backend
1188 .recognize(
1189 ImageInput {
1190 bytes: bytes.clone(),
1191 format: ImageFormat::Png,
1192 dpi: None
1193 },
1194 OcrHints::default()
1195 )
1196 .expect("raw recognize")
1197 .is_empty(),
1198 "raw skewed payload should miss before preprocessing"
1199 );
1200 let tmp = tempfile::tempdir().expect("tempdir");
1201 let input = tmp.path().join("input.png");
1202 fs::write(&input, bytes).expect("write input");
1203 let (agent_out, owner_out) = bundle_dirs(&tmp);
1204
1205 let bundle = Pipeline::new()
1206 .clean_with_ocr_backend(&input, agent_out, owner_out, &backend)
1207 .expect("clean succeeds");
1208
1209 assert!(
1210 bundle.clean_markdown.contains(":Email_"),
1211 "{}",
1212 bundle.clean_markdown
1213 );
1214 assert!(
1215 !bundle.clean_markdown.contains("alice@example.invalid"),
1216 "{}",
1217 bundle.clean_markdown
1218 );
1219 }
1220
1221 #[test]
1222 fn format_clean_markdown_appends_trailing_newline() {
1223 let md = format_clean_markdown("hello", InputKind::Png);
1224 assert!(md.ends_with('\n'));
1225 assert!(md.contains("Source kind: `png`"));
1226 assert!(md.contains("hello"));
1227 }
1228}