1use std::path::PathBuf;
20
21use gaze::Manifest;
22use serde::{Deserialize, Serialize};
23
24use crate::ocr::{
25 detect_image_format, ImageFormat, ImageInput, OcrBackend, OcrError, OcrHints, OcrResult,
26};
27
28#[cfg(feature = "ocr-tesseract")]
29use std::collections::BTreeMap;
30#[cfg(feature = "ocr-tesseract")]
31use std::fs;
32#[cfg(feature = "ocr-tesseract")]
33use std::path::Path;
34
35#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
36use gaze::{
37 Action, ClassRule, CleanDocument, DefaultRule, LocaleTag, Pipeline as GazePipeline,
38 RawDocument, Scope, Session,
39};
40#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
41use gaze_recognizers::{
42 AnchoredBoundary, AnchoredMatchRecognizer, CuePosition, NameShape, RegexDetector,
43};
44#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
45use gaze_types::{EmittedTokenSpan, PiiClass};
46
47#[cfg(feature = "ocr-tesseract")]
48use crate::extract::InputKind;
49#[cfg(feature = "ocr-tesseract")]
50use crate::DocumentError;
51
52pub const BUNDLE_VERSION: u32 = 2;
54const DEFAULT_LOW_CONFIDENCE_THRESHOLD: f32 = 0.65;
55
56pub const CLEAN_MARKDOWN_FILE: &str = "clean.md";
58pub const MANIFEST_FILE: &str = "manifest.json";
60pub const REPORT_FILE: &str = "report.json";
62
63#[non_exhaustive]
65#[derive(Debug, Clone)]
66pub struct SafeBundle {
67 pub clean_markdown: String,
69 pub manifest: Manifest,
71 pub layout: LayoutSummary,
73 pub preview_png: Option<Vec<u8>>,
75 pub report: BundleReport,
77 pub source_path: PathBuf,
79 pub out_dir: PathBuf,
81}
82
83impl SafeBundle {
84 pub fn new(
86 clean_markdown: String,
87 manifest: Manifest,
88 layout: LayoutSummary,
89 preview_png: Option<Vec<u8>>,
90 report: BundleReport,
91 source_path: PathBuf,
92 out_dir: PathBuf,
93 ) -> Self {
94 Self {
95 clean_markdown,
96 manifest,
97 layout,
98 preview_png,
99 report,
100 source_path,
101 out_dir,
102 }
103 }
104}
105
106#[non_exhaustive]
108#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
109pub struct ClassCount {
110 pub class: String,
112 pub count: u32,
114}
115
116impl ClassCount {
117 pub fn new(class: impl Into<String>, count: u32) -> Self {
119 Self {
120 class: class.into(),
121 count,
122 }
123 }
124}
125
126#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
128#[serde(rename_all = "snake_case")]
129pub enum OcrSource {
130 VectorPdf,
132 Ocr,
134}
135
136#[non_exhaustive]
138#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
139pub struct PageReport {
140 pub page_index: i32,
142 pub ocr_source: OcrSource,
144 pub ocr_backend: Option<String>,
146 pub confidence: Option<f32>,
148 pub low_confidence: bool,
150 pub column_count: u32,
152 pub ocr_word_count: usize,
154 pub ocr_mean_confidence: Option<f32>,
156}
157
158impl PageReport {
159 fn new(
160 page_index: i32,
161 ocr_source: OcrSource,
162 ocr_backend: Option<String>,
163 ocr: &OcrResult,
164 column_count: u32,
165 low_confidence_threshold: f32,
166 ) -> Self {
167 let confidence = ocr.mean_confidence_unit();
168 Self {
169 page_index,
170 ocr_source,
171 ocr_backend,
172 confidence,
173 low_confidence: confidence
174 .map(|confidence| confidence < low_confidence_threshold)
175 .unwrap_or(false),
176 column_count,
177 ocr_word_count: ocr.word_count,
178 ocr_mean_confidence: ocr.mean_confidence,
179 }
180 }
181}
182
183#[non_exhaustive]
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct BundleReport {
191 pub bundle_version: u32,
193 pub input_kind: String,
195 pub ocr_mean_confidence: Option<f32>,
197 pub ocr_word_count: usize,
199 pub ocr_lang: String,
201 pub clean_char_count: usize,
203 pub pii_token_count: u32,
205 pub pii_tokens_by_class: Vec<ClassCount>,
207 pub pdf_page_count: Option<i32>,
209 pub pdf_page_index: Option<i32>,
211 #[serde(default)]
213 pub pages: Vec<PageReport>,
214 #[serde(default = "default_low_confidence_threshold")]
216 pub low_confidence_threshold: f32,
217}
218
219impl BundleReport {
220 #[allow(clippy::too_many_arguments)]
222 pub fn new(
223 input_kind: impl Into<String>,
224 ocr: &OcrResult,
225 clean_char_count: usize,
226 pii_token_count: u32,
227 pii_tokens_by_class: Vec<ClassCount>,
228 pdf_page_count: Option<i32>,
229 pdf_page_index: Option<i32>,
230 pages: Vec<PageReport>,
231 low_confidence_threshold: f32,
232 ) -> Self {
233 Self {
234 bundle_version: BUNDLE_VERSION,
235 input_kind: input_kind.into(),
236 ocr_mean_confidence: ocr.mean_confidence,
237 ocr_word_count: ocr.word_count,
238 ocr_lang: ocr.lang.clone(),
239 clean_char_count,
240 pii_token_count,
241 pii_tokens_by_class,
242 pdf_page_count,
243 pdf_page_index,
244 pages,
245 low_confidence_threshold,
246 }
247 }
248}
249
250fn default_low_confidence_threshold() -> f32 {
251 DEFAULT_LOW_CONFIDENCE_THRESHOLD
252}
253
254#[non_exhaustive]
256#[derive(Debug, Clone, Copy)]
257pub struct Pipeline {
258 low_confidence_threshold: f32,
259 column_detection: bool,
260}
261
262impl Pipeline {
263 pub fn new() -> Self {
265 Self {
266 low_confidence_threshold: DEFAULT_LOW_CONFIDENCE_THRESHOLD,
267 column_detection: true,
268 }
269 }
270
271 pub fn with_low_confidence_threshold(mut self, threshold: f32) -> Self {
273 self.low_confidence_threshold = threshold.clamp(0.0, 1.0);
274 self
275 }
276
277 pub fn with_column_detection(mut self, enabled: bool) -> Self {
279 self.column_detection = enabled;
280 self
281 }
282
283 #[cfg(feature = "ocr-tesseract")]
288 #[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
289 pub fn clean_with_ocr_backend(
290 &self,
291 input: &Path,
292 out_dir: &Path,
293 ocr_backend: &dyn OcrBackend,
294 ) -> Result<SafeBundle, DocumentError> {
295 clean_with_options(input, out_dir, ocr_backend, *self)
296 }
297}
298
299impl Default for Pipeline {
300 fn default() -> Self {
301 Self::new()
302 }
303}
304
305#[non_exhaustive]
310#[derive(Debug, Clone)]
311pub struct LayoutSummary {
312 pub page_count: u32,
314}
315
316impl LayoutSummary {
317 pub fn single_page() -> Self {
319 Self { page_count: 1 }
320 }
321
322 pub fn new(page_count: u32) -> Self {
324 Self { page_count }
325 }
326}
327
328#[cfg(feature = "ocr-tesseract")]
340#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
341pub fn clean(input: &Path, out_dir: &Path) -> Result<SafeBundle, DocumentError> {
342 let backend = crate::ocr::TesseractBackend::new();
343 Pipeline::new().clean_with_ocr_backend(input, out_dir, &backend)
344}
345
346#[cfg(feature = "ocr-tesseract")]
357#[cfg_attr(docsrs, doc(cfg(feature = "ocr-tesseract")))]
358pub fn clean_with_ocr_backend(
359 input: &Path,
360 out_dir: &Path,
361 ocr_backend: &dyn OcrBackend,
362) -> Result<SafeBundle, DocumentError> {
363 Pipeline::new().clean_with_ocr_backend(input, out_dir, ocr_backend)
364}
365
366#[cfg(feature = "ocr-tesseract")]
367fn clean_with_options(
368 input: &Path,
369 out_dir: &Path,
370 ocr_backend: &dyn OcrBackend,
371 options: Pipeline,
372) -> Result<SafeBundle, DocumentError> {
373 let kind = InputKind::detect(input)?;
374 let absolute_input = absolutize(input);
375 let absolute_out = absolutize(out_dir);
376
377 fs::create_dir_all(out_dir)
378 .map_err(|err| DocumentError::OutputDir(absolute_out.clone(), err))?;
379
380 let extraction = run_document_extraction(input, kind, ocr_backend, options)?;
381 let normalized_text = crate::ocr::normalize_ocr_artifacts(&extraction.ocr_result.text);
388 let pipeline = build_document_pipeline()?;
389 let session = Session::new(Scope::Ephemeral).map_err(|err| pipeline_err("session", err))?;
390 let locale_chain = [LocaleTag::Global];
391 let (clean_doc, spans, _leak_report) = pipeline
392 .clean_with_safety_net(&session, RawDocument::Text(normalized_text), &locale_chain)
393 .map_err(|err| pipeline_err("redact", err))?;
394
395 let clean_text = match clean_doc {
396 CleanDocument::Text(text) => text,
397 _ => {
398 return Err(DocumentError::Pipeline(
399 "pipeline returned non-text variant for text input".to_string(),
400 ));
401 }
402 };
403
404 let manifest = Manifest::from_spans(spans.clone());
405 let counts = count_pii_by_class(&spans);
406 let pii_token_count: u32 = counts.iter().map(|c| c.count).sum();
407
408 let report = BundleReport::new(
409 kind_label(kind),
410 &extraction.ocr_result,
411 clean_text.chars().count(),
412 pii_token_count,
413 counts,
414 extraction.pdf_page_count,
415 extraction.pdf_page_index,
416 extraction.pages,
417 options.low_confidence_threshold,
418 );
419
420 let clean_markdown = format_clean_markdown(&clean_text, kind);
421 write_bundle(out_dir, &clean_markdown, &manifest, &report)?;
422
423 Ok(SafeBundle::new(
424 clean_markdown,
425 manifest,
426 LayoutSummary::new(extraction.page_count),
427 None,
428 report,
429 absolute_input,
430 absolute_out,
431 ))
432}
433
434#[cfg(feature = "ocr-tesseract")]
435struct DocumentExtraction {
436 ocr_result: OcrResult,
437 pdf_page_count: Option<i32>,
438 pdf_page_index: Option<i32>,
439 pages: Vec<PageReport>,
440 page_count: u32,
441}
442
443#[cfg(feature = "ocr-tesseract")]
444#[cfg_attr(not(feature = "mcp"), allow(dead_code))]
445pub(crate) fn run_ocr(
446 input: &Path,
447 kind: InputKind,
448 ocr_backend: &dyn OcrBackend,
449) -> Result<(OcrResult, Option<i32>, Option<i32>), DocumentError> {
450 let extraction = run_document_extraction(input, kind, ocr_backend, Pipeline::new())?;
451 Ok((
452 extraction.ocr_result,
453 extraction.pdf_page_count,
454 extraction.pdf_page_index,
455 ))
456}
457
458#[cfg(feature = "ocr-tesseract")]
459fn run_document_extraction(
460 input: &Path,
461 kind: InputKind,
462 ocr_backend: &dyn OcrBackend,
463 options: Pipeline,
464) -> Result<DocumentExtraction, DocumentError> {
465 match kind {
466 InputKind::Png | InputKind::Jpeg => {
467 let bytes = fs::read(input)?;
468 let format = detect_image_format(&bytes)?;
469 let (result, column_count) = recognize_image(
470 ocr_backend,
471 ImageInput {
472 bytes,
473 format,
474 dpi: None,
475 },
476 options.column_detection,
477 )?;
478 let page_report = PageReport::new(
479 0,
480 OcrSource::Ocr,
481 Some(ocr_backend.name().to_string()),
482 &result,
483 column_count,
484 options.low_confidence_threshold,
485 );
486 Ok(DocumentExtraction {
487 ocr_result: result,
488 pdf_page_count: None,
489 pdf_page_index: None,
490 pages: vec![page_report],
491 page_count: 1,
492 })
493 }
494 InputKind::Pdf => {
495 #[cfg(feature = "pdf-input")]
496 {
497 use crate::extract::pdf::{extract_pages, PdfPagePayload, PdfRasterConfig};
498 let payloads = extract_pages(input, PdfRasterConfig::new())?;
499 let mut page_results = Vec::with_capacity(payloads.len());
500 let mut pages = Vec::with_capacity(payloads.len());
501 let mut pdf_page_count = None;
502 let mut first_page_index = None;
503
504 for payload in payloads {
505 pdf_page_count = Some(payload.page_count());
506 if first_page_index.is_none() {
507 first_page_index = Some(payload.page_index());
508 }
509 match payload {
510 PdfPagePayload::VectorText {
511 text, page_index, ..
512 } => {
513 let result = OcrResult::new(text, None, 0, "vector-pdf".to_string());
514 pages.push(PageReport::new(
515 page_index,
516 OcrSource::VectorPdf,
517 None,
518 &result,
519 1,
520 options.low_confidence_threshold,
521 ));
522 page_results.push(result);
523 }
524 PdfPagePayload::Raster(raster) => {
525 let (result, column_count) = recognize_image(
526 ocr_backend,
527 ImageInput {
528 bytes: raster.png_bytes,
529 format: ImageFormat::Png,
530 dpi: None,
531 },
532 options.column_detection,
533 )?;
534 pages.push(PageReport::new(
535 raster.page_index,
536 OcrSource::Ocr,
537 Some(ocr_backend.name().to_string()),
538 &result,
539 column_count,
540 options.low_confidence_threshold,
541 ));
542 page_results.push(result);
543 }
544 }
545 }
546
547 Ok(DocumentExtraction {
548 ocr_result: merge_page_results(&page_results),
549 pdf_page_count,
550 pdf_page_index: first_page_index,
551 page_count: pages.len() as u32,
552 pages,
553 })
554 }
555 #[cfg(not(feature = "pdf-input"))]
556 {
557 Err(DocumentError::UnsupportedInput {
558 path: input.to_path_buf(),
559 reason: "rebuild gaze-document with `--features pdf-input` for PDF support",
560 })
561 }
562 }
563 }
564}
565
566#[cfg(feature = "ocr-tesseract")]
567fn recognize_image(
568 ocr_backend: &dyn OcrBackend,
569 image: ImageInput,
570 column_detection: bool,
571) -> Result<(OcrResult, u32), DocumentError> {
572 let hints = OcrHints::default();
573 let lang = hints.primary_language().to_string();
574 let image = crate::preprocess::preprocess_image(image);
575 let spans = ocr_backend
576 .recognize(image, hints)
577 .map_err(map_ocr_error_to_document_error)?;
578 Ok(OcrResult::from_spans_with_column_detection(
579 &spans,
580 lang,
581 column_detection,
582 ))
583}
584
585#[cfg(feature = "ocr-tesseract")]
586fn merge_page_results(results: &[OcrResult]) -> OcrResult {
587 let text = results
588 .iter()
589 .map(|result| result.text.as_str())
590 .collect::<Vec<_>>()
591 .join("\n\n");
592 let mut conf_sum = 0.0f64;
593 let mut conf_count = 0usize;
594 for result in results {
595 if let Some(confidence) = result.mean_confidence {
596 conf_sum += confidence as f64 * result.word_count as f64;
597 conf_count += result.word_count;
598 }
599 }
600 let mean_confidence = if conf_count == 0 {
601 None
602 } else {
603 Some((conf_sum / conf_count as f64) as f32)
604 };
605 OcrResult::new(text, mean_confidence, conf_count, "mixed".to_string())
606}
607
608#[cfg(feature = "ocr-tesseract")]
609fn map_ocr_error_to_document_error(err: OcrError) -> DocumentError {
610 match err {
611 OcrError::InitFailed(hint) => DocumentError::TesseractNotFound(hint),
612 OcrError::RecognizeFailed(detail) => DocumentError::TesseractFailed {
613 status: -1,
614 stderr: detail,
615 },
616 OcrError::UnsupportedFormat(format) => DocumentError::UnsupportedInput {
617 path: PathBuf::new(),
618 reason: match format {
619 ImageFormat::Png => "png image format is not supported by the OCR backend",
620 ImageFormat::Jpeg => "jpeg image format is not supported by the OCR backend",
621 ImageFormat::Tiff => "tiff image format is not supported by the OCR backend",
622 },
623 },
624 OcrError::Internal(detail) => DocumentError::Pipeline(format!("ocr: {detail}")),
625 }
626}
627
628#[cfg(feature = "ocr-tesseract")]
629#[cfg(any(feature = "ocr-tesseract", feature = "mcp"))]
630pub(crate) fn build_document_pipeline() -> Result<GazePipeline, DocumentError> {
631 let email = RegexDetector::emails().map_err(|err| pipeline_err("email-regex", err))?;
632 let phone = RegexDetector::new(
635 r"\+?\d{1,3}[-.\s]\(?\d{3}\)?[-.\s]?\d{3,4}[-.\s]?\d{0,4}",
636 PiiClass::custom("phone"),
637 )
638 .map_err(|err| pipeline_err("phone-regex", err))?;
639 let recipient_name = AnchoredMatchRecognizer::new(
647 "gaze_document.name.recipient".to_string(),
648 vec![
649 "Bill to".to_string(),
650 "Invoice to".to_string(),
651 "Ship to".to_string(),
652 "Attention".to_string(),
653 "Attn".to_string(),
654 ],
655 AnchoredBoundary::LineEnd,
656 48,
657 NameShape::PersonName,
658 CuePosition::Before,
659 "invoice_recipient".to_string(),
660 2,
661 0.88,
662 110,
663 );
664 GazePipeline::builder()
665 .detector(email)
666 .detector(phone)
667 .recognizer(recipient_name)
668 .rule(ClassRule::new(PiiClass::Email, Action::Tokenize))
669 .rule(ClassRule::new(PiiClass::custom("phone"), Action::Tokenize))
670 .rule(ClassRule::new(PiiClass::Name, Action::Tokenize))
671 .rule(DefaultRule::new(Action::Preserve))
672 .build()
673 .map_err(|err| pipeline_err("build", err))
674}
675
676#[cfg(feature = "ocr-tesseract")]
677fn count_pii_by_class(spans: &[EmittedTokenSpan]) -> Vec<ClassCount> {
678 let mut by_class: BTreeMap<String, u32> = BTreeMap::new();
679 for span in spans {
680 *by_class.entry(span.class.to_canonical_str()).or_insert(0) += 1;
681 }
682 by_class
683 .into_iter()
684 .map(|(class, count)| ClassCount::new(class, count))
685 .collect()
686}
687
688#[cfg(feature = "ocr-tesseract")]
689fn write_bundle(
690 out_dir: &Path,
691 clean_markdown: &str,
692 manifest: &Manifest,
693 report: &BundleReport,
694) -> Result<(), DocumentError> {
695 fs::write(out_dir.join(CLEAN_MARKDOWN_FILE), clean_markdown)?;
696 let manifest_json = serde_json::to_vec_pretty(manifest)?;
697 fs::write(out_dir.join(MANIFEST_FILE), manifest_json)?;
698 let report_json = serde_json::to_vec_pretty(report)?;
699 fs::write(out_dir.join(REPORT_FILE), report_json)?;
700 Ok(())
701}
702
703#[cfg(feature = "ocr-tesseract")]
704pub(crate) fn format_clean_markdown(text: &str, kind: InputKind) -> String {
705 let mut out = String::new();
706 out.push_str("# gaze-document safe bundle\n\n");
707 out.push_str(&format!("Source kind: `{}`\n\n", kind_label(kind)));
708 out.push_str("---\n\n");
709 out.push_str(text);
710 if !text.ends_with('\n') {
711 out.push('\n');
712 }
713 out
714}
715
716#[cfg(feature = "ocr-tesseract")]
717pub(crate) fn kind_label(kind: InputKind) -> &'static str {
718 match kind {
719 InputKind::Png => "png",
720 InputKind::Jpeg => "jpeg",
721 InputKind::Pdf => "pdf",
722 }
723}
724
725#[cfg(feature = "ocr-tesseract")]
726fn absolutize(path: &Path) -> PathBuf {
727 if path.is_absolute() {
728 path.to_path_buf()
729 } else {
730 std::env::current_dir()
731 .map(|cwd| cwd.join(path))
732 .unwrap_or_else(|_| path.to_path_buf())
733 }
734}
735
736#[cfg(feature = "ocr-tesseract")]
737fn pipeline_err(stage: &'static str, err: impl std::fmt::Display) -> DocumentError {
738 DocumentError::Pipeline(format!("{stage}: {err}"))
739}
740
741#[cfg(all(test, feature = "ocr-tesseract"))]
742mod tests {
743 use super::*;
744 use crate::ocr::{BBox, OcrSpan};
745
746 #[derive(Debug)]
747 struct MockBackend {
748 spans: Vec<OcrSpan>,
749 }
750
751 impl OcrBackend for MockBackend {
752 fn name(&self) -> &str {
753 "mock-ocr"
754 }
755
756 fn recognize(
757 &self,
758 _image: ImageInput,
759 _hints: OcrHints,
760 ) -> Result<Vec<OcrSpan>, OcrError> {
761 Ok(self.spans.clone())
762 }
763 }
764
765 fn span(text: &str, x: u32, y: u32, confidence: f32) -> OcrSpan {
766 OcrSpan {
767 text: text.to_string(),
768 bbox: BBox { x, y, w: 90, h: 16 },
769 confidence: Some(confidence),
770 }
771 }
772
773 #[test]
774 fn count_pii_by_class_groups_email_and_phone() {
775 let spans = vec![
776 EmittedTokenSpan::new(0..10, 0..10, PiiClass::Email),
777 EmittedTokenSpan::new(20..28, 20..28, PiiClass::Email),
778 EmittedTokenSpan::new(40..50, 40..50, PiiClass::custom("phone")),
779 ];
780 let counts = count_pii_by_class(&spans);
781 assert_eq!(counts.len(), 2);
782 let by_class: BTreeMap<_, _> = counts.iter().map(|c| (c.class.as_str(), c.count)).collect();
783 assert_eq!(by_class.get("email"), Some(&2));
784 assert_eq!(by_class.get("custom:phone"), Some(&1));
785 }
786
787 #[test]
788 fn report_serializes_with_bundle_version() {
789 let ocr = OcrResult::new("body".into(), Some(91.5), 2, "eng".into());
790 let report = BundleReport::new(
791 "png",
792 &ocr,
793 42,
794 3,
795 vec![
796 ClassCount::new("email", 2),
797 ClassCount::new("custom:phone", 1),
798 ],
799 None,
800 None,
801 vec![PageReport::new(
802 0,
803 OcrSource::Ocr,
804 Some("tesseract".to_string()),
805 &ocr,
806 1,
807 DEFAULT_LOW_CONFIDENCE_THRESHOLD,
808 )],
809 DEFAULT_LOW_CONFIDENCE_THRESHOLD,
810 );
811 let json = serde_json::to_value(&report).expect("serialize");
812 assert_eq!(json["bundle_version"], BUNDLE_VERSION);
813 assert_eq!(json["input_kind"], "png");
814 assert_eq!(json["pii_token_count"], 3);
815 assert_eq!(json["pages"][0]["ocr_source"], "ocr");
816 assert_eq!(
817 json["low_confidence_threshold"],
818 DEFAULT_LOW_CONFIDENCE_THRESHOLD
819 );
820 }
821
822 #[test]
823 fn v1_report_without_page_fields_still_deserializes() {
824 let json = serde_json::json!({
825 "bundle_version": 1,
826 "input_kind": "png",
827 "ocr_mean_confidence": 90.0,
828 "ocr_word_count": 2,
829 "ocr_lang": "eng",
830 "clean_char_count": 12,
831 "pii_token_count": 1,
832 "pii_tokens_by_class": [{ "class": "email", "count": 1 }],
833 "pdf_page_count": null,
834 "pdf_page_index": null
835 });
836
837 let report: BundleReport = serde_json::from_value(json).expect("v1 parses");
838
839 assert_eq!(report.bundle_version, 1);
840 assert!(report.pages.is_empty());
841 assert_eq!(
842 report.low_confidence_threshold,
843 DEFAULT_LOW_CONFIDENCE_THRESHOLD
844 );
845 }
846
847 #[test]
848 fn clean_with_mock_backend_flags_low_confidence_and_columns() {
849 let backend = MockBackend {
850 spans: vec![
851 span("Bill", 20, 10, 0.50),
852 span("to:", 116, 10, 0.50),
853 span("Jane", 20, 36, 0.50),
854 span("Doe", 116, 36, 0.50),
855 span("Email:", 360, 10, 0.50),
856 span("alice@example.invalid", 360, 36, 0.50),
857 ],
858 };
859 let tmp = tempfile::tempdir().expect("tempdir");
860 let input = tmp.path().join("input.png");
861 fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
862
863 let bundle = Pipeline::new()
864 .with_low_confidence_threshold(0.65)
865 .clean_with_ocr_backend(&input, tmp.path(), &backend)
866 .expect("clean succeeds");
867
868 assert_eq!(bundle.report.bundle_version, 2);
869 assert_eq!(bundle.report.pages.len(), 1);
870 let page = &bundle.report.pages[0];
871 assert_eq!(page.ocr_backend.as_deref(), Some("mock-ocr"));
872 assert_eq!(page.column_count, 2);
873 assert_eq!(page.confidence, Some(0.5));
874 assert!(page.low_confidence);
875 assert!(
876 bundle.clean_markdown.contains(":Email_"),
877 "{}",
878 bundle.clean_markdown
879 );
880 assert!(
881 !bundle.clean_markdown.contains("alice@example.invalid"),
882 "{}",
883 bundle.clean_markdown
884 );
885 }
886
887 #[test]
888 fn clean_with_mock_backend_preserves_table_cell_context() {
889 let backend = MockBackend {
890 spans: vec![
891 span("Field", 20, 10, 0.92),
892 span("Value", 160, 10, 0.92),
893 span("Bill", 20, 40, 0.92),
894 span("Jane", 160, 40, 0.92),
895 span("Email", 20, 70, 0.92),
896 span("alice@example.invalid", 160, 70, 0.92),
897 ],
898 };
899 let tmp = tempfile::tempdir().expect("tempdir");
900 let input = tmp.path().join("input.png");
901 fs::write(&input, b"\x89PNG\r\n\x1A\nnot-real-image").expect("write input");
902
903 let bundle = Pipeline::new()
904 .clean_with_ocr_backend(&input, tmp.path(), &backend)
905 .expect("clean succeeds");
906
907 assert_eq!(bundle.report.pages[0].column_count, 1);
908 assert!(
909 bundle.clean_markdown.contains("Field\nValue\n\nBill\nJane"),
910 "{}",
911 bundle.clean_markdown
912 );
913 assert!(
914 bundle.clean_markdown.contains(":Email_"),
915 "{}",
916 bundle.clean_markdown
917 );
918 assert!(
919 !bundle.clean_markdown.contains("alice@example.invalid"),
920 "{}",
921 bundle.clean_markdown
922 );
923 }
924
925 #[cfg(feature = "pdf-input")]
926 #[test]
927 fn clean_preprocesses_rotated_image_before_backend_ocr() {
928 use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
929
930 #[derive(Debug)]
931 struct OrientationSensitiveBackend;
932
933 impl OcrBackend for OrientationSensitiveBackend {
934 fn name(&self) -> &str {
935 "orientation-sensitive"
936 }
937
938 fn recognize(
939 &self,
940 image: ImageInput,
941 _hints: OcrHints,
942 ) -> Result<Vec<OcrSpan>, OcrError> {
943 let decoded = image::load_from_memory(&image.bytes)
944 .map_err(|err| OcrError::Internal(err.to_string()))?;
945 if decoded.width() <= decoded.height() {
946 return Ok(Vec::new());
947 }
948 Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
949 }
950 }
951
952 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
953 for y in 38..42 {
954 for x in 16..104 {
955 image.put_pixel(x, y, Luma([0]));
956 }
957 }
958 let sideways = image::imageops::rotate90(&image);
959 let mut bytes = Vec::new();
960 sideways
961 .write_to(
962 &mut std::io::Cursor::new(&mut bytes),
963 EncodedImageFormat::Png,
964 )
965 .expect("encode png");
966 let tmp = tempfile::tempdir().expect("tempdir");
967 let input = tmp.path().join("input.png");
968 fs::write(&input, bytes).expect("write input");
969
970 let bundle = Pipeline::new()
971 .clean_with_ocr_backend(&input, tmp.path(), &OrientationSensitiveBackend)
972 .expect("clean succeeds");
973
974 assert!(
975 bundle.clean_markdown.contains(":Email_"),
976 "{}",
977 bundle.clean_markdown
978 );
979 assert!(
980 !bundle.clean_markdown.contains("alice@example.invalid"),
981 "{}",
982 bundle.clean_markdown
983 );
984 }
985
986 #[cfg(feature = "pdf-input")]
987 #[test]
988 fn clean_deskews_image_before_backend_ocr() {
989 use image::{GrayImage, ImageFormat as EncodedImageFormat, Luma};
990 use imageproc::geometric_transformations::{rotate_about_center, Interpolation};
991
992 fn horizontal_score(bytes: &[u8]) -> Result<u64, OcrError> {
993 let decoded = image::load_from_memory(bytes)
994 .map_err(|err| OcrError::Internal(err.to_string()))?
995 .to_luma8();
996 let mut score = 0u64;
997 for y in 0..decoded.height() {
998 let mut dark = 0u64;
999 for x in 0..decoded.width() {
1000 if decoded.get_pixel(x, y).0[0] < 200 {
1001 dark += 1;
1002 }
1003 }
1004 score = score.saturating_add(dark.saturating_mul(dark));
1005 }
1006 Ok(score)
1007 }
1008
1009 #[derive(Debug)]
1010 struct DeskewSensitiveBackend {
1011 minimum_score: u64,
1012 }
1013
1014 impl OcrBackend for DeskewSensitiveBackend {
1015 fn name(&self) -> &str {
1016 "deskew-sensitive"
1017 }
1018
1019 fn recognize(
1020 &self,
1021 image: ImageInput,
1022 _hints: OcrHints,
1023 ) -> Result<Vec<OcrSpan>, OcrError> {
1024 if horizontal_score(&image.bytes)? < self.minimum_score {
1025 return Ok(Vec::new());
1026 }
1027 Ok(vec![span("alice@example.invalid", 20, 20, 0.91)])
1028 }
1029 }
1030
1031 let mut image = GrayImage::from_pixel(120, 80, Luma([255]));
1032 for y in 38..42 {
1033 for x in 16..104 {
1034 image.put_pixel(x, y, Luma([0]));
1035 }
1036 }
1037 let skewed = rotate_about_center(
1038 &image,
1039 4.0_f32.to_radians(),
1040 Interpolation::Nearest,
1041 Luma([255]),
1042 );
1043 let mut bytes = Vec::new();
1044 skewed
1045 .write_to(
1046 &mut std::io::Cursor::new(&mut bytes),
1047 EncodedImageFormat::Png,
1048 )
1049 .expect("encode png");
1050 let raw_score = horizontal_score(&bytes).expect("raw score");
1051 let backend = DeskewSensitiveBackend {
1052 minimum_score: raw_score + 1_000,
1053 };
1054 assert!(
1055 backend
1056 .recognize(
1057 ImageInput {
1058 bytes: bytes.clone(),
1059 format: ImageFormat::Png,
1060 dpi: None
1061 },
1062 OcrHints::default()
1063 )
1064 .expect("raw recognize")
1065 .is_empty(),
1066 "raw skewed payload should miss before preprocessing"
1067 );
1068 let tmp = tempfile::tempdir().expect("tempdir");
1069 let input = tmp.path().join("input.png");
1070 fs::write(&input, bytes).expect("write input");
1071
1072 let bundle = Pipeline::new()
1073 .clean_with_ocr_backend(&input, tmp.path(), &backend)
1074 .expect("clean succeeds");
1075
1076 assert!(
1077 bundle.clean_markdown.contains(":Email_"),
1078 "{}",
1079 bundle.clean_markdown
1080 );
1081 assert!(
1082 !bundle.clean_markdown.contains("alice@example.invalid"),
1083 "{}",
1084 bundle.clean_markdown
1085 );
1086 }
1087
1088 #[test]
1089 fn format_clean_markdown_appends_trailing_newline() {
1090 let md = format_clean_markdown("hello", InputKind::Png);
1091 assert!(md.ends_with('\n'));
1092 assert!(md.contains("Source kind: `png`"));
1093 assert!(md.contains("hello"));
1094 }
1095}