1use crate::Result;
8use crate::error::Error;
9use crate::ingest::SourceProvenance;
10use async_trait::async_trait;
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13use std::collections::{BTreeMap, HashMap};
14use std::path::PathBuf;
15use tokio::process::Command;
16
17#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
19#[serde(rename_all = "snake_case")]
20pub enum OcrEngine {
21 AppleVision,
23 Tesseract,
25 Mock,
27 External,
29}
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum OcrTargetKind {
35 Screenshot,
37 Photo,
39 GenericImage,
41}
42
43#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
45pub struct BoundingBox {
46 pub x: f32,
48 pub y: f32,
50 pub width: f32,
52 pub height: f32,
54}
55
56impl BoundingBox {
57 pub fn is_normalized(&self) -> bool {
59 self.x >= 0.0
60 && self.y >= 0.0
61 && self.width >= 0.0
62 && self.height >= 0.0
63 && self.x <= 1.0
64 && self.y <= 1.0
65 && self.width <= 1.0
66 && self.height <= 1.0
67 && self.x + self.width <= 1.0
68 && self.y + self.height <= 1.0
69 }
70}
71
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum OcrBlockKind {
76 Paragraph,
78 Line,
80 Word,
82 UiChrome,
84 Unknown,
86}
87
88#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
90pub struct OcrTextBlock {
91 pub text: String,
93 pub confidence: Option<f32>,
95 pub bbox: Option<BoundingBox>,
97 pub kind: OcrBlockKind,
99}
100
101impl OcrTextBlock {
102 pub fn line(text: impl Into<String>) -> Self {
104 Self {
105 text: text.into(),
106 confidence: None,
107 bbox: None,
108 kind: OcrBlockKind::Line,
109 }
110 }
111}
112
113#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct ImageOcrRequest {
116 pub path: PathBuf,
118 pub target_kind: OcrTargetKind,
120 pub provenance: SourceProvenance,
122 pub language_hints: Vec<String>,
124 pub min_confidence: Option<f32>,
126 pub metadata: HashMap<String, String>,
128}
129
130impl ImageOcrRequest {
131 pub fn new(
133 path: impl Into<PathBuf>,
134 target_kind: OcrTargetKind,
135 provenance: SourceProvenance,
136 ) -> Self {
137 Self {
138 path: path.into(),
139 target_kind,
140 provenance,
141 language_hints: Vec::new(),
142 min_confidence: None,
143 metadata: HashMap::new(),
144 }
145 }
146
147 pub fn with_language_hint(mut self, language: impl Into<String>) -> Self {
149 self.language_hints.push(language.into());
150 self
151 }
152
153 pub fn with_min_confidence(mut self, min_confidence: f32) -> Self {
155 self.min_confidence = Some(min_confidence);
156 self
157 }
158}
159
160#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct OcrDocument {
163 pub engine: OcrEngine,
165 pub target_kind: OcrTargetKind,
167 pub full_text: String,
169 pub blocks: Vec<OcrTextBlock>,
171 pub extracted_at: DateTime<Utc>,
173 pub provenance: SourceProvenance,
175}
176
177impl OcrDocument {
178 pub fn new(
180 engine: OcrEngine,
181 target_kind: OcrTargetKind,
182 provenance: SourceProvenance,
183 ) -> Self {
184 Self {
185 engine,
186 target_kind,
187 full_text: String::new(),
188 blocks: Vec::new(),
189 extracted_at: Utc::now(),
190 provenance,
191 }
192 }
193
194 pub fn effective_text(&self) -> String {
198 let trimmed = self.full_text.trim();
199 if !trimmed.is_empty() {
200 return trimmed.to_string();
201 }
202
203 self.blocks
204 .iter()
205 .map(|block| block.text.trim())
206 .filter(|text| !text.is_empty())
207 .collect::<Vec<_>>()
208 .join("\n")
209 }
210
211 pub fn filtered_text(&self, min_confidence: f32) -> String {
213 self.blocks
214 .iter()
215 .filter(|block| block.confidence.unwrap_or(1.0) >= min_confidence)
216 .map(|block| block.text.trim())
217 .filter(|text| !text.is_empty())
218 .collect::<Vec<_>>()
219 .join("\n")
220 }
221}
222
223#[async_trait]
225pub trait OcrBackend: Send + Sync {
226 fn engine(&self) -> OcrEngine;
228
229 async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument>;
231}
232
233#[derive(Debug, Clone)]
235pub struct TesseractOcrConfig {
236 pub binary_path: String,
238 pub default_languages: Vec<String>,
240 pub oem: Option<u8>,
242 pub psm: Option<u8>,
244 pub extra_args: Vec<String>,
246 pub trim_output: bool,
248 pub synthesize_line_blocks: bool,
250}
251
252impl Default for TesseractOcrConfig {
253 fn default() -> Self {
254 Self {
255 binary_path: "tesseract".to_string(),
256 default_languages: vec!["eng".to_string()],
257 oem: None,
258 psm: None,
259 extra_args: Vec::new(),
260 trim_output: true,
261 synthesize_line_blocks: true,
262 }
263 }
264}
265
266impl TesseractOcrConfig {
267 pub fn with_binary_path(mut self, binary_path: impl Into<String>) -> Self {
269 self.binary_path = binary_path.into();
270 self
271 }
272
273 pub fn with_default_languages(
275 mut self,
276 langs: impl IntoIterator<Item = impl Into<String>>,
277 ) -> Self {
278 self.default_languages = langs.into_iter().map(Into::into).collect();
279 self
280 }
281
282 pub fn with_oem(mut self, oem: u8) -> Self {
284 self.oem = Some(oem);
285 self
286 }
287
288 pub fn with_psm(mut self, psm: u8) -> Self {
290 self.psm = Some(psm);
291 self
292 }
293
294 pub fn with_extra_arg(mut self, arg: impl Into<String>) -> Self {
296 self.extra_args.push(arg.into());
297 self
298 }
299
300 pub fn without_line_blocks(mut self) -> Self {
302 self.synthesize_line_blocks = false;
303 self
304 }
305}
306
307#[derive(Debug, Clone, Default)]
313pub struct TesseractOcrBackend {
314 config: TesseractOcrConfig,
315}
316
317#[derive(Debug, Clone, Copy)]
318enum TesseractOutputMode {
319 PlainText,
320 Tsv,
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
324struct TesseractLineKey {
325 page_num: u32,
326 block_num: u32,
327 par_num: u32,
328 line_num: u32,
329}
330
331#[derive(Debug, Clone, Copy)]
332struct TesseractPageDimensions {
333 width: u32,
334 height: u32,
335}
336
337#[derive(Debug, Clone)]
338struct TesseractLineAccumulator {
339 words: Vec<(u32, String)>,
340 confidence_sum: f32,
341 confidence_count: usize,
342 left: u32,
343 top: u32,
344 right: u32,
345 bottom: u32,
346}
347
348impl TesseractLineAccumulator {
349 fn new(
350 word_num: u32,
351 text: String,
352 confidence: f32,
353 left: u32,
354 top: u32,
355 width: u32,
356 height: u32,
357 ) -> Self {
358 Self {
359 words: vec![(word_num, text)],
360 confidence_sum: confidence,
361 confidence_count: 1,
362 left,
363 top,
364 right: left.saturating_add(width),
365 bottom: top.saturating_add(height),
366 }
367 }
368
369 fn add_word(
370 &mut self,
371 word_num: u32,
372 text: String,
373 confidence: f32,
374 left: u32,
375 top: u32,
376 width: u32,
377 height: u32,
378 ) {
379 self.words.push((word_num, text));
380 self.confidence_sum += confidence;
381 self.confidence_count += 1;
382 self.left = self.left.min(left);
383 self.top = self.top.min(top);
384 self.right = self.right.max(left.saturating_add(width));
385 self.bottom = self.bottom.max(top.saturating_add(height));
386 }
387}
388
389#[derive(Debug, Clone)]
390struct ParsedTesseractTsv {
391 full_text: String,
392 blocks: Vec<OcrTextBlock>,
393}
394
395impl TesseractOcrBackend {
396 pub fn new() -> Self {
398 Self::default()
399 }
400
401 pub fn with_config(config: TesseractOcrConfig) -> Self {
403 Self { config }
404 }
405
406 pub fn binary_path(&self) -> &str {
408 &self.config.binary_path
409 }
410
411 pub async fn is_available(&self) -> bool {
413 self.version().await.is_ok()
414 }
415
416 pub async fn version(&self) -> Result<String> {
418 let output = Command::new(&self.config.binary_path)
419 .arg("--version")
420 .output()
421 .await
422 .map_err(|err| {
423 Error::ingest(format!(
424 "failed to execute tesseract binary '{}': {err}",
425 self.config.binary_path
426 ))
427 })?;
428
429 if !output.status.success() {
430 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
431 return Err(Error::ingest(format!(
432 "tesseract --version failed (status {}): {stderr}",
433 output.status
434 )));
435 }
436
437 let version = String::from_utf8_lossy(&output.stdout).trim().to_string();
438 if version.is_empty() {
439 return Err(Error::ingest("tesseract --version returned empty output"));
440 }
441
442 Ok(version)
443 }
444
445 fn build_args(&self, request: &ImageOcrRequest) -> Vec<String> {
446 self.build_args_for_mode(request, TesseractOutputMode::PlainText)
447 }
448
449 fn build_tsv_args(&self, request: &ImageOcrRequest) -> Vec<String> {
450 self.build_args_for_mode(request, TesseractOutputMode::Tsv)
451 }
452
453 fn build_args_for_mode(
454 &self,
455 request: &ImageOcrRequest,
456 mode: TesseractOutputMode,
457 ) -> Vec<String> {
458 let mut args = vec![
459 request.path.to_string_lossy().into_owned(),
460 "stdout".to_string(),
461 ];
462
463 let languages = if request.language_hints.is_empty() {
464 self.config.default_languages.clone()
465 } else {
466 request.language_hints.clone()
467 };
468 let languages = normalize_tesseract_language_hints(languages);
469
470 if !languages.is_empty() {
471 args.push("-l".to_string());
472 args.push(languages.join("+"));
473 }
474
475 if let Some(oem) = self.config.oem {
476 args.push("--oem".to_string());
477 args.push(oem.to_string());
478 }
479
480 if let Some(psm) = self
481 .config
482 .psm
483 .or_else(|| default_psm_for_target(request.target_kind))
484 {
485 args.push("--psm".to_string());
486 args.push(psm.to_string());
487 }
488
489 args.extend(self.config.extra_args.iter().cloned());
490
491 if matches!(mode, TesseractOutputMode::Tsv) {
492 args.push("tsv".to_string());
493 }
494
495 args
496 }
497
498 fn normalize_output_text(&self, stdout: &[u8]) -> String {
499 let text = String::from_utf8_lossy(stdout).to_string();
500 if self.config.trim_output {
501 text.trim().to_string()
502 } else {
503 text
504 }
505 }
506
507 fn synthesize_blocks(&self, text: &str) -> Vec<OcrTextBlock> {
508 if !self.config.synthesize_line_blocks {
509 return Vec::new();
510 }
511
512 text.lines()
513 .map(str::trim)
514 .filter(|line| !line.is_empty())
515 .map(OcrTextBlock::line)
516 .collect()
517 }
518
519 async fn run_tesseract(
520 &self,
521 request: &ImageOcrRequest,
522 mode: TesseractOutputMode,
523 ) -> Result<Vec<u8>> {
524 let args = match mode {
525 TesseractOutputMode::PlainText => self.build_args(request),
526 TesseractOutputMode::Tsv => self.build_tsv_args(request),
527 };
528
529 let output = Command::new(&self.config.binary_path)
530 .args(&args)
531 .output()
532 .await
533 .map_err(|err| {
534 Error::ingest(format!(
535 "failed to execute tesseract binary '{}': {err}",
536 self.config.binary_path
537 ))
538 })?;
539
540 if !output.status.success() {
541 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
542 let mode_label = match mode {
543 TesseractOutputMode::PlainText => "plain-text",
544 TesseractOutputMode::Tsv => "tsv",
545 };
546 return Err(Error::ingest(format!(
547 "tesseract OCR ({mode_label}) failed for '{}'(status {}): {}",
548 request.path.display(),
549 output.status,
550 if stderr.is_empty() {
551 "no stderr output".to_string()
552 } else {
553 stderr
554 }
555 )));
556 }
557
558 Ok(output.stdout)
559 }
560
561 fn parse_tsv_output(&self, stdout: &[u8]) -> Result<ParsedTesseractTsv> {
562 let tsv = String::from_utf8_lossy(stdout);
563 parse_tesseract_tsv(&tsv)
564 }
565
566 fn build_document_from_plain_text(
567 &self,
568 request: &ImageOcrRequest,
569 full_text: String,
570 ) -> OcrDocument {
571 let mut doc = OcrDocument::new(
572 self.engine(),
573 request.target_kind,
574 request.provenance.clone(),
575 );
576 doc.full_text = full_text.clone();
577 doc.blocks = self.synthesize_blocks(&full_text);
578 doc
579 }
580
581 fn build_document_from_tsv(
582 &self,
583 request: &ImageOcrRequest,
584 parsed: ParsedTesseractTsv,
585 ) -> OcrDocument {
586 let mut doc = OcrDocument::new(
587 self.engine(),
588 request.target_kind,
589 request.provenance.clone(),
590 );
591 doc.full_text = if self.config.trim_output {
592 parsed.full_text.trim().to_string()
593 } else {
594 parsed.full_text
595 };
596 doc.blocks = parsed.blocks;
597 doc
598 }
599}
600
601fn normalize_tesseract_language_hints(hints: Vec<String>) -> Vec<String> {
602 let mut normalized = Vec::with_capacity(hints.len());
603 for hint in hints {
604 let code = normalize_tesseract_language_hint(&hint);
605 if !code.is_empty() && !normalized.iter().any(|existing| existing == &code) {
606 normalized.push(code);
607 }
608 }
609 normalized
610}
611
612fn normalize_tesseract_language_hint(hint: &str) -> String {
613 let trimmed = hint.trim();
614 if trimmed.is_empty() {
615 return String::new();
616 }
617
618 if trimmed.contains('_')
620 || (trimmed.len() == 3 && trimmed.chars().all(|c| c.is_ascii_alphabetic()))
621 {
622 return trimmed.to_ascii_lowercase();
623 }
624
625 let primary = trimmed
626 .split(['-', '_'])
627 .next()
628 .unwrap_or(trimmed)
629 .to_ascii_lowercase();
630
631 match primary.as_str() {
632 "en" => "eng",
633 "de" => "deu",
634 "fr" => "fra",
635 "es" => "spa",
636 "it" => "ita",
637 "pt" => "por",
638 "nl" => "nld",
639 "sv" => "swe",
640 "da" => "dan",
641 "fi" => "fin",
642 "no" | "nb" | "nn" => "nor",
643 "pl" => "pol",
644 "cs" => "ces",
645 "sk" => "slk",
646 "sl" => "slv",
647 "hr" => "hrv",
648 "sr" => "srp",
649 "ro" => "ron",
650 "hu" => "hun",
651 "tr" => "tur",
652 "el" => "ell",
653 "ru" => "rus",
654 "uk" => "ukr",
655 "bg" => "bul",
656 "he" | "iw" => "heb",
657 "ar" => "ara",
658 "fa" => "fas",
659 "hi" => "hin",
660 "bn" => "ben",
661 "ta" => "tam",
662 "te" => "tel",
663 "ml" => "mal",
664 "mr" => "mar",
665 "gu" => "guj",
666 "pa" => "pan",
667 "ur" => "urd",
668 "ja" => "jpn",
669 "ko" => "kor",
670 "zh" => "chi_sim",
671 "id" => "ind",
672 "ms" => "msa",
673 "vi" => "vie",
674 "th" => "tha",
675 "ca" => "cat",
676 "et" => "est",
677 "lv" => "lav",
678 "lt" => "lit",
679 _ => return trimmed.to_ascii_lowercase(),
680 }
681 .to_string()
682}
683
684#[async_trait]
685impl OcrBackend for TesseractOcrBackend {
686 fn engine(&self) -> OcrEngine {
687 OcrEngine::Tesseract
688 }
689
690 async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
691 if let Ok(tsv_stdout) = self.run_tesseract(request, TesseractOutputMode::Tsv).await {
692 if let Ok(parsed) = self.parse_tsv_output(&tsv_stdout) {
693 if !parsed.full_text.trim().is_empty() || !parsed.blocks.is_empty() {
694 return Ok(self.build_document_from_tsv(request, parsed));
695 }
696 }
697 }
698
699 let plain_stdout = self
700 .run_tesseract(request, TesseractOutputMode::PlainText)
701 .await?;
702 let full_text = self.normalize_output_text(&plain_stdout);
703 Ok(self.build_document_from_plain_text(request, full_text))
704 }
705}
706
707fn default_psm_for_target(target: OcrTargetKind) -> Option<u8> {
708 match target {
709 OcrTargetKind::Screenshot => Some(11), OcrTargetKind::Photo => Some(3), OcrTargetKind::GenericImage => None,
712 }
713}
714
715#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
717#[serde(rename_all = "snake_case")]
718pub enum AppleVisionRecognitionLevel {
719 Fast,
721 Accurate,
723}
724
725impl AppleVisionRecognitionLevel {
726 fn as_str(self) -> &'static str {
727 match self {
728 Self::Fast => "fast",
729 Self::Accurate => "accurate",
730 }
731 }
732}
733
734#[derive(Debug, Clone)]
739pub struct AppleVisionOcrConfig {
740 pub binary_path: String,
742 pub runner_prefix_args: Vec<String>,
744 pub default_languages: Vec<String>,
746 pub recognition_level: AppleVisionRecognitionLevel,
748 pub uses_language_correction: bool,
750 pub trim_output: bool,
752}
753
754impl Default for AppleVisionOcrConfig {
755 fn default() -> Self {
756 Self {
757 binary_path: "xcrun".to_string(),
758 runner_prefix_args: vec!["swift".to_string()],
759 default_languages: vec!["en-US".to_string()],
760 recognition_level: AppleVisionRecognitionLevel::Accurate,
761 uses_language_correction: true,
762 trim_output: true,
763 }
764 }
765}
766
767impl AppleVisionOcrConfig {
768 pub fn with_binary_path(mut self, binary_path: impl Into<String>) -> Self {
770 self.binary_path = binary_path.into();
771 self
772 }
773
774 pub fn with_runner_prefix_args(
776 mut self,
777 args: impl IntoIterator<Item = impl Into<String>>,
778 ) -> Self {
779 self.runner_prefix_args = args.into_iter().map(Into::into).collect();
780 self
781 }
782
783 pub fn with_default_languages(
785 mut self,
786 langs: impl IntoIterator<Item = impl Into<String>>,
787 ) -> Self {
788 self.default_languages = langs.into_iter().map(Into::into).collect();
789 self
790 }
791
792 pub fn with_recognition_level(mut self, level: AppleVisionRecognitionLevel) -> Self {
794 self.recognition_level = level;
795 self
796 }
797
798 pub fn with_language_correction(mut self, enabled: bool) -> Self {
800 self.uses_language_correction = enabled;
801 self
802 }
803}
804
805#[derive(Debug, Clone, Default)]
811pub struct AppleVisionOcrBackend {
812 config: AppleVisionOcrConfig,
813}
814
815#[derive(Debug, Deserialize)]
816struct AppleVisionScriptResponse {
817 #[serde(default)]
818 full_text: String,
819 #[serde(default)]
820 blocks: Vec<AppleVisionScriptBlock>,
821}
822
823#[derive(Debug, Deserialize)]
824struct AppleVisionScriptBlock {
825 text: String,
826 confidence: Option<f32>,
827 bbox: Option<AppleVisionScriptBoundingBox>,
828}
829
830#[derive(Debug, Deserialize)]
831struct AppleVisionScriptBoundingBox {
832 x: f32,
833 y: f32,
834 width: f32,
835 height: f32,
836}
837
838impl AppleVisionOcrBackend {
839 pub fn new() -> Self {
841 Self::default()
842 }
843
844 pub fn with_config(config: AppleVisionOcrConfig) -> Self {
846 Self { config }
847 }
848
849 pub fn binary_path(&self) -> &str {
851 &self.config.binary_path
852 }
853
854 pub async fn is_available(&self) -> bool {
856 self.version().await.is_ok()
857 }
858
859 pub async fn version(&self) -> Result<String> {
861 self.ensure_macos_runtime()?;
862
863 let mut args = self.config.runner_prefix_args.clone();
864 args.push("--version".to_string());
865
866 let output = Command::new(&self.config.binary_path)
867 .args(&args)
868 .output()
869 .await
870 .map_err(|err| {
871 Error::ingest(format!(
872 "failed to execute Apple Vision OCR runner '{}': {err}",
873 self.config.binary_path
874 ))
875 })?;
876
877 if !output.status.success() {
878 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
879 return Err(Error::ingest(format!(
880 "Apple Vision OCR runner version command failed (status {}): {stderr}",
881 output.status
882 )));
883 }
884
885 let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
886 let version = if stdout.is_empty() {
887 String::from_utf8_lossy(&output.stderr).trim().to_string()
888 } else {
889 stdout
890 };
891
892 if version.is_empty() {
893 return Err(Error::ingest(
894 "Apple Vision OCR runner version command returned empty output",
895 ));
896 }
897
898 Ok(version)
899 }
900
901 fn ensure_macos_runtime(&self) -> Result<()> {
902 if !cfg!(target_os = "macos") {
903 return Err(Error::ingest(
904 "Apple Vision OCR backend requires macOS runtime",
905 ));
906 }
907 Ok(())
908 }
909
910 fn normalize_languages(&self, request: &ImageOcrRequest) -> String {
911 let langs = if request.language_hints.is_empty() {
912 self.config.default_languages.clone()
913 } else {
914 request.language_hints.clone()
915 };
916
917 langs
918 .into_iter()
919 .map(|lang| lang.trim().to_string())
920 .filter(|lang| !lang.is_empty())
921 .collect::<Vec<_>>()
922 .join("+")
923 }
924
925 fn build_args(&self, script_path: &std::path::Path, request: &ImageOcrRequest) -> Vec<String> {
926 let mut args = self.config.runner_prefix_args.clone();
927 args.push(script_path.to_string_lossy().into_owned());
928 args.push(request.path.to_string_lossy().into_owned());
929 args.push(self.normalize_languages(request));
930 args.push(self.config.recognition_level.as_str().to_string());
931 args.push(self.config.uses_language_correction.to_string());
932 args
933 }
934
935 async fn write_temp_script(&self) -> Result<PathBuf> {
936 let ts = Utc::now()
937 .timestamp_nanos_opt()
938 .unwrap_or_else(|| Utc::now().timestamp_micros() * 1000);
939 let path = std::env::temp_dir().join(format!(
940 "converge_apple_vision_ocr_{}_{}.swift",
941 std::process::id(),
942 ts
943 ));
944
945 tokio::fs::write(&path, apple_vision_swift_script())
946 .await
947 .map_err(Error::from)?;
948
949 Ok(path)
950 }
951
952 async fn run_script(
953 &self,
954 request: &ImageOcrRequest,
955 script_path: &std::path::Path,
956 ) -> Result<Vec<u8>> {
957 let args = self.build_args(script_path, request);
958 let output = Command::new(&self.config.binary_path)
959 .args(&args)
960 .output()
961 .await
962 .map_err(|err| {
963 Error::ingest(format!(
964 "failed to execute Apple Vision OCR runner '{}': {err}",
965 self.config.binary_path
966 ))
967 })?;
968
969 if !output.status.success() {
970 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
971 return Err(Error::ingest(format!(
972 "Apple Vision OCR failed for '{}'(status {}): {}",
973 request.path.display(),
974 output.status,
975 if stderr.is_empty() {
976 "no stderr output".to_string()
977 } else {
978 stderr
979 }
980 )));
981 }
982
983 Ok(output.stdout)
984 }
985
986 fn parse_json_output(&self, stdout: &[u8]) -> Result<OcrDocumentParts> {
987 let response: AppleVisionScriptResponse =
988 serde_json::from_slice(stdout).map_err(|err| {
989 Error::ingest(format!(
990 "failed to parse Apple Vision OCR JSON output: {err}"
991 ))
992 })?;
993
994 let mut blocks = Vec::with_capacity(response.blocks.len());
995 for block in response.blocks {
996 let text = block.text.trim().to_string();
997 if text.is_empty() {
998 continue;
999 }
1000
1001 let confidence = block.confidence.map(|c| c.clamp(0.0, 1.0));
1002 let bbox = block.bbox.and_then(|bbox| {
1003 let normalized = BoundingBox {
1004 x: bbox.x.clamp(0.0, 1.0),
1005 y: bbox.y.clamp(0.0, 1.0),
1006 width: bbox.width.clamp(0.0, 1.0),
1007 height: bbox.height.clamp(0.0, 1.0),
1008 };
1009 normalized.is_normalized().then_some(normalized)
1010 });
1011
1012 blocks.push(OcrTextBlock {
1013 text,
1014 confidence,
1015 bbox,
1016 kind: OcrBlockKind::Line,
1017 });
1018 }
1019
1020 let full_text = if response.full_text.trim().is_empty() {
1021 blocks
1022 .iter()
1023 .map(|block| block.text.as_str())
1024 .collect::<Vec<_>>()
1025 .join("\n")
1026 } else if self.config.trim_output {
1027 response.full_text.trim().to_string()
1028 } else {
1029 response.full_text
1030 };
1031
1032 Ok(OcrDocumentParts { full_text, blocks })
1033 }
1034}
1035
1036#[async_trait]
1037impl OcrBackend for AppleVisionOcrBackend {
1038 fn engine(&self) -> OcrEngine {
1039 OcrEngine::AppleVision
1040 }
1041
1042 async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
1043 self.ensure_macos_runtime()?;
1044
1045 let script_path = self.write_temp_script().await?;
1046 let stdout = self.run_script(request, &script_path).await;
1047 let _ = tokio::fs::remove_file(&script_path).await;
1048 let stdout = stdout?;
1049
1050 let parts = self.parse_json_output(&stdout)?;
1051 let mut doc = OcrDocument::new(
1052 self.engine(),
1053 request.target_kind,
1054 request.provenance.clone(),
1055 );
1056 doc.full_text = parts.full_text;
1057 doc.blocks = parts.blocks;
1058 Ok(doc)
1059 }
1060}
1061
1062#[derive(Debug)]
1063struct OcrDocumentParts {
1064 full_text: String,
1065 blocks: Vec<OcrTextBlock>,
1066}
1067
1068fn apple_vision_swift_script() -> &'static str {
1069 r#"
1070import Foundation
1071import Vision
1072import CoreGraphics
1073import ImageIO
1074
1075func fail(_ message: String) -> Never {
1076 if let data = (message + "\n").data(using: .utf8) {
1077 FileHandle.standardError.write(data)
1078 }
1079 exit(1)
1080}
1081
1082func boolFromArg(_ raw: String) -> Bool {
1083 switch raw.lowercased() {
1084 case "1", "true", "yes", "y":
1085 return true
1086 default:
1087 return false
1088 }
1089}
1090
1091let args = CommandLine.arguments
1092guard args.count >= 2 else {
1093 fail("usage: <script> <image_path> [languages] [recognition_level] [language_correction]")
1094}
1095
1096let imagePath = args[1]
1097let languagesArg = args.count > 2 ? args[2] : ""
1098let recognitionLevelArg = args.count > 3 ? args[3].lowercased() : "accurate"
1099let languageCorrectionArg = args.count > 4 ? args[4] : "true"
1100
1101let imageURL = URL(fileURLWithPath: imagePath)
1102guard let imageSource = CGImageSourceCreateWithURL(imageURL as CFURL, nil) else {
1103 fail("failed to create image source for \(imagePath)")
1104}
1105guard let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) else {
1106 fail("failed to decode image at \(imagePath)")
1107}
1108
1109let request = VNRecognizeTextRequest()
1110request.recognitionLevel = (recognitionLevelArg == "fast") ? .fast : .accurate
1111request.usesLanguageCorrection = boolFromArg(languageCorrectionArg)
1112
1113let languages = languagesArg
1114 .split(separator: "+")
1115 .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }
1116 .filter { !$0.isEmpty }
1117if !languages.isEmpty {
1118 request.recognitionLanguages = languages
1119}
1120
1121do {
1122 let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
1123 try handler.perform([request])
1124
1125 let observations = request.results ?? []
1126 var blocks: [[String: Any]] = []
1127
1128 for observation in observations {
1129 guard let candidate = observation.topCandidates(1).first else { continue }
1130 let text = candidate.string.trimmingCharacters(in: .whitespacesAndNewlines)
1131 if text.isEmpty { continue }
1132
1133 let rect = observation.boundingBox
1134 // Vision uses a bottom-left origin; convert to top-left normalized coordinates.
1135 let topLeftY = 1.0 - rect.origin.y - rect.size.height
1136
1137 blocks.append([
1138 "text": text,
1139 "confidence": Double(candidate.confidence),
1140 "bbox": [
1141 "x": Double(rect.origin.x),
1142 "y": Double(topLeftY),
1143 "width": Double(rect.size.width),
1144 "height": Double(rect.size.height)
1145 ]
1146 ])
1147 }
1148
1149 let fullText = blocks.compactMap { $0["text"] as? String }.joined(separator: "\n")
1150 let payload: [String: Any] = [
1151 "full_text": fullText,
1152 "blocks": blocks
1153 ]
1154
1155 let data = try JSONSerialization.data(withJSONObject: payload, options: [])
1156 FileHandle.standardOutput.write(data)
1157} catch {
1158 fail(String(describing: error))
1159}
1160"#
1161}
1162
1163fn parse_tesseract_tsv(tsv: &str) -> Result<ParsedTesseractTsv> {
1164 let mut lines = tsv.lines();
1165 let header = lines
1166 .next()
1167 .ok_or_else(|| Error::ingest("empty tesseract TSV output"))?;
1168
1169 let header_cols: Vec<&str> = header.split('\t').collect();
1170 if header_cols.is_empty() {
1171 return Err(Error::ingest("invalid tesseract TSV header"));
1172 }
1173
1174 let mut header_index = HashMap::<String, usize>::new();
1175 for (idx, col) in header_cols.iter().enumerate() {
1176 header_index.insert((*col).to_string(), idx);
1177 }
1178
1179 for required in [
1180 "level",
1181 "page_num",
1182 "block_num",
1183 "par_num",
1184 "line_num",
1185 "word_num",
1186 "left",
1187 "top",
1188 "width",
1189 "height",
1190 "conf",
1191 "text",
1192 ] {
1193 if !header_index.contains_key(required) {
1194 return Err(Error::ingest(format!(
1195 "tesseract TSV missing required column '{required}'"
1196 )));
1197 }
1198 }
1199
1200 let mut pages = HashMap::<u32, TesseractPageDimensions>::new();
1201 let mut lines_by_key = BTreeMap::<TesseractLineKey, TesseractLineAccumulator>::new();
1202
1203 for raw_line in lines {
1204 if raw_line.trim().is_empty() {
1205 continue;
1206 }
1207
1208 let cols = split_tsv_row(raw_line, header_cols.len());
1209
1210 let level = parse_tsv_u32(&cols, &header_index, "level")?;
1211 let page_num = parse_tsv_u32(&cols, &header_index, "page_num")?;
1212
1213 if level == 1 {
1214 let width = parse_tsv_u32(&cols, &header_index, "width")?;
1215 let height = parse_tsv_u32(&cols, &header_index, "height")?;
1216 if width > 0 && height > 0 {
1217 pages.insert(page_num, TesseractPageDimensions { width, height });
1218 }
1219 continue;
1220 }
1221
1222 if level != 5 {
1223 continue;
1224 }
1225
1226 let text = tsv_field(&cols, &header_index, "text")?.trim().to_string();
1227 if text.is_empty() {
1228 continue;
1229 }
1230
1231 let conf_raw = parse_tsv_f32(&cols, &header_index, "conf")?;
1232 if conf_raw < 0.0 {
1233 continue;
1234 }
1235
1236 let key = TesseractLineKey {
1237 page_num,
1238 block_num: parse_tsv_u32(&cols, &header_index, "block_num")?,
1239 par_num: parse_tsv_u32(&cols, &header_index, "par_num")?,
1240 line_num: parse_tsv_u32(&cols, &header_index, "line_num")?,
1241 };
1242
1243 let word_num = parse_tsv_u32(&cols, &header_index, "word_num")?;
1244 let left = parse_tsv_u32(&cols, &header_index, "left")?;
1245 let top = parse_tsv_u32(&cols, &header_index, "top")?;
1246 let width = parse_tsv_u32(&cols, &header_index, "width")?;
1247 let height = parse_tsv_u32(&cols, &header_index, "height")?;
1248 let confidence = (conf_raw / 100.0).clamp(0.0, 1.0);
1249
1250 use std::collections::btree_map::Entry;
1251 match lines_by_key.entry(key) {
1252 Entry::Vacant(entry) => {
1253 entry.insert(TesseractLineAccumulator::new(
1254 word_num, text, confidence, left, top, width, height,
1255 ));
1256 }
1257 Entry::Occupied(mut entry) => {
1258 entry
1259 .get_mut()
1260 .add_word(word_num, text, confidence, left, top, width, height);
1261 }
1262 }
1263 }
1264
1265 let mut blocks = Vec::with_capacity(lines_by_key.len());
1266
1267 for (key, mut line) in lines_by_key {
1268 line.words.sort_by_key(|(word_num, _)| *word_num);
1269 let text = line
1270 .words
1271 .into_iter()
1272 .map(|(_, text)| text)
1273 .collect::<Vec<_>>()
1274 .join(" ");
1275
1276 if text.trim().is_empty() {
1277 continue;
1278 }
1279
1280 let confidence = if line.confidence_count > 0 {
1281 Some((line.confidence_sum / line.confidence_count as f32).clamp(0.0, 1.0))
1282 } else {
1283 None
1284 };
1285
1286 let bbox = pages
1287 .get(&key.page_num)
1288 .and_then(|page| normalize_bbox(line.left, line.top, line.right, line.bottom, *page));
1289
1290 blocks.push(OcrTextBlock {
1291 text,
1292 confidence,
1293 bbox,
1294 kind: OcrBlockKind::Line,
1295 });
1296 }
1297
1298 let full_text = blocks
1299 .iter()
1300 .map(|block| block.text.trim())
1301 .filter(|line| !line.is_empty())
1302 .collect::<Vec<_>>()
1303 .join("\n");
1304
1305 Ok(ParsedTesseractTsv { full_text, blocks })
1306}
1307
1308fn split_tsv_row(line: &str, expected_cols: usize) -> Vec<String> {
1309 let parts: Vec<&str> = line.split('\t').collect();
1310 if parts.len() <= expected_cols {
1311 return parts.into_iter().map(str::to_string).collect();
1312 }
1313
1314 let mut out = parts[..expected_cols - 1]
1315 .iter()
1316 .map(|s| (*s).to_string())
1317 .collect::<Vec<_>>();
1318 out.push(parts[expected_cols - 1..].join("\t"));
1319 out
1320}
1321
1322fn tsv_field<'a>(
1323 cols: &'a [String],
1324 header_index: &HashMap<String, usize>,
1325 name: &str,
1326) -> Result<&'a str> {
1327 let idx = *header_index
1328 .get(name)
1329 .ok_or_else(|| Error::ingest(format!("missing TSV header index for '{name}'")))?;
1330
1331 cols.get(idx)
1332 .map(String::as_str)
1333 .ok_or_else(|| Error::ingest(format!("missing TSV field '{name}' in row")))
1334}
1335
1336fn parse_tsv_u32(
1337 cols: &[String],
1338 header_index: &HashMap<String, usize>,
1339 name: &str,
1340) -> Result<u32> {
1341 let value = tsv_field(cols, header_index, name)?;
1342 value.parse::<u32>().map_err(|err| {
1343 Error::ingest(format!(
1344 "invalid tesseract TSV integer field '{name}' value '{value}': {err}"
1345 ))
1346 })
1347}
1348
1349fn parse_tsv_f32(
1350 cols: &[String],
1351 header_index: &HashMap<String, usize>,
1352 name: &str,
1353) -> Result<f32> {
1354 let value = tsv_field(cols, header_index, name)?;
1355 value.parse::<f32>().map_err(|err| {
1356 Error::ingest(format!(
1357 "invalid tesseract TSV float field '{name}' value '{value}': {err}"
1358 ))
1359 })
1360}
1361
1362fn normalize_bbox(
1363 left: u32,
1364 top: u32,
1365 right: u32,
1366 bottom: u32,
1367 page: TesseractPageDimensions,
1368) -> Option<BoundingBox> {
1369 if page.width == 0 || page.height == 0 || right <= left || bottom <= top {
1370 return None;
1371 }
1372
1373 let page_w = page.width as f32;
1374 let page_h = page.height as f32;
1375 let x = (left as f32 / page_w).clamp(0.0, 1.0);
1376 let y = (top as f32 / page_h).clamp(0.0, 1.0);
1377 let width = ((right.saturating_sub(left)) as f32 / page_w).clamp(0.0, 1.0);
1378 let height = ((bottom.saturating_sub(top)) as f32 / page_h).clamp(0.0, 1.0);
1379
1380 let bbox = BoundingBox {
1381 x,
1382 y,
1383 width,
1384 height,
1385 };
1386
1387 bbox.is_normalized().then_some(bbox)
1388}
1389
1390#[derive(Debug, Clone)]
1396pub struct FixtureOcrBackend {
1397 engine: OcrEngine,
1398 documents: HashMap<String, OcrDocument>,
1399 default_document: Option<OcrDocument>,
1400}
1401
1402impl FixtureOcrBackend {
1403 pub fn new() -> Self {
1405 Self {
1406 engine: OcrEngine::Mock,
1407 documents: HashMap::new(),
1408 default_document: None,
1409 }
1410 }
1411
1412 pub fn with_engine(engine: OcrEngine) -> Self {
1414 Self {
1415 engine,
1416 documents: HashMap::new(),
1417 default_document: None,
1418 }
1419 }
1420
1421 pub fn with_document(mut self, key: impl Into<String>, document: OcrDocument) -> Self {
1428 self.documents.insert(key.into(), document);
1429 self
1430 }
1431
1432 pub fn with_default_document(mut self, document: OcrDocument) -> Self {
1434 self.default_document = Some(document);
1435 self
1436 }
1437
1438 fn lookup(&self, path: &std::path::Path) -> Option<&OcrDocument> {
1439 let exact = path.to_string_lossy();
1440 if let Some(doc) = self.documents.get(exact.as_ref()) {
1441 return Some(doc);
1442 }
1443
1444 if let Some(file_name) = path.file_name().and_then(|name| name.to_str())
1445 && let Some(doc) = self.documents.get(file_name)
1446 {
1447 return Some(doc);
1448 }
1449
1450 if let Some(file_stem) = path.file_stem().and_then(|stem| stem.to_str())
1451 && let Some(doc) = self.documents.get(file_stem)
1452 {
1453 return Some(doc);
1454 }
1455
1456 self.default_document.as_ref()
1457 }
1458}
1459
1460impl Default for FixtureOcrBackend {
1461 fn default() -> Self {
1462 Self::new()
1463 }
1464}
1465
1466#[async_trait]
1467impl OcrBackend for FixtureOcrBackend {
1468 fn engine(&self) -> OcrEngine {
1469 self.engine.clone()
1470 }
1471
1472 async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
1473 let Some(template) = self.lookup(&request.path) else {
1474 return Err(Error::ingest(format!(
1475 "no OCR fixture registered for path '{}'",
1476 request.path.display()
1477 )));
1478 };
1479
1480 let mut doc = template.clone();
1481 doc.engine = self.engine();
1482 doc.target_kind = request.target_kind;
1483 doc.provenance = request.provenance.clone();
1484 doc.extracted_at = Utc::now();
1485 Ok(doc)
1486 }
1487}
1488
1489#[cfg(test)]
1490mod tests {
1491 use super::*;
1492 use crate::ingest::{SourceKind, SourceProvenance};
1493
1494 #[test]
1495 fn effective_text_falls_back_to_blocks() {
1496 let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1497 let mut doc = OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Screenshot, provenance);
1498 doc.blocks.push(OcrTextBlock::line("First line"));
1499 doc.blocks.push(OcrTextBlock::line("Second line"));
1500
1501 assert_eq!(doc.effective_text(), "First line\nSecond line");
1502 }
1503
1504 #[test]
1505 fn filtered_text_respects_confidence() {
1506 let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1507 let mut doc = OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, provenance);
1508 doc.blocks.push(OcrTextBlock {
1509 text: "keep".into(),
1510 confidence: Some(0.95),
1511 bbox: None,
1512 kind: OcrBlockKind::Word,
1513 });
1514 doc.blocks.push(OcrTextBlock {
1515 text: "drop".into(),
1516 confidence: Some(0.20),
1517 bbox: None,
1518 kind: OcrBlockKind::Word,
1519 });
1520
1521 assert_eq!(doc.filtered_text(0.5), "keep");
1522 }
1523
1524 #[test]
1525 fn bounding_box_validation_checks_normalized_range() {
1526 let valid = BoundingBox {
1527 x: 0.1,
1528 y: 0.2,
1529 width: 0.3,
1530 height: 0.4,
1531 };
1532 let invalid = BoundingBox {
1533 x: 0.8,
1534 y: 0.9,
1535 width: 0.4,
1536 height: 0.2,
1537 };
1538
1539 assert!(valid.is_normalized());
1540 assert!(!invalid.is_normalized());
1541 }
1542
1543 #[tokio::test]
1544 async fn fixture_backend_uses_filename_lookup_and_overrides_provenance() {
1545 let fixture_provenance = SourceProvenance::new(SourceKind::Screenshot, "fixture://shot");
1546 let mut fixture_doc = OcrDocument::new(
1547 OcrEngine::Mock,
1548 OcrTargetKind::Screenshot,
1549 fixture_provenance,
1550 );
1551 fixture_doc.full_text = "fixture text".into();
1552
1553 let backend = FixtureOcrBackend::new().with_document("test-shot.png", fixture_doc);
1554
1555 let request_provenance =
1556 SourceProvenance::new(SourceKind::Screenshot, "file:///tmp/test-shot.png")
1557 .with_origin_id("shot-123");
1558 let request = ImageOcrRequest::new(
1559 "/tmp/test-shot.png",
1560 OcrTargetKind::Screenshot,
1561 request_provenance.clone(),
1562 );
1563
1564 let response = backend.extract(&request).await.unwrap();
1565 assert_eq!(response.full_text, "fixture text");
1566 assert_eq!(
1567 response.provenance.source_uri,
1568 request_provenance.source_uri
1569 );
1570 assert_eq!(response.provenance.origin_id, request_provenance.origin_id);
1571 }
1572
1573 #[test]
1574 fn tesseract_backend_builds_expected_command_args() {
1575 let backend = TesseractOcrBackend::with_config(
1576 TesseractOcrConfig::default()
1577 .with_default_languages(["eng", "deu"])
1578 .with_oem(1)
1579 .with_psm(6)
1580 .with_extra_arg("--dpi")
1581 .with_extra_arg("300"),
1582 );
1583
1584 let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1585 let request = ImageOcrRequest::new("/tmp/shot.png", OcrTargetKind::Screenshot, provenance)
1586 .with_language_hint("fra");
1587
1588 let args = backend.build_args(&request);
1589 assert_eq!(
1590 args,
1591 vec![
1592 "/tmp/shot.png",
1593 "stdout",
1594 "-l",
1595 "fra",
1596 "--oem",
1597 "1",
1598 "--psm",
1599 "6",
1600 "--dpi",
1601 "300",
1602 ]
1603 );
1604 }
1605
1606 #[test]
1607 fn tesseract_backend_normalizes_and_dedupes_language_hints() {
1608 let backend = TesseractOcrBackend::new();
1609 let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1610 let request = ImageOcrRequest::new("/tmp/shot.png", OcrTargetKind::Screenshot, provenance)
1611 .with_language_hint("en")
1612 .with_language_hint("en-US")
1613 .with_language_hint("de-DE")
1614 .with_language_hint("chi_sim");
1615
1616 let args = backend.build_args(&request);
1617 let lang_idx = args.iter().position(|arg| arg == "-l").unwrap();
1618 assert_eq!(args[lang_idx + 1], "eng+deu+chi_sim");
1619 }
1620
1621 #[test]
1622 fn tesseract_backend_builds_tsv_command_args() {
1623 let backend = TesseractOcrBackend::new();
1624 let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1625 let request = ImageOcrRequest::new("/tmp/photo.jpg", OcrTargetKind::Photo, provenance);
1626
1627 let args = backend.build_tsv_args(&request);
1628 assert!(args.ends_with(&["tsv".to_string()]));
1629 assert_eq!(args[0], "/tmp/photo.jpg");
1630 assert_eq!(args[1], "stdout");
1631 }
1632
1633 #[test]
1634 fn tesseract_backend_synthesizes_line_blocks_from_plain_text() {
1635 let backend = TesseractOcrBackend::new();
1636 let blocks = backend.synthesize_blocks("Line 1\n\n Line 2 \n");
1637 assert_eq!(blocks.len(), 2);
1638 assert_eq!(blocks[0].text, "Line 1");
1639 assert_eq!(blocks[1].text, "Line 2");
1640 assert!(blocks.iter().all(|b| b.kind == OcrBlockKind::Line));
1641 }
1642
1643 #[test]
1644 fn parses_tesseract_tsv_into_line_blocks_with_bbox_and_confidence() {
1645 let tsv = concat!(
1646 "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n",
1647 "1\t1\t0\t0\t0\t0\t0\t0\t200\t100\t-1\t\n",
1648 "4\t1\t1\t1\t1\t0\t10\t10\t100\t20\t-1\t\n",
1649 "5\t1\t1\t1\t1\t1\t10\t10\t40\t20\t92\tHello\n",
1650 "5\t1\t1\t1\t1\t2\t60\t10\t50\t20\t88\tworld\n",
1651 "4\t1\t1\t1\t2\t0\t10\t40\t80\t20\t-1\t\n",
1652 "5\t1\t1\t1\t2\t1\t10\t40\t80\t20\t75\tSecond\n",
1653 );
1654
1655 let parsed = parse_tesseract_tsv(tsv).unwrap();
1656 assert_eq!(parsed.blocks.len(), 2);
1657 assert_eq!(parsed.full_text, "Hello world\nSecond");
1658
1659 let first = &parsed.blocks[0];
1660 assert_eq!(first.kind, OcrBlockKind::Line);
1661 assert_eq!(first.text, "Hello world");
1662 let conf = first.confidence.unwrap();
1663 assert!((conf - 0.90).abs() < 1e-6, "unexpected confidence: {conf}");
1664
1665 let bbox = first.bbox.unwrap();
1666 assert!((bbox.x - 0.05).abs() < 1e-6);
1667 assert!((bbox.y - 0.10).abs() < 1e-6);
1668 assert!((bbox.width - 0.50).abs() < 1e-6);
1669 assert!((bbox.height - 0.20).abs() < 1e-6);
1670 }
1671
1672 #[test]
1673 fn tesseract_parser_rejects_missing_header_columns() {
1674 let err = parse_tesseract_tsv("level\tpage_num\n1\t1\n").unwrap_err();
1675 assert!(err.to_string().contains("missing required column"));
1676 }
1677
1678 #[test]
1679 fn apple_vision_backend_builds_runner_args() {
1680 let backend = AppleVisionOcrBackend::with_config(
1681 AppleVisionOcrConfig::default()
1682 .with_binary_path("xcrun")
1683 .with_runner_prefix_args(["swift"])
1684 .with_default_languages(["en-US"])
1685 .with_recognition_level(AppleVisionRecognitionLevel::Fast)
1686 .with_language_correction(false),
1687 );
1688
1689 let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1690 let request = ImageOcrRequest::new("/tmp/photo.jpg", OcrTargetKind::Photo, provenance)
1691 .with_language_hint("en-US")
1692 .with_language_hint("de-DE");
1693
1694 let args = backend.build_args(std::path::Path::new("/tmp/vision.swift"), &request);
1695 assert_eq!(
1696 args,
1697 vec![
1698 "swift",
1699 "/tmp/vision.swift",
1700 "/tmp/photo.jpg",
1701 "en-US+de-DE",
1702 "fast",
1703 "false",
1704 ]
1705 );
1706 }
1707
1708 #[test]
1709 fn parses_apple_vision_json_into_line_blocks() {
1710 let backend = AppleVisionOcrBackend::with_config(AppleVisionOcrConfig::default());
1711 let json = br#"{
1712 "full_text": "Hello\nWorld",
1713 "blocks": [
1714 {
1715 "text": "Hello",
1716 "confidence": 0.93,
1717 "bbox": { "x": 0.1, "y": 0.2, "width": 0.3, "height": 0.1 }
1718 },
1719 {
1720 "text": "World",
1721 "confidence": 1.1,
1722 "bbox": { "x": 0.4, "y": 0.5, "width": 0.4, "height": 0.2 }
1723 }
1724 ]
1725 }"#;
1726
1727 let parsed = backend.parse_json_output(json).unwrap();
1728 assert_eq!(parsed.full_text, "Hello\nWorld");
1729 assert_eq!(parsed.blocks.len(), 2);
1730 assert_eq!(parsed.blocks[0].kind, OcrBlockKind::Line);
1731 assert_eq!(parsed.blocks[0].confidence, Some(0.93));
1732 assert_eq!(parsed.blocks[1].confidence, Some(1.0)); assert!(parsed.blocks[0].bbox.unwrap().is_normalized());
1734 }
1735
1736 #[test]
1737 fn apple_vision_parser_rejects_invalid_json() {
1738 let backend = AppleVisionOcrBackend::new();
1739 let err = backend
1740 .parse_json_output(br#"{"blocks":"nope"}"#)
1741 .unwrap_err();
1742 assert!(err.to_string().contains("parse Apple Vision OCR JSON"));
1743 }
1744
1745 #[test]
1746 fn default_psm_matches_target_kind() {
1747 assert_eq!(default_psm_for_target(OcrTargetKind::Screenshot), Some(11));
1748 assert_eq!(default_psm_for_target(OcrTargetKind::Photo), Some(3));
1749 assert_eq!(default_psm_for_target(OcrTargetKind::GenericImage), None);
1750 }
1751}