converge_knowledge/ingest/
ocr.rs

1//! OCR contracts for screenshots and photos.
2//!
3//! This module defines shared request/response types and backend traits used by
4//! Phase 2 image ingestion specialists. Backends (Apple Vision, Tesseract, mock)
5//! can implement the trait without changing ingestion orchestration logic.
6
7use crate::Result;
8use crate::error::Error;
9use crate::ingest::SourceProvenance;
10use async_trait::async_trait;
11use chrono::{DateTime, Utc};
12use serde::{Deserialize, Serialize};
13use std::collections::{BTreeMap, HashMap};
14use std::path::PathBuf;
15use tokio::process::Command;
16
17/// Supported OCR engine families.
18#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
19#[serde(rename_all = "snake_case")]
20pub enum OcrEngine {
21    /// Apple Vision OCR on macOS.
22    AppleVision,
23    /// Tesseract OCR.
24    Tesseract,
25    /// Test/mock backend.
26    Mock,
27    /// Any external/custom provider.
28    External,
29}
30
31/// The source image type being processed.
32#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
33#[serde(rename_all = "snake_case")]
34pub enum OcrTargetKind {
35    /// A screen capture where UI chrome may be present.
36    Screenshot,
37    /// A photo captured by a camera.
38    Photo,
39    /// Generic image input where no source-specific assumptions apply.
40    GenericImage,
41}
42
43/// Bounding box for OCR blocks in normalized coordinates (0.0..=1.0).
44#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
45pub struct BoundingBox {
46    /// Left X coordinate.
47    pub x: f32,
48    /// Top Y coordinate.
49    pub y: f32,
50    /// Width.
51    pub width: f32,
52    /// Height.
53    pub height: f32,
54}
55
56impl BoundingBox {
57    /// Returns `true` when the box is non-negative and inside normalized bounds.
58    pub fn is_normalized(&self) -> bool {
59        self.x >= 0.0
60            && self.y >= 0.0
61            && self.width >= 0.0
62            && self.height >= 0.0
63            && self.x <= 1.0
64            && self.y <= 1.0
65            && self.width <= 1.0
66            && self.height <= 1.0
67            && self.x + self.width <= 1.0
68            && self.y + self.height <= 1.0
69    }
70}
71
72/// OCR block classification.
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum OcrBlockKind {
76    /// Paragraph-level text block.
77    Paragraph,
78    /// Line-level text block.
79    Line,
80    /// Word/token-level OCR block.
81    Word,
82    /// UI chrome text (menus, buttons, labels) in screenshots.
83    UiChrome,
84    /// Unclassified OCR block.
85    Unknown,
86}
87
88/// A single OCR-detected text block.
89#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
90pub struct OcrTextBlock {
91    /// The recognized text.
92    pub text: String,
93    /// Confidence from 0.0 to 1.0 when provided by the backend.
94    pub confidence: Option<f32>,
95    /// Normalized bounding box when provided by the backend.
96    pub bbox: Option<BoundingBox>,
97    /// Classification for this block.
98    pub kind: OcrBlockKind,
99}
100
101impl OcrTextBlock {
102    /// Create a basic line block with no confidence or bounding box.
103    pub fn line(text: impl Into<String>) -> Self {
104        Self {
105            text: text.into(),
106            confidence: None,
107            bbox: None,
108            kind: OcrBlockKind::Line,
109        }
110    }
111}
112
113/// Request passed to OCR backends.
114#[derive(Debug, Clone, Serialize, Deserialize)]
115pub struct ImageOcrRequest {
116    /// Path to the source image file.
117    pub path: PathBuf,
118    /// High-level target kind for backend tuning.
119    pub target_kind: OcrTargetKind,
120    /// Shared provenance metadata for this image.
121    pub provenance: SourceProvenance,
122    /// Language hints (e.g., `en`, `de`) to improve OCR.
123    pub language_hints: Vec<String>,
124    /// Optional minimum confidence filter hint.
125    pub min_confidence: Option<f32>,
126    /// Source-specific request metadata.
127    pub metadata: HashMap<String, String>,
128}
129
130impl ImageOcrRequest {
131    /// Create a new OCR request.
132    pub fn new(
133        path: impl Into<PathBuf>,
134        target_kind: OcrTargetKind,
135        provenance: SourceProvenance,
136    ) -> Self {
137        Self {
138            path: path.into(),
139            target_kind,
140            provenance,
141            language_hints: Vec::new(),
142            min_confidence: None,
143            metadata: HashMap::new(),
144        }
145    }
146
147    /// Add a language hint.
148    pub fn with_language_hint(mut self, language: impl Into<String>) -> Self {
149        self.language_hints.push(language.into());
150        self
151    }
152
153    /// Set a minimum confidence hint.
154    pub fn with_min_confidence(mut self, min_confidence: f32) -> Self {
155        self.min_confidence = Some(min_confidence);
156        self
157    }
158}
159
160/// OCR extraction result.
161#[derive(Debug, Clone, Serialize, Deserialize)]
162pub struct OcrDocument {
163    /// OCR backend used to produce this result.
164    pub engine: OcrEngine,
165    /// The source target kind.
166    pub target_kind: OcrTargetKind,
167    /// Full-text result if the backend produced one directly.
168    pub full_text: String,
169    /// Structured OCR blocks.
170    pub blocks: Vec<OcrTextBlock>,
171    /// Extraction timestamp.
172    pub extracted_at: DateTime<Utc>,
173    /// Shared provenance for the source image.
174    pub provenance: SourceProvenance,
175}
176
177impl OcrDocument {
178    /// Create an empty OCR document for incremental population.
179    pub fn new(
180        engine: OcrEngine,
181        target_kind: OcrTargetKind,
182        provenance: SourceProvenance,
183    ) -> Self {
184        Self {
185            engine,
186            target_kind,
187            full_text: String::new(),
188            blocks: Vec::new(),
189            extracted_at: Utc::now(),
190            provenance,
191        }
192    }
193
194    /// Returns the best available text for indexing.
195    ///
196    /// Uses `full_text` when present, otherwise concatenates non-empty blocks.
197    pub fn effective_text(&self) -> String {
198        let trimmed = self.full_text.trim();
199        if !trimmed.is_empty() {
200            return trimmed.to_string();
201        }
202
203        self.blocks
204            .iter()
205            .map(|block| block.text.trim())
206            .filter(|text| !text.is_empty())
207            .collect::<Vec<_>>()
208            .join("\n")
209    }
210
211    /// Returns text from blocks whose confidence meets the threshold.
212    pub fn filtered_text(&self, min_confidence: f32) -> String {
213        self.blocks
214            .iter()
215            .filter(|block| block.confidence.unwrap_or(1.0) >= min_confidence)
216            .map(|block| block.text.trim())
217            .filter(|text| !text.is_empty())
218            .collect::<Vec<_>>()
219            .join("\n")
220    }
221}
222
223/// OCR backend contract used by screenshot/photo ingestion pipelines.
224#[async_trait]
225pub trait OcrBackend: Send + Sync {
226    /// The engine identifier for this backend.
227    fn engine(&self) -> OcrEngine;
228
229    /// Extract OCR text and block metadata from an image.
230    async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument>;
231}
232
233/// Configuration for the Tesseract OCR backend.
234#[derive(Debug, Clone)]
235pub struct TesseractOcrConfig {
236    /// Path to the `tesseract` binary.
237    pub binary_path: String,
238    /// Default OCR language codes (e.g., `["eng"]`, `["eng", "deu"]`).
239    pub default_languages: Vec<String>,
240    /// Optional OCR engine mode (`--oem`).
241    pub oem: Option<u8>,
242    /// Optional page segmentation mode (`--psm`).
243    pub psm: Option<u8>,
244    /// Additional CLI arguments passed through to `tesseract`.
245    pub extra_args: Vec<String>,
246    /// Trim trailing whitespace/newlines from output text.
247    pub trim_output: bool,
248    /// Synthesize line blocks from plain-text output when no structured output is parsed.
249    pub synthesize_line_blocks: bool,
250}
251
252impl Default for TesseractOcrConfig {
253    fn default() -> Self {
254        Self {
255            binary_path: "tesseract".to_string(),
256            default_languages: vec!["eng".to_string()],
257            oem: None,
258            psm: None,
259            extra_args: Vec::new(),
260            trim_output: true,
261            synthesize_line_blocks: true,
262        }
263    }
264}
265
266impl TesseractOcrConfig {
267    /// Set the Tesseract binary path.
268    pub fn with_binary_path(mut self, binary_path: impl Into<String>) -> Self {
269        self.binary_path = binary_path.into();
270        self
271    }
272
273    /// Set the default OCR languages.
274    pub fn with_default_languages(
275        mut self,
276        langs: impl IntoIterator<Item = impl Into<String>>,
277    ) -> Self {
278        self.default_languages = langs.into_iter().map(Into::into).collect();
279        self
280    }
281
282    /// Set OCR engine mode (`--oem`).
283    pub fn with_oem(mut self, oem: u8) -> Self {
284        self.oem = Some(oem);
285        self
286    }
287
288    /// Set page segmentation mode (`--psm`).
289    pub fn with_psm(mut self, psm: u8) -> Self {
290        self.psm = Some(psm);
291        self
292    }
293
294    /// Append an extra CLI argument.
295    pub fn with_extra_arg(mut self, arg: impl Into<String>) -> Self {
296        self.extra_args.push(arg.into());
297        self
298    }
299
300    /// Disable synthesized line blocks (text-only output).
301    pub fn without_line_blocks(mut self) -> Self {
302        self.synthesize_line_blocks = false;
303        self
304    }
305}
306
307/// Real OCR backend using the `tesseract` CLI binary.
308///
309/// This backend executes `tesseract <image> stdout ...` and returns the OCR text.
310/// The initial implementation parses plain text output and can synthesize line
311/// blocks from it; structured confidence/bounding box parsing can be added later.
312#[derive(Debug, Clone, Default)]
313pub struct TesseractOcrBackend {
314    config: TesseractOcrConfig,
315}
316
317#[derive(Debug, Clone, Copy)]
318enum TesseractOutputMode {
319    PlainText,
320    Tsv,
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
324struct TesseractLineKey {
325    page_num: u32,
326    block_num: u32,
327    par_num: u32,
328    line_num: u32,
329}
330
331#[derive(Debug, Clone, Copy)]
332struct TesseractPageDimensions {
333    width: u32,
334    height: u32,
335}
336
337#[derive(Debug, Clone)]
338struct TesseractLineAccumulator {
339    words: Vec<(u32, String)>,
340    confidence_sum: f32,
341    confidence_count: usize,
342    left: u32,
343    top: u32,
344    right: u32,
345    bottom: u32,
346}
347
348impl TesseractLineAccumulator {
349    fn new(
350        word_num: u32,
351        text: String,
352        confidence: f32,
353        left: u32,
354        top: u32,
355        width: u32,
356        height: u32,
357    ) -> Self {
358        Self {
359            words: vec![(word_num, text)],
360            confidence_sum: confidence,
361            confidence_count: 1,
362            left,
363            top,
364            right: left.saturating_add(width),
365            bottom: top.saturating_add(height),
366        }
367    }
368
369    fn add_word(
370        &mut self,
371        word_num: u32,
372        text: String,
373        confidence: f32,
374        left: u32,
375        top: u32,
376        width: u32,
377        height: u32,
378    ) {
379        self.words.push((word_num, text));
380        self.confidence_sum += confidence;
381        self.confidence_count += 1;
382        self.left = self.left.min(left);
383        self.top = self.top.min(top);
384        self.right = self.right.max(left.saturating_add(width));
385        self.bottom = self.bottom.max(top.saturating_add(height));
386    }
387}
388
389#[derive(Debug, Clone)]
390struct ParsedTesseractTsv {
391    full_text: String,
392    blocks: Vec<OcrTextBlock>,
393}
394
395impl TesseractOcrBackend {
396    /// Create a Tesseract backend with default configuration.
397    pub fn new() -> Self {
398        Self::default()
399    }
400
401    /// Create a Tesseract backend with custom configuration.
402    pub fn with_config(config: TesseractOcrConfig) -> Self {
403        Self { config }
404    }
405
406    /// Return the configured binary path.
407    pub fn binary_path(&self) -> &str {
408        &self.config.binary_path
409    }
410
411    /// Check whether the configured `tesseract` binary is available.
412    pub async fn is_available(&self) -> bool {
413        self.version().await.is_ok()
414    }
415
416    /// Read the `tesseract` version string.
417    pub async fn version(&self) -> Result<String> {
418        let output = Command::new(&self.config.binary_path)
419            .arg("--version")
420            .output()
421            .await
422            .map_err(|err| {
423                Error::ingest(format!(
424                    "failed to execute tesseract binary '{}': {err}",
425                    self.config.binary_path
426                ))
427            })?;
428
429        if !output.status.success() {
430            let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
431            return Err(Error::ingest(format!(
432                "tesseract --version failed (status {}): {stderr}",
433                output.status
434            )));
435        }
436
437        let version = String::from_utf8_lossy(&output.stdout).trim().to_string();
438        if version.is_empty() {
439            return Err(Error::ingest("tesseract --version returned empty output"));
440        }
441
442        Ok(version)
443    }
444
445    fn build_args(&self, request: &ImageOcrRequest) -> Vec<String> {
446        self.build_args_for_mode(request, TesseractOutputMode::PlainText)
447    }
448
449    fn build_tsv_args(&self, request: &ImageOcrRequest) -> Vec<String> {
450        self.build_args_for_mode(request, TesseractOutputMode::Tsv)
451    }
452
453    fn build_args_for_mode(
454        &self,
455        request: &ImageOcrRequest,
456        mode: TesseractOutputMode,
457    ) -> Vec<String> {
458        let mut args = vec![
459            request.path.to_string_lossy().into_owned(),
460            "stdout".to_string(),
461        ];
462
463        let languages = if request.language_hints.is_empty() {
464            self.config.default_languages.clone()
465        } else {
466            request.language_hints.clone()
467        };
468        let languages = normalize_tesseract_language_hints(languages);
469
470        if !languages.is_empty() {
471            args.push("-l".to_string());
472            args.push(languages.join("+"));
473        }
474
475        if let Some(oem) = self.config.oem {
476            args.push("--oem".to_string());
477            args.push(oem.to_string());
478        }
479
480        if let Some(psm) = self
481            .config
482            .psm
483            .or_else(|| default_psm_for_target(request.target_kind))
484        {
485            args.push("--psm".to_string());
486            args.push(psm.to_string());
487        }
488
489        args.extend(self.config.extra_args.iter().cloned());
490
491        if matches!(mode, TesseractOutputMode::Tsv) {
492            args.push("tsv".to_string());
493        }
494
495        args
496    }
497
498    fn normalize_output_text(&self, stdout: &[u8]) -> String {
499        let text = String::from_utf8_lossy(stdout).to_string();
500        if self.config.trim_output {
501            text.trim().to_string()
502        } else {
503            text
504        }
505    }
506
507    fn synthesize_blocks(&self, text: &str) -> Vec<OcrTextBlock> {
508        if !self.config.synthesize_line_blocks {
509            return Vec::new();
510        }
511
512        text.lines()
513            .map(str::trim)
514            .filter(|line| !line.is_empty())
515            .map(OcrTextBlock::line)
516            .collect()
517    }
518
519    async fn run_tesseract(
520        &self,
521        request: &ImageOcrRequest,
522        mode: TesseractOutputMode,
523    ) -> Result<Vec<u8>> {
524        let args = match mode {
525            TesseractOutputMode::PlainText => self.build_args(request),
526            TesseractOutputMode::Tsv => self.build_tsv_args(request),
527        };
528
529        let output = Command::new(&self.config.binary_path)
530            .args(&args)
531            .output()
532            .await
533            .map_err(|err| {
534                Error::ingest(format!(
535                    "failed to execute tesseract binary '{}': {err}",
536                    self.config.binary_path
537                ))
538            })?;
539
540        if !output.status.success() {
541            let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
542            let mode_label = match mode {
543                TesseractOutputMode::PlainText => "plain-text",
544                TesseractOutputMode::Tsv => "tsv",
545            };
546            return Err(Error::ingest(format!(
547                "tesseract OCR ({mode_label}) failed for '{}'(status {}): {}",
548                request.path.display(),
549                output.status,
550                if stderr.is_empty() {
551                    "no stderr output".to_string()
552                } else {
553                    stderr
554                }
555            )));
556        }
557
558        Ok(output.stdout)
559    }
560
561    fn parse_tsv_output(&self, stdout: &[u8]) -> Result<ParsedTesseractTsv> {
562        let tsv = String::from_utf8_lossy(stdout);
563        parse_tesseract_tsv(&tsv)
564    }
565
566    fn build_document_from_plain_text(
567        &self,
568        request: &ImageOcrRequest,
569        full_text: String,
570    ) -> OcrDocument {
571        let mut doc = OcrDocument::new(
572            self.engine(),
573            request.target_kind,
574            request.provenance.clone(),
575        );
576        doc.full_text = full_text.clone();
577        doc.blocks = self.synthesize_blocks(&full_text);
578        doc
579    }
580
581    fn build_document_from_tsv(
582        &self,
583        request: &ImageOcrRequest,
584        parsed: ParsedTesseractTsv,
585    ) -> OcrDocument {
586        let mut doc = OcrDocument::new(
587            self.engine(),
588            request.target_kind,
589            request.provenance.clone(),
590        );
591        doc.full_text = if self.config.trim_output {
592            parsed.full_text.trim().to_string()
593        } else {
594            parsed.full_text
595        };
596        doc.blocks = parsed.blocks;
597        doc
598    }
599}
600
601fn normalize_tesseract_language_hints(hints: Vec<String>) -> Vec<String> {
602    let mut normalized = Vec::with_capacity(hints.len());
603    for hint in hints {
604        let code = normalize_tesseract_language_hint(&hint);
605        if !code.is_empty() && !normalized.iter().any(|existing| existing == &code) {
606            normalized.push(code);
607        }
608    }
609    normalized
610}
611
612fn normalize_tesseract_language_hint(hint: &str) -> String {
613    let trimmed = hint.trim();
614    if trimmed.is_empty() {
615        return String::new();
616    }
617
618    // Preserve explicit Tesseract pack identifiers (e.g. `eng`, `chi_sim`).
619    if trimmed.contains('_')
620        || (trimmed.len() == 3 && trimmed.chars().all(|c| c.is_ascii_alphabetic()))
621    {
622        return trimmed.to_ascii_lowercase();
623    }
624
625    let primary = trimmed
626        .split(['-', '_'])
627        .next()
628        .unwrap_or(trimmed)
629        .to_ascii_lowercase();
630
631    match primary.as_str() {
632        "en" => "eng",
633        "de" => "deu",
634        "fr" => "fra",
635        "es" => "spa",
636        "it" => "ita",
637        "pt" => "por",
638        "nl" => "nld",
639        "sv" => "swe",
640        "da" => "dan",
641        "fi" => "fin",
642        "no" | "nb" | "nn" => "nor",
643        "pl" => "pol",
644        "cs" => "ces",
645        "sk" => "slk",
646        "sl" => "slv",
647        "hr" => "hrv",
648        "sr" => "srp",
649        "ro" => "ron",
650        "hu" => "hun",
651        "tr" => "tur",
652        "el" => "ell",
653        "ru" => "rus",
654        "uk" => "ukr",
655        "bg" => "bul",
656        "he" | "iw" => "heb",
657        "ar" => "ara",
658        "fa" => "fas",
659        "hi" => "hin",
660        "bn" => "ben",
661        "ta" => "tam",
662        "te" => "tel",
663        "ml" => "mal",
664        "mr" => "mar",
665        "gu" => "guj",
666        "pa" => "pan",
667        "ur" => "urd",
668        "ja" => "jpn",
669        "ko" => "kor",
670        "zh" => "chi_sim",
671        "id" => "ind",
672        "ms" => "msa",
673        "vi" => "vie",
674        "th" => "tha",
675        "ca" => "cat",
676        "et" => "est",
677        "lv" => "lav",
678        "lt" => "lit",
679        _ => return trimmed.to_ascii_lowercase(),
680    }
681    .to_string()
682}
683
684#[async_trait]
685impl OcrBackend for TesseractOcrBackend {
686    fn engine(&self) -> OcrEngine {
687        OcrEngine::Tesseract
688    }
689
690    async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
691        if let Ok(tsv_stdout) = self.run_tesseract(request, TesseractOutputMode::Tsv).await {
692            if let Ok(parsed) = self.parse_tsv_output(&tsv_stdout) {
693                if !parsed.full_text.trim().is_empty() || !parsed.blocks.is_empty() {
694                    return Ok(self.build_document_from_tsv(request, parsed));
695                }
696            }
697        }
698
699        let plain_stdout = self
700            .run_tesseract(request, TesseractOutputMode::PlainText)
701            .await?;
702        let full_text = self.normalize_output_text(&plain_stdout);
703        Ok(self.build_document_from_plain_text(request, full_text))
704    }
705}
706
707fn default_psm_for_target(target: OcrTargetKind) -> Option<u8> {
708    match target {
709        OcrTargetKind::Screenshot => Some(11), // sparse text can work well for UI screens
710        OcrTargetKind::Photo => Some(3),       // fully automatic page segmentation
711        OcrTargetKind::GenericImage => None,
712    }
713}
714
715/// Recognition quality mode for Apple Vision OCR.
716#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
717#[serde(rename_all = "snake_case")]
718pub enum AppleVisionRecognitionLevel {
719    /// Faster recognition with potentially lower accuracy.
720    Fast,
721    /// Higher accuracy recognition (default).
722    Accurate,
723}
724
725impl AppleVisionRecognitionLevel {
726    fn as_str(self) -> &'static str {
727        match self {
728            Self::Fast => "fast",
729            Self::Accurate => "accurate",
730        }
731    }
732}
733
734/// Configuration for the Apple Vision OCR backend.
735///
736/// The backend runs a small Swift script via `xcrun swift` (default) and parses
737/// JSON output into `OcrDocument` blocks.
738#[derive(Debug, Clone)]
739pub struct AppleVisionOcrConfig {
740    /// Command used to run Swift (`xcrun` by default).
741    pub binary_path: String,
742    /// Prefix args before the script path (defaults to `["swift"]`).
743    pub runner_prefix_args: Vec<String>,
744    /// Default recognition language codes (e.g., `["en-US"]`).
745    pub default_languages: Vec<String>,
746    /// Recognition level (`fast` or `accurate`).
747    pub recognition_level: AppleVisionRecognitionLevel,
748    /// Whether Vision should use language correction.
749    pub uses_language_correction: bool,
750    /// Trim trailing whitespace/newlines from the output text.
751    pub trim_output: bool,
752}
753
754impl Default for AppleVisionOcrConfig {
755    fn default() -> Self {
756        Self {
757            binary_path: "xcrun".to_string(),
758            runner_prefix_args: vec!["swift".to_string()],
759            default_languages: vec!["en-US".to_string()],
760            recognition_level: AppleVisionRecognitionLevel::Accurate,
761            uses_language_correction: true,
762            trim_output: true,
763        }
764    }
765}
766
767impl AppleVisionOcrConfig {
768    /// Set the command used to run Swift.
769    pub fn with_binary_path(mut self, binary_path: impl Into<String>) -> Self {
770        self.binary_path = binary_path.into();
771        self
772    }
773
774    /// Set the runner prefix args before the script path.
775    pub fn with_runner_prefix_args(
776        mut self,
777        args: impl IntoIterator<Item = impl Into<String>>,
778    ) -> Self {
779        self.runner_prefix_args = args.into_iter().map(Into::into).collect();
780        self
781    }
782
783    /// Set default recognition languages.
784    pub fn with_default_languages(
785        mut self,
786        langs: impl IntoIterator<Item = impl Into<String>>,
787    ) -> Self {
788        self.default_languages = langs.into_iter().map(Into::into).collect();
789        self
790    }
791
792    /// Set the recognition level.
793    pub fn with_recognition_level(mut self, level: AppleVisionRecognitionLevel) -> Self {
794        self.recognition_level = level;
795        self
796    }
797
798    /// Set whether language correction is enabled.
799    pub fn with_language_correction(mut self, enabled: bool) -> Self {
800        self.uses_language_correction = enabled;
801        self
802    }
803}
804
805/// Real OCR backend using Apple's Vision framework via a Swift script.
806///
807/// This backend is macOS-only at runtime. It invokes Swift (`xcrun swift` by
808/// default), runs a small Vision OCR script, and parses JSON output with line
809/// blocks, confidence, and normalized bounding boxes.
810#[derive(Debug, Clone, Default)]
811pub struct AppleVisionOcrBackend {
812    config: AppleVisionOcrConfig,
813}
814
815#[derive(Debug, Deserialize)]
816struct AppleVisionScriptResponse {
817    #[serde(default)]
818    full_text: String,
819    #[serde(default)]
820    blocks: Vec<AppleVisionScriptBlock>,
821}
822
823#[derive(Debug, Deserialize)]
824struct AppleVisionScriptBlock {
825    text: String,
826    confidence: Option<f32>,
827    bbox: Option<AppleVisionScriptBoundingBox>,
828}
829
830#[derive(Debug, Deserialize)]
831struct AppleVisionScriptBoundingBox {
832    x: f32,
833    y: f32,
834    width: f32,
835    height: f32,
836}
837
838impl AppleVisionOcrBackend {
839    /// Create an Apple Vision backend with default configuration.
840    pub fn new() -> Self {
841        Self::default()
842    }
843
844    /// Create an Apple Vision backend with custom configuration.
845    pub fn with_config(config: AppleVisionOcrConfig) -> Self {
846        Self { config }
847    }
848
849    /// Return the configured runner binary path.
850    pub fn binary_path(&self) -> &str {
851        &self.config.binary_path
852    }
853
854    /// Check whether the configured Swift runner is available.
855    pub async fn is_available(&self) -> bool {
856        self.version().await.is_ok()
857    }
858
859    /// Read the Swift runner version string (`xcrun swift --version` by default).
860    pub async fn version(&self) -> Result<String> {
861        self.ensure_macos_runtime()?;
862
863        let mut args = self.config.runner_prefix_args.clone();
864        args.push("--version".to_string());
865
866        let output = Command::new(&self.config.binary_path)
867            .args(&args)
868            .output()
869            .await
870            .map_err(|err| {
871                Error::ingest(format!(
872                    "failed to execute Apple Vision OCR runner '{}': {err}",
873                    self.config.binary_path
874                ))
875            })?;
876
877        if !output.status.success() {
878            let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
879            return Err(Error::ingest(format!(
880                "Apple Vision OCR runner version command failed (status {}): {stderr}",
881                output.status
882            )));
883        }
884
885        let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
886        let version = if stdout.is_empty() {
887            String::from_utf8_lossy(&output.stderr).trim().to_string()
888        } else {
889            stdout
890        };
891
892        if version.is_empty() {
893            return Err(Error::ingest(
894                "Apple Vision OCR runner version command returned empty output",
895            ));
896        }
897
898        Ok(version)
899    }
900
901    fn ensure_macos_runtime(&self) -> Result<()> {
902        if !cfg!(target_os = "macos") {
903            return Err(Error::ingest(
904                "Apple Vision OCR backend requires macOS runtime",
905            ));
906        }
907        Ok(())
908    }
909
910    fn normalize_languages(&self, request: &ImageOcrRequest) -> String {
911        let langs = if request.language_hints.is_empty() {
912            self.config.default_languages.clone()
913        } else {
914            request.language_hints.clone()
915        };
916
917        langs
918            .into_iter()
919            .map(|lang| lang.trim().to_string())
920            .filter(|lang| !lang.is_empty())
921            .collect::<Vec<_>>()
922            .join("+")
923    }
924
925    fn build_args(&self, script_path: &std::path::Path, request: &ImageOcrRequest) -> Vec<String> {
926        let mut args = self.config.runner_prefix_args.clone();
927        args.push(script_path.to_string_lossy().into_owned());
928        args.push(request.path.to_string_lossy().into_owned());
929        args.push(self.normalize_languages(request));
930        args.push(self.config.recognition_level.as_str().to_string());
931        args.push(self.config.uses_language_correction.to_string());
932        args
933    }
934
935    async fn write_temp_script(&self) -> Result<PathBuf> {
936        let ts = Utc::now()
937            .timestamp_nanos_opt()
938            .unwrap_or_else(|| Utc::now().timestamp_micros() * 1000);
939        let path = std::env::temp_dir().join(format!(
940            "converge_apple_vision_ocr_{}_{}.swift",
941            std::process::id(),
942            ts
943        ));
944
945        tokio::fs::write(&path, apple_vision_swift_script())
946            .await
947            .map_err(Error::from)?;
948
949        Ok(path)
950    }
951
952    async fn run_script(
953        &self,
954        request: &ImageOcrRequest,
955        script_path: &std::path::Path,
956    ) -> Result<Vec<u8>> {
957        let args = self.build_args(script_path, request);
958        let output = Command::new(&self.config.binary_path)
959            .args(&args)
960            .output()
961            .await
962            .map_err(|err| {
963                Error::ingest(format!(
964                    "failed to execute Apple Vision OCR runner '{}': {err}",
965                    self.config.binary_path
966                ))
967            })?;
968
969        if !output.status.success() {
970            let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
971            return Err(Error::ingest(format!(
972                "Apple Vision OCR failed for '{}'(status {}): {}",
973                request.path.display(),
974                output.status,
975                if stderr.is_empty() {
976                    "no stderr output".to_string()
977                } else {
978                    stderr
979                }
980            )));
981        }
982
983        Ok(output.stdout)
984    }
985
986    fn parse_json_output(&self, stdout: &[u8]) -> Result<OcrDocumentParts> {
987        let response: AppleVisionScriptResponse =
988            serde_json::from_slice(stdout).map_err(|err| {
989                Error::ingest(format!(
990                    "failed to parse Apple Vision OCR JSON output: {err}"
991                ))
992            })?;
993
994        let mut blocks = Vec::with_capacity(response.blocks.len());
995        for block in response.blocks {
996            let text = block.text.trim().to_string();
997            if text.is_empty() {
998                continue;
999            }
1000
1001            let confidence = block.confidence.map(|c| c.clamp(0.0, 1.0));
1002            let bbox = block.bbox.and_then(|bbox| {
1003                let normalized = BoundingBox {
1004                    x: bbox.x.clamp(0.0, 1.0),
1005                    y: bbox.y.clamp(0.0, 1.0),
1006                    width: bbox.width.clamp(0.0, 1.0),
1007                    height: bbox.height.clamp(0.0, 1.0),
1008                };
1009                normalized.is_normalized().then_some(normalized)
1010            });
1011
1012            blocks.push(OcrTextBlock {
1013                text,
1014                confidence,
1015                bbox,
1016                kind: OcrBlockKind::Line,
1017            });
1018        }
1019
1020        let full_text = if response.full_text.trim().is_empty() {
1021            blocks
1022                .iter()
1023                .map(|block| block.text.as_str())
1024                .collect::<Vec<_>>()
1025                .join("\n")
1026        } else if self.config.trim_output {
1027            response.full_text.trim().to_string()
1028        } else {
1029            response.full_text
1030        };
1031
1032        Ok(OcrDocumentParts { full_text, blocks })
1033    }
1034}
1035
1036#[async_trait]
1037impl OcrBackend for AppleVisionOcrBackend {
1038    fn engine(&self) -> OcrEngine {
1039        OcrEngine::AppleVision
1040    }
1041
1042    async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
1043        self.ensure_macos_runtime()?;
1044
1045        let script_path = self.write_temp_script().await?;
1046        let stdout = self.run_script(request, &script_path).await;
1047        let _ = tokio::fs::remove_file(&script_path).await;
1048        let stdout = stdout?;
1049
1050        let parts = self.parse_json_output(&stdout)?;
1051        let mut doc = OcrDocument::new(
1052            self.engine(),
1053            request.target_kind,
1054            request.provenance.clone(),
1055        );
1056        doc.full_text = parts.full_text;
1057        doc.blocks = parts.blocks;
1058        Ok(doc)
1059    }
1060}
1061
1062#[derive(Debug)]
1063struct OcrDocumentParts {
1064    full_text: String,
1065    blocks: Vec<OcrTextBlock>,
1066}
1067
1068fn apple_vision_swift_script() -> &'static str {
1069    r#"
1070import Foundation
1071import Vision
1072import CoreGraphics
1073import ImageIO
1074
1075func fail(_ message: String) -> Never {
1076    if let data = (message + "\n").data(using: .utf8) {
1077        FileHandle.standardError.write(data)
1078    }
1079    exit(1)
1080}
1081
1082func boolFromArg(_ raw: String) -> Bool {
1083    switch raw.lowercased() {
1084    case "1", "true", "yes", "y":
1085        return true
1086    default:
1087        return false
1088    }
1089}
1090
1091let args = CommandLine.arguments
1092guard args.count >= 2 else {
1093    fail("usage: <script> <image_path> [languages] [recognition_level] [language_correction]")
1094}
1095
1096let imagePath = args[1]
1097let languagesArg = args.count > 2 ? args[2] : ""
1098let recognitionLevelArg = args.count > 3 ? args[3].lowercased() : "accurate"
1099let languageCorrectionArg = args.count > 4 ? args[4] : "true"
1100
1101let imageURL = URL(fileURLWithPath: imagePath)
1102guard let imageSource = CGImageSourceCreateWithURL(imageURL as CFURL, nil) else {
1103    fail("failed to create image source for \(imagePath)")
1104}
1105guard let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) else {
1106    fail("failed to decode image at \(imagePath)")
1107}
1108
1109let request = VNRecognizeTextRequest()
1110request.recognitionLevel = (recognitionLevelArg == "fast") ? .fast : .accurate
1111request.usesLanguageCorrection = boolFromArg(languageCorrectionArg)
1112
1113let languages = languagesArg
1114    .split(separator: "+")
1115    .map { String($0).trimmingCharacters(in: .whitespacesAndNewlines) }
1116    .filter { !$0.isEmpty }
1117if !languages.isEmpty {
1118    request.recognitionLanguages = languages
1119}
1120
1121do {
1122    let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
1123    try handler.perform([request])
1124
1125    let observations = request.results ?? []
1126    var blocks: [[String: Any]] = []
1127
1128    for observation in observations {
1129        guard let candidate = observation.topCandidates(1).first else { continue }
1130        let text = candidate.string.trimmingCharacters(in: .whitespacesAndNewlines)
1131        if text.isEmpty { continue }
1132
1133        let rect = observation.boundingBox
1134        // Vision uses a bottom-left origin; convert to top-left normalized coordinates.
1135        let topLeftY = 1.0 - rect.origin.y - rect.size.height
1136
1137        blocks.append([
1138            "text": text,
1139            "confidence": Double(candidate.confidence),
1140            "bbox": [
1141                "x": Double(rect.origin.x),
1142                "y": Double(topLeftY),
1143                "width": Double(rect.size.width),
1144                "height": Double(rect.size.height)
1145            ]
1146        ])
1147    }
1148
1149    let fullText = blocks.compactMap { $0["text"] as? String }.joined(separator: "\n")
1150    let payload: [String: Any] = [
1151        "full_text": fullText,
1152        "blocks": blocks
1153    ]
1154
1155    let data = try JSONSerialization.data(withJSONObject: payload, options: [])
1156    FileHandle.standardOutput.write(data)
1157} catch {
1158    fail(String(describing: error))
1159}
1160"#
1161}
1162
1163fn parse_tesseract_tsv(tsv: &str) -> Result<ParsedTesseractTsv> {
1164    let mut lines = tsv.lines();
1165    let header = lines
1166        .next()
1167        .ok_or_else(|| Error::ingest("empty tesseract TSV output"))?;
1168
1169    let header_cols: Vec<&str> = header.split('\t').collect();
1170    if header_cols.is_empty() {
1171        return Err(Error::ingest("invalid tesseract TSV header"));
1172    }
1173
1174    let mut header_index = HashMap::<String, usize>::new();
1175    for (idx, col) in header_cols.iter().enumerate() {
1176        header_index.insert((*col).to_string(), idx);
1177    }
1178
1179    for required in [
1180        "level",
1181        "page_num",
1182        "block_num",
1183        "par_num",
1184        "line_num",
1185        "word_num",
1186        "left",
1187        "top",
1188        "width",
1189        "height",
1190        "conf",
1191        "text",
1192    ] {
1193        if !header_index.contains_key(required) {
1194            return Err(Error::ingest(format!(
1195                "tesseract TSV missing required column '{required}'"
1196            )));
1197        }
1198    }
1199
1200    let mut pages = HashMap::<u32, TesseractPageDimensions>::new();
1201    let mut lines_by_key = BTreeMap::<TesseractLineKey, TesseractLineAccumulator>::new();
1202
1203    for raw_line in lines {
1204        if raw_line.trim().is_empty() {
1205            continue;
1206        }
1207
1208        let cols = split_tsv_row(raw_line, header_cols.len());
1209
1210        let level = parse_tsv_u32(&cols, &header_index, "level")?;
1211        let page_num = parse_tsv_u32(&cols, &header_index, "page_num")?;
1212
1213        if level == 1 {
1214            let width = parse_tsv_u32(&cols, &header_index, "width")?;
1215            let height = parse_tsv_u32(&cols, &header_index, "height")?;
1216            if width > 0 && height > 0 {
1217                pages.insert(page_num, TesseractPageDimensions { width, height });
1218            }
1219            continue;
1220        }
1221
1222        if level != 5 {
1223            continue;
1224        }
1225
1226        let text = tsv_field(&cols, &header_index, "text")?.trim().to_string();
1227        if text.is_empty() {
1228            continue;
1229        }
1230
1231        let conf_raw = parse_tsv_f32(&cols, &header_index, "conf")?;
1232        if conf_raw < 0.0 {
1233            continue;
1234        }
1235
1236        let key = TesseractLineKey {
1237            page_num,
1238            block_num: parse_tsv_u32(&cols, &header_index, "block_num")?,
1239            par_num: parse_tsv_u32(&cols, &header_index, "par_num")?,
1240            line_num: parse_tsv_u32(&cols, &header_index, "line_num")?,
1241        };
1242
1243        let word_num = parse_tsv_u32(&cols, &header_index, "word_num")?;
1244        let left = parse_tsv_u32(&cols, &header_index, "left")?;
1245        let top = parse_tsv_u32(&cols, &header_index, "top")?;
1246        let width = parse_tsv_u32(&cols, &header_index, "width")?;
1247        let height = parse_tsv_u32(&cols, &header_index, "height")?;
1248        let confidence = (conf_raw / 100.0).clamp(0.0, 1.0);
1249
1250        use std::collections::btree_map::Entry;
1251        match lines_by_key.entry(key) {
1252            Entry::Vacant(entry) => {
1253                entry.insert(TesseractLineAccumulator::new(
1254                    word_num, text, confidence, left, top, width, height,
1255                ));
1256            }
1257            Entry::Occupied(mut entry) => {
1258                entry
1259                    .get_mut()
1260                    .add_word(word_num, text, confidence, left, top, width, height);
1261            }
1262        }
1263    }
1264
1265    let mut blocks = Vec::with_capacity(lines_by_key.len());
1266
1267    for (key, mut line) in lines_by_key {
1268        line.words.sort_by_key(|(word_num, _)| *word_num);
1269        let text = line
1270            .words
1271            .into_iter()
1272            .map(|(_, text)| text)
1273            .collect::<Vec<_>>()
1274            .join(" ");
1275
1276        if text.trim().is_empty() {
1277            continue;
1278        }
1279
1280        let confidence = if line.confidence_count > 0 {
1281            Some((line.confidence_sum / line.confidence_count as f32).clamp(0.0, 1.0))
1282        } else {
1283            None
1284        };
1285
1286        let bbox = pages
1287            .get(&key.page_num)
1288            .and_then(|page| normalize_bbox(line.left, line.top, line.right, line.bottom, *page));
1289
1290        blocks.push(OcrTextBlock {
1291            text,
1292            confidence,
1293            bbox,
1294            kind: OcrBlockKind::Line,
1295        });
1296    }
1297
1298    let full_text = blocks
1299        .iter()
1300        .map(|block| block.text.trim())
1301        .filter(|line| !line.is_empty())
1302        .collect::<Vec<_>>()
1303        .join("\n");
1304
1305    Ok(ParsedTesseractTsv { full_text, blocks })
1306}
1307
1308fn split_tsv_row(line: &str, expected_cols: usize) -> Vec<String> {
1309    let parts: Vec<&str> = line.split('\t').collect();
1310    if parts.len() <= expected_cols {
1311        return parts.into_iter().map(str::to_string).collect();
1312    }
1313
1314    let mut out = parts[..expected_cols - 1]
1315        .iter()
1316        .map(|s| (*s).to_string())
1317        .collect::<Vec<_>>();
1318    out.push(parts[expected_cols - 1..].join("\t"));
1319    out
1320}
1321
1322fn tsv_field<'a>(
1323    cols: &'a [String],
1324    header_index: &HashMap<String, usize>,
1325    name: &str,
1326) -> Result<&'a str> {
1327    let idx = *header_index
1328        .get(name)
1329        .ok_or_else(|| Error::ingest(format!("missing TSV header index for '{name}'")))?;
1330
1331    cols.get(idx)
1332        .map(String::as_str)
1333        .ok_or_else(|| Error::ingest(format!("missing TSV field '{name}' in row")))
1334}
1335
1336fn parse_tsv_u32(
1337    cols: &[String],
1338    header_index: &HashMap<String, usize>,
1339    name: &str,
1340) -> Result<u32> {
1341    let value = tsv_field(cols, header_index, name)?;
1342    value.parse::<u32>().map_err(|err| {
1343        Error::ingest(format!(
1344            "invalid tesseract TSV integer field '{name}' value '{value}': {err}"
1345        ))
1346    })
1347}
1348
1349fn parse_tsv_f32(
1350    cols: &[String],
1351    header_index: &HashMap<String, usize>,
1352    name: &str,
1353) -> Result<f32> {
1354    let value = tsv_field(cols, header_index, name)?;
1355    value.parse::<f32>().map_err(|err| {
1356        Error::ingest(format!(
1357            "invalid tesseract TSV float field '{name}' value '{value}': {err}"
1358        ))
1359    })
1360}
1361
1362fn normalize_bbox(
1363    left: u32,
1364    top: u32,
1365    right: u32,
1366    bottom: u32,
1367    page: TesseractPageDimensions,
1368) -> Option<BoundingBox> {
1369    if page.width == 0 || page.height == 0 || right <= left || bottom <= top {
1370        return None;
1371    }
1372
1373    let page_w = page.width as f32;
1374    let page_h = page.height as f32;
1375    let x = (left as f32 / page_w).clamp(0.0, 1.0);
1376    let y = (top as f32 / page_h).clamp(0.0, 1.0);
1377    let width = ((right.saturating_sub(left)) as f32 / page_w).clamp(0.0, 1.0);
1378    let height = ((bottom.saturating_sub(top)) as f32 / page_h).clamp(0.0, 1.0);
1379
1380    let bbox = BoundingBox {
1381        x,
1382        y,
1383        width,
1384        height,
1385    };
1386
1387    bbox.is_normalized().then_some(bbox)
1388}
1389
1390/// Fixture-driven OCR backend for tests and local scaffolding.
1391///
1392/// This backend returns pre-registered OCR results keyed by exact file path,
1393/// file name, or file stem (in that order). It is intended for ingestion
1394/// pipeline development when native OCR runtimes are unavailable.
1395#[derive(Debug, Clone)]
1396pub struct FixtureOcrBackend {
1397    engine: OcrEngine,
1398    documents: HashMap<String, OcrDocument>,
1399    default_document: Option<OcrDocument>,
1400}
1401
1402impl FixtureOcrBackend {
1403    /// Create an empty fixture backend using the mock engine label.
1404    pub fn new() -> Self {
1405        Self {
1406            engine: OcrEngine::Mock,
1407            documents: HashMap::new(),
1408            default_document: None,
1409        }
1410    }
1411
1412    /// Create an empty fixture backend with a custom engine label.
1413    pub fn with_engine(engine: OcrEngine) -> Self {
1414        Self {
1415            engine,
1416            documents: HashMap::new(),
1417            default_document: None,
1418        }
1419    }
1420
1421    /// Register a fixture document for a given path-like lookup key.
1422    ///
1423    /// The key can be:
1424    /// - exact path string
1425    /// - file name (e.g., `Screenshot.png`)
1426    /// - file stem (e.g., `Screenshot`)
1427    pub fn with_document(mut self, key: impl Into<String>, document: OcrDocument) -> Self {
1428        self.documents.insert(key.into(), document);
1429        self
1430    }
1431
1432    /// Register a default fixture returned when no key-specific fixture exists.
1433    pub fn with_default_document(mut self, document: OcrDocument) -> Self {
1434        self.default_document = Some(document);
1435        self
1436    }
1437
1438    fn lookup(&self, path: &std::path::Path) -> Option<&OcrDocument> {
1439        let exact = path.to_string_lossy();
1440        if let Some(doc) = self.documents.get(exact.as_ref()) {
1441            return Some(doc);
1442        }
1443
1444        if let Some(file_name) = path.file_name().and_then(|name| name.to_str())
1445            && let Some(doc) = self.documents.get(file_name)
1446        {
1447            return Some(doc);
1448        }
1449
1450        if let Some(file_stem) = path.file_stem().and_then(|stem| stem.to_str())
1451            && let Some(doc) = self.documents.get(file_stem)
1452        {
1453            return Some(doc);
1454        }
1455
1456        self.default_document.as_ref()
1457    }
1458}
1459
1460impl Default for FixtureOcrBackend {
1461    fn default() -> Self {
1462        Self::new()
1463    }
1464}
1465
1466#[async_trait]
1467impl OcrBackend for FixtureOcrBackend {
1468    fn engine(&self) -> OcrEngine {
1469        self.engine.clone()
1470    }
1471
1472    async fn extract(&self, request: &ImageOcrRequest) -> Result<OcrDocument> {
1473        let Some(template) = self.lookup(&request.path) else {
1474            return Err(Error::ingest(format!(
1475                "no OCR fixture registered for path '{}'",
1476                request.path.display()
1477            )));
1478        };
1479
1480        let mut doc = template.clone();
1481        doc.engine = self.engine();
1482        doc.target_kind = request.target_kind;
1483        doc.provenance = request.provenance.clone();
1484        doc.extracted_at = Utc::now();
1485        Ok(doc)
1486    }
1487}
1488
1489#[cfg(test)]
1490mod tests {
1491    use super::*;
1492    use crate::ingest::{SourceKind, SourceProvenance};
1493
1494    #[test]
1495    fn effective_text_falls_back_to_blocks() {
1496        let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1497        let mut doc = OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Screenshot, provenance);
1498        doc.blocks.push(OcrTextBlock::line("First line"));
1499        doc.blocks.push(OcrTextBlock::line("Second line"));
1500
1501        assert_eq!(doc.effective_text(), "First line\nSecond line");
1502    }
1503
1504    #[test]
1505    fn filtered_text_respects_confidence() {
1506        let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1507        let mut doc = OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, provenance);
1508        doc.blocks.push(OcrTextBlock {
1509            text: "keep".into(),
1510            confidence: Some(0.95),
1511            bbox: None,
1512            kind: OcrBlockKind::Word,
1513        });
1514        doc.blocks.push(OcrTextBlock {
1515            text: "drop".into(),
1516            confidence: Some(0.20),
1517            bbox: None,
1518            kind: OcrBlockKind::Word,
1519        });
1520
1521        assert_eq!(doc.filtered_text(0.5), "keep");
1522    }
1523
1524    #[test]
1525    fn bounding_box_validation_checks_normalized_range() {
1526        let valid = BoundingBox {
1527            x: 0.1,
1528            y: 0.2,
1529            width: 0.3,
1530            height: 0.4,
1531        };
1532        let invalid = BoundingBox {
1533            x: 0.8,
1534            y: 0.9,
1535            width: 0.4,
1536            height: 0.2,
1537        };
1538
1539        assert!(valid.is_normalized());
1540        assert!(!invalid.is_normalized());
1541    }
1542
1543    #[tokio::test]
1544    async fn fixture_backend_uses_filename_lookup_and_overrides_provenance() {
1545        let fixture_provenance = SourceProvenance::new(SourceKind::Screenshot, "fixture://shot");
1546        let mut fixture_doc = OcrDocument::new(
1547            OcrEngine::Mock,
1548            OcrTargetKind::Screenshot,
1549            fixture_provenance,
1550        );
1551        fixture_doc.full_text = "fixture text".into();
1552
1553        let backend = FixtureOcrBackend::new().with_document("test-shot.png", fixture_doc);
1554
1555        let request_provenance =
1556            SourceProvenance::new(SourceKind::Screenshot, "file:///tmp/test-shot.png")
1557                .with_origin_id("shot-123");
1558        let request = ImageOcrRequest::new(
1559            "/tmp/test-shot.png",
1560            OcrTargetKind::Screenshot,
1561            request_provenance.clone(),
1562        );
1563
1564        let response = backend.extract(&request).await.unwrap();
1565        assert_eq!(response.full_text, "fixture text");
1566        assert_eq!(
1567            response.provenance.source_uri,
1568            request_provenance.source_uri
1569        );
1570        assert_eq!(response.provenance.origin_id, request_provenance.origin_id);
1571    }
1572
1573    #[test]
1574    fn tesseract_backend_builds_expected_command_args() {
1575        let backend = TesseractOcrBackend::with_config(
1576            TesseractOcrConfig::default()
1577                .with_default_languages(["eng", "deu"])
1578                .with_oem(1)
1579                .with_psm(6)
1580                .with_extra_arg("--dpi")
1581                .with_extra_arg("300"),
1582        );
1583
1584        let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1585        let request = ImageOcrRequest::new("/tmp/shot.png", OcrTargetKind::Screenshot, provenance)
1586            .with_language_hint("fra");
1587
1588        let args = backend.build_args(&request);
1589        assert_eq!(
1590            args,
1591            vec![
1592                "/tmp/shot.png",
1593                "stdout",
1594                "-l",
1595                "fra",
1596                "--oem",
1597                "1",
1598                "--psm",
1599                "6",
1600                "--dpi",
1601                "300",
1602            ]
1603        );
1604    }
1605
1606    #[test]
1607    fn tesseract_backend_normalizes_and_dedupes_language_hints() {
1608        let backend = TesseractOcrBackend::new();
1609        let provenance = SourceProvenance::new(SourceKind::Screenshot, "file:///shot.png");
1610        let request = ImageOcrRequest::new("/tmp/shot.png", OcrTargetKind::Screenshot, provenance)
1611            .with_language_hint("en")
1612            .with_language_hint("en-US")
1613            .with_language_hint("de-DE")
1614            .with_language_hint("chi_sim");
1615
1616        let args = backend.build_args(&request);
1617        let lang_idx = args.iter().position(|arg| arg == "-l").unwrap();
1618        assert_eq!(args[lang_idx + 1], "eng+deu+chi_sim");
1619    }
1620
1621    #[test]
1622    fn tesseract_backend_builds_tsv_command_args() {
1623        let backend = TesseractOcrBackend::new();
1624        let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1625        let request = ImageOcrRequest::new("/tmp/photo.jpg", OcrTargetKind::Photo, provenance);
1626
1627        let args = backend.build_tsv_args(&request);
1628        assert!(args.ends_with(&["tsv".to_string()]));
1629        assert_eq!(args[0], "/tmp/photo.jpg");
1630        assert_eq!(args[1], "stdout");
1631    }
1632
1633    #[test]
1634    fn tesseract_backend_synthesizes_line_blocks_from_plain_text() {
1635        let backend = TesseractOcrBackend::new();
1636        let blocks = backend.synthesize_blocks("Line 1\n\n Line 2 \n");
1637        assert_eq!(blocks.len(), 2);
1638        assert_eq!(blocks[0].text, "Line 1");
1639        assert_eq!(blocks[1].text, "Line 2");
1640        assert!(blocks.iter().all(|b| b.kind == OcrBlockKind::Line));
1641    }
1642
1643    #[test]
1644    fn parses_tesseract_tsv_into_line_blocks_with_bbox_and_confidence() {
1645        let tsv = concat!(
1646            "level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n",
1647            "1\t1\t0\t0\t0\t0\t0\t0\t200\t100\t-1\t\n",
1648            "4\t1\t1\t1\t1\t0\t10\t10\t100\t20\t-1\t\n",
1649            "5\t1\t1\t1\t1\t1\t10\t10\t40\t20\t92\tHello\n",
1650            "5\t1\t1\t1\t1\t2\t60\t10\t50\t20\t88\tworld\n",
1651            "4\t1\t1\t1\t2\t0\t10\t40\t80\t20\t-1\t\n",
1652            "5\t1\t1\t1\t2\t1\t10\t40\t80\t20\t75\tSecond\n",
1653        );
1654
1655        let parsed = parse_tesseract_tsv(tsv).unwrap();
1656        assert_eq!(parsed.blocks.len(), 2);
1657        assert_eq!(parsed.full_text, "Hello world\nSecond");
1658
1659        let first = &parsed.blocks[0];
1660        assert_eq!(first.kind, OcrBlockKind::Line);
1661        assert_eq!(first.text, "Hello world");
1662        let conf = first.confidence.unwrap();
1663        assert!((conf - 0.90).abs() < 1e-6, "unexpected confidence: {conf}");
1664
1665        let bbox = first.bbox.unwrap();
1666        assert!((bbox.x - 0.05).abs() < 1e-6);
1667        assert!((bbox.y - 0.10).abs() < 1e-6);
1668        assert!((bbox.width - 0.50).abs() < 1e-6);
1669        assert!((bbox.height - 0.20).abs() < 1e-6);
1670    }
1671
1672    #[test]
1673    fn tesseract_parser_rejects_missing_header_columns() {
1674        let err = parse_tesseract_tsv("level\tpage_num\n1\t1\n").unwrap_err();
1675        assert!(err.to_string().contains("missing required column"));
1676    }
1677
1678    #[test]
1679    fn apple_vision_backend_builds_runner_args() {
1680        let backend = AppleVisionOcrBackend::with_config(
1681            AppleVisionOcrConfig::default()
1682                .with_binary_path("xcrun")
1683                .with_runner_prefix_args(["swift"])
1684                .with_default_languages(["en-US"])
1685                .with_recognition_level(AppleVisionRecognitionLevel::Fast)
1686                .with_language_correction(false),
1687        );
1688
1689        let provenance = SourceProvenance::new(SourceKind::Photo, "file:///photo.jpg");
1690        let request = ImageOcrRequest::new("/tmp/photo.jpg", OcrTargetKind::Photo, provenance)
1691            .with_language_hint("en-US")
1692            .with_language_hint("de-DE");
1693
1694        let args = backend.build_args(std::path::Path::new("/tmp/vision.swift"), &request);
1695        assert_eq!(
1696            args,
1697            vec![
1698                "swift",
1699                "/tmp/vision.swift",
1700                "/tmp/photo.jpg",
1701                "en-US+de-DE",
1702                "fast",
1703                "false",
1704            ]
1705        );
1706    }
1707
1708    #[test]
1709    fn parses_apple_vision_json_into_line_blocks() {
1710        let backend = AppleVisionOcrBackend::with_config(AppleVisionOcrConfig::default());
1711        let json = br#"{
1712            "full_text": "Hello\nWorld",
1713            "blocks": [
1714                {
1715                    "text": "Hello",
1716                    "confidence": 0.93,
1717                    "bbox": { "x": 0.1, "y": 0.2, "width": 0.3, "height": 0.1 }
1718                },
1719                {
1720                    "text": "World",
1721                    "confidence": 1.1,
1722                    "bbox": { "x": 0.4, "y": 0.5, "width": 0.4, "height": 0.2 }
1723                }
1724            ]
1725        }"#;
1726
1727        let parsed = backend.parse_json_output(json).unwrap();
1728        assert_eq!(parsed.full_text, "Hello\nWorld");
1729        assert_eq!(parsed.blocks.len(), 2);
1730        assert_eq!(parsed.blocks[0].kind, OcrBlockKind::Line);
1731        assert_eq!(parsed.blocks[0].confidence, Some(0.93));
1732        assert_eq!(parsed.blocks[1].confidence, Some(1.0)); // clamped
1733        assert!(parsed.blocks[0].bbox.unwrap().is_normalized());
1734    }
1735
1736    #[test]
1737    fn apple_vision_parser_rejects_invalid_json() {
1738        let backend = AppleVisionOcrBackend::new();
1739        let err = backend
1740            .parse_json_output(br#"{"blocks":"nope"}"#)
1741            .unwrap_err();
1742        assert!(err.to_string().contains("parse Apple Vision OCR JSON"));
1743    }
1744
1745    #[test]
1746    fn default_psm_matches_target_kind() {
1747        assert_eq!(default_psm_for_target(OcrTargetKind::Screenshot), Some(11));
1748        assert_eq!(default_psm_for_target(OcrTargetKind::Photo), Some(3));
1749        assert_eq!(default_psm_for_target(OcrTargetKind::GenericImage), None);
1750    }
1751}
converge_knowledge/ingest/ocr.rs

converge_knowledge/ingest/
ocr.rs