Skip to main content

converge_knowledge/ingest/
photos.rs

1//! Photo OCR ingestion scaffold.
2//!
3//! This module turns photo image files into searchable text chunks using an
4//! `OcrBackend` implementation (e.g., Apple Vision, Tesseract, or fixtures).
5//! It preserves photo-specific file metadata and produces a `KnowledgeEntry`
6//! scaffold for indexing pipelines.
7
8use crate::core::KnowledgeEntry;
9use crate::error::{Error, Result};
10use crate::ingest::{
11    AppleVisionOcrBackend, AppleVisionOcrConfig, ImageOcrRequest, OcrBackend, OcrBlockKind,
12    OcrDocument, OcrTargetKind, SourceKind, SourceProvenance, TesseractOcrBackend,
13    TesseractOcrConfig,
14};
15use chrono::{DateTime, Utc};
16use serde::{Deserialize, Serialize};
17use std::collections::HashMap;
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::sync::Arc;
21
22/// A photo text chunk derived from OCR output.
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct PhotoTextChunk {
25    /// The text content for this chunk.
26    pub content: String,
27    /// The OCR block kind this chunk originated from.
28    pub block_kind: OcrBlockKind,
29    /// OCR confidence when available.
30    pub confidence: Option<f32>,
31    /// Relative weight hint for later ranking/indexing logic.
32    pub weight: f32,
33}
34
35/// Structured photo OCR ingestion result.
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct PhotoDocument {
38    /// Original photo path.
39    pub path: PathBuf,
40    /// Human-readable title inferred from path/metadata.
41    pub title: String,
42    /// OCR extraction result.
43    pub ocr: OcrDocument,
44    /// Photo-specific metadata.
45    pub metadata: HashMap<String, String>,
46    /// Text chunks prepared for indexing.
47    pub chunks: Vec<PhotoTextChunk>,
48}
49
50impl PhotoDocument {
51    /// Best-effort text for indexing, concatenating chunks in order.
52    pub fn indexing_text(&self) -> String {
53        self.chunks
54            .iter()
55            .map(|chunk| chunk.content.trim())
56            .filter(|content| !content.is_empty())
57            .collect::<Vec<_>>()
58            .join("\n")
59    }
60
61    /// Convert this photo OCR result into a knowledge entry scaffold.
62    pub fn to_knowledge_entry(&self) -> KnowledgeEntry {
63        let content = self.indexing_text();
64        let mut entry = KnowledgeEntry::new(self.title.clone(), content)
65            .with_category("Photo")
66            .with_tags(["photo", "ocr", "image"])
67            .with_source(self.path.to_string_lossy().into_owned());
68
69        for (key, value) in self.ocr.provenance.metadata_pairs() {
70            entry = entry.with_metadata(key, value);
71        }
72
73        for (key, value) in &self.metadata {
74            entry = entry.with_metadata(format!("photo.{key}"), value.clone());
75        }
76
77        entry
78            .with_metadata("photo.chunk_count", self.chunks.len().to_string())
79            .with_metadata(
80                "photo.ocr_engine",
81                match self.ocr.engine {
82                    crate::ingest::OcrEngine::AppleVision => "apple_vision",
83                    crate::ingest::OcrEngine::Tesseract => "tesseract",
84                    crate::ingest::OcrEngine::Mock => "mock",
85                    crate::ingest::OcrEngine::External => "external",
86                },
87            )
88    }
89}
90
91/// Configuration for photo ingestion.
92#[derive(Debug, Clone)]
93pub struct PhotoIngesterConfig {
94    /// Language hints passed to OCR backends by default.
95    pub language_hints: Vec<String>,
96    /// Drop OCR blocks below this confidence when set.
97    pub min_confidence: Option<f32>,
98    /// Weight applied to all OCR-derived content blocks.
99    pub content_weight: f32,
100}
101
102impl Default for PhotoIngesterConfig {
103    fn default() -> Self {
104        Self {
105            language_hints: vec!["en".to_string()],
106            min_confidence: Some(0.5),
107            content_weight: 1.0,
108        }
109    }
110}
111
112impl PhotoIngesterConfig {
113    /// Disable default confidence filtering.
114    pub fn without_confidence_filter(mut self) -> Self {
115        self.min_confidence = None;
116        self
117    }
118}
119
120/// Photo ingester backed by an OCR backend implementation.
121#[derive(Clone)]
122pub struct PhotoIngester {
123    backend: Arc<dyn OcrBackend>,
124    config: PhotoIngesterConfig,
125}
126
127impl PhotoIngester {
128    /// Create a new photo ingester with default config.
129    pub fn new(backend: Arc<dyn OcrBackend>) -> Self {
130        Self {
131            backend,
132            config: PhotoIngesterConfig::default(),
133        }
134    }
135
136    /// Create a photo ingester with custom config.
137    pub fn with_config(backend: Arc<dyn OcrBackend>, config: PhotoIngesterConfig) -> Self {
138        Self { backend, config }
139    }
140
141    /// Create a photo ingester backed by the real Tesseract OCR backend.
142    pub fn with_tesseract() -> Self {
143        Self::new(Arc::new(TesseractOcrBackend::new()))
144    }
145
146    /// Create a photo ingester backed by Tesseract with custom OCR config.
147    pub fn with_tesseract_config(ocr_config: TesseractOcrConfig) -> Self {
148        Self::new(Arc::new(TesseractOcrBackend::with_config(ocr_config)))
149    }
150
151    /// Create a photo ingester with custom ingestion + Tesseract OCR configs.
152    pub fn with_tesseract_and_config(
153        ocr_config: TesseractOcrConfig,
154        config: PhotoIngesterConfig,
155    ) -> Self {
156        Self::with_config(
157            Arc::new(TesseractOcrBackend::with_config(ocr_config)),
158            config,
159        )
160    }
161
162    /// Create a photo ingester backed by Apple's Vision OCR (macOS-only at runtime).
163    pub fn with_apple_vision() -> Self {
164        Self::new(Arc::new(AppleVisionOcrBackend::new()))
165    }
166
167    /// Create a photo ingester backed by Apple Vision with custom OCR config.
168    pub fn with_apple_vision_config(ocr_config: AppleVisionOcrConfig) -> Self {
169        Self::new(Arc::new(AppleVisionOcrBackend::with_config(ocr_config)))
170    }
171
172    /// Create a photo ingester with custom ingestion + Apple Vision OCR configs.
173    pub fn with_apple_vision_and_config(
174        ocr_config: AppleVisionOcrConfig,
175        config: PhotoIngesterConfig,
176    ) -> Self {
177        Self::with_config(
178            Arc::new(AppleVisionOcrBackend::with_config(ocr_config)),
179            config,
180        )
181    }
182
183    /// Ingest a photo image file into structured OCR chunks.
184    pub async fn ingest_file(&self, path: &Path) -> Result<PhotoDocument> {
185        if !path.exists() {
186            return Err(Error::ingest(format!(
187                "photo file does not exist: {}",
188                path.display()
189            )));
190        }
191
192        let metadata = fs::metadata(path)?;
193        if !metadata.is_file() {
194            return Err(Error::ingest(format!(
195                "photo path is not a file: {}",
196                path.display()
197            )));
198        }
199
200        let provenance = self.build_provenance(path, &metadata);
201        let mut request =
202            ImageOcrRequest::new(path.to_path_buf(), OcrTargetKind::Photo, provenance);
203        request.min_confidence = self.config.min_confidence;
204        request.language_hints = self.config.language_hints.clone();
205
206        let ocr = self.backend.extract(&request).await?;
207        let photo_metadata = self.build_photo_metadata(path, &metadata);
208        let chunks = self.build_chunks(&ocr);
209        let title = infer_title(path);
210
211        Ok(PhotoDocument {
212            path: path.to_path_buf(),
213            title,
214            ocr,
215            metadata: photo_metadata,
216            chunks,
217        })
218    }
219
220    /// Ingest a photo and convert directly to a knowledge entry scaffold.
221    pub async fn ingest_as_entry(&self, path: &Path) -> Result<KnowledgeEntry> {
222        let doc = self.ingest_file(path).await?;
223        Ok(doc.to_knowledge_entry())
224    }
225
226    fn build_provenance(&self, path: &Path, fs_meta: &fs::Metadata) -> SourceProvenance {
227        let mut provenance =
228            SourceProvenance::new(SourceKind::Photo, path.to_string_lossy().into_owned())
229                .with_metadata("filename", file_name_string(path))
230                .with_metadata("extension", file_extension_string(path))
231                .with_metadata("file_size_bytes", fs_meta.len().to_string());
232
233        if let Some(captured_at) = system_time_to_utc(fs_meta.modified().ok()) {
234            provenance = provenance.with_captured_at(captured_at);
235        }
236
237        provenance
238    }
239
240    fn build_photo_metadata(&self, path: &Path, fs_meta: &fs::Metadata) -> HashMap<String, String> {
241        let mut out = HashMap::new();
242        out.insert("filename".to_string(), file_name_string(path));
243        out.insert("extension".to_string(), file_extension_string(path));
244        out.insert("file_size_bytes".to_string(), fs_meta.len().to_string());
245        if let Some(min_conf) = self.config.min_confidence {
246            out.insert("min_confidence".to_string(), min_conf.to_string());
247        }
248        out
249    }
250
251    fn build_chunks(&self, ocr: &OcrDocument) -> Vec<PhotoTextChunk> {
252        if ocr.blocks.is_empty() {
253            let text = ocr.effective_text();
254            if text.trim().is_empty() {
255                return Vec::new();
256            }
257            return vec![PhotoTextChunk {
258                content: text,
259                block_kind: OcrBlockKind::Unknown,
260                confidence: None,
261                weight: self.config.content_weight,
262            }];
263        }
264
265        ocr.blocks
266            .iter()
267            .filter(|block| {
268                self.config
269                    .min_confidence
270                    .is_none_or(|min| block.confidence.unwrap_or(1.0) >= min)
271            })
272            .filter_map(|block| {
273                let content = block.text.trim();
274                if content.is_empty() {
275                    return None;
276                }
277
278                Some(PhotoTextChunk {
279                    content: content.to_string(),
280                    block_kind: block.kind,
281                    confidence: block.confidence,
282                    weight: self.config.content_weight,
283                })
284            })
285            .collect()
286    }
287}
288
289fn infer_title(path: &Path) -> String {
290    path.file_stem()
291        .and_then(|stem| stem.to_str())
292        .map(clean_photo_title)
293        .filter(|title| !title.is_empty())
294        .unwrap_or_else(|| "Photo".to_string())
295}
296
297fn clean_photo_title(raw: &str) -> String {
298    raw.replace('_', " ").trim().to_string()
299}
300
301fn file_name_string(path: &Path) -> String {
302    path.file_name()
303        .and_then(|name| name.to_str())
304        .unwrap_or_default()
305        .to_string()
306}
307
308fn file_extension_string(path: &Path) -> String {
309    path.extension()
310        .and_then(|ext| ext.to_str())
311        .unwrap_or_default()
312        .to_string()
313}
314
315fn system_time_to_utc(time: Option<std::time::SystemTime>) -> Option<DateTime<Utc>> {
316    time.map(DateTime::<Utc>::from)
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322    use crate::ingest::{FixtureOcrBackend, OcrDocument, OcrEngine, OcrTextBlock};
323    use tempfile::TempDir;
324
325    async fn create_temp_photo(temp_dir: &TempDir, name: &str) -> PathBuf {
326        let path = temp_dir.path().join(name);
327        tokio::fs::write(&path, b"fakejpg").await.unwrap();
328        path
329    }
330
331    #[tokio::test]
332    async fn ingest_photo_builds_chunks_and_entry_metadata() {
333        let temp_dir = TempDir::new().unwrap();
334        let path = create_temp_photo(&temp_dir, "IMG_2042.jpg").await;
335
336        let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p1");
337        let mut fixture_doc =
338            OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
339        fixture_doc.blocks = vec![
340            OcrTextBlock {
341                text: "Menu".into(),
342                confidence: Some(0.98),
343                bbox: None,
344                kind: OcrBlockKind::Line,
345            },
346            OcrTextBlock {
347                text: "blurry".into(),
348                confidence: Some(0.20),
349                bbox: None,
350                kind: OcrBlockKind::Word,
351            },
352        ];
353
354        let backend = Arc::new(FixtureOcrBackend::new().with_document("IMG_2042.jpg", fixture_doc));
355        let ingester = PhotoIngester::new(backend);
356
357        let doc = ingester.ingest_file(&path).await.unwrap();
358        assert_eq!(doc.title, "IMG 2042");
359        assert_eq!(doc.chunks.len(), 1);
360        assert_eq!(doc.chunks[0].content, "Menu");
361        assert!(doc.indexing_text().contains("Menu"));
362
363        let entry = doc.to_knowledge_entry();
364        assert_eq!(entry.category.as_deref(), Some("Photo"));
365        assert_eq!(entry.metadata.get("source.kind"), Some("photo"));
366        assert_eq!(entry.metadata.get("photo.chunk_count"), Some("1"));
367        assert_eq!(entry.metadata.get("photo.ocr_engine"), Some("mock"));
368    }
369
370    #[tokio::test]
371    async fn ingest_photo_falls_back_to_full_text_when_no_blocks() {
372        let temp_dir = TempDir::new().unwrap();
373        let path = create_temp_photo(&temp_dir, "receipt.png").await;
374
375        let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p2");
376        let mut fixture_doc =
377            OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
378        fixture_doc.full_text = "Receipt total 12.95".into();
379
380        let backend = Arc::new(FixtureOcrBackend::new().with_default_document(fixture_doc));
381        let ingester = PhotoIngester::new(backend);
382
383        let doc = ingester.ingest_file(&path).await.unwrap();
384        assert_eq!(doc.chunks.len(), 1);
385        assert_eq!(doc.chunks[0].content, "Receipt total 12.95");
386    }
387
388    #[tokio::test]
389    async fn ingest_photo_can_disable_confidence_filter() {
390        let temp_dir = TempDir::new().unwrap();
391        let path = create_temp_photo(&temp_dir, "label.jpg").await;
392
393        let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p3");
394        let mut fixture_doc =
395            OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
396        fixture_doc.blocks = vec![OcrTextBlock {
397            text: "low".into(),
398            confidence: Some(0.1),
399            bbox: None,
400            kind: OcrBlockKind::Word,
401        }];
402
403        let backend = Arc::new(FixtureOcrBackend::new().with_default_document(fixture_doc));
404        let config = PhotoIngesterConfig::default().without_confidence_filter();
405        let ingester = PhotoIngester::with_config(backend, config);
406
407        let doc = ingester.ingest_file(&path).await.unwrap();
408        assert_eq!(doc.chunks.len(), 1);
409        assert_eq!(doc.chunks[0].content, "low");
410    }
411
412    #[tokio::test]
413    async fn ingest_photo_errors_for_missing_file() {
414        let backend = Arc::new(FixtureOcrBackend::new());
415        let ingester = PhotoIngester::new(backend);
416
417        let err = ingester
418            .ingest_file(Path::new("/tmp/does-not-exist-photo.jpg"))
419            .await
420            .unwrap_err();
421        assert!(err.to_string().contains("does not exist"));
422    }
423}