converge_knowledge/ingest/
photos.rs1use crate::core::KnowledgeEntry;
9use crate::error::{Error, Result};
10use crate::ingest::{
11 AppleVisionOcrBackend, AppleVisionOcrConfig, ImageOcrRequest, OcrBackend, OcrBlockKind,
12 OcrDocument, OcrTargetKind, SourceKind, SourceProvenance, TesseractOcrBackend,
13 TesseractOcrConfig,
14};
15use chrono::{DateTime, Utc};
16use serde::{Deserialize, Serialize};
17use std::collections::HashMap;
18use std::fs;
19use std::path::{Path, PathBuf};
20use std::sync::Arc;
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct PhotoTextChunk {
25 pub content: String,
27 pub block_kind: OcrBlockKind,
29 pub confidence: Option<f32>,
31 pub weight: f32,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct PhotoDocument {
38 pub path: PathBuf,
40 pub title: String,
42 pub ocr: OcrDocument,
44 pub metadata: HashMap<String, String>,
46 pub chunks: Vec<PhotoTextChunk>,
48}
49
50impl PhotoDocument {
51 pub fn indexing_text(&self) -> String {
53 self.chunks
54 .iter()
55 .map(|chunk| chunk.content.trim())
56 .filter(|content| !content.is_empty())
57 .collect::<Vec<_>>()
58 .join("\n")
59 }
60
61 pub fn to_knowledge_entry(&self) -> KnowledgeEntry {
63 let content = self.indexing_text();
64 let mut entry = KnowledgeEntry::new(self.title.clone(), content)
65 .with_category("Photo")
66 .with_tags(["photo", "ocr", "image"])
67 .with_source(self.path.to_string_lossy().into_owned());
68
69 for (key, value) in self.ocr.provenance.metadata_pairs() {
70 entry = entry.with_metadata(key, value);
71 }
72
73 for (key, value) in &self.metadata {
74 entry = entry.with_metadata(format!("photo.{key}"), value.clone());
75 }
76
77 entry
78 .with_metadata("photo.chunk_count", self.chunks.len().to_string())
79 .with_metadata(
80 "photo.ocr_engine",
81 match self.ocr.engine {
82 crate::ingest::OcrEngine::AppleVision => "apple_vision",
83 crate::ingest::OcrEngine::Tesseract => "tesseract",
84 crate::ingest::OcrEngine::Mock => "mock",
85 crate::ingest::OcrEngine::External => "external",
86 },
87 )
88 }
89}
90
91#[derive(Debug, Clone)]
93pub struct PhotoIngesterConfig {
94 pub language_hints: Vec<String>,
96 pub min_confidence: Option<f32>,
98 pub content_weight: f32,
100}
101
102impl Default for PhotoIngesterConfig {
103 fn default() -> Self {
104 Self {
105 language_hints: vec!["en".to_string()],
106 min_confidence: Some(0.5),
107 content_weight: 1.0,
108 }
109 }
110}
111
112impl PhotoIngesterConfig {
113 pub fn without_confidence_filter(mut self) -> Self {
115 self.min_confidence = None;
116 self
117 }
118}
119
120#[derive(Clone)]
122pub struct PhotoIngester {
123 backend: Arc<dyn OcrBackend>,
124 config: PhotoIngesterConfig,
125}
126
127impl PhotoIngester {
128 pub fn new(backend: Arc<dyn OcrBackend>) -> Self {
130 Self {
131 backend,
132 config: PhotoIngesterConfig::default(),
133 }
134 }
135
136 pub fn with_config(backend: Arc<dyn OcrBackend>, config: PhotoIngesterConfig) -> Self {
138 Self { backend, config }
139 }
140
141 pub fn with_tesseract() -> Self {
143 Self::new(Arc::new(TesseractOcrBackend::new()))
144 }
145
146 pub fn with_tesseract_config(ocr_config: TesseractOcrConfig) -> Self {
148 Self::new(Arc::new(TesseractOcrBackend::with_config(ocr_config)))
149 }
150
151 pub fn with_tesseract_and_config(
153 ocr_config: TesseractOcrConfig,
154 config: PhotoIngesterConfig,
155 ) -> Self {
156 Self::with_config(
157 Arc::new(TesseractOcrBackend::with_config(ocr_config)),
158 config,
159 )
160 }
161
162 pub fn with_apple_vision() -> Self {
164 Self::new(Arc::new(AppleVisionOcrBackend::new()))
165 }
166
167 pub fn with_apple_vision_config(ocr_config: AppleVisionOcrConfig) -> Self {
169 Self::new(Arc::new(AppleVisionOcrBackend::with_config(ocr_config)))
170 }
171
172 pub fn with_apple_vision_and_config(
174 ocr_config: AppleVisionOcrConfig,
175 config: PhotoIngesterConfig,
176 ) -> Self {
177 Self::with_config(
178 Arc::new(AppleVisionOcrBackend::with_config(ocr_config)),
179 config,
180 )
181 }
182
183 pub async fn ingest_file(&self, path: &Path) -> Result<PhotoDocument> {
185 if !path.exists() {
186 return Err(Error::ingest(format!(
187 "photo file does not exist: {}",
188 path.display()
189 )));
190 }
191
192 let metadata = fs::metadata(path)?;
193 if !metadata.is_file() {
194 return Err(Error::ingest(format!(
195 "photo path is not a file: {}",
196 path.display()
197 )));
198 }
199
200 let provenance = self.build_provenance(path, &metadata);
201 let mut request =
202 ImageOcrRequest::new(path.to_path_buf(), OcrTargetKind::Photo, provenance);
203 request.min_confidence = self.config.min_confidence;
204 request.language_hints = self.config.language_hints.clone();
205
206 let ocr = self.backend.extract(&request).await?;
207 let photo_metadata = self.build_photo_metadata(path, &metadata);
208 let chunks = self.build_chunks(&ocr);
209 let title = infer_title(path);
210
211 Ok(PhotoDocument {
212 path: path.to_path_buf(),
213 title,
214 ocr,
215 metadata: photo_metadata,
216 chunks,
217 })
218 }
219
220 pub async fn ingest_as_entry(&self, path: &Path) -> Result<KnowledgeEntry> {
222 let doc = self.ingest_file(path).await?;
223 Ok(doc.to_knowledge_entry())
224 }
225
226 fn build_provenance(&self, path: &Path, fs_meta: &fs::Metadata) -> SourceProvenance {
227 let mut provenance =
228 SourceProvenance::new(SourceKind::Photo, path.to_string_lossy().into_owned())
229 .with_metadata("filename", file_name_string(path))
230 .with_metadata("extension", file_extension_string(path))
231 .with_metadata("file_size_bytes", fs_meta.len().to_string());
232
233 if let Some(captured_at) = system_time_to_utc(fs_meta.modified().ok()) {
234 provenance = provenance.with_captured_at(captured_at);
235 }
236
237 provenance
238 }
239
240 fn build_photo_metadata(&self, path: &Path, fs_meta: &fs::Metadata) -> HashMap<String, String> {
241 let mut out = HashMap::new();
242 out.insert("filename".to_string(), file_name_string(path));
243 out.insert("extension".to_string(), file_extension_string(path));
244 out.insert("file_size_bytes".to_string(), fs_meta.len().to_string());
245 if let Some(min_conf) = self.config.min_confidence {
246 out.insert("min_confidence".to_string(), min_conf.to_string());
247 }
248 out
249 }
250
251 fn build_chunks(&self, ocr: &OcrDocument) -> Vec<PhotoTextChunk> {
252 if ocr.blocks.is_empty() {
253 let text = ocr.effective_text();
254 if text.trim().is_empty() {
255 return Vec::new();
256 }
257 return vec![PhotoTextChunk {
258 content: text,
259 block_kind: OcrBlockKind::Unknown,
260 confidence: None,
261 weight: self.config.content_weight,
262 }];
263 }
264
265 ocr.blocks
266 .iter()
267 .filter(|block| {
268 self.config
269 .min_confidence
270 .is_none_or(|min| block.confidence.unwrap_or(1.0) >= min)
271 })
272 .filter_map(|block| {
273 let content = block.text.trim();
274 if content.is_empty() {
275 return None;
276 }
277
278 Some(PhotoTextChunk {
279 content: content.to_string(),
280 block_kind: block.kind,
281 confidence: block.confidence,
282 weight: self.config.content_weight,
283 })
284 })
285 .collect()
286 }
287}
288
289fn infer_title(path: &Path) -> String {
290 path.file_stem()
291 .and_then(|stem| stem.to_str())
292 .map(clean_photo_title)
293 .filter(|title| !title.is_empty())
294 .unwrap_or_else(|| "Photo".to_string())
295}
296
297fn clean_photo_title(raw: &str) -> String {
298 raw.replace('_', " ").trim().to_string()
299}
300
301fn file_name_string(path: &Path) -> String {
302 path.file_name()
303 .and_then(|name| name.to_str())
304 .unwrap_or_default()
305 .to_string()
306}
307
308fn file_extension_string(path: &Path) -> String {
309 path.extension()
310 .and_then(|ext| ext.to_str())
311 .unwrap_or_default()
312 .to_string()
313}
314
315fn system_time_to_utc(time: Option<std::time::SystemTime>) -> Option<DateTime<Utc>> {
316 time.map(DateTime::<Utc>::from)
317}
318
319#[cfg(test)]
320mod tests {
321 use super::*;
322 use crate::ingest::{FixtureOcrBackend, OcrDocument, OcrEngine, OcrTextBlock};
323 use tempfile::TempDir;
324
325 async fn create_temp_photo(temp_dir: &TempDir, name: &str) -> PathBuf {
326 let path = temp_dir.path().join(name);
327 tokio::fs::write(&path, b"fakejpg").await.unwrap();
328 path
329 }
330
331 #[tokio::test]
332 async fn ingest_photo_builds_chunks_and_entry_metadata() {
333 let temp_dir = TempDir::new().unwrap();
334 let path = create_temp_photo(&temp_dir, "IMG_2042.jpg").await;
335
336 let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p1");
337 let mut fixture_doc =
338 OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
339 fixture_doc.blocks = vec![
340 OcrTextBlock {
341 text: "Menu".into(),
342 confidence: Some(0.98),
343 bbox: None,
344 kind: OcrBlockKind::Line,
345 },
346 OcrTextBlock {
347 text: "blurry".into(),
348 confidence: Some(0.20),
349 bbox: None,
350 kind: OcrBlockKind::Word,
351 },
352 ];
353
354 let backend = Arc::new(FixtureOcrBackend::new().with_document("IMG_2042.jpg", fixture_doc));
355 let ingester = PhotoIngester::new(backend);
356
357 let doc = ingester.ingest_file(&path).await.unwrap();
358 assert_eq!(doc.title, "IMG 2042");
359 assert_eq!(doc.chunks.len(), 1);
360 assert_eq!(doc.chunks[0].content, "Menu");
361 assert!(doc.indexing_text().contains("Menu"));
362
363 let entry = doc.to_knowledge_entry();
364 assert_eq!(entry.category.as_deref(), Some("Photo"));
365 assert_eq!(entry.metadata.get("source.kind"), Some("photo"));
366 assert_eq!(entry.metadata.get("photo.chunk_count"), Some("1"));
367 assert_eq!(entry.metadata.get("photo.ocr_engine"), Some("mock"));
368 }
369
370 #[tokio::test]
371 async fn ingest_photo_falls_back_to_full_text_when_no_blocks() {
372 let temp_dir = TempDir::new().unwrap();
373 let path = create_temp_photo(&temp_dir, "receipt.png").await;
374
375 let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p2");
376 let mut fixture_doc =
377 OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
378 fixture_doc.full_text = "Receipt total 12.95".into();
379
380 let backend = Arc::new(FixtureOcrBackend::new().with_default_document(fixture_doc));
381 let ingester = PhotoIngester::new(backend);
382
383 let doc = ingester.ingest_file(&path).await.unwrap();
384 assert_eq!(doc.chunks.len(), 1);
385 assert_eq!(doc.chunks[0].content, "Receipt total 12.95");
386 }
387
388 #[tokio::test]
389 async fn ingest_photo_can_disable_confidence_filter() {
390 let temp_dir = TempDir::new().unwrap();
391 let path = create_temp_photo(&temp_dir, "label.jpg").await;
392
393 let fixture_provenance = SourceProvenance::new(SourceKind::Photo, "fixture://p3");
394 let mut fixture_doc =
395 OcrDocument::new(OcrEngine::Mock, OcrTargetKind::Photo, fixture_provenance);
396 fixture_doc.blocks = vec![OcrTextBlock {
397 text: "low".into(),
398 confidence: Some(0.1),
399 bbox: None,
400 kind: OcrBlockKind::Word,
401 }];
402
403 let backend = Arc::new(FixtureOcrBackend::new().with_default_document(fixture_doc));
404 let config = PhotoIngesterConfig::default().without_confidence_filter();
405 let ingester = PhotoIngester::with_config(backend, config);
406
407 let doc = ingester.ingest_file(&path).await.unwrap();
408 assert_eq!(doc.chunks.len(), 1);
409 assert_eq!(doc.chunks[0].content, "low");
410 }
411
412 #[tokio::test]
413 async fn ingest_photo_errors_for_missing_file() {
414 let backend = Arc::new(FixtureOcrBackend::new());
415 let ingester = PhotoIngester::new(backend);
416
417 let err = ingester
418 .ingest_file(Path::new("/tmp/does-not-exist-photo.jpg"))
419 .await
420 .unwrap_err();
421 assert!(err.to_string().contains("does not exist"));
422 }
423}