Skip to main content

converge_knowledge/ingest/
screenshots.rs

1//! Screenshot OCR ingestion scaffold.
2//!
3//! This module turns screenshot image files into searchable text chunks using an
4//! `OcrBackend` implementation (e.g., Apple Vision, Tesseract, or fixtures).
5//! The current implementation is intentionally lightweight and testable:
6//! it extracts OCR text, preserves screenshot-specific metadata, and can build
7//! a `KnowledgeEntry` for indexing pipelines.
8
9use crate::core::KnowledgeEntry;
10use crate::error::{Error, Result};
11use crate::ingest::{
12    AppleVisionOcrBackend, AppleVisionOcrConfig, ImageOcrRequest, OcrBackend, OcrBlockKind,
13    OcrDocument, OcrTargetKind, SourceKind, SourceProvenance, TesseractOcrBackend,
14    TesseractOcrConfig,
15};
16use chrono::{DateTime, Utc};
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::fs;
20use std::path::{Path, PathBuf};
21use std::sync::Arc;
22
23/// A screenshot text chunk derived from OCR output.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct ScreenshotTextChunk {
26    /// The text content for this chunk.
27    pub content: String,
28    /// The OCR block kind this chunk originated from.
29    pub block_kind: OcrBlockKind,
30    /// OCR confidence when available.
31    pub confidence: Option<f32>,
32    /// Relative weight hint for later ranking/indexing logic.
33    pub weight: f32,
34}
35
36/// Structured screenshot OCR ingestion result.
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct ScreenshotDocument {
39    /// Original screenshot path.
40    pub path: PathBuf,
41    /// Human-readable title inferred from path/metadata.
42    pub title: String,
43    /// OCR extraction result.
44    pub ocr: OcrDocument,
45    /// Screenshot-specific metadata.
46    pub metadata: HashMap<String, String>,
47    /// Text chunks prepared for indexing.
48    pub chunks: Vec<ScreenshotTextChunk>,
49}
50
51impl ScreenshotDocument {
52    /// Best-effort text for indexing, concatenating chunks in order.
53    pub fn indexing_text(&self) -> String {
54        self.chunks
55            .iter()
56            .map(|chunk| chunk.content.trim())
57            .filter(|content| !content.is_empty())
58            .collect::<Vec<_>>()
59            .join("\n")
60    }
61
62    /// Convert this screenshot OCR result into a knowledge entry scaffold.
63    pub fn to_knowledge_entry(&self) -> KnowledgeEntry {
64        let content = self.indexing_text();
65        let mut entry = KnowledgeEntry::new(self.title.clone(), content)
66            .with_category("Screenshot")
67            .with_tags(["screenshot", "ocr"])
68            .with_source(self.path.to_string_lossy().into_owned());
69
70        for (key, value) in self.ocr.provenance.metadata_pairs() {
71            entry = entry.with_metadata(key, value);
72        }
73
74        for (key, value) in &self.metadata {
75            entry = entry.with_metadata(format!("screenshot.{key}"), value.clone());
76        }
77
78        entry
79            .with_metadata("screenshot.chunk_count", self.chunks.len().to_string())
80            .with_metadata(
81                "screenshot.ocr_engine",
82                match self.ocr.engine {
83                    crate::ingest::OcrEngine::AppleVision => "apple_vision",
84                    crate::ingest::OcrEngine::Tesseract => "tesseract",
85                    crate::ingest::OcrEngine::Mock => "mock",
86                    crate::ingest::OcrEngine::External => "external",
87                },
88            )
89    }
90}
91
92/// Configuration for screenshot ingestion.
93#[derive(Debug, Clone)]
94pub struct ScreenshotIngesterConfig {
95    /// Language hints passed to OCR backends by default.
96    pub language_hints: Vec<String>,
97    /// Drop OCR blocks below this confidence when set.
98    pub min_confidence: Option<f32>,
99    /// Whether UI chrome text blocks should be included in output chunks.
100    pub include_ui_chrome: bool,
101    /// Weight applied to UI chrome text blocks.
102    pub ui_chrome_weight: f32,
103    /// Weight applied to non-UI OCR blocks.
104    pub content_weight: f32,
105}
106
107impl Default for ScreenshotIngesterConfig {
108    fn default() -> Self {
109        Self {
110            language_hints: vec!["en".to_string()],
111            min_confidence: Some(0.5),
112            include_ui_chrome: true,
113            ui_chrome_weight: 0.4,
114            content_weight: 1.0,
115        }
116    }
117}
118
119impl ScreenshotIngesterConfig {
120    /// Disable default confidence filtering.
121    pub fn without_confidence_filter(mut self) -> Self {
122        self.min_confidence = None;
123        self
124    }
125
126    /// Set whether UI chrome blocks are included.
127    pub fn with_ui_chrome(mut self, include: bool) -> Self {
128        self.include_ui_chrome = include;
129        self
130    }
131}
132
133/// Screenshot ingester backed by an OCR backend implementation.
134#[derive(Clone)]
135pub struct ScreenshotIngester {
136    backend: Arc<dyn OcrBackend>,
137    config: ScreenshotIngesterConfig,
138}
139
140impl ScreenshotIngester {
141    /// Create a new screenshot ingester with default config.
142    pub fn new(backend: Arc<dyn OcrBackend>) -> Self {
143        Self {
144            backend,
145            config: ScreenshotIngesterConfig::default(),
146        }
147    }
148
149    /// Create a screenshot ingester with custom config.
150    pub fn with_config(backend: Arc<dyn OcrBackend>, config: ScreenshotIngesterConfig) -> Self {
151        Self { backend, config }
152    }
153
154    /// Create a screenshot ingester backed by the real Tesseract OCR backend.
155    pub fn with_tesseract() -> Self {
156        Self::new(Arc::new(TesseractOcrBackend::new()))
157    }
158
159    /// Create a screenshot ingester backed by Tesseract with custom OCR config.
160    pub fn with_tesseract_config(ocr_config: TesseractOcrConfig) -> Self {
161        Self::new(Arc::new(TesseractOcrBackend::with_config(ocr_config)))
162    }
163
164    /// Create a screenshot ingester with custom ingestion + Tesseract OCR configs.
165    pub fn with_tesseract_and_config(
166        ocr_config: TesseractOcrConfig,
167        config: ScreenshotIngesterConfig,
168    ) -> Self {
169        Self::with_config(
170            Arc::new(TesseractOcrBackend::with_config(ocr_config)),
171            config,
172        )
173    }
174
175    /// Create a screenshot ingester backed by Apple's Vision OCR (macOS-only at runtime).
176    pub fn with_apple_vision() -> Self {
177        Self::new(Arc::new(AppleVisionOcrBackend::new()))
178    }
179
180    /// Create a screenshot ingester backed by Apple Vision with custom OCR config.
181    pub fn with_apple_vision_config(ocr_config: AppleVisionOcrConfig) -> Self {
182        Self::new(Arc::new(AppleVisionOcrBackend::with_config(ocr_config)))
183    }
184
185    /// Create a screenshot ingester with custom ingestion + Apple Vision OCR configs.
186    pub fn with_apple_vision_and_config(
187        ocr_config: AppleVisionOcrConfig,
188        config: ScreenshotIngesterConfig,
189    ) -> Self {
190        Self::with_config(
191            Arc::new(AppleVisionOcrBackend::with_config(ocr_config)),
192            config,
193        )
194    }
195
196    /// Ingest a screenshot image file into structured OCR chunks.
197    pub async fn ingest_file(&self, path: &Path) -> Result<ScreenshotDocument> {
198        if !path.exists() {
199            return Err(Error::ingest(format!(
200                "screenshot file does not exist: {}",
201                path.display()
202            )));
203        }
204
205        let metadata = fs::metadata(path)?;
206        if !metadata.is_file() {
207            return Err(Error::ingest(format!(
208                "screenshot path is not a file: {}",
209                path.display()
210            )));
211        }
212
213        let provenance = self.build_provenance(path, &metadata);
214        let mut request =
215            ImageOcrRequest::new(path.to_path_buf(), OcrTargetKind::Screenshot, provenance);
216        request.min_confidence = self.config.min_confidence;
217        request.language_hints = self.config.language_hints.clone();
218
219        let ocr = self.backend.extract(&request).await?;
220        let screenshot_metadata = self.build_screenshot_metadata(path, &metadata);
221        let chunks = self.build_chunks(&ocr);
222        let title = infer_title(path);
223
224        Ok(ScreenshotDocument {
225            path: path.to_path_buf(),
226            title,
227            ocr,
228            metadata: screenshot_metadata,
229            chunks,
230        })
231    }
232
233    /// Ingest a screenshot and convert directly to a knowledge entry scaffold.
234    pub async fn ingest_as_entry(&self, path: &Path) -> Result<KnowledgeEntry> {
235        let doc = self.ingest_file(path).await?;
236        Ok(doc.to_knowledge_entry())
237    }
238
239    fn build_provenance(&self, path: &Path, fs_meta: &fs::Metadata) -> SourceProvenance {
240        let mut provenance =
241            SourceProvenance::new(SourceKind::Screenshot, path.to_string_lossy().into_owned())
242                .with_metadata("filename", file_name_string(path))
243                .with_metadata("extension", file_extension_string(path))
244                .with_metadata("file_size_bytes", fs_meta.len().to_string());
245
246        if let Some(captured_at) = system_time_to_utc(fs_meta.modified().ok()) {
247            provenance = provenance.with_captured_at(captured_at);
248        }
249
250        provenance
251    }
252
253    fn build_screenshot_metadata(
254        &self,
255        path: &Path,
256        fs_meta: &fs::Metadata,
257    ) -> HashMap<String, String> {
258        let mut out = HashMap::new();
259        out.insert("filename".to_string(), file_name_string(path));
260        out.insert("extension".to_string(), file_extension_string(path));
261        out.insert("file_size_bytes".to_string(), fs_meta.len().to_string());
262        out.insert(
263            "ui_chrome_included".to_string(),
264            self.config.include_ui_chrome.to_string(),
265        );
266        if let Some(min_conf) = self.config.min_confidence {
267            out.insert("min_confidence".to_string(), min_conf.to_string());
268        }
269        out
270    }
271
272    fn build_chunks(&self, ocr: &OcrDocument) -> Vec<ScreenshotTextChunk> {
273        if ocr.blocks.is_empty() {
274            let text = ocr.effective_text();
275            if text.trim().is_empty() {
276                return Vec::new();
277            }
278            return vec![ScreenshotTextChunk {
279                content: text,
280                block_kind: OcrBlockKind::Unknown,
281                confidence: None,
282                weight: self.config.content_weight,
283            }];
284        }
285
286        ocr.blocks
287            .iter()
288            .filter(|block| {
289                self.config.include_ui_chrome || !matches!(block.kind, OcrBlockKind::UiChrome)
290            })
291            .filter(|block| {
292                self.config
293                    .min_confidence
294                    .is_none_or(|min| block.confidence.unwrap_or(1.0) >= min)
295            })
296            .filter_map(|block| {
297                let content = block.text.trim();
298                if content.is_empty() {
299                    return None;
300                }
301
302                let weight = if matches!(block.kind, OcrBlockKind::UiChrome) {
303                    self.config.ui_chrome_weight
304                } else {
305                    self.config.content_weight
306                };
307
308                Some(ScreenshotTextChunk {
309                    content: content.to_string(),
310                    block_kind: block.kind,
311                    confidence: block.confidence,
312                    weight,
313                })
314            })
315            .collect()
316    }
317}
318
319fn infer_title(path: &Path) -> String {
320    path.file_stem()
321        .and_then(|stem| stem.to_str())
322        .map(clean_screenshot_title)
323        .filter(|title| !title.is_empty())
324        .unwrap_or_else(|| "Screenshot".to_string())
325}
326
327fn clean_screenshot_title(raw: &str) -> String {
328    raw.replace('_', " ").trim().to_string()
329}
330
331fn file_name_string(path: &Path) -> String {
332    path.file_name()
333        .and_then(|name| name.to_str())
334        .unwrap_or_default()
335        .to_string()
336}
337
338fn file_extension_string(path: &Path) -> String {
339    path.extension()
340        .and_then(|ext| ext.to_str())
341        .unwrap_or_default()
342        .to_string()
343}
344
345fn system_time_to_utc(time: Option<std::time::SystemTime>) -> Option<DateTime<Utc>> {
346    time.map(DateTime::<Utc>::from)
347}
348
349#[cfg(test)]
350mod tests {
351    use super::*;
352    use crate::ingest::{FixtureOcrBackend, OcrDocument, OcrEngine, OcrTextBlock};
353    use tempfile::TempDir;
354
355    async fn create_temp_screenshot(temp_dir: &TempDir, name: &str) -> PathBuf {
356        let path = temp_dir.path().join(name);
357        tokio::fs::write(&path, b"fakepng").await.unwrap();
358        path
359    }
360
361    #[tokio::test]
362    async fn ingest_screenshot_builds_chunks_and_entry_metadata() {
363        let temp_dir = TempDir::new().unwrap();
364        let path = create_temp_screenshot(&temp_dir, "Screenshot_2026-02-22.png").await;
365
366        let fixture_provenance = SourceProvenance::new(SourceKind::Screenshot, "fixture://s1");
367        let mut fixture_doc = OcrDocument::new(
368            OcrEngine::Mock,
369            OcrTargetKind::Screenshot,
370            fixture_provenance,
371        );
372        fixture_doc.blocks = vec![
373            OcrTextBlock {
374                text: "Browser".into(),
375                confidence: Some(0.99),
376                bbox: None,
377                kind: OcrBlockKind::UiChrome,
378            },
379            OcrTextBlock {
380                text: "Important error message".into(),
381                confidence: Some(0.93),
382                bbox: None,
383                kind: OcrBlockKind::Paragraph,
384            },
385            OcrTextBlock {
386                text: "low conf".into(),
387                confidence: Some(0.2),
388                bbox: None,
389                kind: OcrBlockKind::Word,
390            },
391        ];
392
393        let backend = Arc::new(
394            FixtureOcrBackend::new().with_document("Screenshot_2026-02-22.png", fixture_doc),
395        );
396        let ingester = ScreenshotIngester::new(backend);
397
398        let doc = ingester.ingest_file(&path).await.unwrap();
399        assert_eq!(doc.title, "Screenshot 2026-02-22");
400        assert_eq!(doc.chunks.len(), 2); // UI chrome + paragraph, low confidence dropped
401        assert_eq!(doc.chunks[0].weight, ingester.config.ui_chrome_weight);
402        assert_eq!(doc.chunks[1].weight, ingester.config.content_weight);
403        assert!(doc.indexing_text().contains("Important error message"));
404
405        let entry = doc.to_knowledge_entry();
406        assert_eq!(entry.category.as_deref(), Some("Screenshot"));
407        assert_eq!(entry.metadata.get("source.kind"), Some("screenshot"));
408        assert_eq!(
409            entry.metadata.get("screenshot.ui_chrome_included"),
410            Some("true")
411        );
412        assert_eq!(entry.metadata.get("screenshot.chunk_count"), Some("2"));
413    }
414
415    #[tokio::test]
416    async fn ingest_screenshot_can_exclude_ui_chrome() {
417        let temp_dir = TempDir::new().unwrap();
418        let path = create_temp_screenshot(&temp_dir, "shot.png").await;
419
420        let fixture_provenance = SourceProvenance::new(SourceKind::Screenshot, "fixture://s2");
421        let mut fixture_doc = OcrDocument::new(
422            OcrEngine::Mock,
423            OcrTargetKind::Screenshot,
424            fixture_provenance,
425        );
426        fixture_doc.blocks = vec![
427            OcrTextBlock {
428                text: "Back".into(),
429                confidence: Some(0.95),
430                bbox: None,
431                kind: OcrBlockKind::UiChrome,
432            },
433            OcrTextBlock {
434                text: "Actual content".into(),
435                confidence: Some(0.95),
436                bbox: None,
437                kind: OcrBlockKind::Paragraph,
438            },
439        ];
440
441        let backend = Arc::new(FixtureOcrBackend::new().with_default_document(fixture_doc));
442        let config = ScreenshotIngesterConfig::default().with_ui_chrome(false);
443        let ingester = ScreenshotIngester::with_config(backend, config);
444
445        let doc = ingester.ingest_file(&path).await.unwrap();
446        assert_eq!(doc.chunks.len(), 1);
447        assert_eq!(doc.chunks[0].content, "Actual content");
448        assert!(!doc.indexing_text().contains("Back"));
449    }
450
451    #[tokio::test]
452    async fn ingest_screenshot_errors_for_missing_file() {
453        let backend = Arc::new(FixtureOcrBackend::new());
454        let ingester = ScreenshotIngester::new(backend);
455
456        let err = ingester
457            .ingest_file(Path::new("/tmp/does-not-exist-screenshot.png"))
458            .await
459            .unwrap_err();
460        assert!(err.to_string().contains("does not exist"));
461    }
462}