Skip to main content

trueno_rag/loader/
image.rs

1//! Image loader with OCR via tesseract.
2//!
3//! Extracts text from `.png`, `.jpg`, `.jpeg`, `.tiff`, `.bmp` images
4//! by shelling out to the `tesseract` CLI. Feature-gated behind `ocr`.
5//!
6//! # Requirements
7//!
8//! - `tesseract` must be installed and on `$PATH`
9//! - English language data: `tesseract-ocr-eng` package
10
11use crate::{Document, Result};
12use std::path::Path;
13use std::process::Command;
14
15use super::DocumentLoader;
16
17/// Loads image files by extracting text via Tesseract OCR.
18///
19/// Supports PNG, JPEG, TIFF, and BMP formats.
20#[derive(Debug, Clone, Copy)]
21pub struct ImageLoader;
22
23impl DocumentLoader for ImageLoader {
24    fn supported_extensions(&self) -> Vec<&str> {
25        vec!["png", "jpg", "jpeg", "tiff", "bmp"]
26    }
27
28    fn load(&self, path: &Path) -> Result<Document> {
29        let output = Command::new("tesseract")
30            .arg(path.as_os_str())
31            .arg("stdout")
32            .arg("--psm")
33            .arg("6") // Assume a single uniform block of text
34            .output()
35            .map_err(|e| {
36                crate::Error::InvalidConfig(format!(
37                    "Failed to run tesseract (is it installed?): {e}"
38                ))
39            })?;
40
41        if !output.status.success() {
42            let stderr = String::from_utf8_lossy(&output.stderr);
43            return Err(crate::Error::InvalidConfig(format!(
44                "Tesseract failed on {}: {}",
45                path.display(),
46                stderr.trim()
47            )));
48        }
49
50        let content = String::from_utf8_lossy(&output.stdout).trim().to_string();
51
52        let title = path.file_stem().and_then(|s| s.to_str()).unwrap_or("Untitled").to_string();
53
54        // Extract frame number from filename if it matches pattern like "frame_00123.png"
55        let mut doc = Document::new(content).with_title(title).with_source(path.to_string_lossy());
56
57        // If the filename contains timing info (e.g., "frame_00120s.png"), extract it
58        if let Some(secs) = extract_timestamp_from_filename(path) {
59            doc.metadata.insert(
60                "frame_time_secs".to_string(),
61                serde_json::Value::Number(
62                    serde_json::Number::from_f64(secs)
63                        .unwrap_or_else(|| serde_json::Number::from(0)),
64                ),
65            );
66        }
67
68        Ok(doc)
69    }
70}
71
72/// Try to extract a timestamp from a filename like "frame_00120s.png" or "frame_120.5.png".
73fn extract_timestamp_from_filename(path: &Path) -> Option<f64> {
74    let stem = path.file_stem()?.to_str()?;
75    // Pattern: frame_NNNs or frame_NNN.Ns
76    if let Some(rest) = stem.strip_prefix("frame_") {
77        let numeric = rest.trim_end_matches('s');
78        numeric.parse::<f64>().ok()
79    } else {
80        None
81    }
82}
83
84#[cfg(test)]
85mod tests {
86    use super::*;
87
88    #[test]
89    fn test_image_loader_extensions() {
90        let loader = ImageLoader;
91        let exts = loader.supported_extensions();
92        assert!(exts.contains(&"png"));
93        assert!(exts.contains(&"jpg"));
94        assert!(exts.contains(&"jpeg"));
95        assert!(exts.contains(&"tiff"));
96        assert!(exts.contains(&"bmp"));
97    }
98
99    #[test]
100    fn test_image_loader_can_load() {
101        let loader = ImageLoader;
102        assert!(loader.can_load(Path::new("slide.png")));
103        assert!(loader.can_load(Path::new("photo.JPG")));
104        assert!(!loader.can_load(Path::new("video.mp4")));
105        assert!(!loader.can_load(Path::new("notes.txt")));
106    }
107
108    #[test]
109    fn test_extract_timestamp_from_filename() {
110        assert_eq!(extract_timestamp_from_filename(Path::new("frame_120s.png")), Some(120.0));
111        assert_eq!(extract_timestamp_from_filename(Path::new("frame_45.5s.png")), Some(45.5));
112        assert_eq!(extract_timestamp_from_filename(Path::new("frame_0.png")), Some(0.0));
113        assert_eq!(extract_timestamp_from_filename(Path::new("slide_01.png")), None);
114        assert_eq!(extract_timestamp_from_filename(Path::new("photo.jpg")), None);
115    }
116
117    #[test]
118    fn test_image_loader_missing_file() {
119        let loader = ImageLoader;
120        let result = loader.load(Path::new("/nonexistent/image.png"));
121        // Should fail (tesseract not found or file not found)
122        assert!(result.is_err());
123    }
124}