Skip to main content

trueno_rag/loader/
mod.rs

1//! Document loading abstraction for pluggable file format support.
2//!
3//! The [`DocumentLoader`] trait decouples file format handling from the
4//! RAG pipeline. Built-in loaders handle text (`.txt`, `.md`) and
5//! subtitle (`.srt`, `.vtt`) formats. Third parties can implement
6//! `DocumentLoader` for any format.
7//!
8//! The [`LoaderRegistry`] dispatches loading to the appropriate loader
9//! based on file extension, with support for sidecar subtitle files
10//! adjacent to media files.
11//!
12//! # Example
13//!
14//! ```rust
15//! use trueno_rag::loader::LoaderRegistry;
16//! use std::path::Path;
17//!
18//! let registry = LoaderRegistry::new();
19//! let extensions = registry.supported_extensions();
20//! assert!(extensions.contains(&"txt"));
21//! assert!(extensions.contains(&"srt"));
22//! ```
23
24#[cfg(feature = "ocr")]
25mod image;
26mod subtitle;
27mod text;
28#[cfg(feature = "transcription")]
29pub mod transcription;
30
31#[cfg(feature = "ocr")]
32pub use image::ImageLoader;
33pub use subtitle::SubtitleLoader;
34pub use text::TextLoader;
35#[cfg(feature = "transcription")]
36pub use transcription::TranscriptionLoader;
37
38use crate::{Document, Error, Result};
39use std::path::Path;
40
41/// Abstraction for loading files of any format into Documents.
42///
43/// Implementors handle format detection, parsing, and conversion
44/// to the standard `Document` representation. A loader may support
45/// multiple file extensions.
46pub trait DocumentLoader: Send + Sync {
47    /// File extensions this loader handles (lowercase, without dot).
48    fn supported_extensions(&self) -> Vec<&str>;
49
50    /// Returns true if this loader can handle the given path.
51    ///
52    /// Default implementation checks the file extension against
53    /// [`supported_extensions()`](DocumentLoader::supported_extensions).
54    fn can_load(&self, path: &Path) -> bool {
55        path.extension()
56            .and_then(|ext| ext.to_str())
57            .map(|ext| {
58                let lower = ext.to_lowercase();
59                self.supported_extensions().iter().any(|s| *s == lower)
60            })
61            .unwrap_or(false)
62    }
63
64    /// Load a file and produce a Document.
65    ///
66    /// The returned Document should have:
67    /// - `content`: The extracted text
68    /// - `source`: The file path
69    /// - `title`: Derived from filename or embedded metadata
70    /// - `metadata`: Format-specific fields
71    fn load(&self, path: &Path) -> Result<Document>;
72}
73
74/// Registry that dispatches file loading to the appropriate [`DocumentLoader`].
75///
76/// Comes pre-loaded with [`TextLoader`] and [`SubtitleLoader`].
77/// Register additional loaders with [`register`](LoaderRegistry::register).
78pub struct LoaderRegistry {
79    loaders: Vec<Box<dyn DocumentLoader>>,
80}
81
82impl LoaderRegistry {
83    /// Create a registry with default loaders (text and subtitle).
84    #[must_use]
85    pub fn new() -> Self {
86        let mut registry = Self { loaders: Vec::new() };
87        registry.register(Box::new(TextLoader));
88        registry.register(Box::new(SubtitleLoader));
89        #[cfg(feature = "ocr")]
90        registry.register(Box::new(ImageLoader));
91        registry
92    }
93
94    /// Register a custom loader.
95    pub fn register(&mut self, loader: Box<dyn DocumentLoader>) {
96        self.loaders.push(loader);
97    }
98
99    /// Find the first loader that can handle the given path.
100    #[must_use]
101    pub fn loader_for(&self, path: &Path) -> Option<&dyn DocumentLoader> {
102        self.loaders.iter().find(|l| l.can_load(path)).map(|l| l.as_ref())
103    }
104
105    /// Load a document, selecting the appropriate loader automatically.
106    pub fn load(&self, path: &Path) -> Result<Document> {
107        let loader = self.loader_for(path).ok_or_else(|| {
108            Error::InvalidInput(format!("No loader registered for: {}", path.display()))
109        })?;
110        loader.load(path)
111    }
112
113    /// Check if a sidecar subtitle file exists for a media file.
114    ///
115    /// Returns the sidecar path if found (prefers `.srt` over `.vtt`).
116    #[must_use]
117    pub fn find_sidecar(media_path: &Path) -> Option<std::path::PathBuf> {
118        for ext in &["srt", "vtt"] {
119            let sidecar = media_path.with_extension(ext);
120            if sidecar.exists() {
121                return Some(sidecar);
122            }
123        }
124        None
125    }
126
127    /// All supported extensions across all registered loaders.
128    #[must_use]
129    pub fn supported_extensions(&self) -> Vec<&str> {
130        self.loaders.iter().flat_map(|l| l.supported_extensions()).collect()
131    }
132}
133
134impl Default for LoaderRegistry {
135    fn default() -> Self {
136        Self::new()
137    }
138}
139
140impl std::fmt::Debug for LoaderRegistry {
141    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
142        f.debug_struct("LoaderRegistry")
143            .field("loader_count", &self.loaders.len())
144            .field("extensions", &self.supported_extensions())
145            .finish()
146    }
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    #[test]
154    fn test_registry_default_loaders() {
155        let registry = LoaderRegistry::new();
156        let exts = registry.supported_extensions();
157        assert!(exts.contains(&"txt"));
158        assert!(exts.contains(&"md"));
159        assert!(exts.contains(&"srt"));
160        assert!(exts.contains(&"vtt"));
161    }
162
163    #[test]
164    fn test_registry_loader_for_txt() {
165        let registry = LoaderRegistry::new();
166        assert!(registry.loader_for(Path::new("file.txt")).is_some());
167        assert!(registry.loader_for(Path::new("file.TXT")).is_some());
168    }
169
170    #[test]
171    fn test_registry_loader_for_srt() {
172        let registry = LoaderRegistry::new();
173        assert!(registry.loader_for(Path::new("file.srt")).is_some());
174    }
175
176    #[test]
177    fn test_registry_no_loader_for_unknown() {
178        let registry = LoaderRegistry::new();
179        assert!(registry.loader_for(Path::new("file.xyz")).is_none());
180    }
181
182    #[test]
183    fn test_registry_load_missing_file() {
184        let registry = LoaderRegistry::new();
185        let result = registry.load(Path::new("/nonexistent/file.txt"));
186        assert!(result.is_err());
187    }
188
189    #[test]
190    fn test_registry_load_unsupported_format() {
191        let registry = LoaderRegistry::new();
192        let result = registry.load(Path::new("file.mp4"));
193        assert!(result.is_err());
194    }
195
196    #[test]
197    fn test_find_sidecar_none() {
198        // No sidecar for a file in /tmp that doesn't exist
199        assert!(
200            LoaderRegistry::find_sidecar(Path::new("/tmp/nonexistent_video_12345.mp4")).is_none()
201        );
202    }
203
204    #[test]
205    fn test_registry_custom_loader() {
206        struct DummyLoader;
207        impl DocumentLoader for DummyLoader {
208            fn supported_extensions(&self) -> Vec<&str> {
209                vec!["xyz"]
210            }
211            fn load(&self, path: &Path) -> Result<Document> {
212                Ok(Document::new("dummy").with_source(path.to_string_lossy()))
213            }
214        }
215
216        let mut registry = LoaderRegistry::new();
217        registry.register(Box::new(DummyLoader));
218        assert!(registry.loader_for(Path::new("test.xyz")).is_some());
219    }
220
221    #[test]
222    fn test_registry_debug() {
223        let registry = LoaderRegistry::new();
224        let debug = format!("{registry:?}");
225        assert!(debug.contains("LoaderRegistry"));
226        assert!(debug.contains("loader_count"));
227    }
228
229    #[test]
230    fn test_registry_default() {
231        let registry = LoaderRegistry::default();
232        assert!(!registry.supported_extensions().is_empty());
233    }
234
235    #[test]
236    fn test_find_sidecar_srt_preferred() {
237        // Create temp files to test sidecar detection
238        let dir = std::env::temp_dir().join("trueno_rag_test_sidecar");
239        let _ = std::fs::create_dir_all(&dir);
240        let video = dir.join("lecture.mp4");
241        let srt = dir.join("lecture.srt");
242        let vtt = dir.join("lecture.vtt");
243        std::fs::write(&video, b"").unwrap();
244        std::fs::write(&srt, b"").unwrap();
245        std::fs::write(&vtt, b"").unwrap();
246
247        let found = LoaderRegistry::find_sidecar(&video);
248        assert!(found.is_some());
249        // SRT is preferred over VTT
250        assert_eq!(found.unwrap().extension().unwrap(), "srt");
251
252        // Cleanup
253        let _ = std::fs::remove_dir_all(&dir);
254    }
255
256    #[test]
257    fn test_can_load_no_extension() {
258        let loader = TextLoader;
259        assert!(!loader.can_load(Path::new("Makefile")));
260    }
261
262    #[test]
263    fn test_load_real_txt_file() {
264        let dir = std::env::temp_dir().join("trueno_rag_test_load_txt");
265        let _ = std::fs::create_dir_all(&dir);
266        let file = dir.join("test.txt");
267        std::fs::write(&file, "Hello from test file.").unwrap();
268
269        let registry = LoaderRegistry::new();
270        let doc = registry.load(&file).unwrap();
271        assert_eq!(doc.content, "Hello from test file.");
272        assert!(doc.title.is_some());
273
274        let _ = std::fs::remove_dir_all(&dir);
275    }
276
277    #[test]
278    fn test_load_real_srt_file() {
279        let dir = std::env::temp_dir().join("trueno_rag_test_load_srt");
280        let _ = std::fs::create_dir_all(&dir);
281        let file = dir.join("test.srt");
282        std::fs::write(&file, "1\n00:00:01,000 --> 00:00:04,500\nHello from subtitle.\n").unwrap();
283
284        let registry = LoaderRegistry::new();
285        let doc = registry.load(&file).unwrap();
286        assert!(doc.content.contains("Hello from subtitle"));
287        assert!(doc.metadata.contains_key("subtitle_cues"));
288
289        let _ = std::fs::remove_dir_all(&dir);
290    }
291}