Skip to main content

trueno_rag/loader/
subtitle.rs

1//! Subtitle file loader for `.srt` and `.vtt` files.
2
3use crate::media::parse_subtitles;
4use crate::{Document, Result};
5use std::collections::HashMap;
6use std::path::Path;
7
8use super::DocumentLoader;
9
10/// Loads subtitle files and produces Documents with timestamp metadata.
11///
12/// Supports `.srt` (SubRip) and `.vtt` (WebVTT) formats.
13/// The document content is the concatenated plain text of all cues.
14/// Subtitle cue data is stored in `metadata["subtitle_cues"]` for
15/// downstream timestamp-aware chunking.
16#[derive(Debug, Clone, Copy)]
17pub struct SubtitleLoader;
18
19impl DocumentLoader for SubtitleLoader {
20    fn supported_extensions(&self) -> Vec<&str> {
21        vec!["srt", "vtt"]
22    }
23
24    fn load(&self, path: &Path) -> Result<Document> {
25        let raw = std::fs::read_to_string(path).map_err(crate::Error::Io)?;
26        let track = parse_subtitles(&raw)?;
27
28        let title = path.file_stem().and_then(|s| s.to_str()).unwrap_or("Untitled").to_string();
29
30        let mut metadata = HashMap::new();
31        metadata.insert("duration_secs".into(), serde_json::json!(track.duration_secs()));
32        metadata.insert("format".into(), serde_json::json!(track.format.to_string()));
33        metadata.insert("cue_count".into(), serde_json::json!(track.cues.len()));
34        metadata.insert(
35            "subtitle_cues".into(),
36            serde_json::to_value(&track.cues).map_err(crate::Error::Serialization)?,
37        );
38
39        let mut doc = Document::new(track.to_plain_text())
40            .with_title(title)
41            .with_source(path.to_string_lossy());
42        doc.metadata = metadata;
43        Ok(doc)
44    }
45}
46
47#[cfg(test)]
48mod tests {
49    use super::*;
50
51    #[test]
52    fn test_subtitle_loader_extensions() {
53        let loader = SubtitleLoader;
54        let exts = loader.supported_extensions();
55        assert!(exts.contains(&"srt"));
56        assert!(exts.contains(&"vtt"));
57    }
58
59    #[test]
60    fn test_subtitle_loader_can_load() {
61        let loader = SubtitleLoader;
62        assert!(loader.can_load(Path::new("lecture.srt")));
63        assert!(loader.can_load(Path::new("captions.VTT")));
64        assert!(!loader.can_load(Path::new("file.txt")));
65    }
66
67    #[test]
68    fn test_subtitle_loader_load_srt() {
69        let dir = std::env::temp_dir().join("trueno_rag_test_sub_loader_srt");
70        let _ = std::fs::create_dir_all(&dir);
71        let file = dir.join("lecture.srt");
72        std::fs::write(
73            &file,
74            "\
751
7600:00:01,000 --> 00:00:04,500
77First cue text.
78
792
8000:00:05,000 --> 00:00:09,200
81Second cue text.
82",
83        )
84        .unwrap();
85
86        let loader = SubtitleLoader;
87        let doc = loader.load(&file).unwrap();
88
89        assert!(doc.content.contains("First cue text."));
90        assert!(doc.content.contains("Second cue text."));
91        assert_eq!(doc.title.as_deref(), Some("lecture"));
92        assert!(doc.metadata.contains_key("duration_secs"));
93        assert!(doc.metadata.contains_key("subtitle_cues"));
94        assert_eq!(doc.metadata["cue_count"], serde_json::json!(2));
95        assert_eq!(doc.metadata["format"], serde_json::json!("srt"));
96
97        let _ = std::fs::remove_dir_all(&dir);
98    }
99
100    #[test]
101    fn test_subtitle_loader_load_vtt() {
102        let dir = std::env::temp_dir().join("trueno_rag_test_sub_loader_vtt");
103        let _ = std::fs::create_dir_all(&dir);
104        let file = dir.join("captions.vtt");
105        std::fs::write(
106            &file,
107            "\
108WEBVTT
109
11000:00:01.000 --> 00:00:04.500
111VTT cue text.
112",
113        )
114        .unwrap();
115
116        let loader = SubtitleLoader;
117        let doc = loader.load(&file).unwrap();
118
119        assert!(doc.content.contains("VTT cue text"));
120        assert_eq!(doc.metadata["format"], serde_json::json!("vtt"));
121
122        let _ = std::fs::remove_dir_all(&dir);
123    }
124
125    #[test]
126    fn test_subtitle_loader_metadata_duration() {
127        let dir = std::env::temp_dir().join("trueno_rag_test_sub_duration");
128        let _ = std::fs::create_dir_all(&dir);
129        let file = dir.join("timed.srt");
130        std::fs::write(
131            &file,
132            "\
1331
13400:01:00,000 --> 00:02:30,000
135One minute in.
136",
137        )
138        .unwrap();
139
140        let loader = SubtitleLoader;
141        let doc = loader.load(&file).unwrap();
142
143        let duration = doc.metadata["duration_secs"].as_f64().unwrap();
144        assert!((duration - 150.0).abs() < 0.1);
145
146        let _ = std::fs::remove_dir_all(&dir);
147    }
148
149    #[test]
150    fn test_subtitle_loader_missing_file() {
151        let loader = SubtitleLoader;
152        let result = loader.load(Path::new("/nonexistent/file.srt"));
153        assert!(result.is_err());
154    }
155
156    #[test]
157    fn test_subtitle_loader_cues_deserializable() {
158        let dir = std::env::temp_dir().join("trueno_rag_test_sub_cues_deser");
159        let _ = std::fs::create_dir_all(&dir);
160        let file = dir.join("test.srt");
161        std::fs::write(
162            &file,
163            "1\n00:00:01,000 --> 00:00:04,500\nHello.\n\n2\n00:00:05,000 --> 00:00:09,000\nWorld.\n",
164        )
165        .unwrap();
166
167        let loader = SubtitleLoader;
168        let doc = loader.load(&file).unwrap();
169
170        // Verify cues can be deserialized back
171        let cues: Vec<crate::media::SubtitleCue> =
172            serde_json::from_value(doc.metadata["subtitle_cues"].clone()).unwrap();
173        assert_eq!(cues.len(), 2);
174        assert_eq!(cues[0].text, "Hello.");
175        assert_eq!(cues[1].text, "World.");
176
177        let _ = std::fs::remove_dir_all(&dir);
178    }
179}