substudy/
srt.rs

1//! SRT-format subtitle support.
2
3use std::{fs::File, io::Read as _, path::Path};
4
5use anyhow::Context as _;
6use serde::Serialize;
7
8use crate::{
9    clean::{clean_subtitle_file, strip_formatting},
10    decode::smart_decode,
11    grammar,
12    lang::Lang,
13    time::Period,
14    Result,
15};
16
17/// Format seconds using the standard SRT time format.
18pub fn format_time(time: f32) -> String {
19    let (h, rem) = ((time / 3600.0).trunc(), time % 3600.0);
20    let (m, s) = ((rem / 60.0).trunc(), rem % 60.0);
21    (format!("{:02}:{:02}:{:0>6.3}", h, m, s)).replace(".", ",")
22}
23
24/// A single SRT-format subtitle, minus some of the optional fields used in
25/// various versions of the file format.
26#[derive(Debug, PartialEq, Clone, Serialize)]
27pub struct Subtitle {
28    /// The index of this subtitle.  We should normalize these to start
29    /// with 1 on output.
30    pub index: usize,
31
32    /// The time period during which this subtitle is shown.
33    pub period: Period,
34
35    /// The lines of text in this subtitle.
36    pub lines: Vec<String>,
37}
38
39impl Subtitle {
40    /// Return a string representation of this subtitle.
41    pub fn to_string(&self) -> String {
42        format!(
43            "{}\n{} --> {}\n{}\n",
44            self.index,
45            format_time(self.period.begin()),
46            format_time(self.period.end()),
47            self.lines.join("\n")
48        )
49    }
50
51    /// Return a plain-text version of this subtitle.
52    pub fn plain_text(&self) -> String {
53        strip_formatting(&self.lines.join(" ")).into_owned()
54    }
55}
56
57/// The contents of an SRT-format subtitle file.
58#[derive(Debug, PartialEq)]
59pub struct SubtitleFile {
60    /// The subtitles in this file.
61    pub subtitles: Vec<Subtitle>,
62}
63
64impl SubtitleFile {
65    /// Parse raw subtitle text into an appropriate structure.
66    pub fn from_str(data: &str) -> Result<SubtitleFile> {
67        // Use `trim_left_matches` to remove the leading BOM ("byte order mark")
68        // that's present in much Windows UTF-8 data. Note that if it appears
69        // multiple times, we would remove all the copies, but we've never seen
70        // that in the wild.
71        Ok(grammar::subtitle_file(data.trim_start_matches("\u{FEFF}"))
72            .context("could not parse subtitles")?)
73    }
74
75    /// Parse the subtitle file found at the specified path.
76    pub fn from_path(path: &Path) -> Result<SubtitleFile> {
77        let mut file = File::open(path)
78            .with_context(|| format!("could not open {}", path.display()))?;
79        let mut bytes = Vec::new();
80        file.read_to_end(&mut bytes)
81            .with_context(|| format!("could not read {}", path.display()))?;
82        let data = smart_decode(&bytes)
83            .with_context(|| format!("could not read {}", path.display()))?;
84        Ok(SubtitleFile::from_str(&data)
85            .with_context(|| format!("could not parse {}", path.display()))?)
86    }
87
88    /// Parse and normalize the subtitle file found at the specified path.
89    pub fn cleaned_from_path(path: &Path) -> Result<SubtitleFile> {
90        let raw = SubtitleFile::from_path(path)?;
91        Ok(clean_subtitle_file(&raw)?)
92    }
93
94    /// Convert subtitles to a string.
95    pub fn to_string(&self) -> String {
96        let subs: Vec<String> = self.subtitles.iter().map(|s| s.to_string()).collect();
97        // The BOM (byte-order mark) is generally discouraged on Linux, but
98        // it's sometimes needed to get good results under Windows.  We
99        // include it here because Wikipedia says that SRT files files
100        // default to various legacy encoding, but that the BOM can be used
101        // for Unicode.
102        format!("\u{FEFF}{}", subs.join("\n"))
103    }
104
105    /// Find the subtitle with the given index.
106    pub fn find(&self, index: usize) -> Option<&Subtitle> {
107        self.subtitles.iter().find(|s| s.index == index)
108    }
109
110    /// Detect the language used in these subtitles.
111    pub fn detect_language(&self) -> Option<Lang> {
112        let subs: Vec<_> = self.subtitles.iter().map(|s| s.plain_text()).collect();
113        let text = subs.join("\n");
114        Lang::for_text(&text)
115    }
116}
117
118#[cfg(test)]
119mod test {
120    use std::path::Path;
121
122    use crate::{
123        lang::Lang,
124        srt::{Subtitle, SubtitleFile},
125        time::Period,
126    };
127
128    #[test]
129    fn subtitle_file_from_path() {
130        let path = Path::new("fixtures/sample.es.srt");
131        let srt = SubtitleFile::from_path(&path).unwrap();
132        assert_eq!(5, srt.subtitles.len());
133
134        let sub = &srt.subtitles[0];
135        assert_eq!(16, sub.index);
136        assert_eq!(62.328, sub.period.begin());
137        assert_eq!(64.664, sub.period.end());
138        assert_eq!(vec!["¡Si! ¡Aang ha vuelto!".to_string()], sub.lines);
139
140        let sub2 = &srt.subtitles[2];
141        assert_eq!(
142            vec![
143                "Tu diste la señal a la armada".to_string(),
144                "del fuego con la bengala,".to_string(),
145            ],
146            sub2.lines
147        );
148    }
149
150    #[test]
151    fn subtitle_to_string() {
152        let sub = Subtitle {
153            index: 4,
154            period: Period::new(61.5, 63.75).unwrap(),
155            lines: vec!["Line 1".to_string(), "<i>Line 2</i>".to_string()],
156        };
157        let expected = r"4
15800:01:01,500 --> 00:01:03,750
159Line 1
160<i>Line 2</i>
161"
162        .to_string();
163        assert_eq!(expected, sub.to_string());
164    }
165
166    #[test]
167    fn subtitle_file_to_string() {
168        let data = "\u{FEFF}16
16900:01:02,328 --> 00:01:04,664
170Line 1.1
171
17217
17300:01:12,839 --> 00:01:13,839
174Line 2.1
175";
176        let srt = SubtitleFile::from_str(data).unwrap();
177        assert_eq!(data, &srt.to_string());
178    }
179
180    #[test]
181    fn zero_duration_subtitle() {
182        let data = "\u{FEFF}16
18300:00:01,000 --> 00:00:01,000
184Text
185";
186        let srt = SubtitleFile::from_str(data).unwrap();
187        assert_eq!(srt.subtitles.len(), 1);
188        assert_eq!(srt.subtitles[0].period.begin(), 1.0);
189        assert_eq!(srt.subtitles[0].period.end(), 1.001);
190    }
191
192    #[test]
193    fn detect_language() {
194        let path_es = Path::new("fixtures/sample.es.srt");
195        let srt_es = SubtitleFile::from_path(&path_es).unwrap();
196        assert_eq!(Some(Lang::iso639("es").unwrap()), srt_es.detect_language());
197
198        let path_en = Path::new("fixtures/sample.en.srt");
199        let srt_en = SubtitleFile::from_path(&path_en).unwrap();
200        assert_eq!(Some(Lang::iso639("en").unwrap()), srt_en.detect_language());
201    }
202}