subparse/formats/
microdvd.rs

1// This Source Code Form is subject to the terms of the Mozilla Public
2// License, v. 2.0. If a copy of the MPL was not distributed with this
3// file, You can obtain one at http://mozilla.org/MPL/2.0/.
4
5use self::errors::ErrorKind::*;
6use self::errors::*;
7use crate::{SubtitleEntry, SubtitleFileInterface};
8
9use crate::errors::Result as SubtitleParserResult;
10use crate::formats::common::*;
11use combine::char::char;
12use combine::combinator::{eof, many, parser as p, satisfy, sep_by};
13use combine::primitives::Parser;
14
15use itertools::Itertools;
16use std::borrow::Cow;
17use std::collections::HashSet;
18
19use failure::ResultExt;
20
21use crate::timetypes::{TimePoint, TimeSpan};
22use std::collections::LinkedList;
23
24/// Errors specific to `.sub`(`MicroDVD`)-parsing.
25#[allow(missing_docs)]
26pub mod errors {
27    pub type Result<T> = std::result::Result<T, Error>;
28
29    define_error!(Error, ErrorKind);
30
31    #[derive(PartialEq, Debug, Fail)]
32    pub enum ErrorKind {
33        #[fail(display = "expected subtittle line, found `{}`", line)]
34        ExpectedSubtitleLine { line: String },
35        #[fail(display = "parse error at line `{}`", line_num)]
36        ErrorAtLine { line_num: usize },
37    }
38}
39
40/// Represents a formatting like "{y:i}" (display text in italics).
41///
42/// TODO: `MdvdFormatting` is a stub for the future where this enum holds specialized variants for different options.
43#[derive(Debug, Clone, Hash, PartialEq, Eq)]
44enum MdvdFormatting {
45    /// A format option that is not directly supported.
46    Unknown(String),
47}
48
49impl From<String> for MdvdFormatting {
50    fn from(f: String) -> MdvdFormatting {
51        MdvdFormatting::Unknown(Self::lowercase_first_char(&f))
52    }
53}
54
55impl MdvdFormatting {
56    /// Is this a single line formatting (e.g. `y:i`) or a multi-line formatting (e.g `Y:i`)?
57    fn is_container_line_formatting(f: &str) -> bool {
58        f.chars().next().and_then(|c| Some(c.is_uppercase())).unwrap_or(false)
59    }
60
61    /// Applies `to_lowercase()` to first char, leaves the rest of the characters untouched.
62    fn lowercase_first_char(s: &str) -> String {
63        let mut c = s.chars();
64        match c.next() {
65            None => String::new(),
66            Some(f) => f.to_lowercase().collect::<String>() + c.as_str(),
67        }
68    }
69
70    /// Applies `to_uppercase()` to first char, leaves the rest of the characters untouched.
71    fn uppercase_first_char(s: &str) -> String {
72        let mut c = s.chars();
73        match c.next() {
74            None => String::new(),
75            Some(f) => f.to_uppercase().collect::<String>() + c.as_str(),
76        }
77    }
78
79    fn to_formatting_string_intern(&self) -> String {
80        match *self {
81            MdvdFormatting::Unknown(ref s) => s.clone(),
82        }
83    }
84
85    /// Convert a `MdvdFormatting` to a string which can be used in `.sub` files.
86    fn to_formatting_string(&self, multiline: bool) -> String {
87        let s = self.to_formatting_string_intern();
88        if multiline {
89            Self::uppercase_first_char(&s)
90        } else {
91            Self::lowercase_first_char(&s)
92        }
93    }
94}
95
96#[derive(Debug, Clone)]
97/// Represents a reconstructable `.sub`(`MicroDVD`) file.
98pub struct MdvdFile {
99    /// Number of frames per second of the accociated video (default 25)
100    /// -> start/end frames can be coverted to timestamps
101    fps: f64,
102
103    /// all lines and multilines
104    v: Vec<MdvdLine>,
105}
106
107/// Holds the description of a line like.
108#[derive(Debug, Clone)]
109struct MdvdLine {
110    /// The start frame.
111    start_frame: i64,
112
113    /// The end frame.
114    end_frame: i64,
115
116    /// Formatting that affects all contained single lines.
117    formatting: Vec<MdvdFormatting>,
118
119    /// The (dialog) text of the line.
120    text: String,
121}
122
123impl MdvdLine {
124    fn to_subtitle_entry(&self, fps: f64) -> SubtitleEntry {
125        SubtitleEntry {
126            timespan: TimeSpan::new(
127                TimePoint::from_msecs((self.start_frame as f64 * 1000.0 / fps) as i64),
128                TimePoint::from_msecs((self.end_frame as f64 * 1000.0 / fps) as i64),
129            ),
130            line: Some(self.text.clone()),
131        }
132    }
133}
134
135impl MdvdFile {
136    /// Parse a `MicroDVD` `.sub` subtitle string to `MdvdFile`.
137    pub fn parse(s: &str, fps: f64) -> SubtitleParserResult<MdvdFile> {
138        Ok(Self::parse_file(s, fps).with_context(|_| crate::ErrorKind::ParsingError)?)
139    }
140}
141
142/// Implements parse functions.
143impl MdvdFile {
144    fn parse_file(i: &str, fps: f64) -> Result<MdvdFile> {
145        let mut result: Vec<MdvdLine> = Vec::new();
146
147        // remove utf-8 bom
148        let (_, s) = split_bom(i);
149
150        for (line_num, line) in s.lines().enumerate() {
151            // a line looks like "{0}{25}{c:$0000ff}{y:b,u}{f:DeJaVuSans}{s:12}Hello!|{y:i}Hello2!" where
152            // 0 and 25 are the start and end frames and the other information is the formatting.
153            let mut lines: Vec<MdvdLine> = Self::parse_line(line_num, line)?;
154            result.append(&mut lines);
155        }
156
157        Ok(MdvdFile { fps: fps, v: result })
158    }
159
160    // Parses something like "{0}{25}{C:$0000ff}{y:b,u}{f:DeJaVuSans}{s:12}Hello!|{s:15}Hello2!"
161    fn parse_line(line_num: usize, line: &str) -> Result<Vec<MdvdLine>> {
162        // Matches the regex "\{[^}]*\}"; parses something like "{some_info}".
163        let sub_info = (char('{'), many(satisfy(|c| c != '}')), char('}'))
164            .map(|(_, info, _): (_, String, _)| info)
165            .expected("MicroDVD info");
166
167        // Parse a single line (until separator '|'), something like "{C:$0000ff}{y:b,u}{f:DeJaVuSans}{s:12}Hello!"
168        // Returns the a tuple of the multiline-formatting, the single-line formatting and the text of the single line.
169        let single_line = (many(sub_info), many(satisfy(|c| c != '|')));
170
171        // the '|' char splits single lines
172        (
173            char('{'),
174            p(number_i64),
175            char('}'),
176            char('{'),
177            p(number_i64),
178            char('}'),
179            sep_by(single_line, char('|')),
180            eof(),
181        )
182            .map(|(_, start_frame, _, _, end_frame, _, fmt_strs_and_lines, ())| (start_frame, end_frame, fmt_strs_and_lines))
183            .map(|(start_frame, end_frame, fmt_strs_and_lines): (i64, i64, Vec<(Vec<String>, String)>)| {
184                Self::construct_mdvd_lines(start_frame, end_frame, fmt_strs_and_lines)
185            })
186            .parse(line)
187            .map(|x| x.0)
188            .map_err(|_| Error::from(ExpectedSubtitleLine { line: line.to_string() }))
189            .with_context(|_| ErrorAtLine { line_num })
190            .map_err(Error::from)
191    }
192
193    /// Construct (possibly multiple) `MdvdLines` from a deconstructed file line
194    /// like "{C:$0000ff}{y:b,u}{f:DeJaVuSans}{s:12}Hello!|{s:15}Hello2!".
195    ///
196    /// The third parameter is for the example
197    /// like `[(["C:$0000ff", "y:b,u", "f:DeJaVuSans", "s:12"], "Hello!"), (["s:15"], "Hello2!")].
198    fn construct_mdvd_lines(start_frame: i64, end_frame: i64, fmt_strs_and_lines: Vec<(Vec<String>, String)>) -> Vec<MdvdLine> {
199        // saves all multiline formatting
200        let mut cline_fmts: Vec<MdvdFormatting> = Vec::new();
201
202        // convert the formatting strings to `MdvdFormatting` objects and split between multi-line and single-line formatting
203        let fmts_and_lines = fmt_strs_and_lines
204            .into_iter()
205            .map(|(fmts, text)| (Self::string_to_formatting(&mut cline_fmts, fmts), text))
206            .collect::<Vec<_>>();
207
208        // now we also have all multi-line formattings in `cline_fmts`
209
210        // finish creation of `MdvdLine`s
211        fmts_and_lines
212            .into_iter()
213            .map(|(sline_fmts, text)| MdvdLine {
214                start_frame: start_frame,
215                end_frame: end_frame,
216                text: text,
217                formatting: cline_fmts.clone().into_iter().chain(sline_fmts.into_iter()).collect(),
218            })
219            .collect()
220    }
221
222    /// Convert `MicroDVD` formatting strings to `MdvdFormatting` objects.
223    ///
224    /// Move multiline formattings and single line formattings into different vectors.
225    fn string_to_formatting(multiline_formatting: &mut Vec<MdvdFormatting>, fmts: Vec<String>) -> Vec<MdvdFormatting> {
226        // split multiline-formatting (e.g "Y:b") and single-line formatting (e.g "y:b")
227        let (cline_fmts_str, sline_fmts_str): (Vec<_>, Vec<_>) = fmts
228            .into_iter()
229            .partition(|fmt_str| MdvdFormatting::is_container_line_formatting(fmt_str));
230
231        multiline_formatting.extend(&mut cline_fmts_str.into_iter().map(MdvdFormatting::from));
232        sline_fmts_str.into_iter().map(MdvdFormatting::from).collect()
233    }
234}
235
236impl SubtitleFileInterface for MdvdFile {
237    fn get_subtitle_entries(&self) -> SubtitleParserResult<Vec<SubtitleEntry>> {
238        Ok(self.v.iter().map(|line| line.to_subtitle_entry(self.fps)).collect())
239    }
240
241    fn update_subtitle_entries(&mut self, new_subtitle_entries: &[SubtitleEntry]) -> SubtitleParserResult<()> {
242        assert_eq!(new_subtitle_entries.len(), self.v.len());
243
244        let mut iter = new_subtitle_entries.iter().peekable();
245        for line in &mut self.v {
246            let peeked = iter.next().unwrap();
247
248            line.start_frame = (peeked.timespan.start.secs_f64() * self.fps) as i64;
249            line.end_frame = (peeked.timespan.end.secs_f64() * self.fps) as i64;
250
251            if let Some(ref text) = peeked.line {
252                line.text = text.clone();
253            }
254        }
255
256        Ok(())
257    }
258
259    fn to_data(&self) -> SubtitleParserResult<Vec<u8>> {
260        let mut sorted_list = self.v.clone();
261        sorted_list.sort_by_key(|line| (line.start_frame, line.end_frame));
262
263        let mut result: LinkedList<Cow<'static, str>> = LinkedList::new();
264
265        for (gi, group_iter) in sorted_list
266            .into_iter()
267            .group_by(|line| (line.start_frame, line.end_frame))
268            .into_iter()
269            .enumerate()
270        {
271            if gi != 0 {
272                result.push_back("\n".into());
273            }
274
275            let group: Vec<MdvdLine> = group_iter.1.collect();
276            let group_len = group.len();
277
278            let (start_frame, end_frame) = group_iter.0;
279            let (formattings, texts): (Vec<HashSet<MdvdFormatting>>, Vec<String>) =
280                group.into_iter().map(|line| (line.formatting.into_iter().collect(), line.text)).unzip();
281
282            // all single lines in the container line "cline" have the same start and end time
283            //  -> the .sub file format let's them be on the same line with "{0}{1000}Text1|Text2"
284
285            // find common formatting in all lines
286            let common_formatting = if group_len == 1 {
287                // if this "group" only has a single line, let's say that every formatting is individual
288                HashSet::new()
289            } else {
290                formattings
291                    .iter()
292                    .fold(None, |acc, set| match acc {
293                        None => Some(set.clone()),
294                        Some(acc_set) => Some(acc_set.intersection(set).cloned().collect()),
295                    })
296                    .unwrap()
297            };
298
299            let individual_formattings = formattings
300                .into_iter()
301                .map(|formatting| formatting.difference(&common_formatting).cloned().collect())
302                .collect::<Vec<HashSet<MdvdFormatting>>>();
303
304            result.push_back("{".into());
305            result.push_back(start_frame.to_string().into());
306            result.push_back("}".into());
307
308            result.push_back("{".into());
309            result.push_back(end_frame.to_string().into());
310            result.push_back("}".into());
311
312            for formatting in &common_formatting {
313                result.push_back("{".into());
314                result.push_back(formatting.to_formatting_string(true).into());
315                result.push_back("}".into());
316            }
317
318            for (i, (individual_formatting, text)) in individual_formattings.into_iter().zip(texts.into_iter()).enumerate() {
319                if i != 0 {
320                    result.push_back("|".into());
321                }
322
323                for formatting in individual_formatting {
324                    result.push_back("{".into());
325                    result.push_back(formatting.to_formatting_string(false).into());
326                    result.push_back("}".into());
327                }
328
329                result.push_back(text.into());
330            }
331
332            // ends "group-by-frametime"-loop
333        }
334
335        Ok(result.into_iter().map(|cow| cow.to_string()).collect::<String>().into_bytes())
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342    use SubtitleFileInterface;
343
344    /// Parse string with `MdvdFile`, and reencode it with `MdvdFile`.
345    fn mdvd_reconstruct(s: &str) -> String {
346        let file = MdvdFile::parse(s, 25.0).unwrap();
347        let data = file.to_data().unwrap();
348        String::from_utf8(data).unwrap()
349    }
350
351    /// Parse and re-construct `MicroDVD` files and test them against expected output.
352    fn test_mdvd(input: &str, expected: &str) {
353        // if we put the `input` into the parser, we expect a specific (cleaned-up) output
354        assert_eq!(mdvd_reconstruct(input), expected);
355
356        // if we reconstuct he cleaned-up output, we expect that nothing changes
357        assert_eq!(mdvd_reconstruct(expected), expected);
358    }
359
360    #[test]
361    fn mdvd_test_reconstruction() {
362        // simple examples
363        test_mdvd("{0}{25}Hello!", "{0}{25}Hello!");
364        test_mdvd("{0}{25}{y:i}Hello!", "{0}{25}{y:i}Hello!");
365        test_mdvd("{0}{25}{Y:i}Hello!", "{0}{25}{y:i}Hello!");
366        test_mdvd("{0}{25}{Y:i}\n", "{0}{25}{y:i}");
367
368        // cleanup formattings in a file
369        test_mdvd("{0}{25}{y:i}Text1|{y:i}Text2", "{0}{25}{Y:i}Text1|Text2");
370        test_mdvd("{0}{25}{y:i}Text1\n{0}{25}{y:i}Text2", "{0}{25}{Y:i}Text1|Text2");
371        test_mdvd("{0}{25}{y:i}{y:b}Text1\n{0}{25}{y:i}Text2", "{0}{25}{Y:i}{y:b}Text1|Text2");
372        test_mdvd("{0}{25}{y:i}{y:b}Text1\n{0}{25}{y:i}Text2", "{0}{25}{Y:i}{y:b}Text1|Text2");
373
374        // these can't be condensed, because the lines have different times
375        test_mdvd("{0}{25}{y:i}Text1\n{0}{26}{y:i}Text2", "{0}{25}{y:i}Text1\n{0}{26}{y:i}Text2");
376    }
377}