subrip 0.1.1 - Docs.rs

use std::time::Duration;

use nom::{
    bytes::complete::{is_not, tag},
    character::complete::{line_ending, one_of},
    combinator::{map, map_res, opt, recognize},
    multi::{many0, many1},
    sequence::{preceded, terminated, tuple},
    IResult,
};

use crate::{Error, Subtitle};

fn decimal(input: &str) -> IResult<&str, &str> {
    recognize(many1(one_of("0123456789")))(input)
}

fn bom(input: &str) -> IResult<&str, &str> {
    tag("\u{feff}")(input)
}

fn ascii_u32(input: &str) -> IResult<&str, u32> {
    map_res(decimal, |s| s.parse::<u32>())(input)
}

fn timestamp(input: &str) -> IResult<&str, Duration> {
    map(
        tuple((
            ascii_u32,
            tag(":"),
            ascii_u32,
            tag(":"),
            ascii_u32,
            tag(","),
            ascii_u32,
        )),
        |(h, _, m, _, s, _, mi)| {
            let mut t = h;
            t = t * 60 + m;
            t = t * 60 + s;
            t = t * 1000 + mi;
            Duration::from_millis(t as u64)
        },
    )(input)
}

fn timespan(input: &str) -> IResult<&str, (Duration, Duration)> {
    map(
        tuple((
            timestamp,
            many1(tag(" ")),
            tag("-->"),
            many1(tag(" ")),
            timestamp,
        )),
        |t| (t.0, t.4),
    )(input)
}

fn text(input: &str) -> IResult<&str, &str> {
    recognize(many0(terminated(is_not("\n\r"), line_ending)))(input)
}

fn subtitle(input: &str) -> IResult<&str, Subtitle> {
    map(
        tuple((
            ascii_u32,
            line_ending,
            timespan,
            many0(line_ending),
            text,
            many0(line_ending),
        )),
        |(idx, _, (start, end), _, text, _)| {
            let better_newlines = text.replace("\r\n", "\n");
            Subtitle {
                idx,
                start,
                end,
                text: better_newlines,
            }
        },
    )(input)
}

fn srt_file(input: &str) -> IResult<&str, Vec<Subtitle>> {
    preceded(tuple((opt(bom), many0(line_ending))), many0(subtitle))(input)
}

/// Parse some SRT formatted text.
///
/// See [`Error::ParseIncomplete`] for a common
/// and partially recoverable error outcome.
pub fn parse(input: &str) -> Result<Vec<Subtitle>, Error> {
    let (leftover, results) = srt_file(input).map_err(|_| Error::ParseError(vec![], 0))?;

    if !leftover.is_empty() {
        let start_remainder = input.len() - leftover.len();
        Err(Error::ParseError(results, start_remainder))
    } else {
        Ok(results)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn parse_numberline() {
        let input = "231";
        let (r, n) = ascii_u32(input).unwrap();
        assert_eq!(n, 231);
        assert_eq!(r, "");
    }

    #[test]
    fn timestamp_zero() {
        let input = "00:00:00,000";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(t, Duration::from_millis(0));
    }

    #[test]
    fn timestamp_millis() {
        let input = "00:00:00,050";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(t, Duration::from_millis(50));
    }
    #[test]
    fn timestamp_secs() {
        let input = "00:00:08,000";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(t, Duration::from_secs(8));
    }
    #[test]
    fn timestamp_min() {
        let input = "00:14:00,000";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(t, Duration::from_secs(14 * 60));
    }
    #[test]
    fn timestamp_hrs() {
        let input = "02:00:00,000";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(t, Duration::from_secs(2 * 60 * 60));
    }
    #[test]
    fn timestamp_random() {
        let input = "02:14:08,050";
        let (_, t) = timestamp(input).unwrap();
        assert_eq!(
            t,
            Duration::from_millis(((2 * 60 + 14) * 60 + 8) * 1000 + 50)
        );
    }

    struct TimeSpanTestCase {
        unit: u32,
        raw: &'static str,
        start: Duration,
        end: Duration,
        lines: &'static [&'static str],
    }

    #[derive(Debug, Clone, Copy, PartialEq)]
    enum LineEnding {
        Unix,
        Windows,
    }
    impl LineEnding {
        fn as_str(&self) -> &'static str {
            match self {
                LineEnding::Unix => "\n",
                LineEnding::Windows => "\r\n",
            }
        }
    }

    impl TimeSpanTestCase {
        fn input(&self, ending: LineEnding, text_newline: bool, final_newline: bool) -> String {
            let newline = ending.as_str();
            let mut res = format!("{}", self.unit);
            res.push_str(newline);
            res.push_str(self.raw);
            res.push_str(newline);
            if text_newline {
                res.push_str(newline);
            }
            for line in self.lines {
                res.push_str(line);
                res.push_str(newline);
            }
            if final_newline {
                res.push_str(newline);
            }
            res
        }
        fn sub(&self) -> Subtitle {
            let mut text = self.lines.join("\n");
            text.push('\n');
            Subtitle {
                idx: self.unit,
                start: self.start,
                end: self.end,
                text,
            }
        }
    }

    const EX_TS_1: TimeSpanTestCase = TimeSpanTestCase {
        unit: 1,
        raw: "00:00:02,002 --> 00:00:05,403",
        start: Duration::from_millis(2 * 1000 + 2),
        end: Duration::from_millis(5 * 1000 + 403),
        lines: &[
            "<i>Now the story of a wealthy family</i>",
            "<i>who lost everything...</i>",
        ],
    };

    const EX_TS_2: TimeSpanTestCase = TimeSpanTestCase {
        unit: 2,
        raw: "00:00:05,505 --> 00:00:07,496",
        start: Duration::from_millis(5 * 1000 + 505),
        end: Duration::from_millis(7 * 1000 + 496),
        lines: &["<i>and the one son</i>", "<i>who had no choice...</i>"],
    };

    const EX_TS_3: TimeSpanTestCase = TimeSpanTestCase {
        unit: 3,
        raw: "00:00:07,607 --> 00:00:09,598",
        start: Duration::from_millis(7 * 1000 + 607),
        end: Duration::from_millis(9 * 1000 + 598),
        lines: &["<i>but to keep them all together.</i>"],
    };

    fn build_srt(
        ending: LineEnding,
        extra_text_line: bool,
        short_ending: bool,
        leading_bom: bool,
    ) -> (String, Vec<Subtitle>) {
        let mut input = String::new();
        if leading_bom {
            input.push('\u{feff}')
        }
        input.push_str(EX_TS_1.input(ending, extra_text_line, true).as_str());
        input.push_str(EX_TS_2.input(ending, extra_text_line, true).as_str());
        input.push_str(
            EX_TS_3
                .input(ending, extra_text_line, !short_ending)
                .as_str(),
        );
        let expected = vec![EX_TS_1.sub(), EX_TS_2.sub(), EX_TS_3.sub()];
        (input, expected)
    }

    fn test_srt(ending: LineEnding, extra_text_line: bool, short_ending: bool, leading_bom: bool) {
        let (input, expected) = build_srt(ending, extra_text_line, short_ending, leading_bom);
        let (subt_rem, res) = srt_file(input.as_str()).unwrap();
        assert_eq!(subt_rem, "");
        assert_eq!(res, expected);
    }

    fn test_each_sub(ending: LineEnding, extra_text_line: bool, final_newline: bool) {
        for tc in &[EX_TS_1, EX_TS_2, EX_TS_3] {
            let input = tc.input(ending, extra_text_line, final_newline);
            let (subt_rem, sub) = subtitle(input.as_str()).unwrap();
            assert_eq!(subt_rem, "");
            assert_eq!(sub, tc.sub());
        }
    }

    #[test]
    fn parse_sub() {
        test_each_sub(LineEnding::Unix, false, true);
    }
    #[test]
    fn parse_sub_windows() {
        test_each_sub(LineEnding::Windows, false, true);
    }
    #[test]
    fn parse_sub_last() {
        test_each_sub(LineEnding::Unix, false, false);
    }
    #[test]
    fn parse_sub_windows_last() {
        test_each_sub(LineEnding::Windows, false, false);
    }
    #[test]
    fn parse_sub_text_newline() {
        test_each_sub(LineEnding::Unix, true, false);
    }
    #[test]
    fn parse_sub_windows_text_newline() {
        test_each_sub(LineEnding::Windows, true, false);
    }

    #[test]
    fn parse_srt_leading_byte_order_mark() {
        test_srt(LineEnding::Unix, false, false, true)
    }
    #[test]
    fn parse_srt_unix() {
        test_srt(LineEnding::Unix, false, false, false)
    }
    #[test]
    fn parse_srt_windows() {
        test_srt(LineEnding::Windows, false, false, false)
    }
    #[test]
    fn parse_srt_unix_text_newline() {
        test_srt(LineEnding::Unix, true, false, false)
    }
    #[test]
    fn parse_srt_windows_text_newline() {
        test_srt(LineEnding::Windows, true, false, false)
    }
    #[test]
    fn parse_srt_unix_short() {
        test_srt(LineEnding::Unix, false, true, false)
    }
    #[test]
    fn parse_srt_windows_short() {
        test_srt(LineEnding::Windows, false, true, false)
    }
    #[test]
    fn parse_srt_unix_text_newline_short() {
        test_srt(LineEnding::Unix, true, true, false)
    }
    #[test]
    fn parse_srt_windows_text_newline_short() {
        test_srt(LineEnding::Windows, true, true, false)
    }

    #[test]
    fn parse_incomplete() {
        let (mut input, expected) = build_srt(LineEnding::Windows, true, true, false);
        let junk = "this is extra junk at the end of the file";
        input.push_str(junk);

        match parse(input.as_str()) {
            Ok(_) => panic!("expected error"),
            Err(Error::ParseError(res, offset)) => {
                assert_eq!(res, expected);
                let remainder = std::str::from_utf8(&input.as_bytes()[offset..]).unwrap();
                assert_eq!(remainder, junk);
            }
        }
    }
}