title_parser/
lib.rs

1#![warn(missing_docs)]
2
3//! Parser for SRT and WebVTT
4//!
5//! Provides a parser that will extract a sequence of Cues
6//! from text that conforms to SRT or WebVTT standards
7
8pub mod timecode;
9use regex::{Captures, Regex};
10use timecode::{TimeCode, TimeCodeTrait};
11// use std::{error, fs};
12
13/// A Cue represents a single SRT / WebVTT cue extracted from
14/// a subtitle file:
15///
16/// ```vtt
17/// 14
18/// 00:01:14.815 --> 00:01:18.114
19/// - This line belongs to a subtitle cue.
20/// - This line is also a member of the same cue.
21/// ```
22///
23pub struct Cue {
24    /// timestamp for cue to appear
25    pub start: TimeCode,
26    /// timestamp for cue to disappear
27    pub end: TimeCode,
28    /// text for cue to display
29    pub text: String,
30}
31
32/// trait to implement for types that can be converted to
33/// a `Cue`
34pub trait CueTrait {
35    /// Attempts to create a cue from a string
36    ///
37    /// ```vtt
38    /// 1 - Cue Identifier
39    /// 00:01:14.815 --> 00:01:18.114
40    /// - I'm text for a cue
41    /// - Me too!
42    /// ```
43    ///
44    /// ```
45    /// use title_parser::{CueTrait};
46    ///
47    /// // with cue identifier
48    /// let text = "1 - Cue\n00:01:14.815 --> 00:01:18.114\n- I'm text for a cue\n- Me too!";
49    /// let cue = text.to_cue().unwrap();
50    /// assert_eq!(cue.text, "I'm text for a cue\nMe too!");
51    ///
52    /// // without cue identifier
53    /// let text = "00:01:14.815 --> 00:01:18.114\n- I'm text for a cue\n- Me too!";
54    /// let cue = text.to_cue().unwrap();
55    /// assert_eq!(cue.text, "I'm text for a cue\nMe too!");
56    /// ```
57    fn to_cue(&self) -> Result<Cue, String>;
58}
59
60impl CueTrait for str {
61    fn to_cue(&self) -> Result<Cue, String> {
62        let re = Regex::new(r"(.+\n)?(([0-9:\.,]{9,}) --> ([0-9:\.,]{9,})( .*)?)((\n.*)+)")
63            .expect("failed to compile regex");
64        let caps = re
65            .captures(self)
66            .ok_or_else(|| "not a valid cue".to_string())?;
67        let cues = caps.get(6).unwrap().as_str();
68        let (start, end) = generate_timecodes(caps).ok_or_else(|| "not a valid cue".to_string())?;
69        let lines: Vec<&str> = cues.trim().split('\n').into_iter().collect();
70        let clean_lines: Vec<String> = lines.iter().map(|i| sanitize_text(i)).collect();
71        let text = clean_lines.join("\n");
72        Ok(Cue { start, end, text })
73    }
74}
75
76// Attempts to extract TimeCodes from input, ignores css formatting text
77fn generate_timecodes(caps: Captures) -> Option<(TimeCode, TimeCode)> {
78    let start = caps.get(3)?.as_str().to_timecode().ok()?;
79    let end = caps.get(4)?.as_str().to_timecode().ok()?;
80    Some((start, end))
81}
82
83static REGEX_TO_PRUNE: [&str; 3] = [r"<[0-9a-zA-Z\.,:_\-]+>", r"</[0-9a-zA-Z\.,:_\-]+>", r"^\- "];
84
85static ES_TO_PRUNE: [&str; 6] = ["&amp;", "&lt;", "&gt;", "&lrm;", "&rlm;", "&nbsp;"];
86
87// Removes leading hyphens, HTML tags, CSS tags, etc. from input
88fn sanitize_text(input: &str) -> String {
89    let mut text: String = input.to_string();
90    for regex in REGEX_TO_PRUNE.iter() {
91        let re = Regex::new(regex).expect("unable to compile regex");
92        text = re.replace_all(&text, "").to_string();
93    }
94    for es in ES_TO_PRUNE.iter() {
95        text = text.replace(es, "");
96    }
97    text
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn private_sanitize_text() -> Result<(), String> {
106        let input = "<c.japanese><c.bg_some>&lrm;(聖弥)フフッ</c.bg_some></c.japanese>";
107        assert_eq!(sanitize_text(input), "(聖弥)フフッ".to_string());
108        Ok(())
109    }
110}