use std::borrow::Cow;
use regex::Regex;
use crate::{
srt::{Subtitle, SubtitleFile},
Result,
};
pub fn strip_formatting(line: &str) -> Cow<str> {
let formatting = Regex::new(r"<[a-z/][^>]*>").unwrap();
formatting.replace_all(&line, "")
}
fn clean_line(line: &str) -> String {
let clutter = Regex::new(r"(\([^)]*\)|♪[^♪]*♪|[A-Z]{2,} ?:)").unwrap();
let whitespace = Regex::new(r"\s+").unwrap();
whitespace
.replace_all(&clutter.replace_all(line, ""), " ")
.trim()
.to_string()
}
fn clean_subtitle(sub: &Subtitle) -> Option<Subtitle> {
let lines: Vec<String> = sub
.lines
.iter()
.map(|l| clean_line(&l))
.filter(|l| l.len() > 0)
.map(|l| l.to_string())
.collect();
if lines.len() == 0 {
return None;
}
Some(Subtitle {
index: sub.index,
period: sub.period,
lines: lines,
})
}
pub fn clean_subtitle_file(file: &SubtitleFile) -> Result<SubtitleFile> {
let mut subs: Vec<Subtitle> =
file.subtitles.iter().filter_map(clean_subtitle).collect();
subs.sort_by(|a, b| a.period.begin().partial_cmp(&b.period.begin()).unwrap());
if subs.len() >= 2 {
for i in 0..subs.len() - 1 {
let limit = subs[i + 1].period.begin();
subs[i].period.end_before(limit)?;
}
}
for (i, ref mut sub) in subs.iter_mut().enumerate() {
sub.index = i + 1;
}
Ok(SubtitleFile { subtitles: subs })
}
#[test]
fn test_clean_subtitle_file() {
let dirty = SubtitleFile::from_str(
r"19
00:01:03,163 --> 00:01:04,664
They've arrived.
( <i>door slams</i> )
20
00:01:07,700 --> 00:01:10,736
( <i>cheering</i> )
21
00:01:12,839 --> 00:01:13,840
♪ ♪
18
00:01:02,328 --> 00:01:03,162
JOE: Hey! ( waves arms )
53
00:02:47,965 --> 00:02:50,684
Out of order.
52
00:02:42,658 --> 00:02:48,865
Overlapping.
",
)
.unwrap();
let cleaned = "\u{FEFF}1
00:01:02,328 --> 00:01:03,162
Hey!
2
00:01:03,163 --> 00:01:04,664
They've arrived.
3
00:02:42,658 --> 00:02:47,964
Overlapping.
4
00:02:47,965 --> 00:02:50,684
Out of order.
";
assert_eq!(cleaned, &clean_subtitle_file(&dirty).unwrap().to_string());
}