use crate::Result;
use crate::core::formats::line_endings::{normalize_line_endings, raw_blocks};
use crate::core::formats::{Subtitle, SubtitleEntry, SubtitleFormatType, SubtitleMetadata};
use crate::error::SubXError;
use regex::Regex;
use super::time::parse_vtt_time;
pub(super) const MAX_CUE_BYTES: usize = 1024 * 1024;
const FORMAT_NAME: &str = "VTT";
const UTF8_BOM: &str = "\u{feff}";
pub(super) fn parse(content: &str) -> Result<Subtitle> {
if content.is_empty() {
return Err(SubXError::subtitle_format(FORMAT_NAME, "empty input"));
}
let content = content.strip_prefix(UTF8_BOM).unwrap_or(content);
if !content.trim_start().starts_with("WEBVTT") {
return Err(SubXError::subtitle_format(
FORMAT_NAME,
"missing WEBVTT signature",
));
}
for raw_block in raw_blocks(content) {
let trimmed = raw_block.trim();
if trimmed.is_empty()
|| trimmed.starts_with("WEBVTT")
|| trimmed.starts_with("NOTE")
|| trimmed.starts_with("STYLE")
{
continue;
}
if raw_block.len() > MAX_CUE_BYTES {
return Err(SubXError::subtitle_format(
FORMAT_NAME,
format!(
"cue block of {} bytes exceeds per-cue cap of {} bytes",
raw_block.len(),
MAX_CUE_BYTES
),
));
}
}
let normalized = normalize_line_endings(content);
let content: &str = &normalized;
let time_re =
Regex::new(r"(?m)^(\d{2}):(\d{2}):(\d{2})\.(\d{3}) --> (\d{2}):(\d{2}):(\d{2})\.(\d{3})")
.map_err(|e: regex::Error| SubXError::subtitle_format(FORMAT_NAME, e.to_string()))?;
let neg_signed_re = Regex::new(r"(^|\s|>)-\d{1,}:\d{2}:\d{2}\.\d{3}")
.map_err(|e: regex::Error| SubXError::subtitle_format(FORMAT_NAME, e.to_string()))?;
let mut entries = Vec::new();
for block in content.split("\n\n") {
let block = block.trim();
if block.is_empty()
|| block.starts_with("WEBVTT")
|| block.starts_with("NOTE")
|| block.starts_with("STYLE")
{
continue;
}
if block.len() > MAX_CUE_BYTES {
return Err(SubXError::subtitle_format(
FORMAT_NAME,
format!(
"cue block of {} bytes exceeds per-cue cap of {} bytes",
block.len(),
MAX_CUE_BYTES
),
));
}
let lines: Vec<&str> = block.lines().collect();
let mut idx = 0;
if !time_re.is_match(lines[0]) {
idx = 1;
if idx >= lines.len() {
continue;
}
}
let marker_line = lines[idx];
if neg_signed_re.is_match(marker_line) {
log::debug!(
"Skipping VTT cue with negative timestamp: {:?}",
marker_line
);
continue;
}
if let Some(caps) = time_re.captures(marker_line) {
let start = parse_vtt_time(&caps, 1)?;
let end = parse_vtt_time(&caps, 5)?;
let text = lines[(idx + 1)..].join("\n");
if text.len() > MAX_CUE_BYTES {
return Err(SubXError::subtitle_format(
FORMAT_NAME,
format!(
"cue body of {} bytes exceeds per-cue cap of {} bytes",
text.len(),
MAX_CUE_BYTES
),
));
}
entries.push(SubtitleEntry {
index: entries.len() + 1,
start_time: start,
end_time: end,
text,
styling: None,
});
}
}
Ok(Subtitle {
entries,
metadata: SubtitleMetadata {
title: None,
language: None,
encoding: "utf-8".to_string(),
frame_rate: None,
original_format: SubtitleFormatType::Vtt,
},
format: SubtitleFormatType::Vtt,
})
}
pub(super) fn detect(content: &str) -> bool {
let stripped = content.strip_prefix(UTF8_BOM).unwrap_or(content);
stripped.trim_start().starts_with("WEBVTT")
}