use std::sync::OnceLock;
use regex::Regex;
use crate::error::{AppError, AppResult};
pub const MAX_BODY_BYTES: usize = 5 * 1024 * 1024;
fn srv3_text_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?s)<text\s+start="([0-9.]+)"\s+dur="([0-9.]+)"[^>]*>(.*?)</text>"#)
.expect("static srv3 text regex is valid")
})
}
#[tracing::instrument(level = "debug", err, skip(xml), fields(len_bytes = xml.len()))]
pub fn srv3_to_srt(xml: &str) -> AppResult<String> {
let body = xml.trim();
if body.is_empty() {
return Err(AppError::InvalidInput("empty srv3 body".to_string()));
}
if body.len() > MAX_BODY_BYTES {
return Err(AppError::SubtitleTooLarge(body.len()));
}
let mut out = String::new();
let mut index: usize = 0;
for cap in srv3_text_re().captures_iter(body) {
let start = cap.get(1).map_or("", |m| m.as_str());
let dur = cap.get(2).map_or("", |m| m.as_str());
let text = cap.get(3).map_or("", |m| m.as_str());
let start_secs: f64 = start
.parse()
.map_err(|_| AppError::InvalidInput(format!("srv3 start={start:?} not a float")))?;
let dur_secs: f64 = dur
.parse()
.map_err(|_| AppError::InvalidInput(format!("srv3 dur={dur:?} not a float")))?;
let end_secs = start_secs + dur_secs;
index += 1;
out.push_str(&format!("{index}\n"));
out.push_str(&format!(
"{} --> {}\n",
format_timestamp(start_secs),
format_timestamp(end_secs)
));
out.push_str(&sanitize_cue_text(text));
out.push('\n');
}
if index == 0 {
return Err(AppError::InvalidInput(
"srv3 body has no <text> cues".to_string(),
));
}
Ok(out)
}
#[tracing::instrument(level = "debug", err, skip(json), fields(len_bytes = json.len()))]
pub fn json3_to_srt(json: &str) -> AppResult<String> {
let body = json.trim();
if body.is_empty() {
return Err(AppError::InvalidInput("empty json3 body".to_string()));
}
if body.len() > MAX_BODY_BYTES {
return Err(AppError::SubtitleTooLarge(body.len()));
}
let value: serde_json::Value = serde_json::from_str(body).map_err(AppError::Serde)?;
let events = value
.get("events")
.and_then(serde_json::Value::as_array)
.ok_or_else(|| AppError::InvalidInput("json3 body has no events[] array".to_string()))?;
let mut out = String::new();
let mut index: usize = 0;
for event in events {
let t_start_ms = event.get("tStartMs").and_then(serde_json::Value::as_i64);
let d_dur_ms = event
.get("dDurationMs")
.and_then(serde_json::Value::as_i64)
.unwrap_or(0);
let (Some(start_ms), dur_ms) = (t_start_ms, d_dur_ms) else {
continue;
};
let segs = match event.get("segs").and_then(serde_json::Value::as_array) {
Some(segs) => segs,
None => continue,
};
let mut text = String::new();
let mut first = true;
for seg in segs {
if let Some(utf8) = seg.get("utf8").and_then(serde_json::Value::as_str) {
if !first && !utf8.is_empty() {
text.push('\n');
}
text.push_str(utf8);
first = false;
}
}
if text.is_empty() {
continue;
}
let start_secs = start_ms as f64 / 1000.0;
let end_secs = (start_ms + dur_ms) as f64 / 1000.0;
index += 1;
out.push_str(&format!("{index}\n"));
out.push_str(&format!(
"{} --> {}\n",
format_timestamp(start_secs),
format_timestamp(end_secs)
));
out.push_str(&sanitize_cue_text(&text));
out.push('\n');
}
if index == 0 {
return Err(AppError::InvalidInput(
"json3 body has no usable events".to_string(),
));
}
Ok(out)
}
fn format_timestamp(seconds: f64) -> String {
if !seconds.is_finite() || seconds < 0.0 {
return "00:00:00,000".to_string();
}
let total_ms = (seconds * 1000.0).round() as u64;
let hours = total_ms / 3_600_000;
let minutes = (total_ms / 60_000) % 60;
let secs = (total_ms / 1000) % 60;
let ms = total_ms % 1000;
format!("{hours:02}:{minutes:02}:{secs:02},{ms:03}")
}
fn sanitize_cue_text(text: &str) -> String {
let stripped = text.replace("\r\n", "\n").replace('\r', "\n");
stripped.replace(" -->", "\u{200B}-->")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_minimal_srv3() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<transcript>
<text start="0.0" dur="2.5">Hello world</text>
<text start="2.5" dur="3.0">Second cue</text>
</transcript>"#;
let srt = srv3_to_srt(xml).expect("srv3 parses");
assert!(srt.contains("1\n00:00:00,000 --> 00:00:02,500\nHello world\n"));
assert!(srt.contains("2\n00:00:02,500 --> 00:00:05,500\nSecond cue\n"));
}
#[test]
fn parses_multiline_cue() {
let xml = r#"<?xml version="1.0"?>
<transcript>
<text start="0.0" dur="4.0">Line 1
Line 2
Line 3</text>
</transcript>"#;
let srt = srv3_to_srt(xml).expect("multiline parses");
assert!(srt.contains("Line 1\nLine 2\nLine 3"));
assert!(srt.contains("00:00:00,000 --> 00:00:04,000"));
}
#[test]
fn parses_unicode_cue() {
let xml = r#"<?xml version="1.0" encoding="utf-8"?>
<transcript>
<text start="0.0" dur="2.0">café 日本</text>
</transcript>"#;
let srt = srv3_to_srt(xml).expect("unicode parses");
assert!(srt.contains("café 日本"));
}
#[test]
fn rejects_empty_body() {
let err = srv3_to_srt("").unwrap_err();
assert!(matches!(err, AppError::InvalidInput(_)));
}
#[test]
fn parses_json3_format() {
let json = r#"{
"events": [
{"tStartMs": 0, "dDurationMs": 2500, "segs": [{"utf8": "Hello world"}]},
{"tStartMs": 2500, "dDurationMs": 3000, "segs": [{"utf8": "Line 1"}, {"utf8": "Line 2"}]}
]
}"#;
let srt = json3_to_srt(json).expect("json3 parses");
assert!(srt.contains("1\n00:00:00,000 --> 00:00:02,500\nHello world\n"));
assert!(srt.contains("2\n00:00:02,500 --> 00:00:05,500\nLine 1\nLine 2\n"));
}
}