pub mod srv3;
pub mod video_id;
use crate::error::{AppError, AppResult, NoSubtitleReason};
use crate::text::normalize_nfc;
use regex::Regex;
use std::sync::OnceLock;
static TIMESTAMP_RE: OnceLock<Regex> = OnceLock::new();
static INDEX_RE: OnceLock<Regex> = OnceLock::new();
static NOTEEY_TS_RE: OnceLock<Regex> = OnceLock::new();
fn timestamp_re() -> &'static Regex {
TIMESTAMP_RE.get_or_init(|| {
Regex::new(r"^\d{2}:\d{2}:\d{2}[,.]?\d{3}\s*-->\s*\d{2}:\d{2}:\d{2}[,.]?\d{3}")
.expect("static SRT timestamp regex is valid")
})
}
fn index_re() -> &'static Regex {
INDEX_RE.get_or_init(|| Regex::new(r"^\d+$").expect("static SRT index regex is valid"))
}
fn noteey_ts_re() -> &'static Regex {
NOTEEY_TS_RE.get_or_init(|| {
Regex::new(r"^\d{2}:\d{2}(?::\d{2})?[.,]?\d*")
.expect("static noteey timestamp regex is valid")
})
}
#[tracing::instrument(level = "debug", err, skip(srt), fields(len_bytes = srt.len()))]
pub fn srt_to_text(srt: &str) -> AppResult<String> {
if srt.is_empty() {
return Err(AppError::InvalidInput("empty srt body".to_string()));
}
if srt.len() > 50 * 1024 * 1024 {
return Err(AppError::SubtitleTooLarge(srt.len()));
}
let normalized = srt.replace("\r\n", "\n").replace('\r', "\n");
let mut cues: Vec<String> = Vec::new();
let mut current_lines: Vec<String> = Vec::new();
for line in normalized.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !current_lines.is_empty() {
let text = join_cue_lines(¤t_lines);
if !text.is_empty() {
cues.push(normalize_nfc(&text));
}
current_lines.clear();
}
continue;
}
if index_re().is_match(trimmed) {
continue;
}
if timestamp_re().is_match(trimmed) {
continue;
}
current_lines.push(trimmed.to_string());
}
if !current_lines.is_empty() {
let text = join_cue_lines(¤t_lines);
if !text.is_empty() {
cues.push(normalize_nfc(&text));
}
}
if cues.is_empty() {
return Err(AppError::InvalidInput("srt has no valid cues".to_string()));
}
Ok(cues.join("\n\n"))
}
#[tracing::instrument(level = "debug", err, skip(raw), fields(len_bytes = raw.len()))]
pub fn noteey_to_text(raw: &str) -> AppResult<String> {
if raw.is_empty() {
return Err(AppError::NoSubtitle(NoSubtitleReason::NotPublished));
}
if raw.len() > 50 * 1024 * 1024 {
return Err(AppError::SubtitleTooLarge(raw.len()));
}
let normalized = raw.replace("\r\n", "\n").replace('\r', "\n");
let mut out: Vec<String> = Vec::new();
for line in normalized.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let matched_ts: Option<String> = noteey_ts_re()
.find(trimmed)
.map(|m| m.as_str().to_owned());
let is_standalone = matched_ts
.as_deref()
.map(|ts| ts.len() == trimmed.len())
.unwrap_or(false);
if is_standalone {
continue;
}
let stripped = noteey_ts_re().replace(trimmed, "").trim().to_owned();
let is_marker_only = (stripped.starts_with('[') && stripped.ends_with(']'))
|| (stripped.starts_with('(') && stripped.ends_with(')'));
if stripped.is_empty() || is_marker_only {
continue;
}
out.push(normalize_nfc(&stripped));
}
if out.is_empty() {
return Err(AppError::NoSubtitle(NoSubtitleReason::NotPublished));
}
Ok(out.join("\n"))
}
fn join_cue_lines(lines: &[String]) -> String {
lines.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_basic_srt() {
let srt = "1\n00:00:01,000 --> 00:00:02,000\nHello world\n\n2\n00:00:03,000 --> 00:00:04,000\nSecond cue\n";
let text = srt_to_text(srt).unwrap();
assert_eq!(text, "Hello world\n\nSecond cue");
}
#[test]
fn handles_crlf() {
let srt = "1\r\n00:00:01,000 --> 00:00:02,000\r\nLine one\r\nLine two\r\n";
let text = srt_to_text(srt).unwrap();
assert_eq!(text, "Line one Line two");
}
#[test]
fn handles_multiline_cue() {
let srt = "1\n00:00:01,000 --> 00:00:05,000\nLine one\nLine two\nLine three\n";
let text = srt_to_text(srt).unwrap();
assert_eq!(text, "Line one Line two Line three");
}
#[test]
fn rejects_empty() {
let err = srt_to_text("").unwrap_err();
assert!(matches!(err, AppError::InvalidInput(_)));
}
#[test]
fn rejects_over_50mb() {
let big = "a".repeat(51 * 1024 * 1024);
let err = srt_to_text(&big).unwrap_err();
assert!(matches!(err, AppError::SubtitleTooLarge(_)));
}
#[test]
fn handles_accented_text() {
let srt = "1\n00:00:01,000 --> 00:00:02,000\nOlá mundo com acentuação\n";
let text = srt_to_text(srt).unwrap();
assert_eq!(text, "Olá mundo com acentuação");
}
#[test]
fn noteey_clean_strips_leading_timestamp_prefix() {
let raw = "00:00 hello\n00:03 world\n00:05 again\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "hello\nworld\nagain");
}
#[test]
fn noteey_clean_handles_milliseconds() {
let raw = "00:00.123 hello\n00:03.456 world\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "hello\nworld");
}
#[test]
fn noteey_clean_handles_hh_mm_ss_format() {
let raw = "01:02:03 long video cue\n01:02:08 next cue\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "long video cue\nnext cue");
}
#[test]
fn noteey_clean_skips_empty_lines() {
let raw = "00:00 first\n\n00:05 second\n\n\n00:10 third\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "first\nsecond\nthird");
}
#[test]
fn noteey_clean_skips_marker_only_lines() {
let raw = "00:00 [Music]\n00:03 hello\n00:05 (Applause)\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "hello");
}
#[test]
fn noteey_clean_handles_accented_text() {
let raw = "00:00 Olá mundo\n00:03 ção não\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "Olá mundo\nção não");
}
#[test]
fn noteey_clean_handles_crlf() {
let raw = "00:00 first\r\n00:03 second\r\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "first\nsecond");
}
#[test]
fn noteey_clean_joins_alternating_timestamp_and_text_lines() {
let raw = "00:00\nhello\n00:03\nworld\n00:05\nagain\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "hello\nworld\nagain");
}
#[test]
fn noteey_clean_drops_pending_ts_without_followup_text() {
let raw = "00:00 hello\n00:03 world\n00:05\n";
let text = noteey_to_text(raw).unwrap();
assert_eq!(text, "hello\nworld");
}
#[test]
fn noteey_clean_rejects_empty() {
let err = noteey_to_text("").unwrap_err();
assert!(matches!(err, AppError::NoSubtitle(_)));
}
#[test]
fn noteey_clean_rejects_only_whitespace() {
let err = noteey_to_text(" \n\n \t\n").unwrap_err();
assert!(matches!(err, AppError::NoSubtitle(_)));
}
#[test]
fn noteey_clean_rejects_only_timestamps() {
let raw = "00:00 [Music]\n00:05 (Applause)\n";
let err = noteey_to_text(raw).unwrap_err();
assert!(matches!(err, AppError::NoSubtitle(_)));
}
#[test]
fn noteey_clean_respects_50mb_cap() {
let big = format!("00:00 {}\n", "a".repeat(51 * 1024 * 1024));
let err = noteey_to_text(&big).unwrap_err();
assert!(matches!(err, AppError::SubtitleTooLarge(_)));
}
}