1use crate::error::{YdlError, YdlResult};
2use crate::types::{ParsedSubtitles, SubtitleEntry, SubtitleType};
3use encoding_rs::UTF_8;
4use regex::Regex;
5use std::time::Duration;
6use tracing::{debug, warn};
7
8pub struct ContentProcessor {
10 srt_time_regex: Regex,
12 vtt_time_regex: Regex,
14 html_tag_regex: Regex,
16}
17
18impl Default for ContentProcessor {
19 fn default() -> Self {
20 Self::new()
21 }
22}
23
24impl ContentProcessor {
25 pub fn new() -> Self {
27 let srt_time_regex =
28 Regex::new(r"(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})")
29 .expect("Valid SRT time regex");
30
31 let vtt_time_regex =
32 Regex::new(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3}) --> (\d{2}):(\d{2}):(\d{2})\.(\d{3})")
33 .expect("Valid VTT time regex");
34
35 let html_tag_regex = Regex::new(r"<[^>]*>").expect("Valid HTML tag regex");
36
37 Self {
38 srt_time_regex,
39 vtt_time_regex,
40 html_tag_regex,
41 }
42 }
43
44 pub fn process_content(
46 &self,
47 raw_content: &str,
48 target_format: SubtitleType,
49 language: &str,
50 clean_content: bool,
51 validate_timing: bool,
52 ) -> YdlResult<String> {
53 debug!(
54 "Processing subtitle content, target format: {:?}",
55 target_format
56 );
57
58 let content = self.ensure_utf8(raw_content)?;
60
61 let parsed = self.parse_subtitle_content(&content, language)?;
63
64 if validate_timing {
66 self.validate_timing(&parsed.entries)?;
67 }
68
69 let entries = if clean_content {
71 self.clean_subtitle_entries(parsed.entries)
72 } else {
73 parsed.entries
74 };
75
76 self.convert_to_format(&entries, target_format, language)
78 }
79
80 fn ensure_utf8(&self, content: &str) -> YdlResult<String> {
82 let (decoded, _encoding_used, had_errors) = UTF_8.decode(content.as_bytes());
84
85 if had_errors {
86 warn!("Encoding errors detected, attempting to fix");
87 let encodings = [
89 encoding_rs::WINDOWS_1252,
90 encoding_rs::ISO_8859_2,
91 encoding_rs::UTF_16LE,
92 encoding_rs::UTF_16BE,
93 ];
94
95 for encoding in &encodings {
96 let (decoded, _, had_errors) = encoding.decode(content.as_bytes());
97 if !had_errors {
98 debug!("Successfully decoded using {:?}", encoding.name());
99 return Ok(decoded.to_string());
100 }
101 }
102
103 Ok(decoded.to_string())
105 } else {
106 Ok(content.to_string())
107 }
108 }
109
110 fn parse_subtitle_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
112 debug!("Parsing subtitle content, {} bytes", content.len());
113
114 if content.contains("WEBVTT") {
116 self.parse_vtt_content(content, language)
117 } else if content.contains("<?xml") || content.contains("<transcript") {
118 self.parse_youtube_xml_content(content, language)
119 } else if self.srt_time_regex.is_match(content) {
120 self.parse_srt_content(content, language)
121 } else if content.contains("-->") {
122 self.parse_vtt_content(content, language)
124 } else {
125 self.parse_plain_text_content(content, language)
127 }
128 }
129
130 fn parse_srt_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
132 let mut entries = Vec::new();
133 let blocks = content.split("\n\n");
134
135 for block in blocks {
136 let block = block.trim();
137 if block.is_empty() {
138 continue;
139 }
140
141 let lines: Vec<&str> = block.lines().collect();
142 if lines.len() < 3 {
143 continue;
144 }
145
146 let timing_line = lines[1];
148 let text_lines = &lines[2..];
149
150 if let Some(captures) = self.srt_time_regex.captures(timing_line) {
151 let start = self.parse_srt_time(&captures, 1)?;
152 let end = self.parse_srt_time(&captures, 5)?;
153 let text = text_lines.join("\n");
154
155 entries.push(SubtitleEntry::new(start, end, text));
156 }
157 }
158
159 if entries.is_empty() {
160 return Err(YdlError::SubtitleParsing {
161 message: "No valid SRT entries found".to_string(),
162 });
163 }
164
165 Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Srt))
166 }
167
168 fn parse_vtt_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
170 let mut entries = Vec::new();
171 let lines: Vec<&str> = content.lines().collect();
172 let mut i = 0;
173
174 while i < lines.len() {
176 let line = lines[i].trim();
177 if line.is_empty() || line.starts_with("WEBVTT") || line.starts_with("NOTE") {
178 i += 1;
179 continue;
180 }
181 break;
182 }
183
184 while i < lines.len() {
186 let line = lines[i].trim();
187
188 if line.is_empty() {
189 i += 1;
190 continue;
191 }
192
193 if let Some(captures) = self.vtt_time_regex.captures(line) {
195 let start = self.parse_vtt_time(&captures, 1)?;
196 let end = self.parse_vtt_time(&captures, 5)?;
197
198 i += 1;
200 let mut text_lines = Vec::new();
201 while i < lines.len() && !lines[i].trim().is_empty() {
202 text_lines.push(lines[i]);
203 i += 1;
204 }
205
206 let text = text_lines.join("\n");
207 entries.push(SubtitleEntry::new(start, end, text));
208 } else {
209 i += 1;
211 }
212 }
213
214 if entries.is_empty() {
215 return Err(YdlError::SubtitleParsing {
216 message: "No valid VTT entries found".to_string(),
217 });
218 }
219
220 Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Vtt))
221 }
222
223 fn parse_youtube_xml_content(
225 &self,
226 content: &str,
227 language: &str,
228 ) -> YdlResult<ParsedSubtitles> {
229 let mut entries = Vec::new();
230
231 let p_regex =
233 Regex::new(r#"<p\s+t="(\d+)"(?:\s+d="(\d+)")?[^>]*>(.*?)</p>"#).map_err(|e| {
234 YdlError::SubtitleParsing {
235 message: format!("Invalid XML regex: {}", e),
236 }
237 })?;
238
239 let s_regex =
240 Regex::new(r"<s[^>]*>([^<]*)</s>").map_err(|e| YdlError::SubtitleParsing {
241 message: format!("Invalid s tag regex: {}", e),
242 })?;
243
244 for captures in p_regex.captures_iter(content) {
245 let start_str = captures.get(1).unwrap().as_str();
246 let duration_str = captures.get(2).map(|m| m.as_str()).unwrap_or("1000");
247 let inner_content = captures.get(3).unwrap().as_str();
248
249 let start_ms: u64 = start_str.parse().unwrap_or(0);
251 let duration_ms: u64 = duration_str.parse().unwrap_or(1000);
252
253 let start = Duration::from_millis(start_ms);
254 let end = Duration::from_millis(start_ms + duration_ms);
255
256 let text = if inner_content.contains("<s") {
258 let mut words = Vec::new();
259 for s_capture in s_regex.captures_iter(inner_content) {
260 if let Some(word) = s_capture.get(1) {
261 words.push(word.as_str());
262 }
263 }
264 words.join("")
265 } else {
266 inner_content.to_string()
267 };
268
269 let decoded_text = html_escape::decode_html_entities(&text)
271 .to_string()
272 .trim()
273 .to_string();
274
275 if !decoded_text.is_empty() {
277 entries.push(SubtitleEntry::new(start, end, decoded_text));
278 }
279 }
280
281 if entries.is_empty() {
283 let text_regex =
284 Regex::new(r#"<text start="([^"]+)"(?:\s+dur="([^"]+)")?>([^<]*)</text>"#)
285 .map_err(|e| YdlError::SubtitleParsing {
286 message: format!("Invalid XML regex: {}", e),
287 })?;
288
289 for captures in text_regex.captures_iter(content) {
290 let start_str = captures.get(1).unwrap().as_str();
291 let duration_str = captures.get(2).map(|m| m.as_str()).unwrap_or("1");
292 let text = captures.get(3).unwrap().as_str();
293
294 let start_secs: f64 = start_str.parse().unwrap_or(0.0);
296 let duration_secs: f64 = duration_str.parse().unwrap_or(1.0);
297
298 let start = Duration::from_secs_f64(start_secs);
299 let end = Duration::from_secs_f64(start_secs + duration_secs);
300
301 let decoded_text = html_escape::decode_html_entities(text).to_string();
303
304 entries.push(SubtitleEntry::new(start, end, decoded_text));
305 }
306 }
307
308 if entries.is_empty() {
309 return Err(YdlError::SubtitleParsing {
310 message: "No valid XML transcript entries found".to_string(),
311 });
312 }
313
314 Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Raw))
315 }
316
317 fn parse_plain_text_content(
319 &self,
320 content: &str,
321 language: &str,
322 ) -> YdlResult<ParsedSubtitles> {
323 let lines: Vec<&str> = content.lines().filter(|l| !l.trim().is_empty()).collect();
325
326 if lines.is_empty() {
327 return Err(YdlError::SubtitleParsing {
328 message: "No content found in plain text".to_string(),
329 });
330 }
331
332 let mut entries = Vec::new();
333 let avg_duration = Duration::from_secs(3); for (i, line) in lines.iter().enumerate() {
336 let start = Duration::from_secs((i as u64) * 3);
337 let end = start + avg_duration;
338
339 entries.push(SubtitleEntry::new(start, end, line.to_string()));
340 }
341
342 Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Txt))
343 }
344
345 fn parse_srt_time(
347 &self,
348 captures: ®ex::Captures,
349 start_group: usize,
350 ) -> YdlResult<Duration> {
351 let hours: u64 = captures
352 .get(start_group)
353 .unwrap()
354 .as_str()
355 .parse()
356 .map_err(|_| YdlError::SubtitleParsing {
357 message: "Invalid SRT hour format".to_string(),
358 })?;
359 let minutes: u64 = captures
360 .get(start_group + 1)
361 .unwrap()
362 .as_str()
363 .parse()
364 .map_err(|_| YdlError::SubtitleParsing {
365 message: "Invalid SRT minute format".to_string(),
366 })?;
367 let seconds: u64 = captures
368 .get(start_group + 2)
369 .unwrap()
370 .as_str()
371 .parse()
372 .map_err(|_| YdlError::SubtitleParsing {
373 message: "Invalid SRT second format".to_string(),
374 })?;
375 let millis: u64 = captures
376 .get(start_group + 3)
377 .unwrap()
378 .as_str()
379 .parse()
380 .map_err(|_| YdlError::SubtitleParsing {
381 message: "Invalid SRT millisecond format".to_string(),
382 })?;
383
384 Ok(Duration::from_millis(
385 hours * 3_600_000 + minutes * 60_000 + seconds * 1000 + millis,
386 ))
387 }
388
389 fn parse_vtt_time(
391 &self,
392 captures: ®ex::Captures,
393 start_group: usize,
394 ) -> YdlResult<Duration> {
395 let hours: u64 = captures
396 .get(start_group)
397 .unwrap()
398 .as_str()
399 .parse()
400 .map_err(|_| YdlError::SubtitleParsing {
401 message: "Invalid VTT hour format".to_string(),
402 })?;
403 let minutes: u64 = captures
404 .get(start_group + 1)
405 .unwrap()
406 .as_str()
407 .parse()
408 .map_err(|_| YdlError::SubtitleParsing {
409 message: "Invalid VTT minute format".to_string(),
410 })?;
411 let seconds: u64 = captures
412 .get(start_group + 2)
413 .unwrap()
414 .as_str()
415 .parse()
416 .map_err(|_| YdlError::SubtitleParsing {
417 message: "Invalid VTT second format".to_string(),
418 })?;
419 let millis: u64 = captures
420 .get(start_group + 3)
421 .unwrap()
422 .as_str()
423 .parse()
424 .map_err(|_| YdlError::SubtitleParsing {
425 message: "Invalid VTT millisecond format".to_string(),
426 })?;
427
428 Ok(Duration::from_millis(
429 hours * 3_600_000 + minutes * 60_000 + seconds * 1000 + millis,
430 ))
431 }
432
433 fn clean_subtitle_entries(&self, entries: Vec<SubtitleEntry>) -> Vec<SubtitleEntry> {
435 entries
436 .into_iter()
437 .map(|mut entry| {
438 entry.text = self.html_tag_regex.replace_all(&entry.text, "").to_string();
440
441 entry.text = entry.text.split_whitespace().collect::<Vec<_>>().join(" ");
443
444 entry.text = entry
446 .text
447 .replace("<", "<")
448 .replace(">", ">")
449 .replace("&", "&")
450 .replace(""", "\"")
451 .replace("'", "'");
452
453 entry
454 })
455 .collect()
456 }
457
458 fn validate_timing(&self, entries: &[SubtitleEntry]) -> YdlResult<()> {
460 if entries.is_empty() {
461 return Ok(());
462 }
463
464 let mut prev_end = Duration::from_secs(0);
465
466 for (i, entry) in entries.iter().enumerate() {
467 if entry.start >= entry.end {
469 return Err(YdlError::SubtitleParsing {
470 message: format!("Invalid timing at entry {}: start >= end", i + 1),
471 });
472 }
473
474 let duration = entry.duration();
476 if duration < Duration::from_millis(100) {
477 warn!(
478 "Very short subtitle duration at entry {}: {:?}",
479 i + 1,
480 duration
481 );
482 } else if duration > Duration::from_secs(30) {
483 warn!(
484 "Very long subtitle duration at entry {}: {:?}",
485 i + 1,
486 duration
487 );
488 }
489
490 if entry.start < prev_end {
492 warn!("Overlapping subtitles at entry {}", i + 1);
493 }
494
495 prev_end = entry.end;
496 }
497
498 Ok(())
499 }
500
501 fn convert_to_format(
503 &self,
504 entries: &[SubtitleEntry],
505 format: SubtitleType,
506 language: &str,
507 ) -> YdlResult<String> {
508 match format {
509 SubtitleType::Srt => self.to_srt_format(entries),
510 SubtitleType::Vtt => self.to_vtt_format(entries),
511 SubtitleType::Txt => self.to_txt_format(entries),
512 SubtitleType::Json => self.to_json_format(entries, language),
513 SubtitleType::Raw => {
514 if entries.is_empty() {
516 Ok(String::new())
517 } else {
518 self.to_srt_format(entries) }
520 }
521 }
522 }
523
524 fn to_srt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
526 let mut result = String::new();
527
528 for (i, entry) in entries.iter().enumerate() {
529 result.push_str(&format!("{}\n", i + 1));
530 result.push_str(&format!(
531 "{} --> {}\n",
532 entry.start_as_srt(),
533 entry.end_as_srt()
534 ));
535 result.push_str(&entry.text);
536 result.push_str("\n\n");
537 }
538
539 Ok(result)
540 }
541
542 fn to_vtt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
544 let mut result = String::from("WEBVTT\n\n");
545
546 for entry in entries {
547 result.push_str(&format!(
548 "{} --> {}\n",
549 entry.start_as_vtt(),
550 entry.end_as_vtt()
551 ));
552 result.push_str(&entry.text);
553 result.push_str("\n\n");
554 }
555
556 Ok(result)
557 }
558
559 fn to_txt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
561 let texts: Vec<String> = entries.iter().map(|e| e.text.clone()).collect();
562 Ok(texts.join("\n"))
563 }
564
565 fn to_json_format(&self, entries: &[SubtitleEntry], language: &str) -> YdlResult<String> {
567 let json_entries: Vec<serde_json::Value> = entries
568 .iter()
569 .map(|entry| {
570 serde_json::json!({
571 "start": entry.start.as_secs_f64(),
572 "end": entry.end.as_secs_f64(),
573 "text": entry.text
574 })
575 })
576 .collect();
577
578 let result = serde_json::json!({
579 "language": language,
580 "entries": json_entries
581 });
582
583 serde_json::to_string_pretty(&result).map_err(YdlError::from)
584 }
585}
586
587mod html_escape {
589 pub fn decode_html_entities(text: &str) -> std::borrow::Cow<'_, str> {
590 let mut result = text.to_string();
591
592 result = result.replace("&", "&");
593 result = result.replace("<", "<");
594 result = result.replace(">", ">");
595 result = result.replace(""", "\"");
596 result = result.replace("'", "'");
597 result = result.replace("'", "'");
598 result = result.replace("'", "'");
599
600 std::borrow::Cow::Owned(result)
601 }
602}
603
604#[cfg(test)]
605mod tests {
606 use super::*;
607
608 fn test_processor() -> ContentProcessor {
609 ContentProcessor::new()
610 }
611
612 #[test]
613 fn test_parse_srt_content() {
614 let processor = test_processor();
615 let srt_content = r"1
61600:00:01,000 --> 00:00:03,000
617Hello, world!
618
6192
62000:00:04,000 --> 00:00:06,000
621This is a test.
622";
623
624 let result = processor.parse_srt_content(srt_content, "en");
625 assert!(result.is_ok());
626
627 let parsed = result.unwrap();
628 assert_eq!(parsed.entries.len(), 2);
629 assert_eq!(parsed.entries[0].text, "Hello, world!");
630 assert_eq!(parsed.entries[1].text, "This is a test.");
631 }
632
633 #[test]
634 fn test_parse_vtt_content() {
635 let processor = test_processor();
636 let vtt_content = r"WEBVTT
637
63800:00:01.000 --> 00:00:03.000
639Hello, world!
640
64100:00:04.000 --> 00:00:06.000
642This is a test.
643";
644
645 let result = processor.parse_vtt_content(vtt_content, "en");
646 assert!(result.is_ok());
647
648 let parsed = result.unwrap();
649 assert_eq!(parsed.entries.len(), 2);
650 assert_eq!(parsed.entries[0].text, "Hello, world!");
651 assert_eq!(parsed.entries[1].text, "This is a test.");
652 }
653
654 #[test]
655 fn test_convert_to_srt() {
656 let processor = test_processor();
657 let entries = vec![SubtitleEntry::new(
658 Duration::from_secs(1),
659 Duration::from_secs(3),
660 "Hello, world!".to_string(),
661 )];
662
663 let result = processor.to_srt_format(&entries);
664 assert!(result.is_ok());
665
666 let srt = result.unwrap();
667 assert!(srt.contains("1\n"));
668 assert!(srt.contains("00:00:01,000 --> 00:00:03,000"));
669 assert!(srt.contains("Hello, world!"));
670 }
671
672 #[test]
673 fn test_convert_to_vtt() {
674 let processor = test_processor();
675 let entries = vec![SubtitleEntry::new(
676 Duration::from_secs(1),
677 Duration::from_secs(3),
678 "Hello, world!".to_string(),
679 )];
680
681 let result = processor.to_vtt_format(&entries);
682 assert!(result.is_ok());
683
684 let vtt = result.unwrap();
685 assert!(vtt.starts_with("WEBVTT"));
686 assert!(vtt.contains("00:00:01.000 --> 00:00:03.000"));
687 assert!(vtt.contains("Hello, world!"));
688 }
689
690 #[test]
691 fn test_convert_to_txt() {
692 let processor = test_processor();
693 let entries = vec![
694 SubtitleEntry::new(
695 Duration::from_secs(1),
696 Duration::from_secs(3),
697 "Hello, world!".to_string(),
698 ),
699 SubtitleEntry::new(
700 Duration::from_secs(4),
701 Duration::from_secs(6),
702 "This is a test.".to_string(),
703 ),
704 ];
705
706 let result = processor.to_txt_format(&entries);
707 assert!(result.is_ok());
708
709 let txt = result.unwrap();
710 assert_eq!(txt, "Hello, world!\nThis is a test.");
711 }
712
713 #[test]
714 fn test_clean_subtitle_entries() {
715 let processor = test_processor();
716 let entries = vec![SubtitleEntry::new(
717 Duration::from_secs(1),
718 Duration::from_secs(3),
719 "<b>Hello</b>, & world!".to_string(),
720 )];
721
722 let cleaned = processor.clean_subtitle_entries(entries);
723 assert_eq!(cleaned[0].text, "Hello, & world!");
724 }
725
726 #[test]
727 fn test_validate_timing() {
728 let processor = test_processor();
729
730 let valid_entries = vec![
732 SubtitleEntry::new(
733 Duration::from_secs(1),
734 Duration::from_secs(3),
735 "Test".to_string(),
736 ),
737 SubtitleEntry::new(
738 Duration::from_secs(4),
739 Duration::from_secs(6),
740 "Test".to_string(),
741 ),
742 ];
743 assert!(processor.validate_timing(&valid_entries).is_ok());
744
745 let invalid_entries = vec![SubtitleEntry::new(
747 Duration::from_secs(3),
748 Duration::from_secs(1),
749 "Test".to_string(),
750 )];
751 assert!(processor.validate_timing(&invalid_entries).is_err());
752 }
753
754 #[test]
755 fn test_parse_youtube_xml() {
756 let processor = test_processor();
757 let xml_content = r#"<?xml version="1.0" encoding="utf-8"?>
758<transcript>
759<text start="1.5" dur="2.5">Hello world</text>
760<text start="4.0" dur="3.0">This is a test</text>
761</transcript>"#;
762
763 let result = processor.parse_youtube_xml_content(xml_content, "en");
764 assert!(result.is_ok());
765
766 let parsed = result.unwrap();
767 assert_eq!(parsed.entries.len(), 2);
768 assert_eq!(parsed.entries[0].text, "Hello world");
769 assert_eq!(parsed.entries[1].text, "This is a test");
770 }
771}