ydl/
processor.rs

1use crate::error::{YdlError, YdlResult};
2use crate::types::{ParsedSubtitles, SubtitleEntry, SubtitleType};
3use encoding_rs::UTF_8;
4use regex::Regex;
5use std::time::Duration;
6use tracing::{debug, warn};
7
8/// Content processor for parsing and converting subtitle formats
9pub struct ContentProcessor {
10    /// Regex for parsing SRT timestamps
11    srt_time_regex: Regex,
12    /// Regex for parsing VTT timestamps
13    vtt_time_regex: Regex,
14    /// Regex for cleaning HTML tags
15    html_tag_regex: Regex,
16}
17
18impl Default for ContentProcessor {
19    fn default() -> Self {
20        Self::new()
21    }
22}
23
24impl ContentProcessor {
25    /// Create a new content processor
26    pub fn new() -> Self {
27        let srt_time_regex =
28            Regex::new(r"(\d{2}):(\d{2}):(\d{2}),(\d{3}) --> (\d{2}):(\d{2}):(\d{2}),(\d{3})")
29                .expect("Valid SRT time regex");
30
31        let vtt_time_regex =
32            Regex::new(r"(\d{2}):(\d{2}):(\d{2})\.(\d{3}) --> (\d{2}):(\d{2}):(\d{2})\.(\d{3})")
33                .expect("Valid VTT time regex");
34
35        let html_tag_regex = Regex::new(r"<[^>]*>").expect("Valid HTML tag regex");
36
37        Self {
38            srt_time_regex,
39            vtt_time_regex,
40            html_tag_regex,
41        }
42    }
43
44    /// Process raw subtitle content and convert to the desired format
45    pub fn process_content(
46        &self,
47        raw_content: &str,
48        target_format: SubtitleType,
49        language: &str,
50        clean_content: bool,
51        validate_timing: bool,
52    ) -> YdlResult<String> {
53        debug!(
54            "Processing subtitle content, target format: {:?}",
55            target_format
56        );
57
58        // First, detect encoding and convert to UTF-8 if needed
59        let content = self.ensure_utf8(raw_content)?;
60
61        // Parse the content to determine the source format and extract entries
62        let parsed = self.parse_subtitle_content(&content, language)?;
63
64        // Validate timing if requested
65        if validate_timing {
66            self.validate_timing(&parsed.entries)?;
67        }
68
69        // Clean content if requested
70        let entries = if clean_content {
71            self.clean_subtitle_entries(parsed.entries)
72        } else {
73            parsed.entries
74        };
75
76        // Convert to target format
77        self.convert_to_format(&entries, target_format, language)
78    }
79
80    /// Ensure content is valid UTF-8
81    fn ensure_utf8(&self, content: &str) -> YdlResult<String> {
82        // Try to detect encoding if not UTF-8
83        let (decoded, _encoding_used, had_errors) = UTF_8.decode(content.as_bytes());
84
85        if had_errors {
86            warn!("Encoding errors detected, attempting to fix");
87            // Try common encodings for subtitles
88            let encodings = [
89                encoding_rs::WINDOWS_1252,
90                encoding_rs::ISO_8859_2,
91                encoding_rs::UTF_16LE,
92                encoding_rs::UTF_16BE,
93            ];
94
95            for encoding in &encodings {
96                let (decoded, _, had_errors) = encoding.decode(content.as_bytes());
97                if !had_errors {
98                    debug!("Successfully decoded using {:?}", encoding.name());
99                    return Ok(decoded.to_string());
100                }
101            }
102
103            // If all else fails, use the UTF-8 decode with replacement chars
104            Ok(decoded.to_string())
105        } else {
106            Ok(content.to_string())
107        }
108    }
109
110    /// Parse subtitle content and determine format
111    fn parse_subtitle_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
112        debug!("Parsing subtitle content, {} bytes", content.len());
113
114        // Try different parsers based on content characteristics
115        if content.contains("WEBVTT") {
116            self.parse_vtt_content(content, language)
117        } else if content.contains("<?xml") || content.contains("<transcript") {
118            self.parse_youtube_xml_content(content, language)
119        } else if self.srt_time_regex.is_match(content) {
120            self.parse_srt_content(content, language)
121        } else if content.contains("-->") {
122            // Might be VTT without header
123            self.parse_vtt_content(content, language)
124        } else {
125            // Try to parse as plain text with timing info
126            self.parse_plain_text_content(content, language)
127        }
128    }
129
130    /// Parse SRT format content
131    fn parse_srt_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
132        let mut entries = Vec::new();
133        let blocks = content.split("\n\n");
134
135        for block in blocks {
136            let block = block.trim();
137            if block.is_empty() {
138                continue;
139            }
140
141            let lines: Vec<&str> = block.lines().collect();
142            if lines.len() < 3 {
143                continue;
144            }
145
146            // Skip sequence number (first line)
147            let timing_line = lines[1];
148            let text_lines = &lines[2..];
149
150            if let Some(captures) = self.srt_time_regex.captures(timing_line) {
151                let start = self.parse_srt_time(&captures, 1)?;
152                let end = self.parse_srt_time(&captures, 5)?;
153                let text = text_lines.join("\n");
154
155                entries.push(SubtitleEntry::new(start, end, text));
156            }
157        }
158
159        if entries.is_empty() {
160            return Err(YdlError::SubtitleParsing {
161                message: "No valid SRT entries found".to_string(),
162            });
163        }
164
165        Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Srt))
166    }
167
168    /// Parse VTT format content
169    fn parse_vtt_content(&self, content: &str, language: &str) -> YdlResult<ParsedSubtitles> {
170        let mut entries = Vec::new();
171        let lines: Vec<&str> = content.lines().collect();
172        let mut i = 0;
173
174        // Skip WEBVTT header and metadata
175        while i < lines.len() {
176            let line = lines[i].trim();
177            if line.is_empty() || line.starts_with("WEBVTT") || line.starts_with("NOTE") {
178                i += 1;
179                continue;
180            }
181            break;
182        }
183
184        // Parse cue blocks
185        while i < lines.len() {
186            let line = lines[i].trim();
187
188            if line.is_empty() {
189                i += 1;
190                continue;
191            }
192
193            // Check if this line contains timing
194            if let Some(captures) = self.vtt_time_regex.captures(line) {
195                let start = self.parse_vtt_time(&captures, 1)?;
196                let end = self.parse_vtt_time(&captures, 5)?;
197
198                // Collect text lines
199                i += 1;
200                let mut text_lines = Vec::new();
201                while i < lines.len() && !lines[i].trim().is_empty() {
202                    text_lines.push(lines[i]);
203                    i += 1;
204                }
205
206                let text = text_lines.join("\n");
207                entries.push(SubtitleEntry::new(start, end, text));
208            } else {
209                // Skip cue identifier line
210                i += 1;
211            }
212        }
213
214        if entries.is_empty() {
215            return Err(YdlError::SubtitleParsing {
216                message: "No valid VTT entries found".to_string(),
217            });
218        }
219
220        Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Vtt))
221    }
222
223    /// Parse YouTube XML transcript format
224    fn parse_youtube_xml_content(
225        &self,
226        content: &str,
227        language: &str,
228    ) -> YdlResult<ParsedSubtitles> {
229        let mut entries = Vec::new();
230
231        // Try the newer srv3 format first (uses <p> tags)
232        let p_regex =
233            Regex::new(r#"<p\s+t="(\d+)"(?:\s+d="(\d+)")?[^>]*>(.*?)</p>"#).map_err(|e| {
234                YdlError::SubtitleParsing {
235                    message: format!("Invalid XML regex: {}", e),
236                }
237            })?;
238
239        let s_regex =
240            Regex::new(r"<s[^>]*>([^<]*)</s>").map_err(|e| YdlError::SubtitleParsing {
241                message: format!("Invalid s tag regex: {}", e),
242            })?;
243
244        for captures in p_regex.captures_iter(content) {
245            let start_str = captures.get(1).unwrap().as_str();
246            let duration_str = captures.get(2).map(|m| m.as_str()).unwrap_or("1000");
247            let inner_content = captures.get(3).unwrap().as_str();
248
249            // Parse start time (in milliseconds for srv3 format)
250            let start_ms: u64 = start_str.parse().unwrap_or(0);
251            let duration_ms: u64 = duration_str.parse().unwrap_or(1000);
252
253            let start = Duration::from_millis(start_ms);
254            let end = Duration::from_millis(start_ms + duration_ms);
255
256            // Extract text from <s> tags or use the inner content directly
257            let text = if inner_content.contains("<s") {
258                let mut words = Vec::new();
259                for s_capture in s_regex.captures_iter(inner_content) {
260                    if let Some(word) = s_capture.get(1) {
261                        words.push(word.as_str());
262                    }
263                }
264                words.join("")
265            } else {
266                inner_content.to_string()
267            };
268
269            // Decode HTML entities
270            let decoded_text = html_escape::decode_html_entities(&text)
271                .to_string()
272                .trim()
273                .to_string();
274
275            // Skip empty entries
276            if !decoded_text.is_empty() {
277                entries.push(SubtitleEntry::new(start, end, decoded_text));
278            }
279        }
280
281        // If no <p> tags found, try the older <text> format
282        if entries.is_empty() {
283            let text_regex =
284                Regex::new(r#"<text start="([^"]+)"(?:\s+dur="([^"]+)")?>([^<]*)</text>"#)
285                    .map_err(|e| YdlError::SubtitleParsing {
286                        message: format!("Invalid XML regex: {}", e),
287                    })?;
288
289            for captures in text_regex.captures_iter(content) {
290                let start_str = captures.get(1).unwrap().as_str();
291                let duration_str = captures.get(2).map(|m| m.as_str()).unwrap_or("1");
292                let text = captures.get(3).unwrap().as_str();
293
294                // Parse start time (usually in seconds as float)
295                let start_secs: f64 = start_str.parse().unwrap_or(0.0);
296                let duration_secs: f64 = duration_str.parse().unwrap_or(1.0);
297
298                let start = Duration::from_secs_f64(start_secs);
299                let end = Duration::from_secs_f64(start_secs + duration_secs);
300
301                // Decode HTML entities
302                let decoded_text = html_escape::decode_html_entities(text).to_string();
303
304                entries.push(SubtitleEntry::new(start, end, decoded_text));
305            }
306        }
307
308        if entries.is_empty() {
309            return Err(YdlError::SubtitleParsing {
310                message: "No valid XML transcript entries found".to_string(),
311            });
312        }
313
314        Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Raw))
315    }
316
317    /// Parse plain text with minimal timing information
318    fn parse_plain_text_content(
319        &self,
320        content: &str,
321        language: &str,
322    ) -> YdlResult<ParsedSubtitles> {
323        // For plain text, create artificial timing
324        let lines: Vec<&str> = content.lines().filter(|l| !l.trim().is_empty()).collect();
325
326        if lines.is_empty() {
327            return Err(YdlError::SubtitleParsing {
328                message: "No content found in plain text".to_string(),
329            });
330        }
331
332        let mut entries = Vec::new();
333        let avg_duration = Duration::from_secs(3); // 3 seconds per line
334
335        for (i, line) in lines.iter().enumerate() {
336            let start = Duration::from_secs((i as u64) * 3);
337            let end = start + avg_duration;
338
339            entries.push(SubtitleEntry::new(start, end, line.to_string()));
340        }
341
342        Ok(ParsedSubtitles::new(entries, language.to_string()).with_format(SubtitleType::Txt))
343    }
344
345    /// Parse SRT timestamp from regex captures
346    fn parse_srt_time(
347        &self,
348        captures: &regex::Captures,
349        start_group: usize,
350    ) -> YdlResult<Duration> {
351        let hours: u64 = captures
352            .get(start_group)
353            .unwrap()
354            .as_str()
355            .parse()
356            .map_err(|_| YdlError::SubtitleParsing {
357                message: "Invalid SRT hour format".to_string(),
358            })?;
359        let minutes: u64 = captures
360            .get(start_group + 1)
361            .unwrap()
362            .as_str()
363            .parse()
364            .map_err(|_| YdlError::SubtitleParsing {
365                message: "Invalid SRT minute format".to_string(),
366            })?;
367        let seconds: u64 = captures
368            .get(start_group + 2)
369            .unwrap()
370            .as_str()
371            .parse()
372            .map_err(|_| YdlError::SubtitleParsing {
373                message: "Invalid SRT second format".to_string(),
374            })?;
375        let millis: u64 = captures
376            .get(start_group + 3)
377            .unwrap()
378            .as_str()
379            .parse()
380            .map_err(|_| YdlError::SubtitleParsing {
381                message: "Invalid SRT millisecond format".to_string(),
382            })?;
383
384        Ok(Duration::from_millis(
385            hours * 3_600_000 + minutes * 60_000 + seconds * 1000 + millis,
386        ))
387    }
388
389    /// Parse VTT timestamp from regex captures
390    fn parse_vtt_time(
391        &self,
392        captures: &regex::Captures,
393        start_group: usize,
394    ) -> YdlResult<Duration> {
395        let hours: u64 = captures
396            .get(start_group)
397            .unwrap()
398            .as_str()
399            .parse()
400            .map_err(|_| YdlError::SubtitleParsing {
401                message: "Invalid VTT hour format".to_string(),
402            })?;
403        let minutes: u64 = captures
404            .get(start_group + 1)
405            .unwrap()
406            .as_str()
407            .parse()
408            .map_err(|_| YdlError::SubtitleParsing {
409                message: "Invalid VTT minute format".to_string(),
410            })?;
411        let seconds: u64 = captures
412            .get(start_group + 2)
413            .unwrap()
414            .as_str()
415            .parse()
416            .map_err(|_| YdlError::SubtitleParsing {
417                message: "Invalid VTT second format".to_string(),
418            })?;
419        let millis: u64 = captures
420            .get(start_group + 3)
421            .unwrap()
422            .as_str()
423            .parse()
424            .map_err(|_| YdlError::SubtitleParsing {
425                message: "Invalid VTT millisecond format".to_string(),
426            })?;
427
428        Ok(Duration::from_millis(
429            hours * 3_600_000 + minutes * 60_000 + seconds * 1000 + millis,
430        ))
431    }
432
433    /// Clean subtitle entries by removing HTML tags and normalizing text
434    fn clean_subtitle_entries(&self, entries: Vec<SubtitleEntry>) -> Vec<SubtitleEntry> {
435        entries
436            .into_iter()
437            .map(|mut entry| {
438                // Remove HTML tags
439                entry.text = self.html_tag_regex.replace_all(&entry.text, "").to_string();
440
441                // Normalize whitespace
442                entry.text = entry.text.split_whitespace().collect::<Vec<_>>().join(" ");
443
444                // Remove common subtitle formatting
445                entry.text = entry
446                    .text
447                    .replace("&lt;", "<")
448                    .replace("&gt;", ">")
449                    .replace("&amp;", "&")
450                    .replace("&quot;", "\"")
451                    .replace("&#39;", "'");
452
453                entry
454            })
455            .collect()
456    }
457
458    /// Validate timing consistency
459    fn validate_timing(&self, entries: &[SubtitleEntry]) -> YdlResult<()> {
460        if entries.is_empty() {
461            return Ok(());
462        }
463
464        let mut prev_end = Duration::from_secs(0);
465
466        for (i, entry) in entries.iter().enumerate() {
467            // Check that start < end
468            if entry.start >= entry.end {
469                return Err(YdlError::SubtitleParsing {
470                    message: format!("Invalid timing at entry {}: start >= end", i + 1),
471                });
472            }
473
474            // Check for reasonable duration (not too short or too long)
475            let duration = entry.duration();
476            if duration < Duration::from_millis(100) {
477                warn!(
478                    "Very short subtitle duration at entry {}: {:?}",
479                    i + 1,
480                    duration
481                );
482            } else if duration > Duration::from_secs(30) {
483                warn!(
484                    "Very long subtitle duration at entry {}: {:?}",
485                    i + 1,
486                    duration
487                );
488            }
489
490            // Check for overlaps or gaps (warning only)
491            if entry.start < prev_end {
492                warn!("Overlapping subtitles at entry {}", i + 1);
493            }
494
495            prev_end = entry.end;
496        }
497
498        Ok(())
499    }
500
501    /// Convert subtitle entries to target format
502    fn convert_to_format(
503        &self,
504        entries: &[SubtitleEntry],
505        format: SubtitleType,
506        language: &str,
507    ) -> YdlResult<String> {
508        match format {
509            SubtitleType::Srt => self.to_srt_format(entries),
510            SubtitleType::Vtt => self.to_vtt_format(entries),
511            SubtitleType::Txt => self.to_txt_format(entries),
512            SubtitleType::Json => self.to_json_format(entries, language),
513            SubtitleType::Raw => {
514                // For raw format, return as is if we have entries
515                if entries.is_empty() {
516                    Ok(String::new())
517                } else {
518                    self.to_srt_format(entries) // Default to SRT for raw
519                }
520            }
521        }
522    }
523
524    /// Convert to SRT format
525    fn to_srt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
526        let mut result = String::new();
527
528        for (i, entry) in entries.iter().enumerate() {
529            result.push_str(&format!("{}\n", i + 1));
530            result.push_str(&format!(
531                "{} --> {}\n",
532                entry.start_as_srt(),
533                entry.end_as_srt()
534            ));
535            result.push_str(&entry.text);
536            result.push_str("\n\n");
537        }
538
539        Ok(result)
540    }
541
542    /// Convert to VTT format
543    fn to_vtt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
544        let mut result = String::from("WEBVTT\n\n");
545
546        for entry in entries {
547            result.push_str(&format!(
548                "{} --> {}\n",
549                entry.start_as_vtt(),
550                entry.end_as_vtt()
551            ));
552            result.push_str(&entry.text);
553            result.push_str("\n\n");
554        }
555
556        Ok(result)
557    }
558
559    /// Convert to plain text format
560    fn to_txt_format(&self, entries: &[SubtitleEntry]) -> YdlResult<String> {
561        let texts: Vec<String> = entries.iter().map(|e| e.text.clone()).collect();
562        Ok(texts.join("\n"))
563    }
564
565    /// Convert to JSON format
566    fn to_json_format(&self, entries: &[SubtitleEntry], language: &str) -> YdlResult<String> {
567        let json_entries: Vec<serde_json::Value> = entries
568            .iter()
569            .map(|entry| {
570                serde_json::json!({
571                    "start": entry.start.as_secs_f64(),
572                    "end": entry.end.as_secs_f64(),
573                    "text": entry.text
574                })
575            })
576            .collect();
577
578        let result = serde_json::json!({
579            "language": language,
580            "entries": json_entries
581        });
582
583        serde_json::to_string_pretty(&result).map_err(YdlError::from)
584    }
585}
586
587// Simple HTML entity decoder (subset of common entities)
588mod html_escape {
589    pub fn decode_html_entities(text: &str) -> std::borrow::Cow<'_, str> {
590        let mut result = text.to_string();
591
592        result = result.replace("&amp;", "&");
593        result = result.replace("&lt;", "<");
594        result = result.replace("&gt;", ">");
595        result = result.replace("&quot;", "\"");
596        result = result.replace("&#39;", "'");
597        result = result.replace("&#x27;", "'");
598        result = result.replace("&apos;", "'");
599
600        std::borrow::Cow::Owned(result)
601    }
602}
603
604#[cfg(test)]
605mod tests {
606    use super::*;
607
608    fn test_processor() -> ContentProcessor {
609        ContentProcessor::new()
610    }
611
612    #[test]
613    fn test_parse_srt_content() {
614        let processor = test_processor();
615        let srt_content = r"1
61600:00:01,000 --> 00:00:03,000
617Hello, world!
618
6192
62000:00:04,000 --> 00:00:06,000
621This is a test.
622";
623
624        let result = processor.parse_srt_content(srt_content, "en");
625        assert!(result.is_ok());
626
627        let parsed = result.unwrap();
628        assert_eq!(parsed.entries.len(), 2);
629        assert_eq!(parsed.entries[0].text, "Hello, world!");
630        assert_eq!(parsed.entries[1].text, "This is a test.");
631    }
632
633    #[test]
634    fn test_parse_vtt_content() {
635        let processor = test_processor();
636        let vtt_content = r"WEBVTT
637
63800:00:01.000 --> 00:00:03.000
639Hello, world!
640
64100:00:04.000 --> 00:00:06.000
642This is a test.
643";
644
645        let result = processor.parse_vtt_content(vtt_content, "en");
646        assert!(result.is_ok());
647
648        let parsed = result.unwrap();
649        assert_eq!(parsed.entries.len(), 2);
650        assert_eq!(parsed.entries[0].text, "Hello, world!");
651        assert_eq!(parsed.entries[1].text, "This is a test.");
652    }
653
654    #[test]
655    fn test_convert_to_srt() {
656        let processor = test_processor();
657        let entries = vec![SubtitleEntry::new(
658            Duration::from_secs(1),
659            Duration::from_secs(3),
660            "Hello, world!".to_string(),
661        )];
662
663        let result = processor.to_srt_format(&entries);
664        assert!(result.is_ok());
665
666        let srt = result.unwrap();
667        assert!(srt.contains("1\n"));
668        assert!(srt.contains("00:00:01,000 --> 00:00:03,000"));
669        assert!(srt.contains("Hello, world!"));
670    }
671
672    #[test]
673    fn test_convert_to_vtt() {
674        let processor = test_processor();
675        let entries = vec![SubtitleEntry::new(
676            Duration::from_secs(1),
677            Duration::from_secs(3),
678            "Hello, world!".to_string(),
679        )];
680
681        let result = processor.to_vtt_format(&entries);
682        assert!(result.is_ok());
683
684        let vtt = result.unwrap();
685        assert!(vtt.starts_with("WEBVTT"));
686        assert!(vtt.contains("00:00:01.000 --> 00:00:03.000"));
687        assert!(vtt.contains("Hello, world!"));
688    }
689
690    #[test]
691    fn test_convert_to_txt() {
692        let processor = test_processor();
693        let entries = vec![
694            SubtitleEntry::new(
695                Duration::from_secs(1),
696                Duration::from_secs(3),
697                "Hello, world!".to_string(),
698            ),
699            SubtitleEntry::new(
700                Duration::from_secs(4),
701                Duration::from_secs(6),
702                "This is a test.".to_string(),
703            ),
704        ];
705
706        let result = processor.to_txt_format(&entries);
707        assert!(result.is_ok());
708
709        let txt = result.unwrap();
710        assert_eq!(txt, "Hello, world!\nThis is a test.");
711    }
712
713    #[test]
714    fn test_clean_subtitle_entries() {
715        let processor = test_processor();
716        let entries = vec![SubtitleEntry::new(
717            Duration::from_secs(1),
718            Duration::from_secs(3),
719            "<b>Hello</b>, &amp; world!".to_string(),
720        )];
721
722        let cleaned = processor.clean_subtitle_entries(entries);
723        assert_eq!(cleaned[0].text, "Hello, & world!");
724    }
725
726    #[test]
727    fn test_validate_timing() {
728        let processor = test_processor();
729
730        // Valid timing
731        let valid_entries = vec![
732            SubtitleEntry::new(
733                Duration::from_secs(1),
734                Duration::from_secs(3),
735                "Test".to_string(),
736            ),
737            SubtitleEntry::new(
738                Duration::from_secs(4),
739                Duration::from_secs(6),
740                "Test".to_string(),
741            ),
742        ];
743        assert!(processor.validate_timing(&valid_entries).is_ok());
744
745        // Invalid timing (start >= end)
746        let invalid_entries = vec![SubtitleEntry::new(
747            Duration::from_secs(3),
748            Duration::from_secs(1),
749            "Test".to_string(),
750        )];
751        assert!(processor.validate_timing(&invalid_entries).is_err());
752    }
753
754    #[test]
755    fn test_parse_youtube_xml() {
756        let processor = test_processor();
757        let xml_content = r#"<?xml version="1.0" encoding="utf-8"?>
758<transcript>
759<text start="1.5" dur="2.5">Hello world</text>
760<text start="4.0" dur="3.0">This is a test</text>
761</transcript>"#;
762
763        let result = processor.parse_youtube_xml_content(xml_content, "en");
764        assert!(result.is_ok());
765
766        let parsed = result.unwrap();
767        assert_eq!(parsed.entries.len(), 2);
768        assert_eq!(parsed.entries[0].text, "Hello world");
769        assert_eq!(parsed.entries[1].text, "This is a test");
770    }
771}