Skip to main content

batuta/oracle/coursera/
transcript.rs

1//! Transcript parsing for whisper-apr JSON and plain text formats
2
3use anyhow::{Context, Result};
4use std::path::Path;
5
6use super::types::{TranscriptInput, TranscriptSegment};
7
8/// Whisper-apr JSON transcript format
9#[derive(serde::Deserialize)]
10struct WhisperTranscript {
11    text: String,
12    #[serde(default = "default_language")]
13    language: String,
14    #[serde(default)]
15    segments: Vec<WhisperSegment>,
16}
17
18#[derive(serde::Deserialize)]
19struct WhisperSegment {
20    start: f64,
21    end: f64,
22    text: String,
23    #[serde(default)]
24    tokens: Vec<serde_json::Value>,
25}
26
27fn default_language() -> String {
28    "en".to_string()
29}
30
31/// Parse a transcript file, auto-detecting whisper-apr JSON vs plain text.
32pub fn parse_transcript(path: &Path) -> Result<TranscriptInput> {
33    let content = std::fs::read_to_string(path)
34        .with_context(|| format!("Failed to read transcript: {}", path.display()))?;
35
36    let source_path = path.display().to_string();
37
38    // Try JSON first
39    if let Ok(whisper) = serde_json::from_str::<WhisperTranscript>(&content) {
40        let segments = whisper
41            .segments
42            .into_iter()
43            .map(|s| TranscriptSegment { start: s.start, end: s.end, text: s.text })
44            .collect();
45
46        return Ok(TranscriptInput {
47            text: whisper.text,
48            language: whisper.language,
49            segments,
50            source_path,
51        });
52    }
53
54    // Fall back to plain text
55    Ok(TranscriptInput {
56        text: content,
57        language: "en".to_string(),
58        segments: Vec::new(),
59        source_path,
60    })
61}
62
63/// Parse all transcript files in a directory.
64pub fn parse_transcript_dir(dir: &Path) -> Result<Vec<TranscriptInput>> {
65    let mut transcripts = Vec::new();
66
67    let entries: Vec<_> = std::fs::read_dir(dir)
68        .with_context(|| format!("Failed to read directory: {}", dir.display()))?
69        .filter_map(|e| e.ok())
70        .collect();
71
72    for entry in entries {
73        let path = entry.path();
74        if path.is_file() {
75            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
76            if matches!(ext, "json" | "txt" | "md") {
77                match parse_transcript(&path) {
78                    Ok(t) => transcripts.push(t),
79                    Err(e) => {
80                        eprintln!("Warning: skipping {}: {}", path.display(), e);
81                    }
82                }
83            }
84        }
85    }
86
87    transcripts.sort_by(|a, b| a.source_path.cmp(&b.source_path));
88    Ok(transcripts)
89}
90
91/// Format a timestamp in seconds as MM:SS
92pub fn format_timestamp(seconds: f64) -> String {
93    let mins = (seconds / 60.0) as u64;
94    let secs = (seconds % 60.0) as u64;
95    format!("{mins}:{secs:02}")
96}
97
98#[cfg(test)]
99mod tests {
100    use super::*;
101    use std::io::Write;
102    use tempfile::NamedTempFile;
103
104    #[test]
105    fn test_parse_whisper_json() {
106        let json = r#"{
107            "text": "MLOps combines ML and DevOps practices.",
108            "language": "en",
109            "segments": [
110                {"start": 0.0, "end": 3.5, "text": "MLOps combines ML", "tokens": []},
111                {"start": 3.5, "end": 6.0, "text": "and DevOps practices.", "tokens": []}
112            ]
113        }"#;
114
115        let mut f = NamedTempFile::with_suffix(".json").expect("tempfile creation failed");
116        write!(f, "{json}").expect("write failed");
117
118        let transcript = parse_transcript(f.path()).expect("unexpected failure");
119        assert_eq!(transcript.language, "en");
120        assert_eq!(transcript.segments.len(), 2);
121        assert!(transcript.text.contains("MLOps"));
122        assert!((transcript.segments[0].end - 3.5).abs() < f64::EPSILON);
123    }
124
125    #[test]
126    fn test_parse_plain_text() {
127        let text = "This is a plain text transcript about machine learning.";
128        let mut f = NamedTempFile::with_suffix(".txt").expect("tempfile creation failed");
129        write!(f, "{text}").expect("write failed");
130
131        let transcript = parse_transcript(f.path()).expect("unexpected failure");
132        assert_eq!(transcript.language, "en");
133        assert!(transcript.segments.is_empty());
134        assert!(transcript.text.contains("machine learning"));
135    }
136
137    #[test]
138    fn test_parse_transcript_dir() {
139        let dir = tempfile::tempdir().expect("tempdir creation failed");
140
141        // Create two transcript files
142        let json_path = dir.path().join("lesson1.json");
143        std::fs::write(
144            &json_path,
145            r#"{"text":"Lesson one content.","language":"en","segments":[]}"#,
146        )
147        .expect("unexpected failure");
148
149        let txt_path = dir.path().join("lesson2.txt");
150        std::fs::write(&txt_path, "Lesson two content.").expect("fs write failed");
151
152        // Create a non-transcript file that should be skipped
153        std::fs::write(dir.path().join("notes.rs"), "fn main() {}").expect("fs write failed");
154
155        let transcripts = parse_transcript_dir(dir.path()).expect("unexpected failure");
156        assert_eq!(transcripts.len(), 2);
157    }
158
159    #[test]
160    fn test_parse_nonexistent_file() {
161        let result = parse_transcript(Path::new("/nonexistent/file.json"));
162        assert!(result.is_err());
163    }
164
165    #[test]
166    fn test_format_timestamp() {
167        assert_eq!(format_timestamp(0.0), "0:00");
168        assert_eq!(format_timestamp(65.0), "1:05");
169        assert_eq!(format_timestamp(3661.0), "61:01");
170    }
171
172    #[test]
173    fn test_parse_whisper_json_missing_language() {
174        let json = r#"{"text": "Hello", "segments": []}"#;
175        let mut f = NamedTempFile::with_suffix(".json").expect("tempfile creation failed");
176        write!(f, "{json}").expect("write failed");
177
178        let transcript = parse_transcript(f.path()).expect("unexpected failure");
179        assert_eq!(transcript.language, "en");
180    }
181
182    #[test]
183    fn test_parse_transcript_dir_nonexistent() {
184        let result = parse_transcript_dir(Path::new("/nonexistent/dir"));
185        assert!(result.is_err());
186    }
187}