batuta/oracle/coursera/
transcript.rs1use anyhow::{Context, Result};
4use std::path::Path;
5
6use super::types::{TranscriptInput, TranscriptSegment};
7
8#[derive(serde::Deserialize)]
10struct WhisperTranscript {
11 text: String,
12 #[serde(default = "default_language")]
13 language: String,
14 #[serde(default)]
15 segments: Vec<WhisperSegment>,
16}
17
18#[derive(serde::Deserialize)]
19struct WhisperSegment {
20 start: f64,
21 end: f64,
22 text: String,
23 #[serde(default)]
24 tokens: Vec<serde_json::Value>,
25}
26
27fn default_language() -> String {
28 "en".to_string()
29}
30
31pub fn parse_transcript(path: &Path) -> Result<TranscriptInput> {
33 let content = std::fs::read_to_string(path)
34 .with_context(|| format!("Failed to read transcript: {}", path.display()))?;
35
36 let source_path = path.display().to_string();
37
38 if let Ok(whisper) = serde_json::from_str::<WhisperTranscript>(&content) {
40 let segments = whisper
41 .segments
42 .into_iter()
43 .map(|s| TranscriptSegment { start: s.start, end: s.end, text: s.text })
44 .collect();
45
46 return Ok(TranscriptInput {
47 text: whisper.text,
48 language: whisper.language,
49 segments,
50 source_path,
51 });
52 }
53
54 Ok(TranscriptInput {
56 text: content,
57 language: "en".to_string(),
58 segments: Vec::new(),
59 source_path,
60 })
61}
62
63pub fn parse_transcript_dir(dir: &Path) -> Result<Vec<TranscriptInput>> {
65 let mut transcripts = Vec::new();
66
67 let entries: Vec<_> = std::fs::read_dir(dir)
68 .with_context(|| format!("Failed to read directory: {}", dir.display()))?
69 .filter_map(|e| e.ok())
70 .collect();
71
72 for entry in entries {
73 let path = entry.path();
74 if path.is_file() {
75 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
76 if matches!(ext, "json" | "txt" | "md") {
77 match parse_transcript(&path) {
78 Ok(t) => transcripts.push(t),
79 Err(e) => {
80 eprintln!("Warning: skipping {}: {}", path.display(), e);
81 }
82 }
83 }
84 }
85 }
86
87 transcripts.sort_by(|a, b| a.source_path.cmp(&b.source_path));
88 Ok(transcripts)
89}
90
91pub fn format_timestamp(seconds: f64) -> String {
93 let mins = (seconds / 60.0) as u64;
94 let secs = (seconds % 60.0) as u64;
95 format!("{mins}:{secs:02}")
96}
97
98#[cfg(test)]
99mod tests {
100 use super::*;
101 use std::io::Write;
102 use tempfile::NamedTempFile;
103
104 #[test]
105 fn test_parse_whisper_json() {
106 let json = r#"{
107 "text": "MLOps combines ML and DevOps practices.",
108 "language": "en",
109 "segments": [
110 {"start": 0.0, "end": 3.5, "text": "MLOps combines ML", "tokens": []},
111 {"start": 3.5, "end": 6.0, "text": "and DevOps practices.", "tokens": []}
112 ]
113 }"#;
114
115 let mut f = NamedTempFile::with_suffix(".json").expect("tempfile creation failed");
116 write!(f, "{json}").expect("write failed");
117
118 let transcript = parse_transcript(f.path()).expect("unexpected failure");
119 assert_eq!(transcript.language, "en");
120 assert_eq!(transcript.segments.len(), 2);
121 assert!(transcript.text.contains("MLOps"));
122 assert!((transcript.segments[0].end - 3.5).abs() < f64::EPSILON);
123 }
124
125 #[test]
126 fn test_parse_plain_text() {
127 let text = "This is a plain text transcript about machine learning.";
128 let mut f = NamedTempFile::with_suffix(".txt").expect("tempfile creation failed");
129 write!(f, "{text}").expect("write failed");
130
131 let transcript = parse_transcript(f.path()).expect("unexpected failure");
132 assert_eq!(transcript.language, "en");
133 assert!(transcript.segments.is_empty());
134 assert!(transcript.text.contains("machine learning"));
135 }
136
137 #[test]
138 fn test_parse_transcript_dir() {
139 let dir = tempfile::tempdir().expect("tempdir creation failed");
140
141 let json_path = dir.path().join("lesson1.json");
143 std::fs::write(
144 &json_path,
145 r#"{"text":"Lesson one content.","language":"en","segments":[]}"#,
146 )
147 .expect("unexpected failure");
148
149 let txt_path = dir.path().join("lesson2.txt");
150 std::fs::write(&txt_path, "Lesson two content.").expect("fs write failed");
151
152 std::fs::write(dir.path().join("notes.rs"), "fn main() {}").expect("fs write failed");
154
155 let transcripts = parse_transcript_dir(dir.path()).expect("unexpected failure");
156 assert_eq!(transcripts.len(), 2);
157 }
158
159 #[test]
160 fn test_parse_nonexistent_file() {
161 let result = parse_transcript(Path::new("/nonexistent/file.json"));
162 assert!(result.is_err());
163 }
164
165 #[test]
166 fn test_format_timestamp() {
167 assert_eq!(format_timestamp(0.0), "0:00");
168 assert_eq!(format_timestamp(65.0), "1:05");
169 assert_eq!(format_timestamp(3661.0), "61:01");
170 }
171
172 #[test]
173 fn test_parse_whisper_json_missing_language() {
174 let json = r#"{"text": "Hello", "segments": []}"#;
175 let mut f = NamedTempFile::with_suffix(".json").expect("tempfile creation failed");
176 write!(f, "{json}").expect("write failed");
177
178 let transcript = parse_transcript(f.path()).expect("unexpected failure");
179 assert_eq!(transcript.language, "en");
180 }
181
182 #[test]
183 fn test_parse_transcript_dir_nonexistent() {
184 let result = parse_transcript_dir(Path::new("/nonexistent/dir"));
185 assert!(result.is_err());
186 }
187}