use clap::Parser;
use regex::Regex;
use std::fs;
use std::sync::LazyLock;
static RE_VIDEO_ID_QUERY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"[?&]v=([a-zA-Z0-9_-]{11})").unwrap());
static RE_VIDEO_ID_SHORT_URL: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"youtu\.be/([a-zA-Z0-9_-]{11})").unwrap());
static RE_VIDEO_ID_SHORTS: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"shorts/([a-zA-Z0-9_-]{11})").unwrap());
static RE_INNERTUBE_API_KEY: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r#""INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)""#).unwrap());
static RE_HTML_TAGS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"<[^>]+>").unwrap());
#[derive(Parser)]
#[command(name = "ytx")]
#[command(about = "Fetch YouTube video transcripts")]
struct Cli {
url_or_id: String,
#[arg(short, long)]
output: Option<String>,
#[arg(short, long)]
timestamps: bool,
#[arg(short, long, default_value = "en")]
lang: String,
}
fn extract_video_id(input: &str) -> String {
if let Some(caps) = RE_VIDEO_ID_QUERY.captures(input) {
return caps[1].to_string();
}
if let Some(caps) = RE_VIDEO_ID_SHORT_URL.captures(input) {
return caps[1].to_string();
}
if let Some(caps) = RE_VIDEO_ID_SHORTS.captures(input) {
return caps[1].to_string();
}
input.to_string()
}
fn format_timestamp(seconds: f64) -> String {
let total_secs = seconds as u64;
let hours = total_secs / 3600;
let mins = (total_secs % 3600) / 60;
let secs = total_secs % 60;
if hours > 0 {
format!("[{hours:02}:{mins:02}:{secs:02}]")
} else {
format!("[{mins:02}:{secs:02}]")
}
}
#[derive(Debug, PartialEq)]
struct Segment {
text: String,
start: f64,
}
fn extract_api_key(html: &str) -> Result<String, String> {
RE_INNERTUBE_API_KEY
.captures(html)
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
.ok_or_else(|| "Could not extract INNERTUBE_API_KEY from page.".to_string())
}
fn find_caption_url(data: &serde_json::Value, lang: &str) -> Result<String, String> {
let status = data
.pointer("/playabilityStatus/status")
.and_then(|v| v.as_str())
.unwrap_or("UNKNOWN");
if status != "OK" {
let reason = data
.pointer("/playabilityStatus/reason")
.and_then(|v| v.as_str())
.unwrap_or("Unknown reason");
return Err(format!("Video not playable: {reason}"));
}
let tracks = data
.pointer("/captions/playerCaptionsTracklistRenderer/captionTracks")
.and_then(|v| v.as_array())
.ok_or("No captions available for this video.")?;
let track = tracks
.iter()
.find(|t| {
t.get("languageCode")
.and_then(|v| v.as_str())
.is_some_and(|code| code == lang)
})
.or_else(|| {
let prefix = lang.split('-').next().unwrap_or(lang);
tracks.iter().find(|t| {
t.get("languageCode")
.and_then(|v| v.as_str())
.is_some_and(|code| code.starts_with(prefix))
})
})
.or_else(|| tracks.first())
.ok_or("No caption tracks found.")?;
let base_url = track
.get("baseUrl")
.and_then(|v| v.as_str())
.ok_or("Caption track has no URL.")?;
if base_url.contains("&exp=xpe") {
return Err(
"Caption URL requires PoToken authentication. This video's captions may be restricted."
.to_string(),
);
}
Ok(base_url.to_string())
}
async fn fetch_transcript(video_id: &str, lang: &str) -> Result<Vec<Segment>, String> {
let client = reqwest::Client::builder()
.cookie_store(true)
.build()
.map_err(|e| format!("Failed to create HTTP client: {e}"))?;
let page_url = format!("https://www.youtube.com/watch?v={video_id}");
let html = client
.get(&page_url)
.header(
"User-Agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
)
.header("Accept-Language", "en-US,en;q=0.9")
.send()
.await
.map_err(|e| format!("Failed to fetch video page: {e}"))?
.text()
.await
.map_err(|e| format!("Failed to read response: {e}"))?;
if html.contains(r#"action="https://consent.youtube.com/s""#) {
return Err("YouTube consent page detected. Try again or use a VPN.".to_string());
}
let api_key = extract_api_key(&html)?;
let innertube_url = format!("https://www.youtube.com/youtubei/v1/player?key={api_key}");
let payload = serde_json::json!({
"context": {
"client": {
"clientName": "ANDROID",
"clientVersion": "20.10.38"
}
},
"videoId": video_id
});
let innertube_data: serde_json::Value = client
.post(&innertube_url)
.json(&payload)
.header("Content-Type", "application/json")
.header(
"User-Agent",
"com.google.android.youtube/20.10.38 (Linux; U; Android 11) gzip",
)
.send()
.await
.map_err(|e| format!("InnerTube request failed: {e}"))?
.json()
.await
.map_err(|e| format!("Failed to parse InnerTube response: {e}"))?;
let caption_url = find_caption_url(&innertube_data, lang)?;
let xml = client
.get(&caption_url)
.send()
.await
.map_err(|e| format!("Failed to fetch captions: {e}"))?
.text()
.await
.map_err(|e| format!("Failed to read captions: {e}"))?;
if xml.is_empty() {
return Err("Caption server returned empty response.".to_string());
}
parse_caption_xml(&xml)
}
fn parse_caption_xml(xml: &str) -> Result<Vec<Segment>, String> {
let doc = roxmltree::Document::parse(xml).map_err(|e| format!("Failed to parse XML: {e}"))?;
let segments: Vec<Segment> = doc
.descendants()
.filter(|n| n.has_tag_name("text") || n.has_tag_name("p"))
.filter_map(|n| {
let start: f64 = if let Some(t) = n.attribute("t") {
t.parse::<f64>().ok()? / 1000.0
} else if let Some(s) = n.attribute("start") {
s.parse().ok()?
} else {
return None;
};
let raw_text: String = n
.descendants()
.filter(|d| d.is_text())
.filter_map(|d| d.text())
.collect();
let text = html_escape::decode_html_entities(&raw_text);
let text = RE_HTML_TAGS.replace_all(&text, "").trim().to_string();
if text.is_empty() {
return None;
}
Some(Segment { text, start })
})
.collect();
Ok(segments)
}
#[tokio::main]
async fn main() {
let cli = Cli::parse();
let video_id = extract_video_id(&cli.url_or_id);
let segments = match fetch_transcript(&video_id, &cli.lang).await {
Ok(s) => s,
Err(e) => {
eprintln!("Error: {e}");
std::process::exit(1);
}
};
if segments.is_empty() {
eprintln!("Warning: transcript is empty.");
}
let text: String = segments
.iter()
.map(|seg| {
if cli.timestamps {
format!("{} {}", format_timestamp(seg.start), seg.text)
} else {
seg.text.clone()
}
})
.collect::<Vec<_>>()
.join("\n");
if let Some(output_path) = &cli.output {
if let Err(e) = fs::write(output_path, &text) {
eprintln!("Failed to write file: {e}");
std::process::exit(1);
}
eprintln!(
"Transcript saved to {output_path} ({} segments)",
segments.len()
);
} else {
print!("{text}");
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_id_from_watch_url() {
assert_eq!(
extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ"),
"dQw4w9WgXcQ"
);
}
#[test]
fn test_extract_id_from_watch_url_with_extra_params() {
assert_eq!(
extract_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=120"),
"dQw4w9WgXcQ"
);
}
#[test]
fn test_extract_id_from_short_url() {
assert_eq!(
extract_video_id("https://youtu.be/dQw4w9WgXcQ"),
"dQw4w9WgXcQ"
);
}
#[test]
fn test_extract_id_from_shorts_url() {
assert_eq!(
extract_video_id("https://www.youtube.com/shorts/dQw4w9WgXcQ"),
"dQw4w9WgXcQ"
);
}
#[test]
fn test_extract_id_bare() {
assert_eq!(extract_video_id("dQw4w9WgXcQ"), "dQw4w9WgXcQ");
}
#[test]
fn test_extract_id_with_hyphens_underscores() {
assert_eq!(extract_video_id("abc-_def12AB"), "abc-_def12AB");
}
#[test]
fn test_extract_id_invalid_passthrough() {
assert_eq!(extract_video_id("not-a-valid-id"), "not-a-valid-id");
}
#[test]
fn test_timestamp_zero() {
assert_eq!(format_timestamp(0.0), "[00:00]");
}
#[test]
fn test_timestamp_seconds_only() {
assert_eq!(format_timestamp(45.7), "[00:45]");
}
#[test]
fn test_timestamp_minutes_and_seconds() {
assert_eq!(format_timestamp(754.0), "[12:34]");
}
#[test]
fn test_timestamp_with_hours() {
assert_eq!(format_timestamp(3661.0), "[01:01:01]");
}
#[test]
fn test_timestamp_large_hours() {
assert_eq!(format_timestamp(36000.0), "[10:00:00]");
}
#[test]
fn test_extract_api_key_found() {
let html = r#"var stuff = {"INNERTUBE_API_KEY":"AIzaSyABC123_def"};"#;
assert_eq!(extract_api_key(html).unwrap(), "AIzaSyABC123_def");
}
#[test]
fn test_extract_api_key_missing() {
assert!(extract_api_key("no key here").is_err());
}
fn make_innertube_response(
status: &str,
tracks: Option<Vec<(&str, &str)>>,
) -> serde_json::Value {
let mut data = serde_json::json!({
"playabilityStatus": { "status": status }
});
if let Some(tracks) = tracks {
let track_array: Vec<serde_json::Value> = tracks
.into_iter()
.map(|(lang, url)| {
serde_json::json!({
"languageCode": lang,
"baseUrl": url
})
})
.collect();
data["captions"] = serde_json::json!({
"playerCaptionsTracklistRenderer": {
"captionTracks": track_array
}
});
}
data
}
#[test]
fn test_find_caption_url_ok() {
let data = make_innertube_response(
"OK",
Some(vec![("en", "https://example.com/captions?lang=en")]),
);
assert_eq!(
find_caption_url(&data, "en").unwrap(),
"https://example.com/captions?lang=en"
);
}
#[test]
fn test_find_caption_url_not_playable() {
let data = make_innertube_response("ERROR", None);
let err = find_caption_url(&data, "en").unwrap_err();
assert!(err.contains("not playable"));
}
#[test]
fn test_find_caption_url_no_captions() {
let data = make_innertube_response("OK", None);
let err = find_caption_url(&data, "en").unwrap_err();
assert!(err.contains("No captions"));
}
#[test]
fn test_find_caption_url_lang_fallback() {
let data = make_innertube_response(
"OK",
Some(vec![
("ja", "https://example.com/ja"),
("en", "https://example.com/en"),
]),
);
assert_eq!(
find_caption_url(&data, "en").unwrap(),
"https://example.com/en"
);
assert_eq!(
find_caption_url(&data, "fr").unwrap(),
"https://example.com/ja"
);
}
#[test]
fn test_find_caption_url_lang_prefix_match() {
let data =
make_innertube_response("OK", Some(vec![("en-US", "https://example.com/en-US")]));
assert_eq!(
find_caption_url(&data, "en").unwrap(),
"https://example.com/en-US"
);
}
#[test]
fn test_find_caption_url_rejects_xpe() {
let data = make_innertube_response(
"OK",
Some(vec![("en", "https://example.com/captions?lang=en&exp=xpe")]),
);
let err = find_caption_url(&data, "en").unwrap_err();
assert!(err.contains("PoToken"));
}
#[test]
fn test_parse_legacy_xml() {
let xml = r#"<transcript><text start="0" dur="5">Hello world</text><text start="5" dur="3">Second line</text></transcript>"#;
let segments = parse_caption_xml(xml).unwrap();
assert_eq!(segments.len(), 2);
assert_eq!(segments[0].text, "Hello world");
assert_eq!(segments[0].start, 0.0);
assert_eq!(segments[1].text, "Second line");
assert_eq!(segments[1].start, 5.0);
}
#[test]
fn test_parse_srv3_xml() {
let xml = r#"<timedtext><body><p t="0" d="5000">Hello</p><p t="5000" d="3000">World</p></body></timedtext>"#;
let segments = parse_caption_xml(xml).unwrap();
assert_eq!(segments.len(), 2);
assert_eq!(segments[0].text, "Hello");
assert_eq!(segments[0].start, 0.0);
assert_eq!(segments[1].text, "World");
assert_eq!(segments[1].start, 5.0);
}
#[test]
fn test_parse_xml_html_entities() {
let xml = r#"<transcript><text start="0" dur="1">rock & roll it's great</text></transcript>"#;
let segments = parse_caption_xml(xml).unwrap();
assert_eq!(segments[0].text, "rock & roll it's great");
}
#[test]
fn test_parse_xml_strips_inner_tags() {
let xml =
r#"<transcript><text start="0" dur="1">Hello <b>bold</b> text</text></transcript>"#;
let segments = parse_caption_xml(xml).unwrap();
assert_eq!(segments[0].text, "Hello bold text");
}
#[test]
fn test_parse_xml_skips_empty_segments() {
let xml = r#"<transcript><text start="0" dur="1"></text><text start="1" dur="1">hello</text></transcript>"#;
let segments = parse_caption_xml(xml).unwrap();
assert_eq!(segments.len(), 1);
assert_eq!(segments[0].text, "hello");
}
#[test]
fn test_parse_xml_invalid() {
assert!(parse_caption_xml("not xml at all").is_err());
}
}