use regex::Regex;
use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
pub struct JsVarParser {
var_name: String,
}
impl JsVarParser {
pub fn new(var_name: &str) -> Self {
Self {
var_name: var_name.to_string(),
}
}
pub fn parse(
&self,
html: &str,
video_id: &str,
) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
if let Ok(json_value) = self.parse_char_by_char(html, video_id) {
return Ok(json_value);
}
self.parse_with_regex(html, video_id)
}
fn parse_char_by_char(
&self,
html: &str,
video_id: &str,
) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
let var_marker = format!("var {}", self.var_name);
let parts: Vec<&str> = html.split(&var_marker).collect();
if parts.len() <= 1 {
let parts: Vec<&str> = html.split(&self.var_name).collect();
if parts.len() <= 1 {
return Err(CouldNotRetrieveTranscript {
video_id: video_id.to_string(),
reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
format!("JavaScript variable '{}' not found in HTML", self.var_name),
)),
});
}
}
let after_var = if parts.len() > 1 { parts[1] } else { "" };
let mut chars = after_var.chars();
loop {
match chars.next() {
Some('{') => break,
Some(_) => continue,
None => {
return Err(CouldNotRetrieveTranscript {
video_id: video_id.to_string(),
reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
format!(
"Opening brace not found after JavaScript variable '{}'",
self.var_name
),
)),
});
}
}
}
let mut json_chars = vec!['{'];
let mut depth = 1;
let mut escaped = false;
let mut in_quotes = false;
while depth > 0 {
match chars.next() {
Some(c) => {
json_chars.push(c);
if escaped {
escaped = false;
} else if c == '\\' {
escaped = true;
} else if c == '"' {
in_quotes = !in_quotes;
} else if !in_quotes {
if c == '{' {
depth += 1;
} else if c == '}' {
depth -= 1;
}
}
}
None => {
return Err(CouldNotRetrieveTranscript {
video_id: video_id.to_string(),
reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
"Unexpected end of HTML while parsing JavaScript variable".to_string(),
)),
});
}
}
}
let json_str: String = json_chars.into_iter().collect();
match serde_json::from_str(&json_str) {
Ok(json) => Ok(json),
Err(_) => Err(CouldNotRetrieveTranscript {
video_id: video_id.to_string(),
reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
"Extracted JavaScript variable is not valid JSON".to_string(),
)),
}),
}
}
fn parse_with_regex(
&self,
html: &str,
video_id: &str,
) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
let patterns = [
format!(r"{}\ =\ (.*?);</script>", regex::escape(&self.var_name)),
format!(r"{}=(.*?);</script>", regex::escape(&self.var_name)),
format!(r#"{} = (.*?);"#, regex::escape(&self.var_name)),
format!(r#"{}=(.*?);"#, regex::escape(&self.var_name)),
];
for pattern in &patterns {
let re = match Regex::new(pattern) {
Ok(re) => re,
Err(_) => continue,
};
if let Some(cap) = re.captures(html) {
if let Some(json_str) = cap.get(1) {
match serde_json::from_str(json_str.as_str()) {
Ok(json) => return Ok(json),
Err(_) => continue,
}
}
}
}
Err(CouldNotRetrieveTranscript {
video_id: video_id.to_string(),
reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
format!(
"Could not find or parse JavaScript variable '{}' using regex patterns",
self.var_name
),
)),
})
}
}