yt_transcript_rs/js_var_parser.rs
1/// JavaScript variable parsing from HTML content.
2///
3/// This module provides functionality to extract and parse JavaScript variables
4/// embedded in HTML content, which is essential for extracting transcript data
5/// from YouTube pages. YouTube stores various metadata and configuration options
6/// in JavaScript variables within the page source.
7///
8/// The parser supports multiple extraction strategies:
9/// 1. Character-by-character parsing (primary method, more robust)
10/// 2. Regular expression fallback (used when the primary method fails)
11///
12/// This module is primarily used internally to extract transcript metadata from
13/// the YouTube video page.
14use regex::Regex;
15
16use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
17
18/// Parser for extracting JavaScript variables from HTML content.
19///
20/// This parser is designed to extract and parse JavaScript object literals
21/// assigned to variables in HTML source code, specifically targeting YouTube's
22/// page structure. It handles nested objects, escaping, and proper JSON parsing.
23///
24/// # Features
25///
26/// * Extracts JavaScript variables by name from HTML content
27/// * Handles nested objects with proper brace matching
28/// * Supports both character-by-character parsing and regex fallbacks
29/// * Converts extracted JavaScript objects to Rust values via serde_json
30///
31/// # Example
32///
33/// ```rust,no_run
34/// # use yt_transcript_rs::js_var_parser::JsVarParser;
35/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
36/// // Create a parser for the "ytInitialPlayerResponse" variable
37/// let parser = JsVarParser::new("ytInitialPlayerResponse");
38///
39/// // HTML content containing the JavaScript variable
40/// let html = r#"
41/// <script>
42/// var ytInitialPlayerResponse = {"captions": {"playerCaptionsTracklistRenderer":
43/// {"captionTracks": [{"baseUrl": "https://example.com", "name": {"simpleText": "English"}}]}}};
44/// </script>
45/// "#;
46///
47/// // Parse the variable
48/// let json = parser.parse(html, "dQw4w9WgXcQ")?;
49///
50/// // Access extracted data
51/// if let Some(captions) = json.get("captions") {
52/// println!("Found captions data: {}", captions);
53/// }
54/// # Ok(())
55/// # }
56/// ```
57pub struct JsVarParser {
58 /// The name of the JavaScript variable to extract
59 var_name: String,
60}
61
62impl JsVarParser {
63 /// Creates a new JavaScript variable parser for the specified variable name.
64 ///
65 /// # Parameters
66 ///
67 /// * `var_name` - The name of the JavaScript variable to extract (e.g., "ytInitialPlayerResponse")
68 ///
69 /// # Returns
70 ///
71 /// A new `JsVarParser` instance configured to extract the specified variable.
72 ///
73 /// # Example
74 ///
75 /// ```rust
76 /// # use yt_transcript_rs::js_var_parser::JsVarParser;
77 /// // Create a parser for YouTube's initial player response
78 /// let player_response_parser = JsVarParser::new("ytInitialPlayerResponse");
79 ///
80 /// // Create a parser for YouTube's initial data
81 /// let initial_data_parser = JsVarParser::new("ytInitialData");
82 /// ```
83 pub fn new(var_name: &str) -> Self {
84 Self {
85 var_name: var_name.to_string(),
86 }
87 }
88
89 /// Parses a JavaScript variable from HTML content and converts it to a JSON value.
90 ///
91 /// This method tries multiple parsing strategies:
92 /// 1. First, it attempts a character-by-character approach for precise extraction
93 /// 2. If that fails, it falls back to regular expression patterns
94 ///
95 /// # Parameters
96 ///
97 /// * `html` - The HTML content containing the JavaScript variable
98 /// * `video_id` - The YouTube video ID (used for error reporting)
99 ///
100 /// # Returns
101 ///
102 /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
103 ///
104 /// # Errors
105 ///
106 /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
107 /// - The variable is not found in the HTML
108 /// - The variable value cannot be parsed as valid JSON
109 /// - The braces in the JavaScript object are mismatched
110 ///
111 /// # Example
112 ///
113 /// ```rust,no_run
114 /// # use yt_transcript_rs::js_var_parser::JsVarParser;
115 /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
116 /// let parser = JsVarParser::new("ytInitialPlayerResponse");
117 /// let html = r#"<script>var ytInitialPlayerResponse = {"captions": {"available": true}};</script>"#;
118 ///
119 /// match parser.parse(html, "dQw4w9WgXcQ") {
120 /// Ok(json) => {
121 /// println!("Successfully extracted variable: {}", json);
122 ///
123 /// // Access nested properties
124 /// if let Some(captions) = json.get("captions") {
125 /// if let Some(available) = captions.get("available") {
126 /// println!("Captions available: {}", available);
127 /// }
128 /// }
129 /// },
130 /// Err(e) => {
131 /// println!("Failed to parse: {:?}", e.reason);
132 /// }
133 /// }
134 /// # Ok(())
135 /// # }
136 /// ```
137 pub fn parse(
138 &self,
139 html: &str,
140 video_id: &str,
141 ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
142 // First try to find the variable using a character-by-character approach
143 // (similar to the Python implementation)
144 if let Ok(json_value) = self.parse_char_by_char(html, video_id) {
145 return Ok(json_value);
146 }
147
148 // Fall back to regex as a backup strategy
149 self.parse_with_regex(html, video_id)
150 }
151
152 /// Parses a JavaScript variable using a character-by-character approach.
153 ///
154 /// This method mimics the character-by-character approach used in the Python
155 /// implementation. It carefully tracks braces, quotes, and escape sequences
156 /// to extract nested JavaScript objects correctly.
157 ///
158 /// The approach:
159 /// 1. Finds the variable name in the HTML
160 /// 2. Locates the opening brace of the object
161 /// 3. Tracks nested braces to find the matching closing brace
162 /// 4. Handles string literals and escape sequences properly
163 /// 5. Parses the extracted object as JSON
164 ///
165 /// # Parameters
166 ///
167 /// * `html` - The HTML content containing the JavaScript variable
168 /// * `video_id` - The YouTube video ID (used for error reporting)
169 ///
170 /// # Returns
171 ///
172 /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
173 ///
174 /// # Errors
175 ///
176 /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
177 /// - The variable name is not found in the HTML
178 /// - No opening brace is found after the variable name
179 /// - The HTML ends before finding a matching closing brace
180 /// - The extracted text is not valid JSON
181 ///
182 /// # Note
183 ///
184 /// This is an internal method used by `parse()` and typically should not
185 /// be called directly.
186 fn parse_char_by_char(
187 &self,
188 html: &str,
189 video_id: &str,
190 ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
191 // Step 1: Split by "var {var_name}"
192 let var_marker = format!("var {}", self.var_name);
193 let parts: Vec<&str> = html.split(&var_marker).collect();
194
195 if parts.len() <= 1 {
196 // Try with just the var name (without 'var' prefix)
197 let parts: Vec<&str> = html.split(&self.var_name).collect();
198 if parts.len() <= 1 {
199 return Err(CouldNotRetrieveTranscript {
200 video_id: video_id.to_string(),
201 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
202 format!("JavaScript variable '{}' not found in HTML", self.var_name),
203 )),
204 });
205 }
206 }
207
208 // Take the part after the variable name
209 let after_var = if parts.len() > 1 { parts[1] } else { "" };
210
211 // Step 2: Create iterator over the characters after the variable name
212 let mut chars = after_var.chars();
213
214 // Step 3: Find the opening brace
215 loop {
216 match chars.next() {
217 Some('{') => break,
218 Some(_) => continue,
219 None => {
220 return Err(CouldNotRetrieveTranscript {
221 video_id: video_id.to_string(),
222 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
223 format!(
224 "Opening brace not found after JavaScript variable '{}'",
225 self.var_name
226 ),
227 )),
228 });
229 }
230 }
231 }
232
233 // Step 4: Find the matching closing brace
234 let mut json_chars = vec!['{'];
235 let mut depth = 1;
236 let mut escaped = false;
237 let mut in_quotes = false;
238
239 while depth > 0 {
240 match chars.next() {
241 Some(c) => {
242 json_chars.push(c);
243
244 if escaped {
245 escaped = false;
246 } else if c == '\\' {
247 escaped = true;
248 } else if c == '"' {
249 in_quotes = !in_quotes;
250 } else if !in_quotes {
251 if c == '{' {
252 depth += 1;
253 } else if c == '}' {
254 depth -= 1;
255 }
256 }
257 }
258 None => {
259 // Unexpected end of string
260 return Err(CouldNotRetrieveTranscript {
261 video_id: video_id.to_string(),
262 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
263 "Unexpected end of HTML while parsing JavaScript variable".to_string(),
264 )),
265 });
266 }
267 }
268 }
269
270 // Step 5: Parse the extracted JSON string
271 let json_str: String = json_chars.into_iter().collect();
272
273 match serde_json::from_str(&json_str) {
274 Ok(json) => Ok(json),
275 Err(_) => Err(CouldNotRetrieveTranscript {
276 video_id: video_id.to_string(),
277 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
278 "Extracted JavaScript variable is not valid JSON".to_string(),
279 )),
280 }),
281 }
282 }
283
284 /// Parses a JavaScript variable using regular expressions as a fallback method.
285 ///
286 /// This method tries multiple regex patterns to extract the variable value when
287 /// the character-by-character approach fails. It's less precise but can handle
288 /// some edge cases.
289 ///
290 /// # Parameters
291 ///
292 /// * `html` - The HTML content containing the JavaScript variable
293 /// * `video_id` - The YouTube video ID (used for error reporting)
294 ///
295 /// # Returns
296 ///
297 /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
298 ///
299 /// # Errors
300 ///
301 /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
302 /// - None of the regex patterns match the HTML content
303 /// - The matched content cannot be parsed as valid JSON
304 ///
305 /// # Note
306 ///
307 /// This is an internal method used by `parse()` as a fallback when the primary
308 /// parsing method fails. It typically should not be called directly.
309 fn parse_with_regex(
310 &self,
311 html: &str,
312 video_id: &str,
313 ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
314 // Common patterns for finding JavaScript variables
315 let patterns = [
316 format!(r"{}\ =\ (.*?);</script>", regex::escape(&self.var_name)),
317 format!(r"{}=(.*?);</script>", regex::escape(&self.var_name)),
318 format!(r#"{} = (.*?);"#, regex::escape(&self.var_name)),
319 format!(r#"{}=(.*?);"#, regex::escape(&self.var_name)),
320 ];
321
322 for pattern in &patterns {
323 let re = match Regex::new(pattern) {
324 Ok(re) => re,
325 Err(_) => continue,
326 };
327
328 if let Some(cap) = re.captures(html) {
329 if let Some(json_str) = cap.get(1) {
330 match serde_json::from_str(json_str.as_str()) {
331 Ok(json) => return Ok(json),
332 Err(_) => continue,
333 }
334 }
335 }
336 }
337
338 // If we get here, we couldn't find or parse the variable
339 Err(CouldNotRetrieveTranscript {
340 video_id: video_id.to_string(),
341 reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
342 format!(
343 "Could not find or parse JavaScript variable '{}' using regex patterns",
344 self.var_name
345 ),
346 )),
347 })
348 }
349}