yt_transcript_rs/
js_var_parser.rs

1/// JavaScript variable parsing from HTML content.
2///
3/// This module provides functionality to extract and parse JavaScript variables
4/// embedded in HTML content, which is essential for extracting transcript data
5/// from YouTube pages. YouTube stores various metadata and configuration options
6/// in JavaScript variables within the page source.
7///
8/// The parser supports multiple extraction strategies:
9/// 1. Character-by-character parsing (primary method, more robust)
10/// 2. Regular expression fallback (used when the primary method fails)
11///
12/// This module is primarily used internally to extract transcript metadata from
13/// the YouTube video page.
14use regex::Regex;
15
16use crate::errors::{CouldNotRetrieveTranscript, CouldNotRetrieveTranscriptReason};
17
18/// Parser for extracting JavaScript variables from HTML content.
19///
20/// This parser is designed to extract and parse JavaScript object literals
21/// assigned to variables in HTML source code, specifically targeting YouTube's
22/// page structure. It handles nested objects, escaping, and proper JSON parsing.
23///
24/// # Features
25///
26/// * Extracts JavaScript variables by name from HTML content
27/// * Handles nested objects with proper brace matching
28/// * Supports both character-by-character parsing and regex fallbacks
29/// * Converts extracted JavaScript objects to Rust values via serde_json
30///
31/// # Example
32///
33/// ```rust,no_run
34/// # use yt_transcript_rs::js_var_parser::JsVarParser;
35/// # fn example() -> Result<(), Box<dyn std::error::Error>> {
36/// // Create a parser for the "ytInitialPlayerResponse" variable
37/// let parser = JsVarParser::new("ytInitialPlayerResponse");
38///
39/// // HTML content containing the JavaScript variable
40/// let html = r#"
41///   <script>
42///     var ytInitialPlayerResponse = {"captions": {"playerCaptionsTracklistRenderer":
43///       {"captionTracks": [{"baseUrl": "https://example.com", "name": {"simpleText": "English"}}]}}};
44///   </script>
45/// "#;
46///
47/// // Parse the variable
48/// let json = parser.parse(html, "dQw4w9WgXcQ")?;
49///
50/// // Access extracted data
51/// if let Some(captions) = json.get("captions") {
52///     println!("Found captions data: {}", captions);
53/// }
54/// # Ok(())
55/// # }
56/// ```
57pub struct JsVarParser {
58    /// The name of the JavaScript variable to extract
59    var_name: String,
60}
61
62impl JsVarParser {
63    /// Creates a new JavaScript variable parser for the specified variable name.
64    ///
65    /// # Parameters
66    ///
67    /// * `var_name` - The name of the JavaScript variable to extract (e.g., "ytInitialPlayerResponse")
68    ///
69    /// # Returns
70    ///
71    /// A new `JsVarParser` instance configured to extract the specified variable.
72    ///
73    /// # Example
74    ///
75    /// ```rust
76    /// # use yt_transcript_rs::js_var_parser::JsVarParser;
77    /// // Create a parser for YouTube's initial player response
78    /// let player_response_parser = JsVarParser::new("ytInitialPlayerResponse");
79    ///
80    /// // Create a parser for YouTube's initial data
81    /// let initial_data_parser = JsVarParser::new("ytInitialData");
82    /// ```
83    pub fn new(var_name: &str) -> Self {
84        Self {
85            var_name: var_name.to_string(),
86        }
87    }
88
89    /// Parses a JavaScript variable from HTML content and converts it to a JSON value.
90    ///
91    /// This method tries multiple parsing strategies:
92    /// 1. First, it attempts a character-by-character approach for precise extraction
93    /// 2. If that fails, it falls back to regular expression patterns
94    ///
95    /// # Parameters
96    ///
97    /// * `html` - The HTML content containing the JavaScript variable
98    /// * `video_id` - The YouTube video ID (used for error reporting)
99    ///
100    /// # Returns
101    ///
102    /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
103    ///
104    /// # Errors
105    ///
106    /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
107    /// - The variable is not found in the HTML
108    /// - The variable value cannot be parsed as valid JSON
109    /// - The braces in the JavaScript object are mismatched
110    ///
111    /// # Example
112    ///
113    /// ```rust,no_run
114    /// # use yt_transcript_rs::js_var_parser::JsVarParser;
115    /// # fn example() -> Result<(), Box<dyn std::error::Error>> {
116    /// let parser = JsVarParser::new("ytInitialPlayerResponse");
117    /// let html = r#"<script>var ytInitialPlayerResponse = {"captions": {"available": true}};</script>"#;
118    ///
119    /// match parser.parse(html, "dQw4w9WgXcQ") {
120    ///     Ok(json) => {
121    ///         println!("Successfully extracted variable: {}", json);
122    ///         
123    ///         // Access nested properties
124    ///         if let Some(captions) = json.get("captions") {
125    ///             if let Some(available) = captions.get("available") {
126    ///                 println!("Captions available: {}", available);
127    ///             }
128    ///         }
129    ///     },
130    ///     Err(e) => {
131    ///         println!("Failed to parse: {:?}", e.reason);
132    ///     }
133    /// }
134    /// # Ok(())
135    /// # }
136    /// ```
137    pub fn parse(
138        &self,
139        html: &str,
140        video_id: &str,
141    ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
142        // First try to find the variable using a character-by-character approach
143        // (similar to the Python implementation)
144        if let Ok(json_value) = self.parse_char_by_char(html, video_id) {
145            return Ok(json_value);
146        }
147
148        // Fall back to regex as a backup strategy
149        self.parse_with_regex(html, video_id)
150    }
151
152    /// Parses a JavaScript variable using a character-by-character approach.
153    ///
154    /// This method mimics the character-by-character approach used in the Python
155    /// implementation. It carefully tracks braces, quotes, and escape sequences
156    /// to extract nested JavaScript objects correctly.
157    ///
158    /// The approach:
159    /// 1. Finds the variable name in the HTML
160    /// 2. Locates the opening brace of the object
161    /// 3. Tracks nested braces to find the matching closing brace
162    /// 4. Handles string literals and escape sequences properly
163    /// 5. Parses the extracted object as JSON
164    ///
165    /// # Parameters
166    ///
167    /// * `html` - The HTML content containing the JavaScript variable
168    /// * `video_id` - The YouTube video ID (used for error reporting)
169    ///
170    /// # Returns
171    ///
172    /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
173    ///
174    /// # Errors
175    ///
176    /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
177    /// - The variable name is not found in the HTML
178    /// - No opening brace is found after the variable name
179    /// - The HTML ends before finding a matching closing brace
180    /// - The extracted text is not valid JSON
181    ///
182    /// # Note
183    ///
184    /// This is an internal method used by `parse()` and typically should not
185    /// be called directly.
186    fn parse_char_by_char(
187        &self,
188        html: &str,
189        video_id: &str,
190    ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
191        // Step 1: Split by "var {var_name}"
192        let var_marker = format!("var {}", self.var_name);
193        let parts: Vec<&str> = html.split(&var_marker).collect();
194
195        if parts.len() <= 1 {
196            // Try with just the var name (without 'var' prefix)
197            let parts: Vec<&str> = html.split(&self.var_name).collect();
198            if parts.len() <= 1 {
199                return Err(CouldNotRetrieveTranscript {
200                    video_id: video_id.to_string(),
201                    reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
202                        format!("JavaScript variable '{}' not found in HTML", self.var_name),
203                    )),
204                });
205            }
206        }
207
208        // Take the part after the variable name
209        let after_var = if parts.len() > 1 { parts[1] } else { "" };
210
211        // Step 2: Create iterator over the characters after the variable name
212        let mut chars = after_var.chars();
213
214        // Step 3: Find the opening brace
215        loop {
216            match chars.next() {
217                Some('{') => break,
218                Some(_) => continue,
219                None => {
220                    return Err(CouldNotRetrieveTranscript {
221                        video_id: video_id.to_string(),
222                        reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
223                            format!(
224                                "Opening brace not found after JavaScript variable '{}'",
225                                self.var_name
226                            ),
227                        )),
228                    });
229                }
230            }
231        }
232
233        // Step 4: Find the matching closing brace
234        let mut json_chars = vec!['{'];
235        let mut depth = 1;
236        let mut escaped = false;
237        let mut in_quotes = false;
238
239        while depth > 0 {
240            match chars.next() {
241                Some(c) => {
242                    json_chars.push(c);
243
244                    if escaped {
245                        escaped = false;
246                    } else if c == '\\' {
247                        escaped = true;
248                    } else if c == '"' {
249                        in_quotes = !in_quotes;
250                    } else if !in_quotes {
251                        if c == '{' {
252                            depth += 1;
253                        } else if c == '}' {
254                            depth -= 1;
255                        }
256                    }
257                }
258                None => {
259                    // Unexpected end of string
260                    return Err(CouldNotRetrieveTranscript {
261                        video_id: video_id.to_string(),
262                        reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
263                            "Unexpected end of HTML while parsing JavaScript variable".to_string(),
264                        )),
265                    });
266                }
267            }
268        }
269
270        // Step 5: Parse the extracted JSON string
271        let json_str: String = json_chars.into_iter().collect();
272
273        match serde_json::from_str(&json_str) {
274            Ok(json) => Ok(json),
275            Err(_) => Err(CouldNotRetrieveTranscript {
276                video_id: video_id.to_string(),
277                reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
278                    "Extracted JavaScript variable is not valid JSON".to_string(),
279                )),
280            }),
281        }
282    }
283
284    /// Parses a JavaScript variable using regular expressions as a fallback method.
285    ///
286    /// This method tries multiple regex patterns to extract the variable value when
287    /// the character-by-character approach fails. It's less precise but can handle
288    /// some edge cases.
289    ///
290    /// # Parameters
291    ///
292    /// * `html` - The HTML content containing the JavaScript variable
293    /// * `video_id` - The YouTube video ID (used for error reporting)
294    ///
295    /// # Returns
296    ///
297    /// * `Result<serde_json::Value, CouldNotRetrieveTranscript>` - The parsed JSON value or an error
298    ///
299    /// # Errors
300    ///
301    /// Returns a `CouldNotRetrieveTranscript` error with `YouTubeDataUnparsable` reason when:
302    /// - None of the regex patterns match the HTML content
303    /// - The matched content cannot be parsed as valid JSON
304    ///
305    /// # Note
306    ///
307    /// This is an internal method used by `parse()` as a fallback when the primary
308    /// parsing method fails. It typically should not be called directly.
309    fn parse_with_regex(
310        &self,
311        html: &str,
312        video_id: &str,
313    ) -> Result<serde_json::Value, CouldNotRetrieveTranscript> {
314        // Common patterns for finding JavaScript variables
315        let patterns = [
316            format!(r"{}\ =\ (.*?);</script>", regex::escape(&self.var_name)),
317            format!(r"{}=(.*?);</script>", regex::escape(&self.var_name)),
318            format!(r#"{} = (.*?);"#, regex::escape(&self.var_name)),
319            format!(r#"{}=(.*?);"#, regex::escape(&self.var_name)),
320        ];
321
322        for pattern in &patterns {
323            let re = match Regex::new(pattern) {
324                Ok(re) => re,
325                Err(_) => continue,
326            };
327
328            if let Some(cap) = re.captures(html) {
329                if let Some(json_str) = cap.get(1) {
330                    match serde_json::from_str(json_str.as_str()) {
331                        Ok(json) => return Ok(json),
332                        Err(_) => continue,
333                    }
334                }
335            }
336        }
337
338        // If we get here, we couldn't find or parse the variable
339        Err(CouldNotRetrieveTranscript {
340            video_id: video_id.to_string(),
341            reason: Some(CouldNotRetrieveTranscriptReason::YouTubeDataUnparsable(
342                format!(
343                    "Could not find or parse JavaScript variable '{}' using regex patterns",
344                    self.var_name
345                ),
346            )),
347        })
348    }
349}