ydl/
parser.rs

1use crate::error::{YdlError, YdlResult};
2use regex::Regex;
3use url::Url;
4
5/// YouTube URL parser for extracting video IDs from various URL formats
6pub struct YouTubeParser {
7    video_id_regex: Regex,
8    youtube_domains: Vec<&'static str>,
9}
10
11impl Default for YouTubeParser {
12    fn default() -> Self {
13        Self::new()
14    }
15}
16
17impl YouTubeParser {
18    /// Create a new YouTube parser
19    pub fn new() -> Self {
20        // YouTube video ID pattern: 11 characters, alphanumeric plus - and _
21        let video_id_regex = Regex::new(r"^[a-zA-Z0-9_-]{11}$").expect("Valid video ID regex");
22
23        let youtube_domains = vec![
24            "youtube.com",
25            "www.youtube.com",
26            "youtu.be",
27            "m.youtube.com",
28            "youtube-nocookie.com",
29            "www.youtube-nocookie.com",
30        ];
31
32        Self {
33            video_id_regex,
34            youtube_domains,
35        }
36    }
37
38    /// Parse a YouTube URL and extract the video ID
39    pub fn parse_url(&self, url_str: &str) -> YdlResult<String> {
40        // First, try to parse as URL
41        let url = Url::parse(url_str).map_err(|_| YdlError::InvalidUrl {
42            url: url_str.to_string(),
43        })?;
44
45        // Validate it's a YouTube domain
46        self.validate_domain(&url)?;
47
48        // Extract video ID based on URL pattern
49        self.extract_video_id(&url)
50    }
51
52    /// Validate that the URL is from a YouTube domain
53    fn validate_domain(&self, url: &Url) -> YdlResult<()> {
54        let domain = url.domain().ok_or_else(|| YdlError::InvalidUrl {
55            url: url.to_string(),
56        })?;
57
58        if !self.youtube_domains.contains(&domain) {
59            return Err(YdlError::InvalidUrl {
60                url: url.to_string(),
61            });
62        }
63
64        Ok(())
65    }
66
67    /// Extract video ID from various YouTube URL formats
68    fn extract_video_id(&self, url: &Url) -> YdlResult<String> {
69        let domain = url.domain().unwrap();
70
71        match domain {
72            // youtu.be/VIDEO_ID
73            "youtu.be" => {
74                let path = url.path().trim_start_matches('/');
75                // Remove any additional path components
76                let video_id = path.split('/').next().unwrap_or("");
77                self.validate_and_return_video_id(video_id, url)
78            }
79            // youtube.com, www.youtube.com, m.youtube.com, etc.
80            _ => {
81                // Try different patterns
82                if let Ok(id) = self.extract_from_watch_url(url) {
83                    return Ok(id);
84                }
85                if let Ok(id) = self.extract_from_embed_url(url) {
86                    return Ok(id);
87                }
88                if let Ok(id) = self.extract_from_shorts_url(url) {
89                    return Ok(id);
90                }
91
92                Err(YdlError::InvalidUrl {
93                    url: url.to_string(),
94                })
95            }
96        }
97    }
98
99    /// Extract video ID from /watch?v=VIDEO_ID URLs
100    fn extract_from_watch_url(&self, url: &Url) -> YdlResult<String> {
101        if url.path() != "/watch" {
102            return Err(YdlError::InvalidUrl {
103                url: url.to_string(),
104            });
105        }
106
107        let video_id = url
108            .query_pairs()
109            .find(|(key, _)| key == "v")
110            .map(|(_, value)| value.to_string())
111            .ok_or_else(|| YdlError::InvalidUrl {
112                url: url.to_string(),
113            })?;
114
115        self.validate_and_return_video_id(&video_id, url)
116    }
117
118    /// Extract video ID from /embed/VIDEO_ID URLs
119    fn extract_from_embed_url(&self, url: &Url) -> YdlResult<String> {
120        let path_segments: Vec<&str> = url
121            .path_segments()
122            .ok_or_else(|| YdlError::InvalidUrl {
123                url: url.to_string(),
124            })?
125            .collect();
126
127        if path_segments.len() >= 2 && path_segments[0] == "embed" {
128            let video_id = path_segments[1];
129            return self.validate_and_return_video_id(video_id, url);
130        }
131
132        Err(YdlError::InvalidUrl {
133            url: url.to_string(),
134        })
135    }
136
137    /// Extract video ID from /shorts/VIDEO_ID URLs
138    fn extract_from_shorts_url(&self, url: &Url) -> YdlResult<String> {
139        let path_segments: Vec<&str> = url
140            .path_segments()
141            .ok_or_else(|| YdlError::InvalidUrl {
142                url: url.to_string(),
143            })?
144            .collect();
145
146        if path_segments.len() >= 2 && path_segments[0] == "shorts" {
147            let video_id = path_segments[1];
148            return self.validate_and_return_video_id(video_id, url);
149        }
150
151        Err(YdlError::InvalidUrl {
152            url: url.to_string(),
153        })
154    }
155
156    /// Validate video ID format and return if valid
157    fn validate_and_return_video_id(&self, video_id: &str, _url: &Url) -> YdlResult<String> {
158        if self.is_valid_video_id(video_id) {
159            Ok(video_id.to_string())
160        } else {
161            Err(YdlError::InvalidVideoId {
162                video_id: video_id.to_string(),
163            })
164        }
165    }
166
167    /// Validate that a video ID matches YouTube's format requirements
168    pub fn is_valid_video_id(&self, video_id: &str) -> bool {
169        self.video_id_regex.is_match(video_id)
170    }
171
172    /// Normalize a URL to standard YouTube format
173    pub fn normalize_url(&self, url_str: &str) -> YdlResult<String> {
174        let video_id = self.parse_url(url_str)?;
175        Ok(format!("https://www.youtube.com/watch?v={}", video_id))
176    }
177
178    /// Extract video ID directly from string (if it's already a video ID)
179    pub fn extract_video_id_direct(&self, input: &str) -> YdlResult<String> {
180        if self.is_valid_video_id(input) {
181            Ok(input.to_string())
182        } else {
183            self.parse_url(input)
184        }
185    }
186}
187
188/// Convenience function to parse a YouTube URL
189pub fn parse_youtube_url(url: &str) -> YdlResult<String> {
190    YouTubeParser::new().parse_url(url)
191}
192
193/// Convenience function to validate a video ID
194pub fn is_valid_video_id(video_id: &str) -> bool {
195    YouTubeParser::new().is_valid_video_id(video_id)
196}
197
198/// Convenience function to normalize a YouTube URL
199pub fn normalize_youtube_url(url: &str) -> YdlResult<String> {
200    YouTubeParser::new().normalize_url(url)
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    fn parser() -> YouTubeParser {
208        YouTubeParser::new()
209    }
210
211    #[test]
212    fn test_parse_standard_watch_url() {
213        let parser = parser();
214
215        // Standard watch URLs
216        let urls = vec![
217            "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
218            "https://youtube.com/watch?v=dQw4w9WgXcQ",
219            "http://www.youtube.com/watch?v=dQw4w9WgXcQ",
220            "https://m.youtube.com/watch?v=dQw4w9WgXcQ",
221        ];
222
223        for url in urls {
224            let result = parser.parse_url(url);
225            assert!(result.is_ok(), "Failed to parse: {}", url);
226            assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
227        }
228    }
229
230    #[test]
231    fn test_parse_short_urls() {
232        let parser = parser();
233
234        let urls = vec![
235            "https://youtu.be/dQw4w9WgXcQ",
236            "http://youtu.be/dQw4w9WgXcQ",
237            "youtu.be/dQw4w9WgXcQ",
238        ];
239
240        for url in urls {
241            let result = parser.parse_url(url);
242            if result.is_err() {
243                // Handle the case where scheme is missing
244                let full_url = format!("https://{}", url);
245                let result = parser.parse_url(&full_url);
246                assert!(result.is_ok(), "Failed to parse: {}", url);
247                assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
248            } else {
249                assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
250            }
251        }
252    }
253
254    #[test]
255    fn test_parse_embed_urls() {
256        let parser = parser();
257
258        let urls = vec![
259            "https://www.youtube.com/embed/dQw4w9WgXcQ",
260            "https://www.youtube-nocookie.com/embed/dQw4w9WgXcQ",
261        ];
262
263        for url in urls {
264            let result = parser.parse_url(url);
265            assert!(result.is_ok(), "Failed to parse: {}", url);
266            assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
267        }
268    }
269
270    #[test]
271    fn test_parse_shorts_urls() {
272        let parser = parser();
273
274        let url = "https://www.youtube.com/shorts/dQw4w9WgXcQ";
275        let result = parser.parse_url(url);
276        assert!(result.is_ok());
277        assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
278    }
279
280    #[test]
281    fn test_parse_urls_with_additional_params() {
282        let parser = parser();
283
284        let urls = vec![
285            "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s",
286            "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLrCZdFsaG",
287            "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=10s&list=PLrCZdFsaG",
288            "https://youtu.be/dQw4w9WgXcQ?t=10s",
289        ];
290
291        for url in urls {
292            let result = parser.parse_url(url);
293            assert!(result.is_ok(), "Failed to parse: {}", url);
294            assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
295        }
296    }
297
298    #[test]
299    fn test_invalid_urls() {
300        let parser = parser();
301
302        let invalid_urls = vec![
303            "https://www.google.com/watch?v=dQw4w9WgXcQ", // Wrong domain
304            "https://www.youtube.com/watch",              // No video ID
305            "https://www.youtube.com/watch?list=PLrCZdFsaG", // No v parameter
306            "https://www.youtube.com/user/someuser",      // User page
307            "not-a-url-at-all",                           // Invalid URL format
308            "",                                           // Empty string
309        ];
310
311        for url in invalid_urls {
312            let result = parser.parse_url(url);
313            assert!(result.is_err(), "Should fail to parse: {}", url);
314        }
315    }
316
317    #[test]
318    fn test_invalid_video_ids() {
319        let parser = parser();
320
321        let invalid_ids = vec![
322            "short",                 // Too short
323            "way_too_long_video_id", // Too long
324            "invalid-chars!",        // Invalid characters
325            "dQw4w9WgXc",            // 10 characters (should be 11)
326            "dQw4w9WgXcQQ",          // 12 characters (should be 11)
327        ];
328
329        for id in invalid_ids {
330            assert!(!parser.is_valid_video_id(id), "Should be invalid: {}", id);
331        }
332    }
333
334    #[test]
335    fn test_valid_video_ids() {
336        let parser = parser();
337
338        let valid_ids = vec!["dQw4w9WgXcQ", "aBc_123-XyZ", "0123456789a", "_-_-_-_-_-_"];
339
340        for id in valid_ids {
341            assert!(parser.is_valid_video_id(id), "Should be valid: {}", id);
342        }
343    }
344
345    #[test]
346    fn test_normalize_url() {
347        let parser = parser();
348
349        let test_cases = vec![
350            (
351                "https://youtu.be/dQw4w9WgXcQ",
352                "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
353            ),
354            (
355                "https://www.youtube.com/embed/dQw4w9WgXcQ",
356                "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
357            ),
358            (
359                "https://m.youtube.com/watch?v=dQw4w9WgXcQ&t=10s",
360                "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
361            ),
362        ];
363
364        for (input, expected) in test_cases {
365            let result = parser.normalize_url(input);
366            assert!(result.is_ok(), "Failed to normalize: {}", input);
367            assert_eq!(result.unwrap(), expected);
368        }
369    }
370
371    #[test]
372    fn test_extract_video_id_direct() {
373        let parser = parser();
374
375        // Test with direct video ID
376        let result = parser.extract_video_id_direct("dQw4w9WgXcQ");
377        assert!(result.is_ok());
378        assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
379
380        // Test with URL
381        let result = parser.extract_video_id_direct("https://youtu.be/dQw4w9WgXcQ");
382        assert!(result.is_ok());
383        assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
384
385        // Test with invalid input
386        let result = parser.extract_video_id_direct("invalid");
387        assert!(result.is_err());
388    }
389
390    #[test]
391    fn test_convenience_functions() {
392        // Test parse_youtube_url function
393        let result = parse_youtube_url("https://youtu.be/dQw4w9WgXcQ");
394        assert!(result.is_ok());
395        assert_eq!(result.unwrap(), "dQw4w9WgXcQ");
396
397        // Test is_valid_video_id function
398        assert!(is_valid_video_id("dQw4w9WgXcQ"));
399        assert!(!is_valid_video_id("invalid"));
400
401        // Test normalize_youtube_url function
402        let result = normalize_youtube_url("https://youtu.be/dQw4w9WgXcQ");
403        assert!(result.is_ok());
404        assert_eq!(
405            result.unwrap(),
406            "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
407        );
408    }
409}