halldyll_core/parse/
videos.rs

1//! Videos - Video extraction
2
3use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::types::assets::{VideoAsset, VideoProvider, VideoType};
8
9/// Video extractor
10pub struct VideoExtractor;
11
12impl Default for VideoExtractor {
13    fn default() -> Self {
14        Self
15    }
16}
17
18impl VideoExtractor {
19    /// New extractor
20    pub fn new() -> Self {
21        Self
22    }
23
24    /// Extract all videos
25    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<VideoAsset> {
26        let document = Html::parse_document(html);
27        let mut videos = Vec::new();
28
29        // 1. <video><source> tags
30        videos.extend(self.extract_video_tags(&document, base_url));
31
32        // 2. Iframes (YouTube, Vimeo, etc.)
33        videos.extend(self.extract_iframes(&document, base_url));
34
35        // Deduplicate
36        videos.dedup_by(|a, b| a.urls == b.urls);
37
38        videos
39    }
40
41    /// Extract <video> tags
42    fn extract_video_tags(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
43        let video_selector = Selector::parse("video").unwrap();
44        let source_selector = Selector::parse("source").unwrap();
45        let mut videos = Vec::new();
46
47        for video in document.select(&video_selector) {
48            let attrs = video.value();
49            let mut urls = Vec::new();
50
51            // Direct src attribute
52            if let Some(src) = attrs.attr("src") {
53                if let Ok(url) = base_url.join(src) {
54                    urls.push(url);
55                }
56            }
57
58            // Child sources
59            for source in video.select(&source_selector) {
60                if let Some(src) = source.value().attr("src") {
61                    if let Ok(url) = base_url.join(src) {
62                        urls.push(url);
63                    }
64                }
65            }
66
67            if urls.is_empty() {
68                continue;
69            }
70
71            // Poster
72            let poster = attrs.attr("poster").and_then(|p| base_url.join(p).ok());
73
74            // Type
75            let video_type = self.detect_video_type(&urls);
76
77            // Dimensions
78            let width = attrs.attr("width").and_then(|w| w.parse().ok());
79            let height = attrs.attr("height").and_then(|h| h.parse().ok());
80
81            videos.push(VideoAsset {
82                urls,
83                poster,
84                video_type,
85                duration: None,
86                width,
87                height,
88                provider: None,
89                provider_id: None,
90                iframe_src: None,
91            });
92        }
93
94        videos
95    }
96
97    /// Extract video iframes
98    fn extract_iframes(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
99        let iframe_selector = Selector::parse("iframe").unwrap();
100        let mut videos = Vec::new();
101
102        for iframe in document.select(&iframe_selector) {
103            let src = match iframe.value().attr("src").or_else(|| iframe.value().attr("data-src")) {
104                Some(s) => s,
105                None => continue,
106            };
107
108            // Resolve URL
109            let iframe_url = match Url::parse(src).or_else(|_| base_url.join(src)) {
110                Ok(u) => u,
111                Err(_) => continue,
112            };
113
114            // Detect provider
115            let (provider, provider_id) = self.detect_provider(&iframe_url);
116            if provider.is_none() {
117                continue; // Not a known video iframe
118            }
119
120            // Dimensions
121            let width = iframe.value().attr("width").and_then(|w| w.parse().ok());
122            let height = iframe.value().attr("height").and_then(|h| h.parse().ok());
123
124            videos.push(VideoAsset {
125                urls: vec![iframe_url.clone()],
126                poster: None,
127                video_type: VideoType::Embed,
128                duration: None,
129                width,
130                height,
131                provider,
132                provider_id,
133                iframe_src: Some(iframe_url),
134            });
135        }
136
137        videos
138    }
139
140    /// Detect video type
141    fn detect_video_type(&self, urls: &[Url]) -> VideoType {
142        for url in urls {
143            let path = url.path().to_lowercase();
144            if path.ends_with(".mp4") {
145                return VideoType::Mp4;
146            }
147            if path.ends_with(".webm") {
148                return VideoType::WebM;
149            }
150            if path.ends_with(".ogg") || path.ends_with(".ogv") {
151                return VideoType::Ogg;
152            }
153            if path.ends_with(".m3u8") {
154                return VideoType::Hls;
155            }
156            if path.ends_with(".mpd") {
157                return VideoType::Dash;
158            }
159        }
160        VideoType::Unknown
161    }
162
163    /// Detect video provider and ID
164    fn detect_provider(&self, url: &Url) -> (Option<VideoProvider>, Option<String>) {
165        let host = url.host_str().unwrap_or("");
166        let path = url.path();
167
168        // YouTube
169        if host.contains("youtube.com") || host.contains("youtube-nocookie.com") {
170            let video_id = self.extract_youtube_id(url);
171            return (Some(VideoProvider::YouTube), video_id);
172        }
173        if host.contains("youtu.be") {
174            let video_id = path.trim_start_matches('/').split('/').next().map(String::from);
175            return (Some(VideoProvider::YouTube), video_id);
176        }
177
178        // Vimeo
179        if host.contains("vimeo.com") || host.contains("player.vimeo.com") {
180            let video_id = self.extract_vimeo_id(url);
181            return (Some(VideoProvider::Vimeo), video_id);
182        }
183
184        // Dailymotion
185        if host.contains("dailymotion.com") || host.contains("dai.ly") {
186            let video_id = self.extract_dailymotion_id(url);
187            return (Some(VideoProvider::Dailymotion), video_id);
188        }
189
190        // Twitch
191        if host.contains("twitch.tv") || host.contains("player.twitch.tv") {
192            return (Some(VideoProvider::Twitch), None);
193        }
194
195        // Facebook
196        if host.contains("facebook.com") && path.contains("video") {
197            return (Some(VideoProvider::Facebook), None);
198        }
199
200        // Twitter/X
201        if host.contains("twitter.com") || host.contains("x.com") {
202            return (Some(VideoProvider::Twitter), None);
203        }
204
205        // TikTok
206        if host.contains("tiktok.com") {
207            return (Some(VideoProvider::TikTok), None);
208        }
209
210        // Wistia
211        if host.contains("wistia.com") || host.contains("wistia.net") {
212            return (Some(VideoProvider::Wistia), None);
213        }
214
215        (None, None)
216    }
217
218    /// Extrait l'ID YouTube
219    fn extract_youtube_id(&self, url: &Url) -> Option<String> {
220        // /embed/VIDEO_ID
221        if url.path().starts_with("/embed/") {
222            return url.path().strip_prefix("/embed/").map(|s| s.split('/').next().unwrap_or(s).to_string());
223        }
224        // ?v=VIDEO_ID
225        url.query_pairs().find(|(k, _)| k == "v").map(|(_, v)| v.to_string())
226    }
227
228    /// Extrait l'ID Vimeo
229    fn extract_vimeo_id(&self, url: &Url) -> Option<String> {
230        let re = Regex::new(r"/(\d+)").ok()?;
231        re.captures(url.path())
232            .and_then(|c| c.get(1))
233            .map(|m| m.as_str().to_string())
234    }
235
236    /// Extrait l'ID Dailymotion
237    fn extract_dailymotion_id(&self, url: &Url) -> Option<String> {
238        let re = Regex::new(r"/video/([a-zA-Z0-9]+)").ok()?;
239        re.captures(url.path())
240            .and_then(|c| c.get(1))
241            .map(|m| m.as_str().to_string())
242    }
243}
244
245/// Extrait les URLs de manifest HLS/DASH d'une page
246pub fn extract_manifest_urls(html: &str, base_url: &Url) -> Vec<(Url, VideoType)> {
247    let mut manifests = Vec::new();
248    
249    // Regex pour .m3u8 et .mpd
250    let m3u8_re = Regex::new(r#"["']([^"']*\.m3u8[^"']*)["']"#).unwrap();
251    let mpd_re = Regex::new(r#"["']([^"']*\.mpd[^"']*)["']"#).unwrap();
252
253    for cap in m3u8_re.captures_iter(html) {
254        if let Some(m) = cap.get(1) {
255            if let Ok(url) = base_url.join(m.as_str()) {
256                manifests.push((url, VideoType::Hls));
257            }
258        }
259    }
260
261    for cap in mpd_re.captures_iter(html) {
262        if let Some(m) = cap.get(1) {
263            if let Ok(url) = base_url.join(m.as_str()) {
264                manifests.push((url, VideoType::Dash));
265            }
266        }
267    }
268
269    manifests
270}