halldyll_core/parse/
videos.rs1use regex::Regex;
4use scraper::{Html, Selector};
5use url::Url;
6
7use crate::types::assets::{VideoAsset, VideoProvider, VideoType};
8
9pub struct VideoExtractor;
11
12impl Default for VideoExtractor {
13 fn default() -> Self {
14 Self
15 }
16}
17
18impl VideoExtractor {
19 pub fn new() -> Self {
21 Self
22 }
23
24 pub fn extract(&self, html: &str, base_url: &Url) -> Vec<VideoAsset> {
26 let document = Html::parse_document(html);
27 let mut videos = Vec::new();
28
29 videos.extend(self.extract_video_tags(&document, base_url));
31
32 videos.extend(self.extract_iframes(&document, base_url));
34
35 videos.dedup_by(|a, b| a.urls == b.urls);
37
38 videos
39 }
40
41 fn extract_video_tags(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
43 let video_selector = Selector::parse("video").unwrap();
44 let source_selector = Selector::parse("source").unwrap();
45 let mut videos = Vec::new();
46
47 for video in document.select(&video_selector) {
48 let attrs = video.value();
49 let mut urls = Vec::new();
50
51 if let Some(src) = attrs.attr("src") {
53 if let Ok(url) = base_url.join(src) {
54 urls.push(url);
55 }
56 }
57
58 for source in video.select(&source_selector) {
60 if let Some(src) = source.value().attr("src") {
61 if let Ok(url) = base_url.join(src) {
62 urls.push(url);
63 }
64 }
65 }
66
67 if urls.is_empty() {
68 continue;
69 }
70
71 let poster = attrs.attr("poster").and_then(|p| base_url.join(p).ok());
73
74 let video_type = self.detect_video_type(&urls);
76
77 let width = attrs.attr("width").and_then(|w| w.parse().ok());
79 let height = attrs.attr("height").and_then(|h| h.parse().ok());
80
81 videos.push(VideoAsset {
82 urls,
83 poster,
84 video_type,
85 duration: None,
86 width,
87 height,
88 provider: None,
89 provider_id: None,
90 iframe_src: None,
91 });
92 }
93
94 videos
95 }
96
97 fn extract_iframes(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
99 let iframe_selector = Selector::parse("iframe").unwrap();
100 let mut videos = Vec::new();
101
102 for iframe in document.select(&iframe_selector) {
103 let src = match iframe.value().attr("src").or_else(|| iframe.value().attr("data-src")) {
104 Some(s) => s,
105 None => continue,
106 };
107
108 let iframe_url = match Url::parse(src).or_else(|_| base_url.join(src)) {
110 Ok(u) => u,
111 Err(_) => continue,
112 };
113
114 let (provider, provider_id) = self.detect_provider(&iframe_url);
116 if provider.is_none() {
117 continue; }
119
120 let width = iframe.value().attr("width").and_then(|w| w.parse().ok());
122 let height = iframe.value().attr("height").and_then(|h| h.parse().ok());
123
124 videos.push(VideoAsset {
125 urls: vec![iframe_url.clone()],
126 poster: None,
127 video_type: VideoType::Embed,
128 duration: None,
129 width,
130 height,
131 provider,
132 provider_id,
133 iframe_src: Some(iframe_url),
134 });
135 }
136
137 videos
138 }
139
140 fn detect_video_type(&self, urls: &[Url]) -> VideoType {
142 for url in urls {
143 let path = url.path().to_lowercase();
144 if path.ends_with(".mp4") {
145 return VideoType::Mp4;
146 }
147 if path.ends_with(".webm") {
148 return VideoType::WebM;
149 }
150 if path.ends_with(".ogg") || path.ends_with(".ogv") {
151 return VideoType::Ogg;
152 }
153 if path.ends_with(".m3u8") {
154 return VideoType::Hls;
155 }
156 if path.ends_with(".mpd") {
157 return VideoType::Dash;
158 }
159 }
160 VideoType::Unknown
161 }
162
163 fn detect_provider(&self, url: &Url) -> (Option<VideoProvider>, Option<String>) {
165 let host = url.host_str().unwrap_or("");
166 let path = url.path();
167
168 if host.contains("youtube.com") || host.contains("youtube-nocookie.com") {
170 let video_id = self.extract_youtube_id(url);
171 return (Some(VideoProvider::YouTube), video_id);
172 }
173 if host.contains("youtu.be") {
174 let video_id = path.trim_start_matches('/').split('/').next().map(String::from);
175 return (Some(VideoProvider::YouTube), video_id);
176 }
177
178 if host.contains("vimeo.com") || host.contains("player.vimeo.com") {
180 let video_id = self.extract_vimeo_id(url);
181 return (Some(VideoProvider::Vimeo), video_id);
182 }
183
184 if host.contains("dailymotion.com") || host.contains("dai.ly") {
186 let video_id = self.extract_dailymotion_id(url);
187 return (Some(VideoProvider::Dailymotion), video_id);
188 }
189
190 if host.contains("twitch.tv") || host.contains("player.twitch.tv") {
192 return (Some(VideoProvider::Twitch), None);
193 }
194
195 if host.contains("facebook.com") && path.contains("video") {
197 return (Some(VideoProvider::Facebook), None);
198 }
199
200 if host.contains("twitter.com") || host.contains("x.com") {
202 return (Some(VideoProvider::Twitter), None);
203 }
204
205 if host.contains("tiktok.com") {
207 return (Some(VideoProvider::TikTok), None);
208 }
209
210 if host.contains("wistia.com") || host.contains("wistia.net") {
212 return (Some(VideoProvider::Wistia), None);
213 }
214
215 (None, None)
216 }
217
218 fn extract_youtube_id(&self, url: &Url) -> Option<String> {
220 if url.path().starts_with("/embed/") {
222 return url.path().strip_prefix("/embed/").map(|s| s.split('/').next().unwrap_or(s).to_string());
223 }
224 url.query_pairs().find(|(k, _)| k == "v").map(|(_, v)| v.to_string())
226 }
227
228 fn extract_vimeo_id(&self, url: &Url) -> Option<String> {
230 let re = Regex::new(r"/(\d+)").ok()?;
231 re.captures(url.path())
232 .and_then(|c| c.get(1))
233 .map(|m| m.as_str().to_string())
234 }
235
236 fn extract_dailymotion_id(&self, url: &Url) -> Option<String> {
238 let re = Regex::new(r"/video/([a-zA-Z0-9]+)").ok()?;
239 re.captures(url.path())
240 .and_then(|c| c.get(1))
241 .map(|m| m.as_str().to_string())
242 }
243}
244
245pub fn extract_manifest_urls(html: &str, base_url: &Url) -> Vec<(Url, VideoType)> {
247 let mut manifests = Vec::new();
248
249 let m3u8_re = Regex::new(r#"["']([^"']*\.m3u8[^"']*)["']"#).unwrap();
251 let mpd_re = Regex::new(r#"["']([^"']*\.mpd[^"']*)["']"#).unwrap();
252
253 for cap in m3u8_re.captures_iter(html) {
254 if let Some(m) = cap.get(1) {
255 if let Ok(url) = base_url.join(m.as_str()) {
256 manifests.push((url, VideoType::Hls));
257 }
258 }
259 }
260
261 for cap in mpd_re.captures_iter(html) {
262 if let Some(m) = cap.get(1) {
263 if let Ok(url) = base_url.join(m.as_str()) {
264 manifests.push((url, VideoType::Dash));
265 }
266 }
267 }
268
269 manifests
270}