use regex::Regex;
use scraper::{Html, Selector};
use url::Url;
use crate::types::assets::{VideoAsset, VideoProvider, VideoType};
pub struct VideoExtractor;
impl Default for VideoExtractor {
fn default() -> Self {
Self
}
}
impl VideoExtractor {
pub fn new() -> Self {
Self
}
pub fn extract(&self, html: &str, base_url: &Url) -> Vec<VideoAsset> {
let document = Html::parse_document(html);
let mut videos = Vec::new();
videos.extend(self.extract_video_tags(&document, base_url));
videos.extend(self.extract_iframes(&document, base_url));
videos.dedup_by(|a, b| a.urls == b.urls);
videos
}
fn extract_video_tags(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
let video_selector = Selector::parse("video").unwrap();
let source_selector = Selector::parse("source").unwrap();
let mut videos = Vec::new();
for video in document.select(&video_selector) {
let attrs = video.value();
let mut urls = Vec::new();
if let Some(src) = attrs.attr("src") {
if let Ok(url) = base_url.join(src) {
urls.push(url);
}
}
for source in video.select(&source_selector) {
if let Some(src) = source.value().attr("src") {
if let Ok(url) = base_url.join(src) {
urls.push(url);
}
}
}
if urls.is_empty() {
continue;
}
let poster = attrs.attr("poster").and_then(|p| base_url.join(p).ok());
let video_type = self.detect_video_type(&urls);
let width = attrs.attr("width").and_then(|w| w.parse().ok());
let height = attrs.attr("height").and_then(|h| h.parse().ok());
videos.push(VideoAsset {
urls,
poster,
video_type,
duration: None,
width,
height,
provider: None,
provider_id: None,
iframe_src: None,
});
}
videos
}
fn extract_iframes(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
let iframe_selector = Selector::parse("iframe").unwrap();
let mut videos = Vec::new();
for iframe in document.select(&iframe_selector) {
let src = match iframe.value().attr("src").or_else(|| iframe.value().attr("data-src")) {
Some(s) => s,
None => continue,
};
let iframe_url = match Url::parse(src).or_else(|_| base_url.join(src)) {
Ok(u) => u,
Err(_) => continue,
};
let (provider, provider_id) = self.detect_provider(&iframe_url);
if provider.is_none() {
continue; }
let width = iframe.value().attr("width").and_then(|w| w.parse().ok());
let height = iframe.value().attr("height").and_then(|h| h.parse().ok());
videos.push(VideoAsset {
urls: vec![iframe_url.clone()],
poster: None,
video_type: VideoType::Embed,
duration: None,
width,
height,
provider,
provider_id,
iframe_src: Some(iframe_url),
});
}
videos
}
fn detect_video_type(&self, urls: &[Url]) -> VideoType {
for url in urls {
let path = url.path().to_lowercase();
if path.ends_with(".mp4") {
return VideoType::Mp4;
}
if path.ends_with(".webm") {
return VideoType::WebM;
}
if path.ends_with(".ogg") || path.ends_with(".ogv") {
return VideoType::Ogg;
}
if path.ends_with(".m3u8") {
return VideoType::Hls;
}
if path.ends_with(".mpd") {
return VideoType::Dash;
}
}
VideoType::Unknown
}
fn detect_provider(&self, url: &Url) -> (Option<VideoProvider>, Option<String>) {
let host = url.host_str().unwrap_or("");
let path = url.path();
if host.contains("youtube.com") || host.contains("youtube-nocookie.com") {
let video_id = self.extract_youtube_id(url);
return (Some(VideoProvider::YouTube), video_id);
}
if host.contains("youtu.be") {
let video_id = path.trim_start_matches('/').split('/').next().map(String::from);
return (Some(VideoProvider::YouTube), video_id);
}
if host.contains("vimeo.com") || host.contains("player.vimeo.com") {
let video_id = self.extract_vimeo_id(url);
return (Some(VideoProvider::Vimeo), video_id);
}
if host.contains("dailymotion.com") || host.contains("dai.ly") {
let video_id = self.extract_dailymotion_id(url);
return (Some(VideoProvider::Dailymotion), video_id);
}
if host.contains("twitch.tv") || host.contains("player.twitch.tv") {
return (Some(VideoProvider::Twitch), None);
}
if host.contains("facebook.com") && path.contains("video") {
return (Some(VideoProvider::Facebook), None);
}
if host.contains("twitter.com") || host.contains("x.com") {
return (Some(VideoProvider::Twitter), None);
}
if host.contains("tiktok.com") {
return (Some(VideoProvider::TikTok), None);
}
if host.contains("wistia.com") || host.contains("wistia.net") {
return (Some(VideoProvider::Wistia), None);
}
(None, None)
}
fn extract_youtube_id(&self, url: &Url) -> Option<String> {
if url.path().starts_with("/embed/") {
return url.path().strip_prefix("/embed/").map(|s| s.split('/').next().unwrap_or(s).to_string());
}
url.query_pairs().find(|(k, _)| k == "v").map(|(_, v)| v.to_string())
}
fn extract_vimeo_id(&self, url: &Url) -> Option<String> {
let re = Regex::new(r"/(\d+)").ok()?;
re.captures(url.path())
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
fn extract_dailymotion_id(&self, url: &Url) -> Option<String> {
let re = Regex::new(r"/video/([a-zA-Z0-9]+)").ok()?;
re.captures(url.path())
.and_then(|c| c.get(1))
.map(|m| m.as_str().to_string())
}
}
pub fn extract_manifest_urls(html: &str, base_url: &Url) -> Vec<(Url, VideoType)> {
let mut manifests = Vec::new();
let m3u8_re = Regex::new(r#"["']([^"']*\.m3u8[^"']*)["']"#).unwrap();
let mpd_re = Regex::new(r#"["']([^"']*\.mpd[^"']*)["']"#).unwrap();
for cap in m3u8_re.captures_iter(html) {
if let Some(m) = cap.get(1) {
if let Ok(url) = base_url.join(m.as_str()) {
manifests.push((url, VideoType::Hls));
}
}
}
for cap in mpd_re.captures_iter(html) {
if let Some(m) = cap.get(1) {
if let Ok(url) = base_url.join(m.as_str()) {
manifests.push((url, VideoType::Dash));
}
}
}
manifests
}