halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Videos - Video extraction

use regex::Regex;
use scraper::{Html, Selector};
use url::Url;

use crate::types::assets::{VideoAsset, VideoProvider, VideoType};

/// Video extractor
pub struct VideoExtractor;

impl Default for VideoExtractor {
    fn default() -> Self {
        Self
    }
}

impl VideoExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self
    }

    /// Extract all videos
    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<VideoAsset> {
        let document = Html::parse_document(html);
        let mut videos = Vec::new();

        // 1. <video><source> tags
        videos.extend(self.extract_video_tags(&document, base_url));

        // 2. Iframes (YouTube, Vimeo, etc.)
        videos.extend(self.extract_iframes(&document, base_url));

        // Deduplicate
        videos.dedup_by(|a, b| a.urls == b.urls);

        videos
    }

    /// Extract <video> tags
    fn extract_video_tags(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
        let video_selector = Selector::parse("video").unwrap();
        let source_selector = Selector::parse("source").unwrap();
        let mut videos = Vec::new();

        for video in document.select(&video_selector) {
            let attrs = video.value();
            let mut urls = Vec::new();

            // Direct src attribute
            if let Some(src) = attrs.attr("src") {
                if let Ok(url) = base_url.join(src) {
                    urls.push(url);
                }
            }

            // Child sources
            for source in video.select(&source_selector) {
                if let Some(src) = source.value().attr("src") {
                    if let Ok(url) = base_url.join(src) {
                        urls.push(url);
                    }
                }
            }

            if urls.is_empty() {
                continue;
            }

            // Poster
            let poster = attrs.attr("poster").and_then(|p| base_url.join(p).ok());

            // Type
            let video_type = self.detect_video_type(&urls);

            // Dimensions
            let width = attrs.attr("width").and_then(|w| w.parse().ok());
            let height = attrs.attr("height").and_then(|h| h.parse().ok());

            videos.push(VideoAsset {
                urls,
                poster,
                video_type,
                duration: None,
                width,
                height,
                provider: None,
                provider_id: None,
                iframe_src: None,
            });
        }

        videos
    }

    /// Extract video iframes
    fn extract_iframes(&self, document: &Html, base_url: &Url) -> Vec<VideoAsset> {
        let iframe_selector = Selector::parse("iframe").unwrap();
        let mut videos = Vec::new();

        for iframe in document.select(&iframe_selector) {
            let src = match iframe.value().attr("src").or_else(|| iframe.value().attr("data-src")) {
                Some(s) => s,
                None => continue,
            };

            // Resolve URL
            let iframe_url = match Url::parse(src).or_else(|_| base_url.join(src)) {
                Ok(u) => u,
                Err(_) => continue,
            };

            // Detect provider
            let (provider, provider_id) = self.detect_provider(&iframe_url);
            if provider.is_none() {
                continue; // Not a known video iframe
            }

            // Dimensions
            let width = iframe.value().attr("width").and_then(|w| w.parse().ok());
            let height = iframe.value().attr("height").and_then(|h| h.parse().ok());

            videos.push(VideoAsset {
                urls: vec![iframe_url.clone()],
                poster: None,
                video_type: VideoType::Embed,
                duration: None,
                width,
                height,
                provider,
                provider_id,
                iframe_src: Some(iframe_url),
            });
        }

        videos
    }

    /// Detect video type
    fn detect_video_type(&self, urls: &[Url]) -> VideoType {
        for url in urls {
            let path = url.path().to_lowercase();
            if path.ends_with(".mp4") {
                return VideoType::Mp4;
            }
            if path.ends_with(".webm") {
                return VideoType::WebM;
            }
            if path.ends_with(".ogg") || path.ends_with(".ogv") {
                return VideoType::Ogg;
            }
            if path.ends_with(".m3u8") {
                return VideoType::Hls;
            }
            if path.ends_with(".mpd") {
                return VideoType::Dash;
            }
        }
        VideoType::Unknown
    }

    /// Detect video provider and ID
    fn detect_provider(&self, url: &Url) -> (Option<VideoProvider>, Option<String>) {
        let host = url.host_str().unwrap_or("");
        let path = url.path();

        // YouTube
        if host.contains("youtube.com") || host.contains("youtube-nocookie.com") {
            let video_id = self.extract_youtube_id(url);
            return (Some(VideoProvider::YouTube), video_id);
        }
        if host.contains("youtu.be") {
            let video_id = path.trim_start_matches('/').split('/').next().map(String::from);
            return (Some(VideoProvider::YouTube), video_id);
        }

        // Vimeo
        if host.contains("vimeo.com") || host.contains("player.vimeo.com") {
            let video_id = self.extract_vimeo_id(url);
            return (Some(VideoProvider::Vimeo), video_id);
        }

        // Dailymotion
        if host.contains("dailymotion.com") || host.contains("dai.ly") {
            let video_id = self.extract_dailymotion_id(url);
            return (Some(VideoProvider::Dailymotion), video_id);
        }

        // Twitch
        if host.contains("twitch.tv") || host.contains("player.twitch.tv") {
            return (Some(VideoProvider::Twitch), None);
        }

        // Facebook
        if host.contains("facebook.com") && path.contains("video") {
            return (Some(VideoProvider::Facebook), None);
        }

        // Twitter/X
        if host.contains("twitter.com") || host.contains("x.com") {
            return (Some(VideoProvider::Twitter), None);
        }

        // TikTok
        if host.contains("tiktok.com") {
            return (Some(VideoProvider::TikTok), None);
        }

        // Wistia
        if host.contains("wistia.com") || host.contains("wistia.net") {
            return (Some(VideoProvider::Wistia), None);
        }

        (None, None)
    }

    /// Extrait l'ID YouTube
    fn extract_youtube_id(&self, url: &Url) -> Option<String> {
        // /embed/VIDEO_ID
        if url.path().starts_with("/embed/") {
            return url.path().strip_prefix("/embed/").map(|s| s.split('/').next().unwrap_or(s).to_string());
        }
        // ?v=VIDEO_ID
        url.query_pairs().find(|(k, _)| k == "v").map(|(_, v)| v.to_string())
    }

    /// Extrait l'ID Vimeo
    fn extract_vimeo_id(&self, url: &Url) -> Option<String> {
        let re = Regex::new(r"/(\d+)").ok()?;
        re.captures(url.path())
            .and_then(|c| c.get(1))
            .map(|m| m.as_str().to_string())
    }

    /// Extrait l'ID Dailymotion
    fn extract_dailymotion_id(&self, url: &Url) -> Option<String> {
        let re = Regex::new(r"/video/([a-zA-Z0-9]+)").ok()?;
        re.captures(url.path())
            .and_then(|c| c.get(1))
            .map(|m| m.as_str().to_string())
    }
}

/// Extrait les URLs de manifest HLS/DASH d'une page
pub fn extract_manifest_urls(html: &str, base_url: &Url) -> Vec<(Url, VideoType)> {
    let mut manifests = Vec::new();
    
    // Regex pour .m3u8 et .mpd
    let m3u8_re = Regex::new(r#"["']([^"']*\.m3u8[^"']*)["']"#).unwrap();
    let mpd_re = Regex::new(r#"["']([^"']*\.mpd[^"']*)["']"#).unwrap();

    for cap in m3u8_re.captures_iter(html) {
        if let Some(m) = cap.get(1) {
            if let Ok(url) = base_url.join(m.as_str()) {
                manifests.push((url, VideoType::Hls));
            }
        }
    }

    for cap in mpd_re.captures_iter(html) {
        if let Some(m) = cap.get(1) {
            if let Ok(url) = base_url.join(m.as_str()) {
                manifests.push((url, VideoType::Dash));
            }
        }
    }

    manifests
}