halldyll-media 0.1.0

//! Embedded content extraction for halldyll-media
//!
//! Extracts embedded content from HTML with support for:
//! - iframes (maps, social, widgets)
//! - Objects and embeds
//! - Platform detection (Google Maps, Twitter, Instagram, etc.)

use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;

use crate::types::{
    EmbeddedMedia, EmbedPlatform, MediaResult,
};

// ============================================================================
// REGEX PATTERNS
// ============================================================================

lazy_static! {
    /// Google Maps URL pattern
    static ref GOOGLE_MAPS: Regex = Regex::new(
        r"google\.com/maps|maps\.google\."
    ).unwrap();
    
    /// Twitter/X embed pattern
    static ref TWITTER: Regex = Regex::new(
        r"twitter\.com|x\.com|platform\.twitter"
    ).unwrap();
    
    /// Instagram embed pattern
    static ref INSTAGRAM: Regex = Regex::new(
        r"instagram\.com"
    ).unwrap();
    
    /// Facebook embed pattern
    static ref FACEBOOK: Regex = Regex::new(
        r"facebook\.com|fb\.com"
    ).unwrap();
    
    /// LinkedIn embed pattern
    static ref LINKEDIN: Regex = Regex::new(
        r"linkedin\.com"
    ).unwrap();
    
    /// Pinterest embed pattern
    static ref PINTEREST: Regex = Regex::new(
        r"pinterest\.com"
    ).unwrap();
    
    /// TikTok embed pattern
    static ref TIKTOK: Regex = Regex::new(
        r"tiktok\.com"
    ).unwrap();
    
    /// Reddit embed pattern
    static ref REDDIT: Regex = Regex::new(
        r"reddit\.com|redd\.it"
    ).unwrap();
    
    /// CodePen embed pattern
    static ref CODEPEN: Regex = Regex::new(
        r"codepen\.io"
    ).unwrap();
    
    /// JSFiddle embed pattern
    static ref JSFIDDLE: Regex = Regex::new(
        r"jsfiddle\.net"
    ).unwrap();
    
    /// CodeSandbox embed pattern
    static ref CODESANDBOX: Regex = Regex::new(
        r"codesandbox\.io"
    ).unwrap();
    
    /// Giphy embed pattern
    static ref GIPHY: Regex = Regex::new(
        r"giphy\.com"
    ).unwrap();
    
    /// SlideShare embed pattern
    static ref SLIDESHARE: Regex = Regex::new(
        r"slideshare\.net"
    ).unwrap();
    
    /// Typeform embed pattern
    static ref TYPEFORM: Regex = Regex::new(
        r"typeform\.com"
    ).unwrap();
    
    /// Calendly embed pattern
    static ref CALENDLY: Regex = Regex::new(
        r"calendly\.com"
    ).unwrap();
    
    /// Stripe embed pattern
    static ref STRIPE: Regex = Regex::new(
        r"stripe\.com"
    ).unwrap();
    
    /// PayPal embed pattern
    static ref PAYPAL: Regex = Regex::new(
        r"paypal\.com"
    ).unwrap();
}

// ============================================================================
// EXTRACTION FUNCTIONS
// ============================================================================

/// Extract all embedded content from HTML document
pub fn extract_embeds(document: &Html, base_url: Option<&Url>) -> Vec<EmbeddedMedia> {
    let mut embeds = Vec::new();
    let mut seen_urls: HashSet<String> = HashSet::new();
    
    // Extract from iframes
    if let Ok(sel) = Selector::parse("iframe[src]") {
        for el in document.select(&sel) {
            if let Some(embed) = extract_iframe(&el, base_url) {
                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
                if seen_urls.insert(key) {
                    embeds.push(embed);
                }
            }
        }
    }
    
    // Extract from object elements
    if let Ok(sel) = Selector::parse("object[data]") {
        for el in document.select(&sel) {
            if let Some(embed) = extract_object(&el, base_url) {
                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
                if seen_urls.insert(key) {
                    embeds.push(embed);
                }
            }
        }
    }
    
    // Extract from embed elements
    if let Ok(sel) = Selector::parse("embed[src]") {
        for el in document.select(&sel) {
            if let Some(embed) = extract_embed_tag(&el, base_url) {
                let key = embed.absolute_url.as_ref().unwrap_or(&embed.url).clone();
                if seen_urls.insert(key) {
                    embeds.push(embed);
                }
            }
        }
    }
    
    // Extract social embeds (blockquote/div with data attributes)
    extract_social_embeds(document, base_url, &mut embeds, &mut seen_urls);
    
    embeds
}

/// Extract iframe element
fn extract_iframe(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
    let src = el.value().attr("src")?;
    
    // Skip empty or javascript URLs
    if src.is_empty() || src.starts_with("javascript:") || src.starts_with("about:") {
        return None;
    }
    
    let absolute_url = resolve_url(src, base_url);
    let platform = detect_embed_platform(src);
    
    // Skip video platforms (handled by videos.rs)
    if is_video_platform(&platform) {
        return None;
    }
    
    // Parse dimensions
    let width = el.value().attr("width")
        .and_then(parse_dimension);
    let height = el.value().attr("height")
        .and_then(parse_dimension);
    
    Some(EmbeddedMedia {
        url: src.to_string(),
        absolute_url,
        platform,
        title: el.value().attr("title").map(|s| s.to_string()),
        width,
        height,
        allow: el.value().attr("allow").map(|s| s.to_string()),
        sandbox: el.value().attr("sandbox").map(|s| s.to_string()),
        loading: el.value().attr("loading").map(|s| s.to_string()),
        frameborder: el.value().attr("frameborder").map(|s| s.to_string()),
    })
}

/// Extract object element
fn extract_object(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
    let data = el.value().attr("data")?;
    
    // Skip PDFs (handled by documents.rs)
    if data.to_lowercase().contains(".pdf") {
        return None;
    }
    
    let absolute_url = resolve_url(data, base_url);
    let platform = detect_embed_platform(data);
    
    let width = el.value().attr("width")
        .and_then(parse_dimension);
    let height = el.value().attr("height")
        .and_then(parse_dimension);
    
    Some(EmbeddedMedia {
        url: data.to_string(),
        absolute_url,
        platform,
        title: el.value().attr("title").map(|s| s.to_string()),
        width,
        height,
        ..Default::default()
    })
}

/// Extract embed element
fn extract_embed_tag(el: &ElementRef, base_url: Option<&Url>) -> Option<EmbeddedMedia> {
    let src = el.value().attr("src")?;
    
    // Skip PDFs and videos
    if src.to_lowercase().contains(".pdf") {
        return None;
    }
    
    let absolute_url = resolve_url(src, base_url);
    let platform = detect_embed_platform(src);
    
    if is_video_platform(&platform) {
        return None;
    }
    
    let width = el.value().attr("width")
        .and_then(parse_dimension);
    let height = el.value().attr("height")
        .and_then(parse_dimension);
    
    Some(EmbeddedMedia {
        url: src.to_string(),
        absolute_url,
        platform,
        title: None,
        width,
        height,
        ..Default::default()
    })
}

/// Extract social embeds (Twitter, Instagram, etc.)
fn extract_social_embeds(
    document: &Html,
    _base_url: Option<&Url>,
    embeds: &mut Vec<EmbeddedMedia>,
    seen_urls: &mut HashSet<String>,
) {
    // Twitter embeds
    if let Ok(sel) = Selector::parse("blockquote.twitter-tweet") {
        for el in document.select(&sel) {
            if let Ok(link_sel) = Selector::parse("a") {
                for link in el.select(&link_sel) {
                    if let Some(href) = link.value().attr("href") {
                        if TWITTER.is_match(href) && seen_urls.insert(href.to_string()) {
                            embeds.push(EmbeddedMedia {
                                url: href.to_string(),
                                absolute_url: Some(href.to_string()),
                                platform: EmbedPlatform::Twitter,
                                ..Default::default()
                            });
                            break;
                        }
                    }
                }
            }
        }
    }
    
    // Instagram embeds
    if let Ok(sel) = Selector::parse("blockquote.instagram-media") {
        for el in document.select(&sel) {
            if let Some(permalink) = el.value().attr("data-instgrm-permalink") {
                if seen_urls.insert(permalink.to_string()) {
                    embeds.push(EmbeddedMedia {
                        url: permalink.to_string(),
                        absolute_url: Some(permalink.to_string()),
                        platform: EmbedPlatform::Instagram,
                        ..Default::default()
                    });
                }
            }
        }
    }
    
    // Facebook embeds
    if let Ok(sel) = Selector::parse("div.fb-post, div.fb-video") {
        for el in document.select(&sel) {
            if let Some(href) = el.value().attr("data-href") {
                if seen_urls.insert(href.to_string()) {
                    embeds.push(EmbeddedMedia {
                        url: href.to_string(),
                        absolute_url: Some(href.to_string()),
                        platform: EmbedPlatform::Facebook,
                        ..Default::default()
                    });
                }
            }
        }
    }
    
    // Reddit embeds
    if let Ok(sel) = Selector::parse("blockquote.reddit-embed-bq") {
        for el in document.select(&sel) {
            if let Ok(link_sel) = Selector::parse("a") {
                for link in el.select(&link_sel) {
                    if let Some(href) = link.value().attr("href") {
                        if REDDIT.is_match(href) && seen_urls.insert(href.to_string()) {
                            embeds.push(EmbeddedMedia {
                                url: href.to_string(),
                                absolute_url: Some(href.to_string()),
                                platform: EmbedPlatform::Reddit,
                                ..Default::default()
                            });
                            break;
                        }
                    }
                }
            }
        }
    }
}

/// Detect embed platform from URL
pub fn detect_embed_platform(url: &str) -> EmbedPlatform {
    if GOOGLE_MAPS.is_match(url) { return EmbedPlatform::GoogleMaps; }
    if TWITTER.is_match(url) { return EmbedPlatform::Twitter; }
    if INSTAGRAM.is_match(url) { return EmbedPlatform::Instagram; }
    if FACEBOOK.is_match(url) { return EmbedPlatform::Facebook; }
    if LINKEDIN.is_match(url) { return EmbedPlatform::LinkedIn; }
    if PINTEREST.is_match(url) { return EmbedPlatform::Pinterest; }
    if TIKTOK.is_match(url) { return EmbedPlatform::TikTok; }
    if REDDIT.is_match(url) { return EmbedPlatform::Reddit; }
    if CODEPEN.is_match(url) { return EmbedPlatform::CodePen; }
    if JSFIDDLE.is_match(url) { return EmbedPlatform::JsFiddle; }
    if CODESANDBOX.is_match(url) { return EmbedPlatform::CodeSandbox; }
    if GIPHY.is_match(url) { return EmbedPlatform::Giphy; }
    if SLIDESHARE.is_match(url) { return EmbedPlatform::SlideShare; }
    if TYPEFORM.is_match(url) { return EmbedPlatform::Typeform; }
    if CALENDLY.is_match(url) { return EmbedPlatform::Calendly; }
    if STRIPE.is_match(url) { return EmbedPlatform::Stripe; }
    if PAYPAL.is_match(url) { return EmbedPlatform::PayPal; }
    
    EmbedPlatform::Other
}

/// Check if platform is a video platform (handled elsewhere)
fn is_video_platform(platform: &EmbedPlatform) -> bool {
    matches!(platform, 
        EmbedPlatform::YouTube | 
        EmbedPlatform::Vimeo | 
        EmbedPlatform::Dailymotion |
        EmbedPlatform::Twitch |
        EmbedPlatform::Wistia |
        EmbedPlatform::Spotify |
        EmbedPlatform::SoundCloud |
        EmbedPlatform::ApplePodcasts
    )
}

/// Parse dimension (handle px, %, etc.)
fn parse_dimension(s: &str) -> Option<u32> {
    s.trim()
        .trim_end_matches("px")
        .trim_end_matches('%')
        .parse()
        .ok()
}

/// Resolve relative URL
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
    if href.starts_with("http://") || href.starts_with("https://") {
        return Some(href.to_string());
    }
    
    if href.starts_with("//") {
        return Some(format!("https:{}", href));
    }
    
    base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}

// ============================================================================
// CONVENIENCE FUNCTIONS
// ============================================================================

/// Extract embeds from HTML string
pub fn extract_embeds_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<EmbeddedMedia>> {
    let document = Html::parse_document(html);
    let base = base_url.and_then(|u| Url::parse(u).ok());
    Ok(extract_embeds(&document, base.as_ref()))
}

/// Get all embed URLs
pub fn get_embed_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
    extract_embeds_from_html(html, base_url)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|e| e.absolute_url)
        .collect()
}

/// Check if HTML has embeds
pub fn has_embeds(document: &Html) -> bool {
    if let Ok(sel) = Selector::parse("iframe[src], object[data], embed[src]") {
        document.select(&sel).next().is_some()
    } else {
        false
    }
}

/// Filter embeds by platform
pub fn filter_by_platform(embeds: &[EmbeddedMedia], platform: EmbedPlatform) -> Vec<&EmbeddedMedia> {
    embeds.iter()
        .filter(|e| e.platform == platform)
        .collect()
}

/// Get Google Maps embeds
pub fn get_maps(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
    filter_by_platform(embeds, EmbedPlatform::GoogleMaps)
}

/// Get social embeds
pub fn get_social_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
    embeds.iter()
        .filter(|e| matches!(e.platform,
            EmbedPlatform::Twitter |
            EmbedPlatform::Instagram |
            EmbedPlatform::Facebook |
            EmbedPlatform::LinkedIn |
            EmbedPlatform::Pinterest |
            EmbedPlatform::TikTok |
            EmbedPlatform::Reddit
        ))
        .collect()
}

/// Get code embeds (CodePen, JSFiddle, etc.)
pub fn get_code_embeds(embeds: &[EmbeddedMedia]) -> Vec<&EmbeddedMedia> {
    embeds.iter()
        .filter(|e| matches!(e.platform,
            EmbedPlatform::CodePen |
            EmbedPlatform::JsFiddle |
            EmbedPlatform::CodeSandbox
        ))
        .collect()
}

/// Count embeds by platform
pub fn count_by_platform(embeds: &[EmbeddedMedia]) -> std::collections::HashMap<EmbedPlatform, usize> {
    let mut counts = std::collections::HashMap::new();
    for embed in embeds {
        *counts.entry(embed.platform).or_insert(0) += 1;
    }
    counts
}

// ============================================================================
// TESTS
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    fn parse_html(html: &str) -> Html {
        Html::parse_document(html)
    }

    #[test]
    fn test_extract_google_maps_iframe() {
        let html = r#"<iframe src="https://www.google.com/maps/embed?pb=..." width="600" height="450"></iframe>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert_eq!(embeds.len(), 1);
        assert_eq!(embeds[0].platform, EmbedPlatform::GoogleMaps);
        assert_eq!(embeds[0].width, Some(600));
        assert_eq!(embeds[0].height, Some(450));
    }

    #[test]
    fn test_extract_codepen_embed() {
        let html = r#"<iframe src="https://codepen.io/user/embed/pen" title="CodePen"></iframe>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert_eq!(embeds.len(), 1);
        assert_eq!(embeds[0].platform, EmbedPlatform::CodePen);
        assert_eq!(embeds[0].title, Some("CodePen".to_string()));
    }

    #[test]
    fn test_detect_platform() {
        assert_eq!(detect_embed_platform("https://www.google.com/maps/embed"), EmbedPlatform::GoogleMaps);
        assert_eq!(detect_embed_platform("https://twitter.com/user/status/123"), EmbedPlatform::Twitter);
        assert_eq!(detect_embed_platform("https://www.instagram.com/p/abc"), EmbedPlatform::Instagram);
        assert_eq!(detect_embed_platform("https://codepen.io/user/pen/abc"), EmbedPlatform::CodePen);
        assert_eq!(detect_embed_platform("https://example.com/widget"), EmbedPlatform::Other);
    }

    #[test]
    fn test_extract_typeform() {
        let html = r#"<iframe src="https://form.typeform.com/to/abc123"></iframe>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert_eq!(embeds.len(), 1);
        assert_eq!(embeds[0].platform, EmbedPlatform::Typeform);
    }

    #[test]
    fn test_extract_calendly() {
        let html = r#"<iframe src="https://calendly.com/user/meeting"></iframe>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert_eq!(embeds.len(), 1);
        assert_eq!(embeds[0].platform, EmbedPlatform::Calendly);
    }

    #[test]
    fn test_skip_empty_src() {
        let html = r#"<iframe src=""></iframe><iframe src="javascript:void(0)"></iframe>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert!(embeds.is_empty());
    }

    #[test]
    fn test_has_embeds() {
        let with_embed = r#"<iframe src="https://example.com"></iframe>"#;
        let without_embed = r#"<div>No embed</div>"#;
        
        assert!(has_embeds(&parse_html(with_embed)));
        assert!(!has_embeds(&parse_html(without_embed)));
    }

    #[test]
    fn test_parse_dimension() {
        assert_eq!(parse_dimension("600"), Some(600));
        assert_eq!(parse_dimension("600px"), Some(600));
        assert_eq!(parse_dimension("100%"), Some(100));
        assert_eq!(parse_dimension("invalid"), None);
    }

    #[test]
    fn test_get_social_embeds() {
        let embeds = vec![
            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::GoogleMaps, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::Instagram, ..Default::default() },
        ];
        
        let social = get_social_embeds(&embeds);
        assert_eq!(social.len(), 2);
    }

    #[test]
    fn test_get_code_embeds() {
        let embeds = vec![
            EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::JsFiddle, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
        ];
        
        let code = get_code_embeds(&embeds);
        assert_eq!(code.len(), 2);
    }

    #[test]
    fn test_twitter_blockquote() {
        let html = r#"<blockquote class="twitter-tweet"><a href="https://twitter.com/user/status/123">Tweet</a></blockquote>"#;
        let doc = parse_html(html);
        let embeds = extract_embeds(&doc, None);
        
        assert_eq!(embeds.len(), 1);
        assert_eq!(embeds[0].platform, EmbedPlatform::Twitter);
    }

    #[test]
    fn test_count_by_platform() {
        let embeds = vec![
            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::Twitter, ..Default::default() },
            EmbeddedMedia { platform: EmbedPlatform::CodePen, ..Default::default() },
        ];
        
        let counts = count_by_platform(&embeds);
        assert_eq!(counts.get(&EmbedPlatform::Twitter), Some(&2));
        assert_eq!(counts.get(&EmbedPlatform::CodePen), Some(&1));
    }
}