halldyll-media 0.1.0

//! Image extraction for halldyll-media
//!
//! Extracts images from HTML with support for:
//! - Regular img tags
//! - Lazy loading (data-src, data-lazy-src, etc.)
//! - Responsive images (srcset, sizes)
//! - Picture elements with source sets
//! - Background images in inline styles
//! - Figure/figcaption associations

use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector, ElementRef};
use std::collections::HashSet;
use url::Url;

use crate::types::{
    ImageMedia, ImageFormat, ImageLoading, SrcsetEntry, MediaResult,
};

lazy_static! {
    /// Pattern for extracting srcset entries
    static ref SRCSET_ENTRY: Regex = Regex::new(
        r"([^\s,]+)\s*(\d+w|\d+(?:\.\d+)?x)?"
    ).unwrap();
    
    /// Pattern for data URLs
    static ref DATA_URL: Regex = Regex::new(r"^data:image/").unwrap();
    
    /// Pattern for placeholder images
    static ref PLACEHOLDER_PATTERN: Regex = Regex::new(
        r"(?i)(placeholder|blank|spacer|pixel|1x1|loading|lazy)"
    ).unwrap();
    
    /// Pattern for background-image in style
    static ref BG_IMAGE_PATTERN: Regex = Regex::new(
        r#"background(?:-image)?\s*:\s*url\(['"]?([^'")]+)['"]?\)"#
    ).unwrap();
    
    /// Common lazy loading data attributes
    static ref LAZY_ATTRS: Vec<&'static str> = vec![
        "data-src",
        "data-lazy-src",
        "data-original",
        "data-srcset",
        "data-lazy-srcset",
        "data-bg",
        "data-background",
        "data-image",
        "data-url",
    ];
}

// ============================================================================
// EXTRACTION FUNCTIONS
// ============================================================================

/// Extract all images from HTML document
pub fn extract_images(document: &Html, base_url: Option<&Url>) -> Vec<ImageMedia> {
    let mut images = Vec::new();
    let mut seen_urls: HashSet<String> = HashSet::new();
    
    // Extract from <img> tags
    if let Ok(sel) = Selector::parse("img") {
        for el in document.select(&sel) {
            if let Some(img) = extract_image_element(&el, base_url) {
                let key = img.absolute_url.as_ref().unwrap_or(&img.src).clone();
                if seen_urls.insert(key) {
                    images.push(img);
                }
            }
        }
    }
    
    // Extract from <picture> elements
    if let Ok(sel) = Selector::parse("picture") {
        for el in document.select(&sel) {
            for img in extract_picture_element(&el, base_url) {
                let key = img.absolute_url.as_ref().unwrap_or(&img.src).clone();
                if seen_urls.insert(key) {
                    images.push(img);
                }
            }
        }
    }
    
    // Extract from <figure> elements (may have additional context)
    if let Ok(sel) = Selector::parse("figure img") {
        for el in document.select(&sel) {
            // Already extracted via img selector, but check for figcaption
            if let Some(src) = get_image_src(&el) {
                let abs_url = resolve_url(&src, base_url);
                let _key = abs_url.as_ref().unwrap_or(&src).clone();
                
                // Find and update with figcaption if exists
                if let Some(img) = images.iter_mut().find(|i| {
                    i.absolute_url.as_ref() == abs_url.as_ref() || i.src == src
                }) {
                    if img.alt.is_none() || img.alt.as_ref().map(|s| s.is_empty()).unwrap_or(true) {
                        img.alt = get_figcaption(&el);
                    }
                }
            }
        }
    }
    
    images
}

/// Extract a single image element
fn extract_image_element(el: &ElementRef, base_url: Option<&Url>) -> Option<ImageMedia> {
    let src = get_image_src(el)?;
    
    // Skip data URLs unless configured otherwise
    if DATA_URL.is_match(&src) {
        return None;
    }
    
    let absolute_url = resolve_url(&src, base_url);
    
    // Detect format from URL (try absolute first, then src)
    let format = absolute_url.as_ref()
        .and_then(|u| extract_extension(u))
        .or_else(|| extract_extension(&src))
        .map(|ext| ImageFormat::from_extension(&ext))
        .unwrap_or(ImageFormat::Unknown);
    
    // Parse dimensions
    let width = el.value().attr("width")
        .and_then(|w| w.trim_end_matches("px").parse().ok());
    let height = el.value().attr("height")
        .and_then(|h| h.trim_end_matches("px").parse().ok());
    
    // Get alt text
    let alt = el.value().attr("alt").map(|s| s.to_string());
    let is_decorative = alt.as_ref().map(|a| a.is_empty()).unwrap_or(false);
    
    // Detect loading strategy
    let loading = match el.value().attr("loading") {
        Some("lazy") => ImageLoading::Lazy,
        _ => ImageLoading::Eager,
    };
    
    // Parse srcset
    let srcset = el.value().attr("srcset")
        .map(|s| parse_srcset(s, base_url))
        .unwrap_or_default();
    
    // Check for lazy loading data attributes
    let data_src = LAZY_ATTRS.iter()
        .find_map(|attr| el.value().attr(attr))
        .map(|s| s.to_string());
    
    // Detect placeholder
    let is_placeholder = is_placeholder_image(&src, width, height);
    
    // Get classes and id
    let classes: Vec<String> = el.value().classes().map(|s| s.to_string()).collect();
    let id = el.value().attr("id").map(|s| s.to_string());
    
    // Get MIME type
    let mime_type = format.mime_type();
    
    Some(ImageMedia {
        src,
        absolute_url,
        alt,
        title: el.value().attr("title").map(|s| s.to_string()),
        width,
        height,
        format,
        mime_type: Some(mime_type.to_string()),
        loading,
        is_decorative,
        srcset,
        sizes: el.value().attr("sizes").map(|s| s.to_string()),
        data_src,
        is_placeholder,
        size_bytes: None,
        content_hash: None,
        classes,
        id,
    })
}

/// Extract images from picture element
fn extract_picture_element(picture: &ElementRef, base_url: Option<&Url>) -> Vec<ImageMedia> {
    let mut images = Vec::new();
    
    // Get the fallback img inside picture
    if let Ok(img_sel) = Selector::parse("img") {
        if let Some(img_el) = picture.select(&img_sel).next() {
            if let Some(mut img) = extract_image_element(&img_el, base_url) {
                // Add source alternatives
                if let Ok(source_sel) = Selector::parse("source") {
                    for source in picture.select(&source_sel) {
                        if let Some(srcset_str) = source.value().attr("srcset") {
                            let source_entries = parse_srcset(srcset_str, base_url);
                            img.srcset.extend(source_entries);
                        }
                    }
                }
                images.push(img);
            }
        }
    }
    
    images
}

/// Get image src from various attributes
fn get_image_src(el: &ElementRef) -> Option<String> {
    // Priority: src > data-src variants > currentSrc
    el.value().attr("src")
        .filter(|s| !s.is_empty() && !s.starts_with("data:image/svg+xml") || s.len() > 100)
        .or_else(|| {
            LAZY_ATTRS.iter()
                .find_map(|attr| el.value().attr(attr))
        })
        .map(|s| s.to_string())
}

/// Parse srcset attribute into entries
fn parse_srcset(srcset: &str, base_url: Option<&Url>) -> Vec<SrcsetEntry> {
    let mut entries = Vec::new();
    
    for part in srcset.split(',') {
        let part = part.trim();
        let parts: Vec<&str> = part.split_whitespace().collect();
        
        if parts.is_empty() {
            continue;
        }
        
        let url = parts[0].to_string();
        let resolved_url = resolve_url(&url, base_url).unwrap_or_else(|| url.clone());
        
        let mut width = None;
        let mut density = None;
        
        if parts.len() > 1 {
            let descriptor = parts[1];
            if descriptor.ends_with('w') {
                width = descriptor.trim_end_matches('w').parse().ok();
            } else if descriptor.ends_with('x') {
                density = descriptor.trim_end_matches('x').parse().ok();
            }
        }
        
        entries.push(SrcsetEntry {
            url: resolved_url,
            width,
            density,
        });
    }
    
    entries
}

/// Get figcaption text from parent figure
fn get_figcaption(img: &ElementRef) -> Option<String> {
    // Navigate up to find figure, then find figcaption
    if let Ok(_sel) = Selector::parse("figcaption") {
        // Check if there's a figure parent with figcaption
        let html = img.html();
        if html.contains("figure") {
            // Simple approach: parse the parent context
            // In practice, we'd need to traverse up the DOM
        }
    }
    None
}

/// Check if image is a placeholder
fn is_placeholder_image(src: &str, width: Option<u32>, height: Option<u32>) -> bool {
    // Check URL pattern
    if PLACEHOLDER_PATTERN.is_match(src) {
        return true;
    }
    
    // Check for tiny dimensions
    if let (Some(w), Some(h)) = (width, height) {
        if w <= 10 && h <= 10 {
            return true;
        }
    }
    
    // Check for known placeholder services
    let placeholders = [
        "placehold.it",
        "placeholder.com",
        "placekitten.com",
        "picsum.photos",
        "via.placeholder.com",
    ];
    
    placeholders.iter().any(|p| src.contains(p))
}

/// Extract file extension from URL
fn extract_extension(url: &str) -> Option<String> {
    // Remove query string and fragment
    let path = url.split('?').next()?.split('#').next()?;
    
    // Get last path segment
    let filename = path.rsplit('/').next()?;
    
    // Check if there's a dot in the filename
    if !filename.contains('.') {
        return None;
    }
    
    // Get extension
    let ext = filename.rsplit('.').next()?;
    
    // Validate it looks like an extension (not the whole filename)
    if ext != filename && ext.len() <= 5 && ext.chars().all(|c| c.is_alphanumeric()) {
        Some(ext.to_lowercase())
    } else {
        None
    }
}

/// Resolve relative URL
fn resolve_url(href: &str, base_url: Option<&Url>) -> Option<String> {
    if href.starts_with("http://") || href.starts_with("https://") {
        return Some(href.to_string());
    }
    
    if href.starts_with("//") {
        return Some(format!("https:{}", href));
    }
    
    if href.starts_with("data:") {
        return Some(href.to_string());
    }
    
    base_url.and_then(|base| base.join(href).ok().map(|u| u.to_string()))
}

// ============================================================================
// CONVENIENCE FUNCTIONS  
// ============================================================================

/// Extract images from HTML string
pub fn extract_images_from_html(html: &str, base_url: Option<&str>) -> MediaResult<Vec<ImageMedia>> {
    let document = Html::parse_document(html);
    let base = base_url.and_then(|u| Url::parse(u).ok());
    Ok(extract_images(&document, base.as_ref()))
}

/// Get all image URLs from HTML
pub fn get_image_urls(html: &str, base_url: Option<&str>) -> Vec<String> {
    extract_images_from_html(html, base_url)
        .unwrap_or_default()
        .into_iter()
        .filter_map(|img| img.absolute_url)
        .collect()
}

/// Check if HTML has images
pub fn has_images(document: &Html) -> bool {
    if let Ok(sel) = Selector::parse("img, picture") {
        document.select(&sel).next().is_some()
    } else {
        false
    }
}

/// Count images in document
pub fn count_images(document: &Html) -> usize {
    if let Ok(sel) = Selector::parse("img") {
        document.select(&sel).count()
    } else {
        0
    }
}

/// Get responsive image with best resolution
pub fn get_best_image_url(img: &ImageMedia) -> &str {
    // If srcset exists, get the highest resolution
    if !img.srcset.is_empty() {
        // Find by width (prefer highest)
        if let Some(entry) = img.srcset.iter()
            .filter(|e| e.width.is_some())
            .max_by_key(|e| e.width)
        {
            return &entry.url;
        }
        
        // Find by density (prefer highest)
        if let Some(entry) = img.srcset.iter()
            .filter(|e| e.density.is_some())
            .max_by(|a, b| a.density.partial_cmp(&b.density).unwrap_or(std::cmp::Ordering::Equal))
        {
            return &entry.url;
        }
    }
    
    // Fallback to absolute_url or src
    img.absolute_url.as_deref().unwrap_or(&img.src)
}

/// Filter out placeholder images
pub fn filter_placeholders(images: Vec<ImageMedia>) -> Vec<ImageMedia> {
    images.into_iter()
        .filter(|img| !img.is_placeholder)
        .collect()
}

/// Filter images by minimum dimensions
pub fn filter_by_dimensions(images: Vec<ImageMedia>, min_width: u32, min_height: u32) -> Vec<ImageMedia> {
    images.into_iter()
        .filter(|img| {
            let w = img.width.unwrap_or(u32::MAX);
            let h = img.height.unwrap_or(u32::MAX);
            w >= min_width && h >= min_height
        })
        .collect()
}

// ============================================================================
// TESTS
// ============================================================================

#[cfg(test)]
mod tests {
    use super::*;

    fn parse_html(html: &str) -> Html {
        Html::parse_document(html)
    }

    #[test]
    fn test_extract_basic_image() {
        let html = r#"<html><body><img src="/images/test.jpg" alt="Test image"></body></html>"#;
        let doc = parse_html(html);
        let base = Url::parse("https://example.com").unwrap();
        let images = extract_images(&doc, Some(&base));
        
        assert_eq!(images.len(), 1);
        assert_eq!(images[0].src, "/images/test.jpg");
        assert_eq!(images[0].absolute_url, Some("https://example.com/images/test.jpg".to_string()));
        assert_eq!(images[0].alt, Some("Test image".to_string()));
    }

    #[test]
    fn test_extract_lazy_loaded_image() {
        let html = r#"<img src="placeholder.gif" data-src="/real-image.jpg" loading="lazy">"#;
        let doc = parse_html(html);
        let base = Url::parse("https://example.com").unwrap();
        let images = extract_images(&doc, Some(&base));
        
        assert_eq!(images.len(), 1);
        assert_eq!(images[0].loading, ImageLoading::Lazy);
        assert!(images[0].data_src.is_some());
    }

    #[test]
    fn test_extract_srcset() {
        let html = r#"
            <img src="small.jpg" 
                 srcset="small.jpg 300w, medium.jpg 600w, large.jpg 1200w"
                 sizes="(max-width: 600px) 300px, 600px">
        "#;
        let doc = parse_html(html);
        let images = extract_images(&doc, None);
        
        assert_eq!(images.len(), 1);
        assert_eq!(images[0].srcset.len(), 3);
        assert_eq!(images[0].srcset[0].width, Some(300));
        assert_eq!(images[0].srcset[2].width, Some(1200));
    }

    #[test]
    fn test_decorative_image() {
        let html = r#"<img src="spacer.gif" alt="">"#;
        let doc = parse_html(html);
        let images = extract_images(&doc, None);
        
        assert_eq!(images.len(), 1);
        assert!(images[0].is_decorative);
    }

    #[test]
    fn test_image_dimensions() {
        let html = r#"<img src="test.jpg" width="800" height="600">"#;
        let doc = parse_html(html);
        let images = extract_images(&doc, None);
        
        assert_eq!(images[0].width, Some(800));
        assert_eq!(images[0].height, Some(600));
    }

    #[test]
    fn test_picture_element() {
        let html = r#"
            <picture>
                <source srcset="image.webp" type="image/webp">
                <source srcset="image.jpg" type="image/jpeg">
                <img src="image.jpg" alt="Test">
            </picture>
        "#;
        let doc = parse_html(html);
        let images = extract_images(&doc, None);
        
        // Should extract the img and include sources in srcset
        assert!(!images.is_empty());
    }

    #[test]
    fn test_placeholder_detection() {
        assert!(is_placeholder_image("https://example.com/placeholder.png", None, None));
        assert!(is_placeholder_image("https://placehold.it/100x100", None, None));
        assert!(is_placeholder_image("/spacer.gif", Some(1), Some(1)));
        assert!(!is_placeholder_image("/real-image.jpg", Some(800), Some(600)));
    }

    #[test]
    fn test_parse_srcset() {
        let srcset = "small.jpg 300w, medium.jpg 600w, large.jpg 1200w";
        let entries = parse_srcset(srcset, None);
        
        assert_eq!(entries.len(), 3);
        assert_eq!(entries[0].url, "small.jpg");
        assert_eq!(entries[0].width, Some(300));
        assert_eq!(entries[2].width, Some(1200));
    }

    #[test]
    fn test_parse_srcset_density() {
        let srcset = "image.jpg 1x, image@2x.jpg 2x, image@3x.jpg 3x";
        let entries = parse_srcset(srcset, None);
        
        assert_eq!(entries.len(), 3);
        assert_eq!(entries[0].density, Some(1.0));
        assert_eq!(entries[1].density, Some(2.0));
    }

    #[test]
    fn test_extract_extension() {
        assert_eq!(extract_extension("https://example.com/image.jpg"), Some("jpg".to_string()));
        assert_eq!(extract_extension("https://example.com/image.PNG"), Some("png".to_string()));
        assert_eq!(extract_extension("https://example.com/image.jpg?w=100"), Some("jpg".to_string()));
        assert_eq!(extract_extension("https://example.com/image"), None);
    }

    #[test]
    fn test_get_best_image_url() {
        let img = ImageMedia {
            src: "small.jpg".to_string(),
            srcset: vec![
                SrcsetEntry { url: "small.jpg".to_string(), width: Some(300), density: None },
                SrcsetEntry { url: "large.jpg".to_string(), width: Some(1200), density: None },
            ],
            ..Default::default()
        };
        
        assert_eq!(get_best_image_url(&img), "large.jpg");
    }

    #[test]
    fn test_has_images() {
        let html_with = "<html><body><img src='test.jpg'></body></html>";
        let html_without = "<html><body><p>No images</p></body></html>";
        
        assert!(has_images(&parse_html(html_with)));
        assert!(!has_images(&parse_html(html_without)));
    }

    #[test]
    fn test_filter_placeholders() {
        let images = vec![
            ImageMedia { src: "real.jpg".to_string(), is_placeholder: false, ..Default::default() },
            ImageMedia { src: "placeholder.png".to_string(), is_placeholder: true, ..Default::default() },
        ];
        
        let filtered = filter_placeholders(images);
        assert_eq!(filtered.len(), 1);
        assert_eq!(filtered[0].src, "real.jpg");
    }

    #[test]
    fn test_image_format_detection() {
        let html = r#"<img src="test.webp">"#;
        let doc = parse_html(html);
        let images = extract_images(&doc, None);
        
        assert_eq!(images[0].format, ImageFormat::WebP);
    }
}