halldyll-core 0.1.0

Core scraping engine for Halldyll - high-performance async web scraper for AI agents
Documentation
//! Images - Image extraction

use regex::Regex;
use scraper::{Html, Selector};
use url::Url;

use crate::types::assets::{ImageAsset, ImageSourceType, SrcsetEntry};

/// Image extractor
pub struct ImageExtractor {
    /// Resolve lazy-loading (data-src, etc.)
    resolve_lazy: bool,
    /// Extract CSS background images
    extract_css_backgrounds: bool,
}

impl Default for ImageExtractor {
    fn default() -> Self {
        Self {
            resolve_lazy: true,
            extract_css_backgrounds: false,
        }
    }
}

impl ImageExtractor {
    /// New extractor
    pub fn new() -> Self {
        Self::default()
    }

    /// Configure options
    pub fn with_options(mut self, resolve_lazy: bool, extract_css_backgrounds: bool) -> Self {
        self.resolve_lazy = resolve_lazy;
        self.extract_css_backgrounds = extract_css_backgrounds;
        self
    }

    /// Extract all images
    pub fn extract(&self, html: &str, base_url: &Url) -> Vec<ImageAsset> {
        let document = Html::parse_document(html);
        let mut images = Vec::new();

        // 1. Images <img>
        images.extend(self.extract_img_tags(&document, base_url));

        // 2. Images <picture><source>
        images.extend(self.extract_picture_tags(&document, base_url));

        // 3. CSS background-image (optionnel)
        if self.extract_css_backgrounds {
            images.extend(self.extract_css_backgrounds_from_style(&document, base_url));
        }

        // Deduplicate by URL
        images.sort_by(|a, b| a.url.as_str().cmp(b.url.as_str()));
        images.dedup_by(|a, b| a.url == b.url);

        images
    }

    /// Extract <img> tags
    fn extract_img_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
        let selector = Selector::parse("img").unwrap();
        let mut images = Vec::new();

        for img in document.select(&selector) {
            let attrs = img.value();

            // Main source
            let src = attrs.attr("src");
            
            // Lazy-loading sources
            let lazy_src = if self.resolve_lazy {
                attrs.attr("data-src")
                    .or_else(|| attrs.attr("data-lazy"))
                    .or_else(|| attrs.attr("data-original"))
                    .or_else(|| attrs.attr("loading") .filter(|_| attrs.attr("data-src").is_some()).and_then(|_| attrs.attr("data-src")))
            } else {
                None
            };

            // Determine final URL
            let url_str = lazy_src.or(src);
            let url = match url_str {
                Some(s) if !s.is_empty() && !s.starts_with("data:") => {
                    base_url.join(s).ok()
                }
                _ => continue,
            };

            let url = match url {
                Some(u) => u,
                None => continue,
            };

            // Alt text
            let alt = attrs.attr("alt").map(String::from);

            // Dimensions
            let width = attrs.attr("width").and_then(|w| w.parse().ok());
            let height = attrs.attr("height").and_then(|h| h.parse().ok());

            // Srcset
            let srcset = attrs.attr("srcset").map(|s| self.parse_srcset(s, base_url));

            // Lazy src en tant qu'URL
            let lazy_src_url = if lazy_src.is_some() && src.is_some() {
                lazy_src.and_then(|s| base_url.join(s).ok())
            } else {
                None
            };

            images.push(ImageAsset {
                url,
                alt,
                width,
                height,
                srcset,
                lazy_src: lazy_src_url,
                file_size: None,
                mime_type: None,
                source_type: ImageSourceType::Img,
            });
        }

        images
    }

    /// Extract <picture><source> tags
    fn extract_picture_tags(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
        let picture_selector = Selector::parse("picture").unwrap();
        let source_selector = Selector::parse("source").unwrap();
        let mut images = Vec::new();

        for picture in document.select(&picture_selector) {
            for source in picture.select(&source_selector) {
                let attrs = source.value();

                // srcset ou src
                let srcset_str = attrs.attr("srcset").or_else(|| attrs.attr("src"));
                if srcset_str.is_none() {
                    continue;
                }

                let srcset = self.parse_srcset(srcset_str.unwrap(), base_url);
                if srcset.is_empty() {
                    continue;
                }

                // Take first entry as main URL
                let url = srcset[0].url.clone();

                images.push(ImageAsset {
                    url,
                    alt: None,
                    width: None,
                    height: None,
                    srcset: Some(srcset),
                    lazy_src: None,
                    file_size: None,
                    mime_type: attrs.attr("type").map(String::from),
                    source_type: ImageSourceType::Picture,
                });
            }
        }

        images
    }

    /// Parse a srcset attribute
    fn parse_srcset(&self, srcset: &str, base_url: &Url) -> Vec<SrcsetEntry> {
        srcset
            .split(',')
            .filter_map(|entry| {
                let parts: Vec<&str> = entry.trim().split_whitespace().collect();
                if parts.is_empty() {
                    return None;
                }

                let url = base_url.join(parts[0]).ok()?;
                let descriptor = parts.get(1).map(|s| s.to_string()).unwrap_or_else(|| "1x".to_string());

                Some(SrcsetEntry { url, descriptor })
            })
            .collect()
    }

    /// Extract CSS background-image images
    fn extract_css_backgrounds_from_style(&self, document: &Html, base_url: &Url) -> Vec<ImageAsset> {
        let mut images = Vec::new();
        let url_regex = Regex::new(r#"url\s*\(\s*['"]?([^'")\s]+)['"]?\s*\)"#).unwrap();

        // Styles inline
        let all_selector = Selector::parse("[style]").unwrap();
        for element in document.select(&all_selector) {
            if let Some(style) = element.value().attr("style") {
                for cap in url_regex.captures_iter(style) {
                    if let Some(url_match) = cap.get(1) {
                        let url_str = url_match.as_str();
                        if !url_str.starts_with("data:") {
                            if let Ok(url) = base_url.join(url_str) {
                                images.push(ImageAsset {
                                    url,
                                    alt: None,
                                    width: None,
                                    height: None,
                                    srcset: None,
                                    lazy_src: None,
                                    file_size: None,
                                    mime_type: None,
                                    source_type: ImageSourceType::CssBackground,
                                });
                            }
                        }
                    }
                }
            }
        }

        images
    }
}